From 4d594660858f2e83a63745625fcc895aecfba0ff Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Wed, 12 Mar 2025 00:54:37 -0400
Subject: [PATCH 001/103] Add a simple threadpool.

---
 cpp/benchmarks/CMakeLists.txt                 | 10 +++
 .../threadpool/threadpool_benchmark.cpp       |  3 +
 cpp/benchmarks/utils/utils.cpp                | 31 +++++++++
 cpp/benchmarks/utils/utils.hpp                | 21 +++++++
 cpp/include/kvikio/threadpool_simple.hpp      | 63 +++++++++++++++++++
 5 files changed, 128 insertions(+)
 create mode 100644 cpp/benchmarks/utils/utils.cpp
 create mode 100644 cpp/benchmarks/utils/utils.hpp
 create mode 100644 cpp/include/kvikio/threadpool_simple.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 665d45edfb..a103231406 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -44,6 +44,10 @@ function(kvikio_add_benchmark)
   add_executable(${_KVIKIO_NAME} ${_KVIKIO_SOURCES})
   set_target_properties(${_KVIKIO_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
 
+<<<<<<< HEAD
+=======
+  target_include_directories(${_KVIKIO_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
+>>>>>>> e54d507 (Add a simple threadpool.)
   target_link_libraries(${_KVIKIO_NAME} PUBLIC benchmark::benchmark kvikio::kvikio)
 
   if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -59,4 +63,10 @@ function(kvikio_add_benchmark)
   )
 endfunction()
 
+<<<<<<< HEAD
 kvikio_add_benchmark(NAME THREADPOOL_BENCHMARK SOURCES "threadpool/threadpool_benchmark.cpp")
+=======
+kvikio_add_benchmark(
+  NAME THREADPOOL_BENCHMARK SOURCES "threadpool/threadpool_benchmark.cpp" "utils/utils.cpp"
+)
+>>>>>>> e54d507 (Add a simple threadpool.)
diff --git a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
index 1c90579d22..86dcb9c9ad 100644
--- a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
+++ b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
@@ -28,6 +28,7 @@
 
 #include <benchmark/benchmark.h>
 #include <kvikio/defaults.hpp>
+#include <utils/utils.hpp>
 
 namespace kvikio {
 enum class ScalingType : uint8_t {
@@ -88,6 +89,8 @@ int main(int argc, char** argv)
     ->Unit(benchmark::kMillisecond)
     ->MinTime(2);
 
+  kvikio::utils::explain_default_metrics();
+
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
 }
diff --git a/cpp/benchmarks/utils/utils.cpp b/cpp/benchmarks/utils/utils.cpp
new file mode 100644
index 0000000000..ae71a2c2bc
--- /dev/null
+++ b/cpp/benchmarks/utils/utils.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <utils/utils.hpp>
+
+namespace kvikio::utils {
+void explain_default_metrics()
+{
+  benchmark::AddCustomContext(
+    "Time",
+    "The average real time (i.e. wall-clock time) of the entire process per benchmark iteration.");
+  benchmark::AddCustomContext(
+    "CPU",
+    "The average CPU time of the main thread per benchmark iteration. The timer is accumulated "
+    "only when the main thread is being executed.");
+}
+}  // namespace kvikio::utils
diff --git a/cpp/benchmarks/utils/utils.hpp b/cpp/benchmarks/utils/utils.hpp
new file mode 100644
index 0000000000..dd0090f27b
--- /dev/null
+++ b/cpp/benchmarks/utils/utils.hpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace kvikio::utils {
+void explain_default_metrics();
+}
diff --git a/cpp/include/kvikio/threadpool_simple.hpp b/cpp/include/kvikio/threadpool_simple.hpp
new file mode 100644
index 0000000000..20e995a6d7
--- /dev/null
+++ b/cpp/include/kvikio/threadpool_simple.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions andc
+ * limitations under the License.
+ */
+
+#include <atomic>
+#include <functional>
+#include <future>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+namespace kvikio {
+/**
+ * @brief A simple thread pool that executes tasks in an embarrassingly parallel manner.
+ *
+ * The implementation is header-only.
+ */
+class ThreadPoolSimple {
+ public:
+  ThreadPoolSimple(
+    unsigned int num_threads, const std::function<void()>& worker_thread_init_func = [] {})
+    : _num_threads{num_threads}, _worker_thread_init_func{worker_thread_init_func}
+  {
+  }
+
+  void reset();
+
+  template <typename F, typename R = std::invoke_result_t<std::decay_t<F>>>
+  [[nodiscard]] std::future<R> submit_task(F&& task)
+  {
+  }
+
+ private:
+  void worker() {}
+
+  void create_threads()
+  {
+    for (unsigned int i = 0; i < _num_threads; ++i) {
+      _thread_container.emplace_back(&ThreadPoolSimple::worker, _worker_thread_init_func);
+    }
+  }
+
+  void destroy_threads() {}
+
+  std::atomic_bool _done{false};
+  unsigned int _num_threads{};
+  std::function<void()> _worker_thread_init_func{};
+  std::vector<std::thread> _thread_container{};
+};
+
+}  // namespace kvikio
\ No newline at end of file

From 3458b3722c419cee3b9b28b12c7c4390faf45c11 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 17 Mar 2025 23:00:57 -0400
Subject: [PATCH 002/103] Update

---
 cpp/benchmarks/CMakeLists.txt                 |   7 -
 .../threadpool/threadpool_benchmark.cpp       |  53 +++++-
 cpp/include/kvikio/function_wrapper.hpp       |  67 +++++++
 cpp/include/kvikio/threadpool_simple.hpp      | 172 ++++++++++++++++--
 4 files changed, 268 insertions(+), 31 deletions(-)
 create mode 100644 cpp/include/kvikio/function_wrapper.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index a103231406..18694bafc0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -44,10 +44,7 @@ function(kvikio_add_benchmark)
   add_executable(${_KVIKIO_NAME} ${_KVIKIO_SOURCES})
   set_target_properties(${_KVIKIO_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
 
-<<<<<<< HEAD
-=======
   target_include_directories(${_KVIKIO_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
->>>>>>> e54d507 (Add a simple threadpool.)
   target_link_libraries(${_KVIKIO_NAME} PUBLIC benchmark::benchmark kvikio::kvikio)
 
   if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -63,10 +60,6 @@ function(kvikio_add_benchmark)
   )
 endfunction()
 
-<<<<<<< HEAD
-kvikio_add_benchmark(NAME THREADPOOL_BENCHMARK SOURCES "threadpool/threadpool_benchmark.cpp")
-=======
 kvikio_add_benchmark(
   NAME THREADPOOL_BENCHMARK SOURCES "threadpool/threadpool_benchmark.cpp" "utils/utils.cpp"
 )
->>>>>>> e54d507 (Add a simple threadpool.)
diff --git a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
index 86dcb9c9ad..0f7725bb72 100644
--- a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
+++ b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
@@ -28,6 +28,7 @@
 
 #include <benchmark/benchmark.h>
 #include <kvikio/defaults.hpp>
+#include <kvikio/threadpool_simple.hpp>
 #include <utils/utils.hpp>
 
 namespace kvikio {
@@ -46,7 +47,7 @@ void task_compute(std::size_t num_compute_iterations)
 }
 
 template <ScalingType scaling_type>
-void BM_threadpool_compute(benchmark::State& state)
+void BM_BS_threadpool_compute(benchmark::State& state)
 {
   auto const num_threads = state.range(0);
 
@@ -67,22 +68,64 @@ void BM_threadpool_compute(benchmark::State& state)
 
   state.counters["threads"] = num_threads;
 }
+
+template <ScalingType scaling_type>
+void BM_simple_threadpool_compute(benchmark::State& state)
+{
+  auto const num_threads = state.range(0);
+
+  std::size_t const num_compute_tasks =
+    (scaling_type == ScalingType::STRONG_SCALING) ? 10'000 : (1'000 * num_threads);
+
+  std::size_t constexpr num_compute_iterations{1'000};
+  kvikio::ThreadPoolSimple thread_pool(num_threads);
+
+  for (auto _ : state) {
+    // Submit a total of "num_compute_tasks" tasks to the thread pool.
+    for (auto i = std::size_t{0}; i < num_compute_tasks; ++i) {
+      [[maybe_unused]] auto fut =
+        thread_pool.submit_task([] { task_compute(num_compute_iterations); });
+    }
+    thread_pool.wait();
+  }
+
+  state.counters["threads"] = num_threads;
+}
 }  // namespace kvikio
 
 int main(int argc, char** argv)
 {
   benchmark::Initialize(&argc, argv);
 
-  benchmark::RegisterBenchmark("BM_threadpool_compute:strong_scaling",
-                               kvikio::BM_threadpool_compute<kvikio::ScalingType::STRONG_SCALING>)
+  benchmark::RegisterBenchmark(
+    "BS_threadpool_compute:strong_scaling",
+    kvikio::BM_BS_threadpool_compute<kvikio::ScalingType::STRONG_SCALING>)
+    ->RangeMultiplier(2)
+    ->Range(1, 64)   // Increase from 1 to 64 (inclusive of both endpoints) with x2 stepping.
+    ->UseRealTime()  // Use the wall clock to determine the number of benchmark iterations.
+    ->Unit(benchmark::kMillisecond)
+    ->MinTime(2);  // Minimum of 2 seconds.
+
+  benchmark::RegisterBenchmark("BS_threadpool_compute:weak_scaling",
+                               kvikio::BM_BS_threadpool_compute<kvikio::ScalingType::WEAK_SCALING>)
+    ->RangeMultiplier(2)
+    ->Range(1, 64)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond)
+    ->MinTime(2);
+
+  benchmark::RegisterBenchmark(
+    "simple_threadpool_compute:strong_scaling",
+    kvikio::BM_simple_threadpool_compute<kvikio::ScalingType::STRONG_SCALING>)
     ->RangeMultiplier(2)
     ->Range(1, 64)   // Increase from 1 to 64 (inclusive of both endpoints) with x2 stepping.
     ->UseRealTime()  // Use the wall clock to determine the number of benchmark iterations.
     ->Unit(benchmark::kMillisecond)
     ->MinTime(2);  // Minimum of 2 seconds.
 
-  benchmark::RegisterBenchmark("BM_threadpool_compute:weak_scaling",
-                               kvikio::BM_threadpool_compute<kvikio::ScalingType::WEAK_SCALING>)
+  benchmark::RegisterBenchmark(
+    "simple_threadpool_compute:weak_scaling",
+    kvikio::BM_simple_threadpool_compute<kvikio::ScalingType::WEAK_SCALING>)
     ->RangeMultiplier(2)
     ->Range(1, 64)
     ->UseRealTime()
diff --git a/cpp/include/kvikio/function_wrapper.hpp b/cpp/include/kvikio/function_wrapper.hpp
new file mode 100644
index 0000000000..81362e4b9a
--- /dev/null
+++ b/cpp/include/kvikio/function_wrapper.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions andc
+ * limitations under the License.
+ */
+
+#include <functional>
+#include <memory>
+
+namespace kvikio {
+class SimpleFunctionWrapper {
+ private:
+  struct inner_base {
+    virtual void operator()() = 0;
+
+    virtual ~inner_base() = default;
+  };
+
+  template <typename F>
+  struct inner : inner_base {
+    using F_decay = std::decay_t<F>;
+    static_assert(std::is_invocable_r_v<void, F_decay>);
+
+    inner(F&& f) : _f(std::forward<F>(f)) {}
+
+    void operator()() override { std::invoke(_f); }
+
+    ~inner() override = default;
+
+    F_decay _f;
+  };
+
+  std::unique_ptr<inner_base> _callable;
+
+ public:
+  template <typename F>
+  SimpleFunctionWrapper(F&& f) : _callable(std::make_unique<inner<F>>(std::forward<F>(f)))
+  {
+    using F_decay = std::decay_t<F>;
+    static_assert(std::is_invocable_r_v<void, F_decay>);
+  }
+
+  SimpleFunctionWrapper() = default;
+
+  SimpleFunctionWrapper(SimpleFunctionWrapper&&)            = default;
+  SimpleFunctionWrapper& operator=(SimpleFunctionWrapper&&) = default;
+
+  SimpleFunctionWrapper(const SimpleFunctionWrapper&)            = delete;
+  SimpleFunctionWrapper& operator=(const SimpleFunctionWrapper&) = delete;
+
+  void operator()() { return _callable->operator()(); }
+
+  operator bool() { return _callable != nullptr; }
+};
+
+using FunctionWrapper = SimpleFunctionWrapper;
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/threadpool_simple.hpp b/cpp/include/kvikio/threadpool_simple.hpp
index 20e995a6d7..d97032acd3 100644
--- a/cpp/include/kvikio/threadpool_simple.hpp
+++ b/cpp/include/kvikio/threadpool_simple.hpp
@@ -15,49 +15,183 @@
  */
 
 #include <atomic>
-#include <functional>
+#include <condition_variable>
 #include <future>
+#include <optional>
+#include <queue>
 #include <thread>
-#include <type_traits>
-#include <vector>
+#include <utility>
+
+#include <kvikio/function_wrapper.hpp>
 
-namespace kvikio {
 /**
- * @brief A simple thread pool that executes tasks in an embarrassingly parallel manner.
- *
- * The implementation is header-only.
+ * @brief A simple, header-only thread pool that executes tasks in an embarrassingly parallel
+ * manner.
  */
+namespace kvikio {
+class this_thread {
+ public:
+  static bool is_from_pool() { return get_thread_idx().has_value(); }
+
+  static std::optional<std::size_t> get_thread_idx() { return this_thread_idx; }
+
+ private:
+  friend class ThreadPoolSimple;
+
+  static void set_thread_idx(std::size_t thread_idx) { this_thread_idx = thread_idx; }
+
+  inline static thread_local std::optional<std::size_t> this_thread_idx{std::nullopt};
+};
+
+struct Worker {
+  std::thread thread;
+  std::condition_variable task_available_cv;
+  std::condition_variable task_done_cv;
+  std::mutex task_mutex;
+  std::queue<FunctionWrapper> task_queue;
+  bool should_stop{false};
+};
+
 class ThreadPoolSimple {
  public:
-  ThreadPoolSimple(
-    unsigned int num_threads, const std::function<void()>& worker_thread_init_func = [] {})
-    : _num_threads{num_threads}, _worker_thread_init_func{worker_thread_init_func}
+  template <typename F>
+  ThreadPoolSimple(unsigned int num_threads, F&& worker_thread_init_func)
+    : _num_threads{num_threads}, _worker_thread_init_func{std::forward<F>(worker_thread_init_func)}
   {
+    create_threads();
   }
 
-  void reset();
+  ThreadPoolSimple(unsigned int num_threads) : ThreadPoolSimple(num_threads, FunctionWrapper{}) {}
+
+  ~ThreadPoolSimple() { destroy_threads(); }
+
+  template <typename F>
+  void reset(unsigned int num_threads, F&& worker_thread_init_func)
+  {
+    wait();
+    destroy_threads();
+
+    _num_threads             = num_threads;
+    _worker_thread_init_func = std::forward<F>(worker_thread_init_func);
+    create_threads();
+  }
+
+  void reset(unsigned int num_threads) { reset(num_threads, FunctionWrapper{}); }
+
+  void wait()
+  {
+    for (unsigned int thread_idx = 0; thread_idx < _num_threads; ++thread_idx) {
+      auto& task_done_cv = _workers[thread_idx].task_done_cv;
+      auto& mut          = _workers[thread_idx].task_mutex;
+      auto& task_queue   = _workers[thread_idx].task_queue;
+
+      std::unique_lock lock(mut);
+      task_done_cv.wait(lock, [&] { return task_queue.empty(); });
+    }
+  }
+
+  unsigned int num_thread() const { return _num_threads; }
 
   template <typename F, typename R = std::invoke_result_t<std::decay_t<F>>>
   [[nodiscard]] std::future<R> submit_task(F&& task)
   {
+    auto tid =
+      std::atomic_fetch_add_explicit(&_task_submission_counter, 1, std::memory_order_relaxed);
+    tid %= _num_threads;
+
+    return submit_task_to_thread(std::forward<F>(task), tid);
+  }
+
+  template <typename F, typename R = std::invoke_result_t<std::decay_t<F>>>
+  [[nodiscard]] std::future<R> submit_task_to_thread(F&& task, std::size_t thread_idx)
+  {
+    auto& task_available_cv = _workers[thread_idx].task_available_cv;
+    auto& mut               = _workers[thread_idx].task_mutex;
+    auto& task_queue        = _workers[thread_idx].task_queue;
+
+    std::promise<R> p;
+    auto fut = p.get_future();
+
+    {
+      std::lock_guard lock(mut);
+
+      task_queue.emplace([task = std::forward<F>(task), p = std::move(p), thread_idx]() mutable {
+        try {
+          if constexpr (std::is_same_v<R, void>) {
+            task();
+            p.set_value();
+          } else {
+            p.set_value(task());
+          }
+        } catch (...) {
+          p.set_exception(std::current_exception());
+        }
+      });
+    }
+
+    task_available_cv.notify_one();
+    return fut;
   }
 
  private:
-  void worker() {}
+  void run_worker(std::size_t thread_idx)
+  {
+    this_thread::set_thread_idx(thread_idx);
+
+    auto& task_available_cv = _workers[thread_idx].task_available_cv;
+    auto& task_done_cv      = _workers[thread_idx].task_done_cv;
+    auto& mut               = _workers[thread_idx].task_mutex;
+    auto& task_queue        = _workers[thread_idx].task_queue;
+    auto& should_stop       = _workers[thread_idx].should_stop;
+
+    if (_worker_thread_init_func) { std::invoke(_worker_thread_init_func); }
+
+    while (true) {
+      std::unique_lock lock(mut);
+
+      if (task_queue.empty()) { task_done_cv.notify_all(); }
+
+      task_available_cv.wait(lock, [&] { return !task_queue.empty() || should_stop; });
+
+      if (should_stop) { break; }
+
+      auto task = std::move(task_queue.front());
+      task_queue.pop();
+      lock.unlock();
+
+      task();
+    }
+  }
 
   void create_threads()
   {
-    for (unsigned int i = 0; i < _num_threads; ++i) {
-      _thread_container.emplace_back(&ThreadPoolSimple::worker, _worker_thread_init_func);
+    _workers = std::make_unique<Worker[]>(_num_threads);
+    for (unsigned int thread_idx = 0; thread_idx < _num_threads; ++thread_idx) {
+      _workers[thread_idx].thread = std::thread([this, thread_idx] { run_worker(thread_idx); });
     }
   }
 
-  void destroy_threads() {}
+  void destroy_threads()
+  {
+    for (unsigned int thread_idx = 0; thread_idx < _num_threads; ++thread_idx) {
+      auto& task_available_cv = _workers[thread_idx].task_available_cv;
+      auto& mut               = _workers[thread_idx].task_mutex;
+
+      {
+        std::lock_guard lock(mut);
+        _workers[thread_idx].should_stop = true;
+      }
+
+      task_available_cv.notify_one();
+
+      _workers[thread_idx].thread.join();
+    }
+  }
 
-  std::atomic_bool _done{false};
   unsigned int _num_threads{};
-  std::function<void()> _worker_thread_init_func{};
-  std::vector<std::thread> _thread_container{};
+  FunctionWrapper _worker_thread_init_func;
+  std::unique_ptr<Worker[]> _workers;
+  std::atomic_size_t _task_submission_counter{0};
 };
 
-}  // namespace kvikio
\ No newline at end of file
+}  // namespace kvikio

From 205ea93d582d65142dd3df0d63960e09a76f69ec Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 18 Mar 2025 00:37:55 -0400
Subject: [PATCH 003/103] Add more comments

---
 cpp/include/kvikio/threadpool_simple.hpp | 139 ++++++++++++++++++++++-
 1 file changed, 135 insertions(+), 4 deletions(-)

diff --git a/cpp/include/kvikio/threadpool_simple.hpp b/cpp/include/kvikio/threadpool_simple.hpp
index d97032acd3..5db1992d56 100644
--- a/cpp/include/kvikio/threadpool_simple.hpp
+++ b/cpp/include/kvikio/threadpool_simple.hpp
@@ -25,24 +25,51 @@
 #include <kvikio/function_wrapper.hpp>
 
 /**
+ * @file
  * @brief A simple, header-only thread pool that executes tasks in an embarrassingly parallel
- * manner.
+ * manner. Inspired by the BS threadpool that KvikIO has been using.
  */
+
 namespace kvikio {
-class this_thread {
+/**
+ * @brief Utility class for the calling thread.
+ */
+class ThisThread {
  public:
+  /**
+   * @brief Check if the calling thread is from ThreadPoolSimple.
+   *
+   * @return Boolean answer.
+   */
   static bool is_from_pool() { return get_thread_idx().has_value(); }
 
+  /**
+   * @brief Get the index of the calling thread.
+   *
+   * If the calling thread is not from ThreadPoolSimple, return std::nullopt. Otherwise, return the
+   * thread index ranging from 0 to (N-1) where N is the thread count.
+   *
+   * @return Index of the calling thread.
+   */
   static std::optional<std::size_t> get_thread_idx() { return this_thread_idx; }
 
  private:
   friend class ThreadPoolSimple;
 
+  /**
+   * @brief Set the index of the calling thread.
+   *
+   * @param thread_idx Index of the calling thread.
+   */
   static void set_thread_idx(std::size_t thread_idx) { this_thread_idx = thread_idx; }
 
   inline static thread_local std::optional<std::size_t> this_thread_idx{std::nullopt};
 };
 
+/**
+ * @brief Struct to hold per-thread data.
+ *
+ */
 struct Worker {
   std::thread thread;
   std::condition_variable task_available_cv;
@@ -52,8 +79,46 @@ struct Worker {
   bool should_stop{false};
 };
 
+/**
+ * @brief A simple thread pool that executes tasks in an embarrassingly parallel manner.
+ *
+ * Each worker thread has their own task queue, mutex and condition variables. The per-thread mutex
+ * and condition variables are shared with the main thread for synchronization. Tasks are submitted
+ * on the main thread to the worker threads in a round-robin fashion, unless the target thread index
+ * is specified by the user.
+ *
+ * Example:
+ * ```cpp
+ * // Create a thread pool with 4 threads, and pass an optional callable with which to initialize
+ * // each worker thread.
+ * kvikio::ThreadPoolSimple thread_pool{4, [] {
+ *     // Initialize worker thread
+ * }};
+ *
+ * // Submit the task to the thread pool. The worker thread is selected automatically in a
+ * // round-robin fashion.
+ * auto fut = thread_pool.submit_task([] {
+ *     // Task logic
+ * });
+ *
+ * // Submit the task to a specific thread.
+ * auto fut = thread_pool.submit_task_to_thread([] {
+ *     // Task logic
+ * });
+ *
+ * // Wait until the result is ready.
+ * auto result = fut.get();
+ * ```
+ */
 class ThreadPoolSimple {
  public:
+  /**
+   * @brief Constructor. Create a thread pool.
+   *
+   * @tparam F Type of the user-defined worker thread initialization.
+   * @param num_threads Number of threads.
+   * @param worker_thread_init_func User-defined worker thread initialization.
+   */
   template <typename F>
   ThreadPoolSimple(unsigned int num_threads, F&& worker_thread_init_func)
     : _num_threads{num_threads}, _worker_thread_init_func{std::forward<F>(worker_thread_init_func)}
@@ -61,10 +126,30 @@ class ThreadPoolSimple {
     create_threads();
   }
 
+  /**
+   * @brief Constructor, without user-defined worker thread initialization.
+   *
+   * @param num_threads Number of threads.
+   */
   ThreadPoolSimple(unsigned int num_threads) : ThreadPoolSimple(num_threads, FunctionWrapper{}) {}
 
-  ~ThreadPoolSimple() { destroy_threads(); }
+  /**
+   * @brief Destructor. Wait until all worker threads complete their tasks, then join the threads.
+   */
+  ~ThreadPoolSimple()
+  {
+    wait();
+    destroy_threads();
+  }
 
+  /**
+   * @brief Wait until all worker threads complete their tasks. Then join the threads, and
+   * reinitialize the thread pool with new threads.
+   *
+   * @tparam F Type of the user-defined worker thread initialization.
+   * @param num_threads Number of threads.
+   * @param worker_thread_init_func User-defined worker thread initialization.
+   */
   template <typename F>
   void reset(unsigned int num_threads, F&& worker_thread_init_func)
   {
@@ -76,8 +161,16 @@ class ThreadPoolSimple {
     create_threads();
   }
 
+  /**
+   * @brief Overload of reset(), without user-defined worker thread initialization.
+   *
+   * @param num_threads Number of threads.
+   */
   void reset(unsigned int num_threads) { reset(num_threads, FunctionWrapper{}); }
 
+  /**
+   * @brief Block the calling thread until all worker threads complete their tasks.
+   */
   void wait()
   {
     for (unsigned int thread_idx = 0; thread_idx < _num_threads; ++thread_idx) {
@@ -90,11 +183,28 @@ class ThreadPoolSimple {
     }
   }
 
+  /**
+   * @brief Get the number of threads from the thread pool.
+   *
+   * @return Thread count.
+   */
   unsigned int num_thread() const { return _num_threads; }
 
+  /**
+   * @brief Submit the task to the thread pool for execution. The worker thread is selected
+   * automatically in a round-robin fashion.
+   *
+   * @tparam F Type of the task callable.
+   * @tparam R Return type of the task callable.
+   * @param task  Task callable. The task can either be copyable or move-only.
+   * @return An std::future<R> object. R can be void or other types.
+   */
   template <typename F, typename R = std::invoke_result_t<std::decay_t<F>>>
   [[nodiscard]] std::future<R> submit_task(F&& task)
   {
+    // The call index is atomically incremented on each submit_task call, and will wrap around once
+    // it reaches the maximum value the integer type `std::size_t` can hold (this overflow
+    // behavior is well-defined in C++).
     auto tid =
       std::atomic_fetch_add_explicit(&_task_submission_counter, 1, std::memory_order_relaxed);
     tid %= _num_threads;
@@ -102,6 +212,15 @@ class ThreadPoolSimple {
     return submit_task_to_thread(std::forward<F>(task), tid);
   }
 
+  /**
+   * @brief Submit the task to a specific thread for execution.
+   *
+   * @tparam F Type of the task callable.
+   * @tparam R Return type of the task callable.
+   * @param task Task callable. The task can either be copyable or move-only.
+   * @param thread_idx Index of the thread to which the task is submitted.
+   * @return An std::future<R> object. R can be void or other types.
+   */
   template <typename F, typename R = std::invoke_result_t<std::decay_t<F>>>
   [[nodiscard]] std::future<R> submit_task_to_thread(F&& task, std::size_t thread_idx)
   {
@@ -134,9 +253,14 @@ class ThreadPoolSimple {
   }
 
  private:
+  /**
+   * @brief Worker thread loop.
+   *
+   * @param thread_idx Worker thread index.
+   */
   void run_worker(std::size_t thread_idx)
   {
-    this_thread::set_thread_idx(thread_idx);
+    ThisThread::set_thread_idx(thread_idx);
 
     auto& task_available_cv = _workers[thread_idx].task_available_cv;
     auto& task_done_cv      = _workers[thread_idx].task_done_cv;
@@ -163,6 +287,9 @@ class ThreadPoolSimple {
     }
   }
 
+  /**
+   * @brief Create worker threads.
+   */
   void create_threads()
   {
     _workers = std::make_unique<Worker[]>(_num_threads);
@@ -171,6 +298,10 @@ class ThreadPoolSimple {
     }
   }
 
+  /**
+   * @brief Notify each work thread of the intention to stop and join the threads. Pre-condition:
+   * Each worker thread has finished all the tasks in their task queue.
+   */
   void destroy_threads()
   {
     for (unsigned int thread_idx = 0; thread_idx < _num_threads; ++thread_idx) {

From 74af1557ed9e3f23a7e9cde0f1317910d7a34693 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 18 Mar 2025 00:53:10 -0400
Subject: [PATCH 004/103] Add more comments to the function wrapper

---
 cpp/include/kvikio/function_wrapper.hpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/cpp/include/kvikio/function_wrapper.hpp b/cpp/include/kvikio/function_wrapper.hpp
index 81362e4b9a..eaae062a26 100644
--- a/cpp/include/kvikio/function_wrapper.hpp
+++ b/cpp/include/kvikio/function_wrapper.hpp
@@ -18,6 +18,14 @@
 #include <memory>
 
 namespace kvikio {
+/**
+ * @brief Type-erased function wrapper that can hold either a copyable or move-only callable. This
+ * class avoids the limitation and inconvenience of std::function whose target has to be copyable.
+ *
+ * @todo Use small object optimization to avoid heap allocation.
+ * @note This class will be deprecated in the far future when C++23 is adopted that offers
+ * std::move_only_function.
+ */
 class SimpleFunctionWrapper {
  private:
   struct inner_base {
@@ -43,6 +51,13 @@ class SimpleFunctionWrapper {
   std::unique_ptr<inner_base> _callable;
 
  public:
+  /**
+   * @brief Constructor. Create a function wrapper that can hold either a copyable or move-only
+   * callable.
+   *
+   * @tparam F Callable type.
+   * @param f Callable.
+   */
   template <typename F>
   SimpleFunctionWrapper(F&& f) : _callable(std::make_unique<inner<F>>(std::forward<F>(f)))
   {
@@ -60,6 +75,12 @@ class SimpleFunctionWrapper {
 
   void operator()() { return _callable->operator()(); }
 
+  /**
+   * @brief Conversion function that tells whether the wrapper has a target (true) or is empty
+   * (false).
+   *
+   * @return Boolean answer.
+   */
   operator bool() { return _callable != nullptr; }
 };
 

From e7cbc04c58d75d9324927e7f6293b0e7ba08e9ba Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 18 Mar 2025 00:56:25 -0400
Subject: [PATCH 005/103] Cleanup

---
 cpp/include/kvikio/threadpool_simple.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/include/kvikio/threadpool_simple.hpp b/cpp/include/kvikio/threadpool_simple.hpp
index 5db1992d56..eb026fcd9d 100644
--- a/cpp/include/kvikio/threadpool_simple.hpp
+++ b/cpp/include/kvikio/threadpool_simple.hpp
@@ -68,7 +68,6 @@ class ThisThread {
 
 /**
  * @brief Struct to hold per-thread data.
- *
  */
 struct Worker {
   std::thread thread;

From 9c0ecb92ba7443d27ff791671049a9a99bbe394e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 18 Mar 2025 09:08:16 -0400
Subject: [PATCH 006/103] Fix typo

---
 cpp/include/kvikio/threadpool_simple.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/kvikio/threadpool_simple.hpp b/cpp/include/kvikio/threadpool_simple.hpp
index eb026fcd9d..d4193f2814 100644
--- a/cpp/include/kvikio/threadpool_simple.hpp
+++ b/cpp/include/kvikio/threadpool_simple.hpp
@@ -187,7 +187,7 @@ class ThreadPoolSimple {
    *
    * @return Thread count.
    */
-  unsigned int num_thread() const { return _num_threads; }
+  unsigned int num_threads() const { return _num_threads; }
 
   /**
    * @brief Submit the task to the thread pool for execution. The worker thread is selected

From 1dc94f56684db7ee2db5d9638f6bc72610a34d09 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 18 Mar 2025 11:46:13 -0400
Subject: [PATCH 007/103] Add include guard

---
 cpp/include/kvikio/function_wrapper.hpp  | 2 ++
 cpp/include/kvikio/threadpool_simple.hpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/cpp/include/kvikio/function_wrapper.hpp b/cpp/include/kvikio/function_wrapper.hpp
index eaae062a26..02a3260b13 100644
--- a/cpp/include/kvikio/function_wrapper.hpp
+++ b/cpp/include/kvikio/function_wrapper.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <functional>
 #include <memory>
 
diff --git a/cpp/include/kvikio/threadpool_simple.hpp b/cpp/include/kvikio/threadpool_simple.hpp
index d4193f2814..7b022fd676 100644
--- a/cpp/include/kvikio/threadpool_simple.hpp
+++ b/cpp/include/kvikio/threadpool_simple.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <atomic>
 #include <condition_variable>
 #include <future>

From 0e7a55d0bb3ce92c87c47ca773e18690aa4f959d Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 18 Mar 2025 21:11:52 -0400
Subject: [PATCH 008/103] Update comments

---
 cpp/include/kvikio/threadpool_simple.hpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/include/kvikio/threadpool_simple.hpp b/cpp/include/kvikio/threadpool_simple.hpp
index 7b022fd676..0bed02ed54 100644
--- a/cpp/include/kvikio/threadpool_simple.hpp
+++ b/cpp/include/kvikio/threadpool_simple.hpp
@@ -28,8 +28,9 @@
 
 /**
  * @file
- * @brief A simple, header-only thread pool that executes tasks in an embarrassingly parallel
- * manner. Inspired by the BS threadpool that KvikIO has been using.
+ * @brief A simple, header-only thread pool that uses per-thread task queues. Synchronization only
+ * exists between the pairs of the main thread and each worker thread, but not among the worker
+ * threads themselves. Inspired by the BS threadpool that KvikIO has been using.
  */
 
 namespace kvikio {
@@ -81,12 +82,12 @@ struct Worker {
 };
 
 /**
- * @brief A simple thread pool that executes tasks in an embarrassingly parallel manner.
+ * @brief A simple thread pool that uses per-thread task queues.
  *
- * Each worker thread has their own task queue, mutex and condition variables. The per-thread mutex
- * and condition variables are shared with the main thread for synchronization. Tasks are submitted
- * on the main thread to the worker threads in a round-robin fashion, unless the target thread index
- * is specified by the user.
+ * Each worker thread has their own task queue, mutex and condition variable. The per-thread
+ * synchronization primitives (mutex and condition variable) are shared with the main thread. Tasks
+ * are submitted to the worker threads in a round-robin fashion, unless the target thread index is
+ * specified by the user.
  *
  * Example:
  * ```cpp

From 489d9a9a4d8459bd7af0ac71c205fb0a4d247ae6 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Wed, 19 Mar 2025 00:15:52 -0400
Subject: [PATCH 009/103] Improve clarity

---
 .../threadpool/threadpool_benchmark.cpp       | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
index 0f7725bb72..aba4244ef4 100644
--- a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
+++ b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
@@ -37,6 +37,12 @@ enum class ScalingType : uint8_t {
   WEAK_SCALING,
 };
 
+namespace constant {
+std::size_t constexpr ntasks_strong_scaling{10'000};
+std::size_t constexpr ntasks_weak_scaling{1'000};
+std::size_t constexpr num_compute_iterations{1'000};
+}  // namespace constant
+
 void task_compute(std::size_t num_compute_iterations)
 {
   [[maybe_unused]] double res{0.0};
@@ -51,17 +57,17 @@ void BM_BS_threadpool_compute(benchmark::State& state)
 {
   auto const num_threads = state.range(0);
 
-  std::size_t const num_compute_tasks =
-    (scaling_type == ScalingType::STRONG_SCALING) ? 10'000 : (1'000 * num_threads);
+  std::size_t const num_compute_tasks = (scaling_type == ScalingType::STRONG_SCALING)
+                                          ? constant::ntasks_strong_scaling
+                                          : (constant::ntasks_weak_scaling * num_threads);
 
-  std::size_t constexpr num_compute_iterations{1'000};
   kvikio::defaults::set_thread_pool_nthreads(num_threads);
 
   for (auto _ : state) {
     // Submit a total of "num_compute_tasks" tasks to the thread pool.
     for (auto i = std::size_t{0}; i < num_compute_tasks; ++i) {
-      [[maybe_unused]] auto fut =
-        kvikio::defaults::thread_pool().submit_task([] { task_compute(num_compute_iterations); });
+      [[maybe_unused]] auto fut = kvikio::defaults::thread_pool().submit_task(
+        [] { task_compute(constant::num_compute_iterations); });
     }
     kvikio::defaults::thread_pool().wait();
   }
@@ -74,17 +80,17 @@ void BM_simple_threadpool_compute(benchmark::State& state)
 {
   auto const num_threads = state.range(0);
 
-  std::size_t const num_compute_tasks =
-    (scaling_type == ScalingType::STRONG_SCALING) ? 10'000 : (1'000 * num_threads);
+  std::size_t const num_compute_tasks = (scaling_type == ScalingType::STRONG_SCALING)
+                                          ? constant::ntasks_strong_scaling
+                                          : (constant::ntasks_weak_scaling * num_threads);
 
-  std::size_t constexpr num_compute_iterations{1'000};
   kvikio::ThreadPoolSimple thread_pool(num_threads);
 
   for (auto _ : state) {
     // Submit a total of "num_compute_tasks" tasks to the thread pool.
     for (auto i = std::size_t{0}; i < num_compute_tasks; ++i) {
       [[maybe_unused]] auto fut =
-        thread_pool.submit_task([] { task_compute(num_compute_iterations); });
+        thread_pool.submit_task([] { task_compute(constant::num_compute_iterations); });
     }
     thread_pool.wait();
   }
@@ -118,10 +124,10 @@ int main(int argc, char** argv)
     "simple_threadpool_compute:strong_scaling",
     kvikio::BM_simple_threadpool_compute<kvikio::ScalingType::STRONG_SCALING>)
     ->RangeMultiplier(2)
-    ->Range(1, 64)   // Increase from 1 to 64 (inclusive of both endpoints) with x2 stepping.
-    ->UseRealTime()  // Use the wall clock to determine the number of benchmark iterations.
+    ->Range(1, 64)
+    ->UseRealTime()
     ->Unit(benchmark::kMillisecond)
-    ->MinTime(2);  // Minimum of 2 seconds.
+    ->MinTime(2);
 
   benchmark::RegisterBenchmark(
     "simple_threadpool_compute:weak_scaling",

From 3f31cf0615d1be8b18dbd1b85a2d8b6c04b3d2ba Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Wed, 19 Mar 2025 00:27:52 -0400
Subject: [PATCH 010/103] Add static task benchmark

---
 .../threadpool/threadpool_benchmark.cpp       | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
index aba4244ef4..ea65389b82 100644
--- a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
+++ b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
@@ -25,6 +25,8 @@
 
 #include <cmath>
 #include <cstdint>
+#include <thread>
+#include <vector>
 
 #include <benchmark/benchmark.h>
 #include <kvikio/defaults.hpp>
@@ -97,6 +99,39 @@ void BM_simple_threadpool_compute(benchmark::State& state)
 
   state.counters["threads"] = num_threads;
 }
+
+template <ScalingType scaling_type>
+void BM_static_task_compute(benchmark::State& state)
+{
+  auto const num_threads = state.range(0);
+
+  for (auto _ : state) {
+    std::vector<std::thread> threads(num_threads);
+    for (auto&& thread : threads) {
+      thread = std::thread([=] {
+        std::size_t num_tasks_this_thread{};
+        if constexpr (scaling_type == ScalingType::STRONG_SCALING) {
+          auto const p = constant::ntasks_strong_scaling / num_threads;
+          auto const q = constant::ntasks_strong_scaling % num_threads;
+          num_tasks_this_thread =
+            (static_cast<decltype(q)>(state.thread_index()) < q) ? (p + 1) : p;
+        } else {
+          num_tasks_this_thread = constant::ntasks_weak_scaling;
+        }
+
+        for (std::size_t i = 0; i < num_tasks_this_thread; ++i) {
+          task_compute(constant::num_compute_iterations);
+        }
+      });
+    }
+
+    for (auto&& thread : threads) {
+      thread.join();
+    }
+  }
+
+  state.counters["threads"] = num_threads;
+}
 }  // namespace kvikio
 
 int main(int argc, char** argv)
@@ -138,6 +173,22 @@ int main(int argc, char** argv)
     ->Unit(benchmark::kMillisecond)
     ->MinTime(2);
 
+  benchmark::RegisterBenchmark("static_task_compute:strong_scaling",
+                               kvikio::BM_static_task_compute<kvikio::ScalingType::STRONG_SCALING>)
+    ->RangeMultiplier(2)
+    ->Range(1, 64)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond)
+    ->MinTime(2);
+
+  benchmark::RegisterBenchmark("static_task_compute:weak_scaling",
+                               kvikio::BM_static_task_compute<kvikio::ScalingType::WEAK_SCALING>)
+    ->RangeMultiplier(2)
+    ->Range(1, 64)
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond)
+    ->MinTime(2);
+
   kvikio::utils::explain_default_metrics();
 
   benchmark::RunSpecifiedBenchmarks();

From 0656f7ba48faed927b0b945f082e4c317bc7b6ac Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 30 Apr 2025 15:11:34 -0400
Subject: [PATCH 011/103] DOC v25.08 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 ++---
 .../cuda12.8-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda12.8-pip/devcontainer.json  |  6 ++---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-128_arch-aarch64.yaml            |  2 +-
 .../all_cuda-128_arch-x86_64.yaml             |  2 +-
 .../downstream/cmake/get_kvikio.cmake         |  2 +-
 dependencies.yaml                             | 22 ++++++++--------
 java/pom.xml                                  |  2 +-
 python/kvikio/pyproject.toml                  |  6 ++---
 17 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 184d664799..610e38b8c7 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 702422dbd4..8297d7282b 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.06-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.8-conda/devcontainer.json b/.devcontainer/cuda12.8-conda/devcontainer.json
index ee9b1ddad7..4074dc74f4 100644
--- a/.devcontainer/cuda12.8-conda/devcontainer.json
+++ b/.devcontainer/cuda12.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.06-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.08-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.8-pip/devcontainer.json b/.devcontainer/cuda12.8-pip/devcontainer.json
index 84454b0886..7d761d40b7 100644
--- a/.devcontainer/cuda12.8-pip/devcontainer.json
+++ b/.devcontainer/cuda12.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.06-cpp-cuda12.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda12.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 09e48cd547..2c14863a32 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -81,7 +81,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -93,7 +93,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -104,7 +104,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 6cabc922c0..98be309f11 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.08
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -46,7 +46,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.08
     with:
       files_yaml: |
         test_cpp:
@@ -86,26 +86,26 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.08
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -115,20 +115,20 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -138,7 +138,7 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.08
     with:
       arch: '["amd64"]'
       cuda: '["12.8"]'
@@ -149,7 +149,7 @@ jobs:
         sccache -s;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -159,7 +159,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -168,7 +168,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 33b36c08a3..48dd2fc399 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -19,7 +19,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -27,7 +27,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -35,7 +35,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 475380cb68..593fcb1086 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.06
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.08
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/VERSION b/VERSION
index cc83d7ab43..3af4bda020 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.06.00
+25.08.00
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index c4eff1f684..edde8e7ff8 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - python>=3.10,<3.13
 - rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.6.*,>=0.0.0a0
+- rapids-dask-dependency==25.8.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 294562d18a..a95b31909a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - python>=3.10,<3.13
 - rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.6.*,>=0.0.0a0
+- rapids-dask-dependency==25.8.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-128_arch-aarch64.yaml b/conda/environments/all_cuda-128_arch-aarch64.yaml
index 3d9ed82d71..2b1cb8121c 100644
--- a/conda/environments/all_cuda-128_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-128_arch-aarch64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - python>=3.10,<3.13
 - rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.6.*,>=0.0.0a0
+- rapids-dask-dependency==25.8.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
index fc0827fb93..2e2729c7e5 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - python>=3.10,<3.13
 - rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.6.*,>=0.0.0a0
+- rapids-dask-dependency==25.8.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index efd3b1435f..5e4132f8e5 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -30,4 +30,4 @@ function(find_and_configure_kvikio MIN_VERSION)
 
 endfunction()
 
-find_and_configure_kvikio("25.06")
+find_and_configure_kvikio("25.08")
diff --git a/dependencies.yaml b/dependencies.yaml
index 368d8b5425..6cf81f1a0b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -171,7 +171,7 @@ dependencies:
     common:
       - output_types: conda
         packages: &libkvikio_packages
-          - libkvikio==25.6.*,>=0.0.0a0
+          - libkvikio==25.8.*,>=0.0.0a0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -179,12 +179,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.6.*,>=0.0.0a0
+              - libkvikio-cu12==25.8.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu11==25.6.*,>=0.0.0a0
+              - libkvikio-cu11==25.8.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -326,7 +326,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.6.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==25.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -340,12 +340,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.6.*,>=0.0.0a0
+              - libkvikio-cu12==25.8.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu11==25.6.*,>=0.0.0a0
+              - libkvikio-cu11==25.8.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
@@ -401,14 +401,14 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.6.*,>=0.0.0a0
-          - libkvikio-tests==25.6.*,>=0.0.0a0
+          - libkvikio==25.8.*,>=0.0.0a0
+          - libkvikio-tests==25.8.*,>=0.0.0a0
   test_kvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.6.*,>=0.0.0a0
-          - kvikio==25.6.*,>=0.0.0a0
+          - libkvikio==25.8.*,>=0.0.0a0
+          - kvikio==25.8.*,>=0.0.0a0
   test_cpp:
     common:
       - output_types: conda
@@ -418,7 +418,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.6.*,>=0.0.0a0
+          - rapids-dask-dependency==25.8.*,>=0.0.0a0
           - pytest
           - pytest-asyncio
           - pytest-cov
diff --git a/java/pom.xml b/java/pom.xml
index c6aa3eb8ce..228c82a3d5 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -7,7 +7,7 @@
 
   <groupId>ai.rapids.kvikio</groupId>
   <artifactId>cufile</artifactId>
-  <version>25.06.0-SNAPSHOT</version>
+  <version>25.08.0-SNAPSHOT</version>
 
   <name>cuFile</name>
   <description>
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index c0f16012ff..9c7c63480a 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cupy-cuda11x>=12.0.0",
-    "libkvikio==25.6.*,>=0.0.0a0",
+    "libkvikio==25.8.*,>=0.0.0a0",
     "numcodecs !=0.12.0",
     "numpy>=1.23,<3.0a0",
     "nvidia-nvcomp==4.2.0.11",
@@ -48,7 +48,7 @@ test = [
     "pytest-cov",
     "pytest-timeout",
     "rangehttpserver",
-    "rapids-dask-dependency==25.6.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.8.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -127,7 +127,7 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
     "cython>=3.0.0",
-    "libkvikio==25.6.*,>=0.0.0a0",
+    "libkvikio==25.8.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From 75adaf444a850fa1e4918054a6fff0dc4346be5f Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 27 May 2025 17:35:50 -0500
Subject: [PATCH 012/103] stop uploading packages to downloads.rapids.ai (#734)

Contributes to https://github.com/rapidsai/build-planning/issues/181

* removes all uploads of conda packages and wheels to `downloads.rapids.ai`

## Notes for Reviewers

### How I identified changes

Looked for uses of the relevant `gha-tools` tools, as well as documentation about `downloads.rapids.ai`, being on the NVIDIA VPN, using S3, etc. like this:

```shell
git grep -i -E 's3|upload|downloads\.rapids|vpn'
```

### How I tested this

See "How I tested this" on https://github.com/rapidsai/shared-workflows/pull/364

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/734
---
 ci/build_cpp.sh          | 2 --
 ci/build_python.sh       | 2 --
 ci/build_wheel_cpp.sh    | 4 ----
 ci/build_wheel_python.sh | 2 --
 4 files changed, 10 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 8cf60a3d13..7639ddd5c4 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -33,5 +33,3 @@ sccache --show-adv-stats
 # remove build_cache directory to avoid uploading the entire source tree
 # tracked in https://github.com/prefix-dev/rattler-build/issues/1424
 rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache
-
-rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 8bf17ea2a0..1559350ed9 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -41,5 +41,3 @@ sccache --show-adv-stats
 # remove build_cache directory to avoid uploading the entire source tree
 # tracked in https://github.com/prefix-dev/rattler-build/issues/1424
 rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache
-
-rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 1bdf439a29..8107f24160 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -30,12 +30,8 @@ export PIP_NO_BUILD_ISOLATION=0
 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=OFF"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
-
 python -m auditwheel repair \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
 ./ci/validate_wheel.sh ${package_dir} "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"
-
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index f00609cc17..55ee0c05a0 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -27,5 +27,3 @@ python -m auditwheel repair \
     ${package_dir}/dist/*
 
 ./ci/validate_wheel.sh ${package_dir} "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"
-
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}"

From de4bccdfc6c7e80025d77b1a97bf1e815de475c5 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 30 May 2025 09:32:55 -0400
Subject: [PATCH 013/103] Add an overload getenv_or that supports env var
 alias. Add new env var KVIKIO_NUM_THREADS. Fix UB. (#735)

This PR performs the following items:
- Add a new overload `kvikio::getenv_or` that supports env var alias, i.e. multiple different env vars referring to the same property.
- Add new env var `KVIKIO_NUM_THREADS` as the alias of `KVIKIO_NTHREADS`, and add new C++ API `num_threads`/`set_thread_pool_nthreads` as the alias of existing `thread_pool_nthreads`/`set_thread_pool_nthreads`. This makes C++ and Python API (i.e. `kvikio.defaults.get("num_threads")`) consistent.
- Fix a well-hidden UB related to string temporaries and `std::initializer_list`.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/735
---
 cpp/include/kvikio/defaults.hpp |  80 ++++++++++++++-
 cpp/src/defaults.cpp            |  15 ++-
 cpp/tests/CMakeLists.txt        |   2 +-
 cpp/tests/test_defaults.cpp     | 168 ++++++++++++++++++++++++++++++++
 cpp/tests/utils/env.cpp         |  11 ++-
 cpp/tests/utils/env.hpp         |   3 +-
 6 files changed, 266 insertions(+), 13 deletions(-)

diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index d1e17f0b85..200f248d79 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -18,11 +18,14 @@
 
 #include <cstddef>
 #include <cstdlib>
+#include <initializer_list>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <type_traits>
 
 #include <kvikio/compat_mode.hpp>
+#include <kvikio/error.hpp>
 #include <kvikio/http_status_codes.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/threadpool_wrapper.hpp>
@@ -41,9 +44,13 @@ T getenv_or(std::string_view env_var_name, T default_val)
   std::stringstream sstream(env_val);
   T converted_val;
   sstream >> converted_val;
-  KVIKIO_EXPECT(!sstream.fail(),
-                "unknown config value " + std::string{env_var_name} + "=" + std::string{env_val},
-                std::invalid_argument);
+
+  if constexpr (!std::is_same_v<T, std::string>) {
+    KVIKIO_EXPECT(!sstream.fail(),
+                  "unknown config value " + std::string{env_var_name} + "=" + std::string{env_val},
+                  std::invalid_argument);
+  }
+
   return converted_val;
 }
 
@@ -56,6 +63,59 @@ CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val);
 template <>
 std::vector<int> getenv_or(std::string_view env_var_name, std::vector<int> default_val);
 
+/**
+ * @brief Get the environment variable value from a candidate list
+ *
+ * @tparam T Type of the environment variable value
+ * @param env_var_names Candidate list containing the names of environment variable
+ * @param default_val Default value of the environment variable, if none of the candidates has been
+ * found
+ * @return A tuple of (`env_var_name`, `result`, `has_found`), where:
+ *   - If the environment variable is not set by any of the candidates, `has_found` will be false,
+ * `result` will be `default_val`, and `env_var_name` will be empty.
+ *   - If the environment variable is set by `env_var_name`, then `has_found` will be true, and
+ * `result` be the set value. If more than one candidates have been set with the same value,
+ * `env_var_name` will be assigned the last candidate.
+ *
+ * @throws std::invalid_argument if:
+ *   - `env_var_names` is empty.
+ *   - The environment variable is not defined to be string type and is assigned an empty value (in
+ *     other words, string-type environment variables are allowed to hold an empty value).
+ *   - More than one candidates have been set with different values.
+ *   - An invalid value is given, e.g. value that cannot be converted to type T.
+ */
+template <typename T>
+std::tuple<std::string_view, T, bool> getenv_or(
+  std::initializer_list<std::string_view> env_var_names, T default_val)
+{
+  KVIKIO_EXPECT(env_var_names.size() > 0,
+                "`env_var_names` must contain at least one environment variable name.",
+                std::invalid_argument);
+  std::string_view env_name_target;
+  std::string_view env_val_target;
+
+  for (auto const& env_var_name : env_var_names) {
+    auto const* env_val = std::getenv(env_var_name.data());
+    if (env_val == nullptr) { continue; }
+
+    if (!env_name_target.empty() && env_val_target != env_val) {
+      std::stringstream ss;
+      ss << "Environment variable " << env_var_name << " (" << env_val
+         << ") has already been set by its alias " << env_name_target << " (" << env_val_target
+         << ") with a different value.";
+      KVIKIO_FAIL(ss.str(), std::invalid_argument);
+    }
+
+    env_name_target = env_var_name;
+    env_val_target  = env_val;
+  }
+
+  if (env_name_target.empty()) { return {env_name_target, default_val, false}; }
+
+  auto res = getenv_or<T>(env_name_target, default_val);
+  return {env_name_target, res, true};
+}
+
 /**
  * @brief Singleton class of default values used throughout KvikIO.
  *
@@ -183,6 +243,20 @@ class defaults {
    */
   static void set_thread_pool_nthreads(unsigned int nthreads);
 
+  /**
+   * @brief Alias of `thread_pool_nthreads`
+   *
+   * @return The number of threads
+   */
+  [[nodiscard]] static unsigned int num_threads();
+
+  /**
+   * @brief Alias of `set_thread_pool_nthreads`
+   *
+   * @param nthreads The number of threads to use
+   */
+  static void set_num_threads(unsigned int nthreads);
+
   /**
    * @brief Get the default task size used for parallel IO operations.
    *
diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp
index f492d39a23..24a4de0898 100644
--- a/cpp/src/defaults.cpp
+++ b/cpp/src/defaults.cpp
@@ -28,6 +28,7 @@
 #include <kvikio/error.hpp>
 #include <kvikio/http_status_codes.hpp>
 #include <kvikio/shim/cufile.hpp>
+#include <string_view>
 
 namespace kvikio {
 template <>
@@ -88,9 +89,13 @@ std::vector<int> getenv_or(std::string_view env_var_name, std::vector<int> defau
 unsigned int defaults::get_num_threads_from_env()
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  int const ret = getenv_or("KVIKIO_NTHREADS", 1);
-  KVIKIO_EXPECT(ret > 0, "KVIKIO_NTHREADS has to be a positive integer", std::invalid_argument);
-  return ret;
+
+  auto const [env_var_name, num_threads, _] =
+    getenv_or({"KVIKIO_NTHREADS", "KVIKIO_NUM_THREADS"}, 1);
+  KVIKIO_EXPECT(num_threads > 0,
+                std::string{env_var_name} + " has to be a positive integer",
+                std::invalid_argument);
+  return num_threads;
 }
 
 defaults::defaults()
@@ -183,6 +188,10 @@ void defaults::set_thread_pool_nthreads(unsigned int nthreads)
   thread_pool().reset(nthreads);
 }
 
+unsigned int defaults::num_threads() { return thread_pool_nthreads(); }
+
+void defaults::set_num_threads(unsigned int nthreads) { set_thread_pool_nthreads(nthreads); }
+
 std::size_t defaults::task_size() { return instance()->_task_size; }
 
 void defaults::set_task_size(std::size_t nbytes)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4b4ad1049a..47be1d9899 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -72,7 +72,7 @@ endfunction()
 
 kvikio_add_test(NAME BASIC_IO_TEST SOURCES test_basic_io.cpp)
 
-kvikio_add_test(NAME DEFAULTS_TEST SOURCES test_defaults.cpp)
+kvikio_add_test(NAME DEFAULTS_TEST SOURCES test_defaults.cpp utils/env.cpp)
 
 kvikio_add_test(NAME ERROR_TEST SOURCES test_error.cpp)
 
diff --git a/cpp/tests/test_defaults.cpp b/cpp/tests/test_defaults.cpp
index f4f3a92dc1..9c283d658a 100644
--- a/cpp/tests/test_defaults.cpp
+++ b/cpp/tests/test_defaults.cpp
@@ -15,10 +15,18 @@
  */
 
 #include <stdexcept>
+#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <kvikio/defaults.hpp>
 
+#include "kvikio/compat_mode.hpp"
+#include "utils/env.hpp"
+
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
 TEST(DefaultsTest, parse_compat_mode_str)
 {
   {
@@ -72,3 +80,163 @@ TEST(DefaultsTest, parse_http_status_codes)
     }
   }
 }
+
+TEST(DefaultsTest, alias_for_getenv_or)
+{
+  // Passed initializer list is empty
+  {
+    EXPECT_THAT([=] { kvikio::getenv_or({}, 123); },
+                ThrowsMessage<std::invalid_argument>(HasSubstr(
+                  "`env_var_names` must contain at least one environment variable name")));
+  }
+
+  // Non-string env var has an empty value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", ""}}};
+    EXPECT_THAT(
+      [=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, 123); },
+      ThrowsMessage<std::invalid_argument>(HasSubstr("unknown config value KVIKIO_TEST_ALIAS=")));
+  }
+
+  // Non-string env var and alias have an empty value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{
+      {{"KVIKIO_TEST_ALIAS_1", ""}, {"KVIKIO_TEST_ALIAS_2", ""}}};
+    EXPECT_THAT(
+      [=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123); },
+      ThrowsMessage<std::invalid_argument>(HasSubstr("unknown config value KVIKIO_TEST_ALIAS_2=")));
+  }
+
+  // String env var has an empty value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", ""}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, std::string{"abc"});
+    EXPECT_EQ(env_var_name, "KVIKIO_TEST_ALIAS");
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_found);
+  }
+
+  // String env var and alias have an empty value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{
+      {{"KVIKIO_TEST_ALIAS_1", ""}, {"KVIKIO_TEST_ALIAS_2", ""}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, std::string{"abc"});
+    EXPECT_EQ(env_var_name, "KVIKIO_TEST_ALIAS_2");
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_found);
+  }
+
+  // Env var has already been set by its alias with the same value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS_1", "10"},
+                                             {"KVIKIO_TEST_ALIAS_2", "10"},
+                                             {"KVIKIO_TEST_ALIAS_3", "10"}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2", "KVIKIO_TEST_ALIAS_3"}, 123);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_3"});
+    EXPECT_EQ(result, 10);
+    EXPECT_TRUE(has_found);
+  }
+
+  // Env var has already been set by its alias with a different value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{
+      {{"KVIKIO_TEST_ALIAS_1", "10"}, {"KVIKIO_TEST_ALIAS_2", "20"}}};
+    EXPECT_THAT([=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123); },
+                ThrowsMessage<std::invalid_argument>(HasSubstr(
+                  "Environment variable KVIKIO_TEST_ALIAS_2 (20) has already been set by its alias "
+                  "KVIKIO_TEST_ALIAS_1 (10) with a different value")));
+  }
+
+  // Env var has invalid value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "abc"}}};
+    EXPECT_THAT([=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, 123); },
+                ThrowsMessage<std::invalid_argument>(
+                  HasSubstr("unknown config value KVIKIO_TEST_ALIAS=abc")));
+  }
+
+  // 1st alias has a set value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS_1", "654.321"}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123.456);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_1"});
+    EXPECT_EQ(result, 654.321);
+    EXPECT_TRUE(has_found);
+  }
+
+  // 2nd alias has a set value
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS_2", "654.321"}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123.456);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_2"});
+    EXPECT_EQ(result, 654.321);
+    EXPECT_TRUE(has_found);
+  }
+
+  // Neither alias has a set value
+  {
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123.456);
+    EXPECT_TRUE(env_var_name.empty());
+    EXPECT_EQ(result, 123.456);
+    EXPECT_FALSE(has_found);
+  }
+
+  // Special type: bool
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "yes"}}};
+    auto const [env_var_name, result, has_found] = kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, false);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_TRUE(result);
+    EXPECT_TRUE(has_found);
+  }
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "OFF"}}};
+    auto const [env_var_name, result, has_found] = kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, false);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_FALSE(result);
+    EXPECT_TRUE(has_found);
+  }
+
+  // Special type: CompatMode
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "yes"}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::AUTO);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_EQ(result, kvikio::CompatMode::ON);
+    EXPECT_TRUE(has_found);
+  }
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "FALSE"}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::AUTO);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_EQ(result, kvikio::CompatMode::OFF);
+    EXPECT_TRUE(has_found);
+  }
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "aUtO"}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::ON);
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    EXPECT_EQ(result, kvikio::CompatMode::AUTO);
+    EXPECT_TRUE(has_found);
+  }
+
+  // Special type: std::vector<int>
+  {
+    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "109, 108, 107"}}};
+    auto const [env_var_name, result, has_found] =
+      kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, std::vector<int>{111, 112, 113});
+    EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
+    std::vector<int> expected{109, 108, 107};
+    EXPECT_EQ(result, expected);
+    EXPECT_TRUE(has_found);
+  }
+}
diff --git a/cpp/tests/utils/env.cpp b/cpp/tests/utils/env.cpp
index 5e713dca04..9495289157 100644
--- a/cpp/tests/utils/env.cpp
+++ b/cpp/tests/utils/env.cpp
@@ -23,21 +23,22 @@
 
 namespace kvikio::test {
 EnvVarContext::EnvVarContext(
-  std::initializer_list<std::pair<std::string, std::string>> env_var_entries)
+  std::initializer_list<std::pair<std::string_view, std::string_view>> env_var_entries)
 {
   for (auto const& [key, current_value] : env_var_entries) {
     EnvVarState env_var_state;
-    if (auto const res = std::getenv(key.c_str()); res != nullptr) {
+    if (auto const res = std::getenv(key.data()); res != nullptr) {
       env_var_state.existed_before = true;
       env_var_state.previous_value = res;
     }
-    SYSCALL_CHECK(setenv(key.c_str(), current_value.c_str(), 1 /* allow overwrite */));
-    if (_env_var_map.find(key) != _env_var_map.end()) {
+    SYSCALL_CHECK(setenv(key.data(), current_value.data(), 1 /* allow overwrite */));
+    std::string key_str{key};
+    if (_env_var_map.find(key_str) != _env_var_map.end()) {
       std::stringstream ss;
       ss << "Environment variable " << key << " has already been set in this context.";
       KVIKIO_FAIL(ss.str());
     }
-    _env_var_map.insert({key, std::move(env_var_state)});
+    _env_var_map.insert({std::move(key_str), std::move(env_var_state)});
   }
 }
 
diff --git a/cpp/tests/utils/env.hpp b/cpp/tests/utils/env.hpp
index 8ec51ad27b..e6e84cd9b6 100644
--- a/cpp/tests/utils/env.hpp
+++ b/cpp/tests/utils/env.hpp
@@ -45,7 +45,8 @@ class EnvVarContext {
    * @param env_var_entries User-specified environment variables. Each entry includes the variable
    * name and value.
    */
-  EnvVarContext(std::initializer_list<std::pair<std::string, std::string>> env_var_entries);
+  EnvVarContext(
+    std::initializer_list<std::pair<std::string_view, std::string_view>> env_var_entries);
 
   /**
    * @brief Restore the environment variables to previous values

From c6e423714699fdfac27f9e2f03a08d73ecbd5ad2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 30 May 2025 11:56:15 -0500
Subject: [PATCH 014/103] Remove CUDA 11 devcontainers and update CI scripts
 (#738)

This PR removes CUDA 11 devcontainers and updates CI scripts.

xref: https://github.com/rapidsai/build-planning/issues/184

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/kvikio/pull/738
---
 .../cuda11.8-conda/devcontainer.json          | 42 -------------------
 .devcontainer/cuda11.8-pip/devcontainer.json  | 41 ------------------
 .github/workflows/pr.yaml                     |  5 +--
 ci/run_pytests.sh                             |  9 +---
 ci/test_wheel.sh                              |  7 +---
 5 files changed, 4 insertions(+), 100 deletions(-)
 delete mode 100644 .devcontainer/cuda11.8-conda/devcontainer.json
 delete mode 100644 .devcontainer/cuda11.8-pip/devcontainer.json

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
deleted file mode 100644
index 610e38b8c7..0000000000
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
-  "build": {
-    "context": "${localWorkspaceFolder}/.devcontainer",
-    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
-    "args": {
-      "CUDA": "11.8",
-      "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
-    }
-  },
-  "runArgs": [
-    "--rm",
-    "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda11.8-conda"
-  ],
-  "hostRequirements": {"gpu": "optional"},
-  "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
-  },
-  "overrideFeatureInstallOrder": [
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
-  ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
-  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
-  "workspaceFolder": "/home/coder",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "ms-python.flake8",
-        "nvidia.nsight-vscode-edition"
-      ]
-    }
-  }
-}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
deleted file mode 100644
index 8297d7282b..0000000000
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-  "build": {
-    "context": "${localWorkspaceFolder}/.devcontainer",
-    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
-    "args": {
-      "CUDA": "11.8",
-      "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda11.8-ubuntu22.04"
-    }
-  },
-  "runArgs": [
-    "--rm",
-    "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda11.8-pip"
-  ],
-  "hostRequirements": {"gpu": "optional"},
-  "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
-  },
-  "overrideFeatureInstallOrder": [
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
-  ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
-  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
-  "workspaceFolder": "/home/coder",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "ms-python.flake8",
-        "nvidia.nsight-vscode-edition"
-      ]
-    }
-  }
-}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 938111c080..b0b196a1b0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -32,10 +32,7 @@ jobs:
     with:
       needs: ${{ toJSON(needs) }}
   check-nightly-ci:
-    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
-    # provides at least Python 3.11 (see
-    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
     needs: telemetry-setup
     env:
       RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/ci/run_pytests.sh b/ci/run_pytests.sh
index 1a7edb5be5..e75c351700 100755
--- a/ci/run_pytests.sh
+++ b/ci/run_pytests.sh
@@ -1,14 +1,9 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 # Support invoking run_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/kvikio
 
-# If running CUDA 11.8 on arm64, we skip tests marked "cufile" since
-# cuFile didn't support arm until 12.4
-[[ "${CUDA_VERSION}" == "11.8.0" && "${RUNNER_ARCH}" == "ARM64" ]] \
-  && PYTEST_MARK=( -m 'not cufile' ) || PYTEST_MARK=()
-
-pytest --cache-clear --verbose "${PYTEST_MARK[@]}" "$@" tests
+pytest --cache-clear --verbose "$@" tests
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 66c7cfcb71..ca0fd7dd25 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -15,9 +15,4 @@ rapids-pip-retry install -v \
   "$(echo "${LIBKVIKIO_WHEELHOUSE}"/libkvikio_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
   "$(echo "${KVIKIO_WHEELHOUSE}"/kvikio_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]"
 
-# If running CUDA 11.8 on arm64, we skip tests marked "cufile" since
-# cuFile didn't support arm until 12.4
-[[ "${CUDA_VERSION}" == "11.8.0" && "${RUNNER_ARCH}" == "ARM64" ]] \
-  && PYTEST_MARK=( -m 'not cufile' ) || PYTEST_MARK=()
-
-python -m pytest --cache-clear --verbose "${PYTEST_MARK[@]}" ./python/kvikio/tests
+python -m pytest --cache-clear --verbose ./python/kvikio/tests

From 46a1ab465af0b2fdf14ba740a7c7b7ff0e72be96 Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gforsyth@users.noreply.github.com>
Date: Mon, 2 Jun 2025 11:26:01 -0400
Subject: [PATCH 015/103] refactor(rattler): remove cuda11 options and general
 cleanup (#737)

xref rapidsai/build-planning#184

Authors:
  - Gil Forsyth (https://github.com/gforsyth)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/737
---
 conda/recipes/kvikio/conda_build_config.yaml  | 19 ++-----
 conda/recipes/kvikio/recipe.yaml              | 26 +++------
 .../recipes/libkvikio/conda_build_config.yaml | 19 ++-----
 conda/recipes/libkvikio/recipe.yaml           | 53 ++++---------------
 4 files changed, 21 insertions(+), 96 deletions(-)

diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
index a746372833..c831f4c3cb 100644
--- a/conda/recipes/kvikio/conda_build_config.yaml
+++ b/conda/recipes/kvikio/conda_build_config.yaml
@@ -1,17 +1,14 @@
 c_compiler_version:
-  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 13
 
 cxx_compiler_version:
-  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 13
 
 cmake_version:
   - ">=3.30.4"
 
 cuda_compiler:
-  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - cuda-nvcc
 
 c_stdlib:
   - sysroot
@@ -19,16 +16,6 @@ c_stdlib:
 c_stdlib_version:
   - "2.28"
 
-# The CTK libraries below are missing from the conda-forge::cudatoolkit package
-# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
-# and the "*_run_*" version specifiers correspond to `11.x` packages.
-
-cuda11_libcufile_host_version:
-  - "1.4.0.31"
-
-cuda11_libcufile_run_version:
-  - ">=1.0.0.82,<=1.4.0.31"
-
 libcurl_version:
   - "==8.5.0"
 
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 245773d6ee..135c751f9d 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -71,15 +71,9 @@ requirements:
     - python =${{ py_version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
-    - if: cuda_major == "11"
-      then:
-        - cudatoolkit
-        - if: linux64
-          then: libcufile-dev =${{ cuda11_libcufile_host_version }}
-      else:
-        - cuda-cudart-dev
-        - if: linux
-          then: libcufile-dev
+    - cuda-cudart-dev
+    - if: linux
+      then: libcufile-dev
   run:
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
     - cupy >=12.0.0
@@ -91,15 +85,9 @@ requirements:
     - packaging
     - python
     - zarr >=2.0.0,<4.0.0a0
-    - if: cuda_major == "11"
-      then:
-        - cudatoolkit
-        - if: linux64
-          then: libcufile ${{ cuda11_libcufile_run_version }}
-      else:
-        - cuda-cudart
-        - if: linux
-          then: libcufile
+    - cuda-cudart
+    - if: linux
+      then: libcufile
   ignore_run_exports:
     by_name:
       - cuda-cudart
@@ -108,8 +96,6 @@ requirements:
       - libcurl
       - libkvikio
       - nvcomp
-      - if: cuda_major == "11"
-        then: cudatoolkit
 
 tests:
   - python:
diff --git a/conda/recipes/libkvikio/conda_build_config.yaml b/conda/recipes/libkvikio/conda_build_config.yaml
index ba8163c7ff..1e3ee456a6 100644
--- a/conda/recipes/libkvikio/conda_build_config.yaml
+++ b/conda/recipes/libkvikio/conda_build_config.yaml
@@ -1,17 +1,14 @@
 c_compiler_version:
-  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 13
 
 cxx_compiler_version:
-  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 13
 
 cmake_version:
   - ">=3.30.4"
 
 cuda_compiler:
-  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
-  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - cuda-nvcc
 
 c_stdlib:
   - sysroot
@@ -19,15 +16,5 @@ c_stdlib:
 c_stdlib_version:
   - "2.28"
 
-# The CTK libraries below are missing from the conda-forge::cudatoolkit package
-# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
-# and the "*_run_*" version specifiers correspond to `11.x` packages.
-
-cuda11_libcufile_host_version:
-  - "1.4.0.31"
-
-cuda11_libcufile_run_version:
-  - ">=1.0.0.82,<=1.4.0.31"
-
 libcurl_version:
   - "==8.5.0"
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 8ee2448217..5a2fbab7ee 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -59,14 +59,7 @@ cache:
     host:
       - cuda-version =${{ cuda_version }}
       - libcurl ${{ libcurl_version }}
-      - if: cuda_major == "11"
-        then:
-          - cudatoolkit
-          - if: linux64
-            then:
-              - libcufile =${{ cuda11_libcufile_host_version }}
-              - libcufile-dev =${{ cuda11_libcufile_host_version }}
-        else: libcufile-dev
+      - libcufile-dev
 
 outputs:
   - package:
@@ -91,24 +84,13 @@ outputs:
         - libcurl ${{ libcurl_version }}
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - if: cuda_major == "11"
-          then:
-            - cudatoolkit
-            - if: linux64
-              then:
-                - libcufile ${{ cuda11_libcufile_run_version }}
-                - libcufile-dev ${{ cuda11_libcufile_run_version }}
-          else:
-            - if: linux
-              then: libcufile-dev
+        - if: linux
+          then: libcufile-dev
       ignore_run_exports:
         by_name:
           - cuda-version
           - libcufile
           - libcurl
-          - if: cuda_major == "11"
-            then:
-              - cudatoolkit
     tests:
       - script:
           - test -f $PREFIX/include/kvikio/file_handle.hpp
@@ -134,37 +116,20 @@ outputs:
       host:
         - ${{ pin_subpackage("libkvikio", exact=True) }}
         - cuda-version =${{ cuda_version }}
-        - if: cuda_major == "11"
-          then:
-            - cudatoolkit
-            - if: linux64
-              then:
-                - libcufile-dev =${{ cuda11_libcufile_host_version }}
-          else:
-            - cuda-cudart-dev
-            - if: linux
-              then: libcufile-dev
+        - cuda-cudart-dev
+        - if: linux
+          then: libcufile-dev
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - if: cuda_major == "11"
-          then:
-            - cudatoolkit
-            - if: linux64
-              then:
-                - libcufile =${{ cuda11_libcufile_host_version }}
-          else:
-            - cuda-cudart
-            - if: linux
-              then: libcufile
+        - cuda-cudart
+        - if: linux
+          then: libcufile
       ignore_run_exports:
         by_name:
           - cuda-cudart
           - cuda-version
           - libcufile
           - libcurl
-          - if: cuda_major == "11"
-            then:
-              - cudatoolkit
     about:
       homepage: ${{ load_from_file("python/libkvikio/pyproject.toml").project.urls.Homepage }}
       license: ${{ load_from_file("python/libkvikio/pyproject.toml").project.license.text }}

From c147b62f7f535eea9e9a78a039dde625572a65ac Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 4 Jun 2025 22:52:36 -0400
Subject: [PATCH 016/103] Remove CUDA 11 from dependencies.yaml (#745)

Issue: https://github.com/rapidsai/build-planning/issues/184

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/kvikio/pull/745
---
 .../all_cuda-118_arch-aarch64.yaml            |  44 --------
 .../all_cuda-118_arch-x86_64.yaml             |  46 --------
 dependencies.yaml                             | 103 +-----------------
 python/kvikio/pyproject.toml                  |   4 +-
 4 files changed, 6 insertions(+), 191 deletions(-)
 delete mode 100644 conda/environments/all_cuda-118_arch-aarch64.yaml
 delete mode 100644 conda/environments/all_cuda-118_arch-x86_64.yaml

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
deleted file mode 100644
index 08904577c5..0000000000
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- conda-forge
-- nvidia
-dependencies:
-- boto3>=1.21.21
-- c-compiler
-- cmake>=3.30.4
-- cuda-python>=11.8.5,<12.0a0
-- cuda-version=11.8
-- cudatoolkit
-- cupy>=12.0.0
-- cxx-compiler
-- cython>=3.0.0
-- doxygen=1.9.1
-- gcc_linux-aarch64=11.*
-- libcurl>=8.5.0,<9.0a0
-- moto>=4.0.8
-- ninja
-- numcodecs !=0.12.0
-- numpy>=1.23,<3.0a0
-- numpydoc
-- nvcc_linux-aarch64=11.8
-- nvcomp==4.2.0.11
-- packaging
-- pre-commit
-- pytest
-- pytest-asyncio
-- pytest-cov
-- pytest-timeout
-- python>=3.10,<3.14
-- rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.8.*,>=0.0.0a0
-- scikit-build-core>=0.10.0
-- sphinx
-- sphinx-click
-- sphinx_rtd_theme
-- sysroot_linux-aarch64=2.28
-- zarr>=2.0.0,<4.0.0
-name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
deleted file mode 100644
index ebcc610f40..0000000000
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- conda-forge
-- nvidia
-dependencies:
-- boto3>=1.21.21
-- c-compiler
-- cmake>=3.30.4
-- cuda-python>=11.8.5,<12.0a0
-- cuda-version=11.8
-- cudatoolkit
-- cupy>=12.0.0
-- cxx-compiler
-- cython>=3.0.0
-- doxygen=1.9.1
-- gcc_linux-64=11.*
-- libcufile-dev=1.4.0.31
-- libcufile=1.4.0.31
-- libcurl>=8.5.0,<9.0a0
-- moto>=4.0.8
-- ninja
-- numcodecs !=0.12.0
-- numpy>=1.23,<3.0a0
-- numpydoc
-- nvcc_linux-64=11.8
-- nvcomp==4.2.0.11
-- packaging
-- pre-commit
-- pytest
-- pytest-asyncio
-- pytest-cov
-- pytest-timeout
-- python>=3.10,<3.14
-- rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.8.*,>=0.0.0a0
-- scikit-build-core>=0.10.0
-- sphinx
-- sphinx-click
-- sphinx_rtd_theme
-- sysroot_linux-64=2.28
-- zarr>=2.0.0,<4.0.0
-name: all_cuda-118_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 1f194ebbe0..0a290e4424 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.8"]
+      cuda: ["12.8"]
       arch: [aarch64, x86_64]
     includes:
       - build-universal
@@ -130,18 +130,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              arch: x86_64
-              cuda: "11.8"
-            packages:
-              - gcc_linux-64=11.*
-              - sysroot_linux-64=2.28
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - gcc_linux-aarch64=11.*
-              - sysroot_linux-aarch64=2.28
           - matrix:
               arch: x86_64
               cuda: "12.*"
@@ -156,16 +144,6 @@ dependencies:
               - sysroot_linux-aarch64=2.28
       - output_types: conda
         matrices:
-          - matrix:
-              arch: x86_64
-              cuda: "11.8"
-            packages:
-              - nvcc_linux-64=11.8
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - nvcc_linux-aarch64=11.8
           - matrix:
               cuda: "12.*"
             packages:
@@ -183,11 +161,6 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - libkvikio-cu12==25.8.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - libkvikio-cu11==25.8.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -203,22 +176,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              cuda: "11.2"
-            packages:
-              - cuda-version=11.2
-          - matrix:
-              cuda: "11.4"
-            packages:
-              - cuda-version=11.4
-          - matrix:
-              cuda: "11.5"
-            packages:
-              - cuda-version=11.5
-          - matrix:
-              cuda: "11.8"
-            packages:
-              - cuda-version=11.8
           - matrix:
               cuda: "12.0"
             packages:
@@ -243,10 +200,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              cuda: "11.*"
-            packages:
-              - cudatoolkit
           - matrix:
               cuda: "12.*"
             packages:
@@ -256,36 +209,6 @@ dependencies:
               cuda: "12.*"
             packages:
               - libcufile-dev
-          - matrix:
-              arch: aarch64
-              cuda: "11.*"
-            packages:
-          - matrix:
-              cuda: "11.8"
-              arch: x86_64
-            packages:
-              - libcufile=1.4.0.31
-              - libcufile-dev=1.4.0.31
-          - matrix:
-              cuda: "11.5"
-              arch: x86_64
-            packages:
-              - libcufile>=1.1.0.37,<=1.1.1.25
-              - libcufile-dev>=1.1.0.37,<=1.1.1.25
-          - matrix:
-              cuda: "11.4"
-              arch: x86_64
-            packages:
-              - &libcufile_114 libcufile>=1.0.0.82,<=1.0.2.10
-              - &libcufile_dev114 libcufile-dev>=1.0.0.82,<=1.0.2.10
-          - matrix:
-              cuda: "11.2"
-              arch: x86_64
-            packages:
-              # The NVIDIA channel doesn't publish pkgs older than 11.4 for these libs,
-              # so 11.2 uses 11.4 packages (the oldest available).
-              - *libcufile_114
-              - *libcufile_dev114
   depends_on_cupy:
     common:
       - output_types: conda
@@ -294,13 +217,9 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix: # All CUDA 12 versions
             packages:
               - cupy-cuda12x>=12.0.0
-          - matrix: {cuda: "11.*"}
-            packages: &cupy_packages_cu11
-              - cupy-cuda11x>=12.0.0
-          - {matrix: null, packages: *cupy_packages_cu11}
   depends_on_nvcomp:
     common:
       - output_types: conda
@@ -314,11 +233,6 @@ dependencies:
               use_cuda_wheels: "true"
             packages:
               - nvidia-nvcomp-cu12==4.2.0.11
-          - matrix:
-              cuda: "11.*"
-              use_cuda_wheels: "true"
-            packages:
-              - nvidia-nvcomp-cu11==4.2.0.11
           # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
           # (e.g. for DLFW and pip devcontainers)
           - matrix:
@@ -337,7 +251,7 @@ dependencies:
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for libkvikio-cu{11,12}.
+          # This index is needed for libkvikio-cu12.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
@@ -348,11 +262,6 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - libkvikio-cu12==25.8.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - libkvikio-cu11==25.8.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
@@ -445,13 +354,9 @@ dependencies:
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
-          - matrix:
-              cuda: "12.*"
+          - matrix: # All CUDA 12 versions
             packages:
               - cuda-python>=12.6.2,<13.0a0
-          - matrix: # All CUDA 11 versions
-            packages:
-              - cuda-python>=11.8.5,<12.0a0
   test_java:
     common:
       - output_types: conda
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index dee6594304..b48b4312ee 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cupy-cuda11x>=12.0.0",
+    "cupy-cuda12x>=12.0.0",
     "libkvikio==25.8.*,>=0.0.0a0",
     "numcodecs !=0.12.0",
     "numpy>=1.23,<3.0a0",
@@ -41,7 +41,7 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "boto3>=1.21.21",
-    "cuda-python>=11.8.5,<12.0a0",
+    "cuda-python>=12.6.2,<13.0a0",
     "moto[server]>=4.0.8",
     "pytest",
     "pytest-asyncio",

From d8baac9734841feb405a5eab76a7a7de776e3606 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 5 Jun 2025 15:41:58 -0400
Subject: [PATCH 017/103] Add the conda dependency libnuma necessary for GDS
 I/O (#744)

This PR adds the conda dependency `libnuma` necessary for GDS I/O. For this PR, only CUDA 12 is considered.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)
  - Gil Forsyth (https://github.com/gforsyth)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/744
---
 conda/environments/all_cuda-128_arch-aarch64.yaml |  1 +
 conda/environments/all_cuda-128_arch-x86_64.yaml  |  1 +
 conda/recipes/libkvikio/recipe.yaml               | 15 +++++++++------
 dependencies.yaml                                 |  1 +
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/conda/environments/all_cuda-128_arch-aarch64.yaml b/conda/environments/all_cuda-128_arch-aarch64.yaml
index ae4bf15cc7..beb13ccd92 100644
--- a/conda/environments/all_cuda-128_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-128_arch-aarch64.yaml
@@ -19,6 +19,7 @@ dependencies:
 - gcc_linux-aarch64=13.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
+- libnuma
 - moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
index 0283f650e0..0df124c208 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -19,6 +19,7 @@ dependencies:
 - gcc_linux-64=13.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
+- libnuma
 - moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 5a2fbab7ee..25caa15401 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -60,6 +60,7 @@ cache:
       - cuda-version =${{ cuda_version }}
       - libcurl ${{ libcurl_version }}
       - libcufile-dev
+      - libnuma
 
 outputs:
   - package:
@@ -82,10 +83,13 @@ outputs:
       host:
         - cuda-version =${{ cuda_version }}
         - libcurl ${{ libcurl_version }}
+        - libnuma
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - if: linux
-          then: libcufile-dev
+        # Needed until libnuma run-exports are fixed:
+        # https://github.com/conda-forge/numactl-feedstock/pull/18
+        - ${{ pin_compatible("libnuma", upper_bound="x", lower_bound="x") }}
+        - libcufile-dev
       ignore_run_exports:
         by_name:
           - cuda-version
@@ -117,19 +121,18 @@ outputs:
         - ${{ pin_subpackage("libkvikio", exact=True) }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
-        - if: linux
-          then: libcufile-dev
+        - libcufile-dev
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - cuda-cudart
-        - if: linux
-          then: libcufile
+        - libcufile
       ignore_run_exports:
         by_name:
           - cuda-cudart
           - cuda-version
           - libcufile
           - libcurl
+          - libnuma
     about:
       homepage: ${{ load_from_file("python/libkvikio/pyproject.toml").project.urls.Homepage }}
       license: ${{ load_from_file("python/libkvikio/pyproject.toml").project.license.text }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 0a290e4424..c54a09d8d9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -209,6 +209,7 @@ dependencies:
               cuda: "12.*"
             packages:
               - libcufile-dev
+              - libnuma
   depends_on_cupy:
     common:
       - output_types: conda

From 6d767f75dc7553065746855e22e3d207a6527a5d Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gforsyth@users.noreply.github.com>
Date: Wed, 11 Jun 2025 15:59:15 -0400
Subject: [PATCH 018/103] refactor(rattler): remove explicit `libnuma` pin now
 that upstream feedstock is fixed (#752)

xref #744

Authors:
  - Gil Forsyth (https://github.com/gforsyth)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/752
---
 conda/recipes/libkvikio/recipe.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 25caa15401..3bf8671daa 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -83,12 +83,8 @@ outputs:
       host:
         - cuda-version =${{ cuda_version }}
         - libcurl ${{ libcurl_version }}
-        - libnuma
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        # Needed until libnuma run-exports are fixed:
-        # https://github.com/conda-forge/numactl-feedstock/pull/18
-        - ${{ pin_compatible("libnuma", upper_bound="x", lower_bound="x") }}
         - libcufile-dev
       ignore_run_exports:
         by_name:

From 8dcbdfcc6c94fd0c9ebd8c85b5c4ebdd9e1990c9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 12 Jun 2025 12:10:38 -0700
Subject: [PATCH 019/103] Only use cufile on x86 (#753)

This is a temporary patch for the problem that cufile is not available on arm before CUDA 12.2, which results in these libkvikio packages not currently being installable on (for example) CUDA 12.0 arm systems. We should follow this PR up with a fix to actually build separate packages for older and newer CUDA versions, but this quick fix should unblock other work today.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/kvikio/pull/753
---
 ci/build_cpp.sh                     |  4 ++++
 conda/recipes/libkvikio/recipe.yaml | 12 +++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 7639ddd5c4..961e0c7e36 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -18,6 +18,10 @@ sccache --zero-stats
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version)
 export RAPIDS_PACKAGE_VERSION
 
+RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
+mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
+export RAPIDS_ARTIFACTS_DIR
+
 # populates `RATTLER_CHANNELS` array and `RATTLER_ARGS` array
 source rapids-rattler-channel-string
 
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 3bf8671daa..3d09516cae 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -59,7 +59,9 @@ cache:
     host:
       - cuda-version =${{ cuda_version }}
       - libcurl ${{ libcurl_version }}
-      - libcufile-dev
+      - if: (linux and x86_64) or (linux and aarch64 and cuda_version >= "12.2")
+        then:
+        - libcufile-dev
       - libnuma
 
 outputs:
@@ -85,7 +87,9 @@ outputs:
         - libcurl ${{ libcurl_version }}
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - libcufile-dev
+        - if: linux and x86_64
+          then:
+          - libcufile-dev
       ignore_run_exports:
         by_name:
           - cuda-version
@@ -117,7 +121,9 @@ outputs:
         - ${{ pin_subpackage("libkvikio", exact=True) }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
-        - libcufile-dev
+        - if: linux and x86_64
+          then:
+          - libcufile-dev
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - cuda-cudart

From c3509acaecacc30c71363cd5f75ddd63fd0872fd Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 13 Jun 2025 09:49:56 -0700
Subject: [PATCH 020/103] Use variants to produce separate builds with and
 without cufile support (#754)

On arm cufile was not supported until CUDA 12.2, whereas support exists since 12.0 on x86 architectures. To properly reflect these dependencies, we need to build separate variants of cufile on arm for cuda versions before and after 12.2. This PR updates the recipe to support that.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/754
---
 ci/build_cpp.sh                               | 32 +++++++
 .../recipes/libkvikio/conda_build_config.yaml |  6 --
 conda/recipes/libkvikio/recipe.yaml           | 54 +++++++++---
 cpp/include/kvikio/shim/cufile.hpp            |  4 -
 cpp/src/shim/cufile.cpp                       |  2 +
 python/kvikio/tests/test_cufile_driver.py     | 86 +++++++++++--------
 6 files changed, 123 insertions(+), 61 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 961e0c7e36..9258c3dd94 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -25,10 +25,42 @@ export RAPIDS_ARTIFACTS_DIR
 # populates `RATTLER_CHANNELS` array and `RATTLER_ARGS` array
 source rapids-rattler-channel-string
 
+# Construct the extra variants according to the architecture
+if [[ "$(arch)" == "x86_64" ]]; then
+    cat > variants.yaml << EOF
+    c_compiler_version:
+      - 13
+
+    cxx_compiler_version:
+      - 13
+
+    cuda_version:
+      - ${RAPIDS_CUDA_VERSION}
+EOF
+else
+    cat > variants.yaml << EOF
+    zip_keys:
+    - [c_compiler_version, cxx_compiler_version, cuda_version]
+
+    c_compiler_version:
+    - 12
+    - 13
+
+    cxx_compiler_version:
+    - 12
+    - 13
+
+    cuda_version:
+    - 12.1 # The last version to not support cufile
+    - ${RAPIDS_CUDA_VERSION}
+EOF
+fi
+
 # --no-build-id allows for caching with `sccache`
 # more info is available at
 # https://rattler.build/latest/tips_and_tricks/#using-sccache-or-ccache-with-rattler-build
 rattler-build build --recipe conda/recipes/libkvikio \
+                    --variant-config variants.yaml \
                     "${RATTLER_ARGS[@]}" \
                     "${RATTLER_CHANNELS[@]}"
 
diff --git a/conda/recipes/libkvikio/conda_build_config.yaml b/conda/recipes/libkvikio/conda_build_config.yaml
index 1e3ee456a6..1149e0f9fa 100644
--- a/conda/recipes/libkvikio/conda_build_config.yaml
+++ b/conda/recipes/libkvikio/conda_build_config.yaml
@@ -1,9 +1,3 @@
-c_compiler_version:
-  - 13
-
-cxx_compiler_version:
-  - 13
-
 cmake_version:
   - ">=3.30.4"
 
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 3d09516cae..a4e423d3ed 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -4,11 +4,18 @@ schema_version: 1
 context:
   version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
   minor_version: ${{ (version | split("."))[:2] | join(".") }}
-  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
-  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  # We need to support three cases:
+  # 1. Linux x86_64, which always uses libcufile
+  # 2. Linux aarch64 with CUDA >= 12.2, which uses libcufile
+  # 3. Linux aarch64 with CUDA < 12.2, which does not use libcufile
+  # Each case has different cuda-version constraints as expressed below
+  should_use_cufile: ${{ x86_64 or (aarch64 and cuda_version >= "12.2") }}
+  # When reverting, instances of cuda_key_string can be replaced with cuda_major
+  cuda_key_string: ${{ cuda_version | replace(".", "_") }}
+  #cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  #cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
   date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
   head_rev: '${{ git.head_rev(".")[:8] }}'
-  linux64: ${{ linux and x86_64 }}
 
 recipe:
   name: libkvikio-split
@@ -46,7 +53,7 @@ cache:
         SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
         SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
         SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
-        SCCACHE_S3_KEY_PREFIX: libkvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}
+        SCCACHE_S3_KEY_PREFIX: libkvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_key_string }}
   requirements:
     build:
       - ${{ compiler("c") }}
@@ -59,7 +66,7 @@ cache:
     host:
       - cuda-version =${{ cuda_version }}
       - libcurl ${{ libcurl_version }}
-      - if: (linux and x86_64) or (linux and aarch64 and cuda_version >= "12.2")
+      - if: should_use_cufile
         then:
         - libcufile-dev
       - libnuma
@@ -72,7 +79,7 @@ outputs:
       script:
         content: |
           cmake --install cpp/build
-      string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
+      string: cuda${{ cuda_key_string }}_${{ date_string }}_${{ head_rev }}
       dynamic_linking:
         overlinking_behavior: "error"
       prefix_detection:
@@ -86,15 +93,25 @@ outputs:
         - cuda-version =${{ cuda_version }}
         - libcurl ${{ libcurl_version }}
       run:
-        - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-        - if: linux and x86_64
+        - if: x86_64
+          then:
+          - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+          else:
+          - if: aarch64 and cuda_version >= "12.2"
+            then:
+            - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="12.2.0a0") }}
+            else:
+            - ${{ pin_compatible("cuda-version", upper_bound="12.2.0a0", lower_bound="12.0") }}
+        - if: should_use_cufile
           then:
           - libcufile-dev
       ignore_run_exports:
         by_name:
           - cuda-version
-          - libcufile
           - libcurl
+          - if: should_use_cufile
+            then:
+            - libcufile
     tests:
       - script:
           - test -f $PREFIX/include/kvikio/file_handle.hpp
@@ -107,7 +124,7 @@ outputs:
       name: libkvikio-tests
       version: ${{ version }}
     build:
-      string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
+      string: cuda${{ cuda_key_string }}_${{ date_string }}_${{ head_rev }}
       dynamic_linking:
         overlinking_behavior: "error"
       script:
@@ -121,20 +138,29 @@ outputs:
         - ${{ pin_subpackage("libkvikio", exact=True) }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
-        - if: linux and x86_64
+        - if: should_use_cufile
           then:
           - libcufile-dev
       run:
-        - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+        - if: x86_64
+          then:
+          - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+          else:
+          - if: aarch64 and cuda_version >= "12.2"
+            then:
+            - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="12.2.0a0") }}
+            else:
+            - ${{ pin_compatible("cuda-version", upper_bound="12.2.0a0", lower_bound="12.0") }}
         - cuda-cudart
-        - libcufile
       ignore_run_exports:
         by_name:
           - cuda-cudart
           - cuda-version
-          - libcufile
           - libcurl
           - libnuma
+          - if: should_use_cufile
+            then:
+            - libcufile
     about:
       homepage: ${{ load_from_file("python/libkvikio/pyproject.toml").project.urls.Homepage }}
       license: ${{ load_from_file("python/libkvikio/pyproject.toml").project.license.text }}
diff --git a/cpp/include/kvikio/shim/cufile.hpp b/cpp/include/kvikio/shim/cufile.hpp
index 4823fd42f5..29d99ed5a1 100644
--- a/cpp/include/kvikio/shim/cufile.hpp
+++ b/cpp/include/kvikio/shim/cufile.hpp
@@ -128,11 +128,7 @@ bool is_cufile_available() noexcept;
  *
  * @return The version (1000*major + 10*minor) or zero if older than 1080.
  */
-#ifdef KVIKIO_CUFILE_FOUND
 int cufile_version() noexcept;
-#else
-constexpr int cufile_version() noexcept { return 0; }
-#endif
 
 /**
  * @brief Check if cuFile's batch API is available.
diff --git a/cpp/src/shim/cufile.cpp b/cpp/src/shim/cufile.cpp
index 1f849263e8..7e69c84456 100644
--- a/cpp/src/shim/cufile.cpp
+++ b/cpp/src/shim/cufile.cpp
@@ -147,6 +147,8 @@ int cufile_version() noexcept
     return 0;
   }
 }
+#else
+int cufile_version() noexcept { return 0; }
 #endif
 
 bool is_batch_api_available() noexcept { return cufile_version() >= 1060; }
diff --git a/python/kvikio/tests/test_cufile_driver.py b/python/kvikio/tests/test_cufile_driver.py
index a325272426..d85cd35d3e 100644
--- a/python/kvikio/tests/test_cufile_driver.py
+++ b/python/kvikio/tests/test_cufile_driver.py
@@ -13,47 +13,59 @@ def test_version():
 
 
 @pytest.mark.cufile
-def test_open_and_close():
-    kvikio.cufile_driver.driver_open()
-    kvikio.cufile_driver.driver_close()
+def test_open_and_close(request):
+    try:
+        kvikio.cufile_driver.driver_open()
+        kvikio.cufile_driver.driver_close()
+    except RuntimeError as e:
+        if "KvikIO not compiled with cuFile.h" in str(e):
+            pytest.skip("KvikIO not compiled with cuFile.h, skipping cuFile tests")
 
 
 @pytest.mark.cufile
 def test_property_accessor():
     """Test the method `get` and `set`"""
 
-    # Attempt to set a nonexistent property
-    with pytest.raises(KeyError):
-        kvikio.cufile_driver.set("nonexistent_property", 123)
-
-    # Attempt to get a nonexistent property
-    with pytest.raises(KeyError):
-        kvikio.cufile_driver.get("nonexistent_property")
-
-    # Attempt to set a read-only property
-    with pytest.raises(KeyError, match="read-only"):
-        kvikio.cufile_driver.set("major_version", 2077)
-
-    # Nested context managers
-    poll_thresh_size_default = kvikio.cufile_driver.get("poll_thresh_size")
-    with kvikio.cufile_driver.set("poll_thresh_size", 1024):
-        assert kvikio.cufile_driver.get("poll_thresh_size") == 1024
-        with kvikio.cufile_driver.set("poll_thresh_size", 2048):
-            assert kvikio.cufile_driver.get("poll_thresh_size") == 2048
-            with kvikio.cufile_driver.set("poll_thresh_size", 4096):
-                assert kvikio.cufile_driver.get("poll_thresh_size") == 4096
-            assert kvikio.cufile_driver.get("poll_thresh_size") == 2048
-        assert kvikio.cufile_driver.get("poll_thresh_size") == 1024
-    assert kvikio.cufile_driver.get("poll_thresh_size") == poll_thresh_size_default
-
-    # Multiple context managers
-    poll_mode_default = kvikio.cufile_driver.get("poll_mode")
-    max_device_cache_size_default = kvikio.cufile_driver.get("max_device_cache_size")
-    with kvikio.cufile_driver.set({"poll_mode": True, "max_device_cache_size": 2048}):
-        assert kvikio.cufile_driver.get("poll_mode") and (
-            kvikio.cufile_driver.get("max_device_cache_size") == 2048
+    try:
+        # Attempt to set a nonexistent property
+        with pytest.raises(KeyError):
+            kvikio.cufile_driver.set("nonexistent_property", 123)
+
+        # Attempt to get a nonexistent property
+        with pytest.raises(KeyError):
+            kvikio.cufile_driver.get("nonexistent_property")
+
+        # Attempt to set a read-only property
+        with pytest.raises(KeyError, match="read-only"):
+            kvikio.cufile_driver.set("major_version", 2077)
+
+        # Nested context managers
+        poll_thresh_size_default = kvikio.cufile_driver.get("poll_thresh_size")
+        with kvikio.cufile_driver.set("poll_thresh_size", 1024):
+            assert kvikio.cufile_driver.get("poll_thresh_size") == 1024
+            with kvikio.cufile_driver.set("poll_thresh_size", 2048):
+                assert kvikio.cufile_driver.get("poll_thresh_size") == 2048
+                with kvikio.cufile_driver.set("poll_thresh_size", 4096):
+                    assert kvikio.cufile_driver.get("poll_thresh_size") == 4096
+                assert kvikio.cufile_driver.get("poll_thresh_size") == 2048
+            assert kvikio.cufile_driver.get("poll_thresh_size") == 1024
+        assert kvikio.cufile_driver.get("poll_thresh_size") == poll_thresh_size_default
+
+        # Multiple context managers
+        poll_mode_default = kvikio.cufile_driver.get("poll_mode")
+        max_device_cache_size_default = kvikio.cufile_driver.get(
+            "max_device_cache_size"
+        )
+        with kvikio.cufile_driver.set(
+            {"poll_mode": True, "max_device_cache_size": 2048}
+        ):
+            assert kvikio.cufile_driver.get("poll_mode") and (
+                kvikio.cufile_driver.get("max_device_cache_size") == 2048
+            )
+        assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
+            kvikio.cufile_driver.get("max_device_cache_size")
+            == max_device_cache_size_default
         )
-    assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
-        kvikio.cufile_driver.get("max_device_cache_size")
-        == max_device_cache_size_default
-    )
+    except RuntimeError as e:
+        if "KvikIO not compiled with cuFile.h" in str(e):
+            pytest.skip("KvikIO not compiled with cuFile.h, skipping cuFile tests")

From 03351cf67ffba067fb9ad619330943fe32b00e8e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 13 Jun 2025 13:43:32 -0400
Subject: [PATCH 021/103] Fix unit test UB resulting from incorrect
 initialization (#751)

KvikIO unit test has a utility class `EnvVarContext` class, introduced in https://github.com/rapidsai/kvikio/pull/700, and slightly improved in https://github.com/rapidsai/kvikio/pull/735. It has been identified that this class was incorrectly initialized, resulting in UB: it causes the unit test failure in C++20, which by fluke was not observed in C++17. This PR fixes this error. Specifically, the constructor of `EnvVarContext` is:
```
EnvVarContext(std::initializer_list<std::pair<std::string_view, std::string_view>> env_var_entries);
```
There are several ways of instantiation:
```
// Direct initialization
EnvVarContext env_var_ctx({{"env_1", "v1"}, {"env_2", "v2"}});

// Direct list initialization
EnvVarContext env_var_ctx{{"env_1", "v1"}, {"env_2", "v2"}};

// Copy list initialization
EnvVarContext env_var_ctx = {{"env_1", "v1"}, {"env_2", "v2"}};
```
The erroneous instantiation performed is:
```
// Extra pair of braces
// {}: brace-enclosed initializer list
// {{"env_1", "v1"}, {"env_2", "v2"}}: one element of type pair<std::string_view, std::string_view>
// {"env_1", "v1"}: first
// {"env_2", "v2"}: second
EnvVarContext env_var_ctx{{{"env_1", "v1"}, {"env_2", "v2"}}};
```
As a result, the initializer list only has 1 pair, with the key being `{"env_1", "v1"}` and value being `{"env_2", "v2"}`. For the key, for instance, the 5-th overload (https://en.cppreference.com/w/cpp/string/basic_string_view/basic_string_view.html) of the constructor was used, where `first` points to "env_1" and `last` points to "v1". Since the two iterators do not form a valid range, UB ensues.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/751
---
 cpp/tests/test_defaults.cpp      | 39 ++++++++++++++++----------------
 cpp/tests/test_remote_handle.cpp |  9 ++++----
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/cpp/tests/test_defaults.cpp b/cpp/tests/test_defaults.cpp
index 9c283d658a..89bbe7399c 100644
--- a/cpp/tests/test_defaults.cpp
+++ b/cpp/tests/test_defaults.cpp
@@ -92,7 +92,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // Non-string env var has an empty value
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", ""}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", ""}};
     EXPECT_THAT(
       [=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, 123); },
       ThrowsMessage<std::invalid_argument>(HasSubstr("unknown config value KVIKIO_TEST_ALIAS=")));
@@ -100,8 +100,8 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // Non-string env var and alias have an empty value
   {
-    kvikio::test::EnvVarContext env_var_ctx{
-      {{"KVIKIO_TEST_ALIAS_1", ""}, {"KVIKIO_TEST_ALIAS_2", ""}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_1", ""},
+                                            {"KVIKIO_TEST_ALIAS_2", ""}};
     EXPECT_THAT(
       [=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123); },
       ThrowsMessage<std::invalid_argument>(HasSubstr("unknown config value KVIKIO_TEST_ALIAS_2=")));
@@ -109,7 +109,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // String env var has an empty value
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", ""}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", ""}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, std::string{"abc"});
     EXPECT_EQ(env_var_name, "KVIKIO_TEST_ALIAS");
@@ -119,8 +119,8 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // String env var and alias have an empty value
   {
-    kvikio::test::EnvVarContext env_var_ctx{
-      {{"KVIKIO_TEST_ALIAS_1", ""}, {"KVIKIO_TEST_ALIAS_2", ""}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_1", ""},
+                                            {"KVIKIO_TEST_ALIAS_2", ""}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, std::string{"abc"});
     EXPECT_EQ(env_var_name, "KVIKIO_TEST_ALIAS_2");
@@ -130,9 +130,8 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // Env var has already been set by its alias with the same value
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS_1", "10"},
-                                             {"KVIKIO_TEST_ALIAS_2", "10"},
-                                             {"KVIKIO_TEST_ALIAS_3", "10"}}};
+    kvikio::test::EnvVarContext env_var_ctx{
+      {"KVIKIO_TEST_ALIAS_1", "10"}, {"KVIKIO_TEST_ALIAS_2", "10"}, {"KVIKIO_TEST_ALIAS_3", "10"}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2", "KVIKIO_TEST_ALIAS_3"}, 123);
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_3"});
@@ -142,8 +141,8 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // Env var has already been set by its alias with a different value
   {
-    kvikio::test::EnvVarContext env_var_ctx{
-      {{"KVIKIO_TEST_ALIAS_1", "10"}, {"KVIKIO_TEST_ALIAS_2", "20"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_1", "10"},
+                                            {"KVIKIO_TEST_ALIAS_2", "20"}};
     EXPECT_THAT([=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123); },
                 ThrowsMessage<std::invalid_argument>(HasSubstr(
                   "Environment variable KVIKIO_TEST_ALIAS_2 (20) has already been set by its alias "
@@ -152,7 +151,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // Env var has invalid value
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "abc"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "abc"}};
     EXPECT_THAT([=] { kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, 123); },
                 ThrowsMessage<std::invalid_argument>(
                   HasSubstr("unknown config value KVIKIO_TEST_ALIAS=abc")));
@@ -160,7 +159,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // 1st alias has a set value
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS_1", "654.321"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_1", "654.321"}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123.456);
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_1"});
@@ -170,7 +169,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // 2nd alias has a set value
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS_2", "654.321"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS_2", "654.321"}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS_1", "KVIKIO_TEST_ALIAS_2"}, 123.456);
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS_2"});
@@ -189,14 +188,14 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // Special type: bool
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "yes"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "yes"}};
     auto const [env_var_name, result, has_found] = kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, false);
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
     EXPECT_TRUE(result);
     EXPECT_TRUE(has_found);
   }
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "OFF"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "OFF"}};
     auto const [env_var_name, result, has_found] = kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, false);
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
     EXPECT_FALSE(result);
@@ -205,7 +204,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // Special type: CompatMode
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "yes"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "yes"}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::AUTO);
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
@@ -213,7 +212,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
     EXPECT_TRUE(has_found);
   }
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "FALSE"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "FALSE"}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::AUTO);
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
@@ -221,7 +220,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
     EXPECT_TRUE(has_found);
   }
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "aUtO"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "aUtO"}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, kvikio::CompatMode::ON);
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
@@ -231,7 +230,7 @@ TEST(DefaultsTest, alias_for_getenv_or)
 
   // Special type: std::vector<int>
   {
-    kvikio::test::EnvVarContext env_var_ctx{{{"KVIKIO_TEST_ALIAS", "109, 108, 107"}}};
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_TEST_ALIAS", "109, 108, 107"}};
     auto const [env_var_name, result, has_found] =
       kvikio::getenv_or({"KVIKIO_TEST_ALIAS"}, std::vector<int>{111, 112, 113});
     EXPECT_EQ(env_var_name, std::string_view{"KVIKIO_TEST_ALIAS"});
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index 650f1500f3..918479b0f0 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -17,15 +17,14 @@
 #include <gtest/gtest.h>
 #include <kvikio/remote_handle.hpp>
 
-#include <unordered_map>
 #include "utils/env.hpp"
 
 TEST(RemoteHandleTest, s3_endpoint_constructor)
 {
-  kvikio::test::EnvVarContext env_var_ctx{{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
-                                           {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
-                                           {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"},
-                                           {"AWS_ENDPOINT_URL", "https://my_aws_endpoint_url"}}};
+  kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
+                                          {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
+                                          {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"},
+                                          {"AWS_ENDPOINT_URL", "https://my_aws_endpoint_url"}};
   std::string url        = "https://my_aws_endpoint_url/bucket_name/object_name";
   std::string aws_region = "my_aws_region";
   // Use the overload where the full url and the optional aws_region are specified.

From b87d11137103d4b89584f9cf56ac330a6ff35150 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 16 Jun 2025 16:16:11 -0400
Subject: [PATCH 022/103] Add the utility function to clear page cache (#741)

This PR introduces a utility function to clear page cache in C++ and Python.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/741
---
 cpp/include/kvikio/file_utils.hpp         | 23 ++++++++++++
 cpp/src/file_utils.cpp                    | 43 +++++++++++++++++++++++
 docs/source/api.rst                       |  4 +++
 python/kvikio/kvikio/__init__.py          |  5 +--
 python/kvikio/kvikio/_lib/file_handle.pyx |  8 +++++
 python/kvikio/kvikio/cufile.py            | 24 +++++++++++++
 6 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/cpp/include/kvikio/file_utils.hpp b/cpp/include/kvikio/file_utils.hpp
index 778608a03b..2f03f0cbdc 100644
--- a/cpp/include/kvikio/file_utils.hpp
+++ b/cpp/include/kvikio/file_utils.hpp
@@ -180,4 +180,27 @@ std::pair<std::size_t, std::size_t> get_page_cache_info(std::string const& file_
  * @sa `get_page_cache_info(std::string const&)` overload.
  */
 std::pair<std::size_t, std::size_t> get_page_cache_info(int fd);
+
+/**
+ * @brief Clear the page cache
+ *
+ * @param reclaim_dentries_and_inodes Whether to free reclaimable slab objects which include
+ * dentries and inodes.
+ * - If `true`, equivalent to executing `/sbin/sysctl vm.drop_caches=3`;
+ * - If `false`, equivalent to executing `/sbin/sysctl vm.drop_caches=1`.
+ * @param clear_dirty_pages Whether to trigger the writeback process to clear the dirty pages. If
+ * `true`, `sync` will be called prior to cache clearing.
+ * @return Whether the page cache has been successfully cleared
+ *
+ * @note This function creates a child process and executes the cache clearing shell command in the
+ * following order
+ * - Execute the command without `sudo` prefix. This is for the superuser and also for specially
+ * configured systems where unprivileged users cannot execute `/usr/bin/sudo` but can execute
+ * `/sbin/sysctl`. If this step succeeds, the function returns `true` immediately.
+ * - Execute the command with `sudo` prefix. This is for the general case where selective
+ * unprivileged users have permission to run `/sbin/sysctl` with `sudo` prefix.
+ *
+ * @throws kvikio::GenericSystemError if somehow the child process could not be created.
+ */
+bool clear_page_cache(bool reclaim_dentries_and_inodes = true, bool clear_dirty_pages = true);
 }  // namespace kvikio
diff --git a/cpp/src/file_utils.cpp b/cpp/src/file_utils.cpp
index 3c7951effe..124edf7b86 100644
--- a/cpp/src/file_utils.cpp
+++ b/cpp/src/file_utils.cpp
@@ -18,7 +18,11 @@
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
+
+#include <array>
 #include <stdexcept>
+#include <string>
+#include <string_view>
 #include <system_error>
 #include <utility>
 #include <vector>
@@ -209,4 +213,43 @@ std::pair<std::size_t, std::size_t> get_page_cache_info(int fd)
   SYSCALL_CHECK(munmap(addr, file_size));
   return {num_pages_in_page_cache, num_pages};
 }
+
+bool clear_page_cache(bool reclaim_dentries_and_inodes, bool clear_dirty_pages)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (clear_dirty_pages) { sync(); }
+  std::string param = reclaim_dentries_and_inodes ? "3" : "1";
+
+  auto exec_cmd = [](std::string_view cmd) -> bool {
+    // Prevent the output from the command from mixing with the original process' output.
+    fflush(nullptr);
+    // popen only handles stdout. Switch stderr and stdout to only capture stderr.
+    auto const redirected_cmd =
+      std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
+    std::unique_ptr<FILE, int (*)(FILE*)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
+    KVIKIO_EXPECT(pipe != nullptr, "popen() failed", GenericSystemError);
+
+    std::array<char, 128> buffer;
+    std::string error_out;
+    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+      error_out += buffer.data();
+    }
+    return error_out.empty();
+  };
+
+  std::array cmds{
+    // Special case:
+    // - Unprivileged users who cannot execute `/usr/bin/sudo` but can execute `/sbin/sysctl`, and
+    // - Superuser
+    std::string{"/sbin/sysctl vm.drop_caches=" + param},
+    // General case:
+    // - Unprivileged users who can execute `sudo`, and
+    // - Superuser
+    std::string{"sudo /sbin/sysctl vm.drop_caches=" + param}};
+
+  for (auto const& cmd : cmds) {
+    if (exec_cmd(cmd)) { return true; }
+  }
+  return false;
+}
 }  // namespace kvikio
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 5cba4fd8d3..1e19f12bdc 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -11,6 +11,10 @@ CuFile
 .. autoclass:: IOFuture
     :members:
 
+.. autofunction:: get_page_cache_info
+
+.. autofunction:: clear_page_cache
+
 CuFile driver
 -------------
 .. currentmodule:: kvikio.cufile_driver
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index a1f3c483f6..9208d4e3ce 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -14,14 +14,15 @@
 
 from kvikio._lib.defaults import CompatMode  # noqa: F401
 from kvikio._version import __git_commit__, __version__
-from kvikio.cufile import CuFile, get_page_cache_info
+from kvikio.cufile import CuFile, clear_page_cache, get_page_cache_info
 from kvikio.remote_file import RemoteFile, is_remote_file_available
 
 __all__ = [
     "__git_commit__",
     "__version__",
+    "clear_page_cache",
     "CuFile",
     "get_page_cache_info",
-    "RemoteFile",
     "is_remote_file_available",
+    "RemoteFile",
 ]
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index 6ac3cc14d8..c0d71f36a7 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -185,6 +185,10 @@ cdef extern from "<kvikio/file_utils.hpp>" nogil:
     pair[size_t, size_t] cpp_get_page_cache_info_int \
         "kvikio::get_page_cache_info"(int fd) except +
 
+    bool cpp_clear_page_cache "kvikio::clear_page_cache" \
+        (bool reclaim_dentries_and_inodes, bool clear_dirty_pages) \
+        except +
+
 
 def get_page_cache_info(file: Union[os.PathLike, str, int, io.IOBase]) \
         -> tuple[int, int]:
@@ -202,3 +206,7 @@ def get_page_cache_info(file: Union[os.PathLike, str, int, io.IOBase]) \
     else:
         raise ValueError("The type of `file` must be `os.PathLike`, `str`, `int`, "
                          "or `io.IOBase`")
+
+
+def clear_page_cache(reclaim_dentries_and_inodes: bool, clear_dirty_pages: bool):
+    return cpp_clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
diff --git a/python/kvikio/kvikio/cufile.py b/python/kvikio/kvikio/cufile.py
index e703c703bc..fd1021d8ca 100644
--- a/python/kvikio/kvikio/cufile.py
+++ b/python/kvikio/kvikio/cufile.py
@@ -458,3 +458,27 @@ def get_page_cache_info(
         and the total number of pages.
     """
     return file_handle.get_page_cache_info(file)
+
+
+def clear_page_cache(
+    reclaim_dentries_and_inodes: bool = True, clear_dirty_pages: bool = True
+) -> bool:
+    """Clear the page cache
+
+    Parameters
+    ----------
+    reclaim_dentries_and_inodes: bool, optional
+        Whether to free reclaimable slab objects which include dentries and inodes.
+
+        - If `true`, equivalent to executing `/sbin/sysctl vm.drop_caches=3`;
+        - If `false`, equivalent to executing `/sbin/sysctl vm.drop_caches=1`.
+    clear_dirty_pages: bool, optional
+        Whether to trigger the writeback process to clear the dirty pages. If `true`,
+        `sync` will be called prior to cache dropping.
+
+    Returns
+    -------
+    bool
+        Whether the page cache has been successfully cleared.
+    """
+    return file_handle.clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)

From 5918cf3fb30d02de067347dc5ba24f5affc49e28 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Tue, 17 Jun 2025 12:24:41 -0700
Subject: [PATCH 023/103] Set compiler versions in context (#755)

Simplify the logic necessary to handle compiler versions by folding it into the context.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/kvikio/pull/755
---
 ci/build_cpp.sh                     | 32 ++++++-----------------------
 conda/recipes/libkvikio/recipe.yaml |  2 ++
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 9258c3dd94..b87ea972a9 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -26,33 +26,13 @@ export RAPIDS_ARTIFACTS_DIR
 source rapids-rattler-channel-string
 
 # Construct the extra variants according to the architecture
-if [[ "$(arch)" == "x86_64" ]]; then
-    cat > variants.yaml << EOF
-    c_compiler_version:
-      - 13
-
-    cxx_compiler_version:
-      - 13
-
-    cuda_version:
-      - ${RAPIDS_CUDA_VERSION}
+cat > variants.yaml << EOF
+cuda_version:
+  - ${RAPIDS_CUDA_VERSION}
 EOF
-else
-    cat > variants.yaml << EOF
-    zip_keys:
-    - [c_compiler_version, cxx_compiler_version, cuda_version]
-
-    c_compiler_version:
-    - 12
-    - 13
-
-    cxx_compiler_version:
-    - 12
-    - 13
-
-    cuda_version:
-    - 12.1 # The last version to not support cufile
-    - ${RAPIDS_CUDA_VERSION}
+if [[ "$(arch)" == "aarch64" ]]; then
+  cat >> variants.yaml << EOF
+  - 12.1 # The last version to not support cufile
 EOF
 fi
 
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index a4e423d3ed..6d6316593e 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -10,6 +10,8 @@ context:
   # 3. Linux aarch64 with CUDA < 12.2, which does not use libcufile
   # Each case has different cuda-version constraints as expressed below
   should_use_cufile: ${{ x86_64 or (aarch64 and cuda_version >= "12.2") }}
+  c_compiler_version: ${{ 13 if should_use_cufile else 12 }}
+  cxx_compiler_version: ${{ c_compiler_version }}
   # When reverting, instances of cuda_key_string can be replaced with cuda_major
   cuda_key_string: ${{ cuda_version | replace(".", "_") }}
   #cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}

From 007c0671004c33c46635c345060e11dcc612c26c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 24 Jun 2025 14:57:20 -0700
Subject: [PATCH 024/103] Use RAPIDS' CUDA major-minor version in Conda's
 `cuda_version` (#758)

We need to drop the CUDA patch version from `cuda_version`.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/758
---
 ci/build_cpp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index b87ea972a9..b12ed048ad 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -28,7 +28,7 @@ source rapids-rattler-channel-string
 # Construct the extra variants according to the architecture
 cat > variants.yaml << EOF
 cuda_version:
-  - ${RAPIDS_CUDA_VERSION}
+  - ${RAPIDS_CUDA_VERSION%.*}
 EOF
 if [[ "$(arch)" == "aarch64" ]]; then
   cat >> variants.yaml << EOF

From 9ec0f956c20ad982590daeea9df5eb4e9220f43e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 26 Jun 2025 12:32:47 -0700
Subject: [PATCH 025/103] Remove nvidia and dask channels (#759)

Now that we have dropped support for CUDA 11 we no longer require the nvidia channel.
With the changes in https://github.com/rapidsai/rapids-dask-dependency/pull/85, RAPIDS now only uses released versions of dask, so we no longer need the dask channel either.
This PR also removes the explicit cufile dependence in the kvikio conda packages, which should no longer be necessary now that we have variants of the libkvikio package for different CUDA versions handling this dependency (see https://github.com/rapidsai/kvikio/pull/754).

Contributes to https://github.com/rapidsai/build-planning/issues/184

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/kvikio/pull/759
---
 .pre-commit-config.yaml                           | 2 +-
 conda/environments/all_cuda-128_arch-aarch64.yaml | 1 -
 conda/environments/all_cuda-128_arch-x86_64.yaml  | 1 -
 conda/recipes/kvikio/recipe.yaml                  | 5 -----
 dependencies.yaml                                 | 1 -
 5 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index db0629d88f..5795ae6a25 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -95,7 +95,7 @@ repos:
       - id: verify-codeowners
         args: [--fix, --project-prefix=kvikio]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.17.0
+    rev: v1.19.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/conda/environments/all_cuda-128_arch-aarch64.yaml b/conda/environments/all_cuda-128_arch-aarch64.yaml
index beb13ccd92..7afb90236e 100644
--- a/conda/environments/all_cuda-128_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-128_arch-aarch64.yaml
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - conda-forge
-- nvidia
 dependencies:
 - boto3>=1.21.21
 - c-compiler
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
index 0df124c208..bd00b38b72 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - conda-forge
-- nvidia
 dependencies:
 - boto3>=1.21.21
 - c-compiler
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 135c751f9d..742f96bf48 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -72,8 +72,6 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - cuda-cudart-dev
-    - if: linux
-      then: libcufile-dev
   run:
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
     - cupy >=12.0.0
@@ -86,13 +84,10 @@ requirements:
     - python
     - zarr >=2.0.0,<4.0.0a0
     - cuda-cudart
-    - if: linux
-      then: libcufile
   ignore_run_exports:
     by_name:
       - cuda-cudart
       - cuda-version
-      - libcufile
       - libcurl
       - libkvikio
       - nvcomp
diff --git a/dependencies.yaml b/dependencies.yaml
index c54a09d8d9..f298ce2ac1 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -112,7 +112,6 @@ channels:
   - rapidsai
   - rapidsai-nightly
   - conda-forge
-  - nvidia
 dependencies:
   build-universal:
     common:

From 3d59ec90baf58ad62b22040758b956dcc741f4d7 Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gforsyth@users.noreply.github.com>
Date: Mon, 30 Jun 2025 12:15:38 -0400
Subject: [PATCH 026/103] refactor(shellcheck): enable for all files and fix
 remaining warnings (#760)

xref rapidsai/build-planning#135

Authors:
  - Gil Forsyth (https://github.com/gforsyth)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/kvikio/pull/760
---
 .github/CODEOWNERS              |  1 +
 .pre-commit-config.yaml         |  2 --
 .shellcheckrc                   |  2 ++
 build.sh                        | 32 +++++++++++++++++---------------
 cpp/scripts/run-cmake-format.sh | 12 ++++++------
 5 files changed, 26 insertions(+), 23 deletions(-)
 create mode 100644 .shellcheckrc

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f5fe2c9d4c..d94819012e 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -14,6 +14,7 @@ python/setup.py    @rapidsai/kvikio-cmake-codeowners @rapidsai/kvikio-python-cod
 #CI code owners
 /.github/                @rapidsai/ci-codeowners
 /ci/                     @rapidsai/ci-codeowners
+/.shellcheckrc           @rapidsai/ci-codeowners
 
 #packaging code owners
 /.pre-commit-config.yaml @rapidsai/packaging-codeowners
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5795ae6a25..f095cc2807 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -103,8 +103,6 @@ repos:
     rev: v0.10.0.1
     hooks:
       - id: shellcheck
-        args: ["--severity=warning"]
-        files: ^ci/
 
 
 default_language_version:
diff --git a/.shellcheckrc b/.shellcheckrc
new file mode 100644
index 0000000000..b57b9d1962
--- /dev/null
+++ b/.shellcheckrc
@@ -0,0 +1,2 @@
+# Disable file checks (otherwise every use of `gha-tools` will get flagged)
+disable=SC1091
diff --git a/build.sh b/build.sh
index c02d0c876a..bf4c0780f6 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 # kvikio build script
 
@@ -16,7 +16,7 @@ ARGS=$*
 
 # NOTE: ensure all dir changes are relative to the location of this
 # script, and that this script resides in the repo dir!
-REPODIR=$(cd $(dirname $0); pwd)
+REPODIR=$(cd "$(dirname "$0")"; pwd)
 
 VALIDARGS="clean libkvikio kvikio -v -g -n --pydevelop -h"
 HELP="$0 [clean] [libkvikio] [kvikio] [-v] [-g] [-n] [--cmake-args=\"<args>\"] [-h]
@@ -40,7 +40,7 @@ VERBOSE_FLAG=""
 BUILD_TYPE=Release
 INSTALL_TARGET=install
 RAN_CMAKE=0
-PYTHON_ARGS_FOR_INSTALL="-v --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true"
+PYTHON_ARGS_FOR_INSTALL=("-v" "--no-build-isolation" "--no-deps" "--config-settings" "rapidsai.disable-cuda=true")
 
 
 # Set defaults for vars that may not have been defined externally
@@ -55,24 +55,25 @@ function hasArg {
 
 function cmakeArgs {
     # Check for multiple cmake args options
-    if [[ $(echo $ARGS | { grep -Eo "\-\-cmake\-args" || true; } | wc -l ) -gt 1 ]]; then
+    if [[ $(echo "$ARGS" | { grep -Eo "\-\-cmake\-args" || true; } | wc -l ) -gt 1 ]]; then
         echo "Multiple --cmake-args options were provided, please provide only one: ${ARGS}"
         exit 1
     fi
 
     # Check for cmake args option
-    if [[ -n $(echo $ARGS | { grep -E "\-\-cmake\-args" || true; } ) ]]; then
+    if [[ -n $(echo "$ARGS" | { grep -E "\-\-cmake\-args" || true; } ) ]]; then
         # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
         # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
         # on the invalid option error
-        EXTRA_CMAKE_ARGS=$(echo $ARGS | { grep -Eo "\-\-cmake\-args=\".+\"" || true; })
+        EXTRA_CMAKE_ARGS=$(echo "$ARGS" | { grep -Eo "\-\-cmake\-args=\".+\"" || true; })
         if [[ -n ${EXTRA_CMAKE_ARGS} ]]; then
             # Remove the full  EXTRA_CMAKE_ARGS argument from list of args so that it passes validArgs function
             ARGS=${ARGS//$EXTRA_CMAKE_ARGS/}
             # Filter the full argument down to just the extra string that will be added to cmake call
-            EXTRA_CMAKE_ARGS=$(echo $EXTRA_CMAKE_ARGS | grep -Eo "\".+\"" | sed -e 's/^"//' -e 's/"$//')
+            EXTRA_CMAKE_ARGS=$(echo "$EXTRA_CMAKE_ARGS" | grep -Eo "\".+\"" | sed -e 's/^"//' -e 's/"$//')
         fi
     fi
+    read -ra EXTRA_CMAKE_ARGS <<< "$EXTRA_CMAKE_ARGS"
 }
 
 
@@ -80,13 +81,13 @@ function cmakeArgs {
 # LIBKVIKIO_BUILD_DIR
 function ensureCMakeRan {
     mkdir -p "${LIBKVIKIO_BUILD_DIR}"
-    cd ${REPODIR}/cpp
+    cd "${REPODIR}"/cpp
     if (( RAN_CMAKE == 0 )); then
         echo "Executing cmake for libkvikio..."
         cmake -B "${LIBKVIKIO_BUILD_DIR}" -S . \
               -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" \
               -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-              ${EXTRA_CMAKE_ARGS}
+              "${EXTRA_CMAKE_ARGS[@]}"
         RAN_CMAKE=1
     fi
 }
@@ -97,7 +98,7 @@ if hasArg -h || hasArg --help; then
 fi
 
 # Check for valid usage
-if (( ${NUMARGS} != 0 )); then
+if (( NUMARGS != 0 )); then
     # Check for cmake args
     cmakeArgs
     for a in ${ARGS}; do
@@ -124,7 +125,7 @@ if hasArg -n; then
     INSTALL_TARGET=""
 fi
 if hasArg --pydevelop; then
-    PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e"
+    PYTHON_ARGS_FOR_INSTALL+=("-e")
 fi
 
 # If clean given, run it prior to any other steps
@@ -146,7 +147,7 @@ fi
 if (( NUMARGS == 0 )) || hasArg libkvikio; then
     ensureCMakeRan
     echo "building libkvikio..."
-    cmake --build "${LIBKVIKIO_BUILD_DIR}" -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
+    cmake --build "${LIBKVIKIO_BUILD_DIR}" -j"${PARALLEL_LEVEL}" ${VERBOSE_FLAG}
     if [[ ${INSTALL_TARGET} != "" ]]; then
         echo "installing libkvikio..."
         cmake --build "${LIBKVIKIO_BUILD_DIR}" --target install ${VERBOSE_FLAG}
@@ -156,7 +157,8 @@ fi
 # Build and install the kvikio Python package
 if (( NUMARGS == 0 )) || hasArg kvikio; then
     echo "building kvikio..."
-    cd ${REPODIR}/python/kvikio
-    SKBUILD_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX};-DCMAKE_LIBRARY_PATH=${LIBKVIKIO_BUILD_DIR};${EXTRA_CMAKE_ARGS}" \
-        python -m pip install ${PYTHON_ARGS_FOR_INSTALL} .
+    cd "${REPODIR}"/python/kvikio
+    _EXTRA_CMAKE_ARGS=$(IFS=';'; echo "${EXTRA_CMAKE_ARGS[*]}")
+    SKBUILD_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX};-DCMAKE_LIBRARY_PATH=${LIBKVIKIO_BUILD_DIR};$_EXTRA_CMAKE_ARGS" \
+        python -m pip install "${PYTHON_ARGS_FOR_INSTALL[@]}" .
 fi
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index 963fc7359b..37c57b5af5 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 # This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
@@ -37,7 +37,7 @@ if ! [ ${status} -eq 0 ]; then
         echo "This script must be run inside the kvikio repository, or the KVIKIO_ROOT environment variable must be set."
     else
         echo "Script failed with unknown error attempting to determine project root:"
-        echo ${KVIKIO_BUILD_DIR}
+        echo "${KVIKIO_BUILD_DIR}"
     fi
     exit 1
 fi
@@ -48,8 +48,8 @@ DEFAULT_FORMAT_FILE_LOCATIONS=(
 )
 
 if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
-    for file_path in ${DEFAULT_FORMAT_FILE_LOCATIONS[@]}; do
-        if [ -f ${file_path} ]; then
+    for file_path in "${DEFAULT_FORMAT_FILE_LOCATIONS[@]}"; do
+        if [ -f "${file_path}" ]; then
             RAPIDS_CMAKE_FORMAT_FILE=${file_path}
             break
         fi
@@ -68,12 +68,12 @@ else
 fi
 
 if [[ $1 == "cmake-format" ]]; then
-  cmake-format -i --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
+  cmake-format -i --config-files cpp/cmake/config.json "${RAPIDS_CMAKE_FORMAT_FILE}" -- "${@:2}"
 elif [[ $1 == "cmake-lint" ]]; then
   # Since the pre-commit hook is verbose, we have to be careful to only
   # present cmake-lint's output (which is quite verbose) if we actually
   # observe a failure.
-  OUTPUT=$(cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2})
+  OUTPUT=$(cmake-lint --config-files cpp/cmake/config.json "${RAPIDS_CMAKE_FORMAT_FILE}" -- "${@:2}")
   status=$?
 
   if ! [ ${status} -eq 0 ]; then

From fd4a3bf9faa323cb2390027d53d10c03df5ff177 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Mon, 30 Jun 2025 11:04:41 -0700
Subject: [PATCH 027/103] Use CUDA 12.9 in Conda, Devcontainers, Spark, GHA,
 etc. (#762)

Use CUDA 12.9 throughout different build and test environments.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/kvikio/pull/762
---
 .../devcontainer.json                                  |  8 ++++----
 .../{cuda12.8-pip => cuda12.9-pip}/devcontainer.json   | 10 +++++-----
 .github/workflows/pr.yaml                              |  2 +-
 ...rch-aarch64.yaml => all_cuda-129_arch-aarch64.yaml} |  4 ++--
 ..._arch-x86_64.yaml => all_cuda-129_arch-x86_64.yaml} |  4 ++--
 cpp/doxygen/main_page.md                               |  4 ++--
 dependencies.yaml                                      |  2 +-
 docs/source/install.rst                                |  4 ++--
 8 files changed, 19 insertions(+), 19 deletions(-)
 rename .devcontainer/{cuda12.8-conda => cuda12.9-conda}/devcontainer.json (91%)
 rename .devcontainer/{cuda12.8-pip => cuda12.9-pip}/devcontainer.json (87%)
 rename conda/environments/{all_cuda-128_arch-aarch64.yaml => all_cuda-129_arch-aarch64.yaml} (94%)
 rename conda/environments/{all_cuda-128_arch-x86_64.yaml => all_cuda-129_arch-x86_64.yaml} (94%)

diff --git a/.devcontainer/cuda12.8-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.8-conda/devcontainer.json
rename to .devcontainer/cuda12.9-conda/devcontainer.json
index 4074dc74f4..7e000efb91 100644
--- a/.devcontainer/cuda12.8-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -3,7 +3,7 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.8",
+      "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:25.08-cpp-mambaforge-ubuntu22.04"
     }
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.9-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.9-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.8-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
similarity index 87%
rename from .devcontainer/cuda12.8-pip/devcontainer.json
rename to .devcontainer/cuda12.9-pip/devcontainer.json
index 7d761d40b7..087acd2900 100644
--- a/.devcontainer/cuda12.8-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -3,15 +3,15 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.8",
+      "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda12.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda12.9-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.9-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
@@ -28,7 +28,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.9-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b0b196a1b0..908d30f9e4 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -145,7 +145,7 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.08
     with:
       arch: '["amd64"]'
-      cuda: '["12.8"]'
+      cuda: '["12.9"]'
       build_command: |
         sccache -z;
         build-all --verbose;
diff --git a/conda/environments/all_cuda-128_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
similarity index 94%
rename from conda/environments/all_cuda-128_arch-aarch64.yaml
rename to conda/environments/all_cuda-129_arch-aarch64.yaml
index 7afb90236e..0c352383d8 100644
--- a/conda/environments/all_cuda-128_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - cmake>=3.30.4
 - cuda-nvcc
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.8
+- cuda-version=12.9
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
@@ -41,4 +41,4 @@ dependencies:
 - sphinx_rtd_theme
 - sysroot_linux-aarch64=2.28
 - zarr>=2.0.0,<4.0.0
-name: all_cuda-128_arch-aarch64
+name: all_cuda-129_arch-aarch64
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
similarity index 94%
rename from conda/environments/all_cuda-128_arch-x86_64.yaml
rename to conda/environments/all_cuda-129_arch-x86_64.yaml
index bd00b38b72..51d46cafd7 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - cmake>=3.30.4
 - cuda-nvcc
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.8
+- cuda-version=12.9
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
@@ -41,4 +41,4 @@ dependencies:
 - sphinx_rtd_theme
 - sysroot_linux-64=2.28
 - zarr>=2.0.0,<4.0.0
-name: all_cuda-128_arch-x86_64
+name: all_cuda-129_arch-x86_64
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 7dbcf8edea..ab9a613663 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -34,7 +34,7 @@ Install the **stable release** from the ``rapidsai`` channel with the following:
 # Install in existing environment
 mamba install -c rapidsai -c conda-forge libkvikio
 # Create new environment (CUDA 12)
-mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.8 libkvikio
+mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.9 libkvikio
 # Create new environment (CUDA 11)
 mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=11.8 libkvikio
 ```
@@ -45,7 +45,7 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel with the f
 # Install in existing environment
 mamba install -c rapidsai-nightly -c conda-forge libkvikio
 # Create new environment (CUDA 12)
-mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.8 libkvikio
+mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 libkvikio
 # Create new environment (CUDA 11)
 mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=11.8 libkvikio
 ```
diff --git a/dependencies.yaml b/dependencies.yaml
index f298ce2ac1..34d1f0b249 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["12.8"]
+      cuda: ["12.9"]
       arch: [aarch64, x86_64]
     includes:
       - build-universal
diff --git a/docs/source/install.rst b/docs/source/install.rst
index 4b62dd02ca..f9b2918617 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -16,7 +16,7 @@ Install the **stable release** from the ``rapidsai`` channel like:
   # Install in existing environment
   mamba install -c rapidsai -c conda-forge kvikio
   # Create new environment (CUDA 12)
-  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=12.8 kvikio
+  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=12.9 kvikio
   # Create new environment (CUDA 11)
   mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=11.8 kvikio
 
@@ -27,7 +27,7 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel like:
   # Install in existing environment
   mamba install -c rapidsai-nightly -c conda-forge kvikio
   # Create new environment (CUDA 12)
-  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.8 kvikio
+  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 kvikio
   # Create new environment (CUDA 11)
   mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=11.8 kvikio
 

From e94379a38829126690fdec7d4c86e2e77f15360f Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 1 Jul 2025 12:14:06 -0500
Subject: [PATCH 028/103] add docs on CI workflow inputs (#764)

Contributes to https://github.com/rapidsai/shared-workflows/issues/376

* adds descriptions for all inputs to workflows triggered by `workflow_dispatch`

## Notes for Reviewers

### Motivation

The input descriptions show up in the UI when you go to trigger these workflows. Like this:

![image](https://github.com/user-attachments/assets/fc62d1ff-39eb-47c7-9a21-57aab959e64f)

I'm hoping that will make it easier for developers to manually trigger workflows. Inspired by being asked multiple times "what format is `date` supposed to be in?".

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/kvikio/pull/764
---
 .github/workflows/build.yaml | 6 ++++++
 .github/workflows/test.yaml  | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 513396befc..53d0402fef 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -9,15 +9,21 @@ on:
   workflow_dispatch:
     inputs:
       branch:
+        description: |
+          branch: git branch the workflow run targets.
+          Required even when 'sha' is provided because it is also used for organizing artifacts.
         required: true
         type: string
       date:
+        description: "date: Date (YYYY-MM-DD) this run is for. Used to organize artifacts produced by nightly builds"
         required: true
         type: string
       sha:
+        description: "sha: full git commit SHA to check out"
         required: true
         type: string
       build_type:
+        description: "build_type: one of [branch, nightly, pull-request]"
         type: string
         default: nightly
 
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ea9beb54d6..b58dbd3ade 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -4,15 +4,21 @@ on:
   workflow_dispatch:
     inputs:
       branch:
+        description: |
+          branch: git branch the workflow run targets.
+          Required even when 'sha' is provided because it is also used for organizing artifacts.
         required: true
         type: string
       date:
+        description: "date: Date (YYYY-MM-DD) this run is for. Used to organize artifacts produced by nightly builds"
         required: true
         type: string
       sha:
+        description: "sha: full git commit SHA to check out"
         required: true
         type: string
       build_type:
+        description: "build_type: one of [branch, nightly, pull-request]"
         type: string
         default: nightly
 

From f091ebc0b358c76f27b622a19df2efade53f099a Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Tue, 8 Jul 2025 17:27:02 -0400
Subject: [PATCH 029/103] Fix a case sensitivity bug in S3 remote I/O (#765)

AWS S3 object key name is case sensitive. Current implementation of `open_s3_url` converts all the letters of URL to lowercase before passing it to the wrapped C++ library. As a result, if the object key name contains any capital letter, the following error will occur:
```
RuntimeError: KvikIO failure at: /home/coder/kvikio/cpp/src/shim/libcurl.cpp:176: curl_easy_perform() error (The requested URL returned error: 404)
```
This PR fixes this issue by forwarding the user-provided URL as-is.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/765
---
 python/kvikio/kvikio/remote_file.py | 7 ++++---
 python/kvikio/tests/test_s3_io.py   | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 55cce53115..41ec216e5c 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import functools
+import urllib.parse
 from typing import Optional
 
 from kvikio.cufile import IOFuture
@@ -130,12 +131,12 @@ def open_s3_url(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        url = url.lower()
-        if url.startswith("http://") or url.startswith("https://"):
+        parsed_result = urllib.parse.urlparse(url.lower())
+        if parsed_result.scheme in ("http", "https"):
             return RemoteFile(
                 _get_remote_module().RemoteFile.open_s3_from_http_url(url, nbytes)
             )
-        if url.startswith("s3://"):
+        if parsed_result.scheme == "s3":
             return RemoteFile(
                 _get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes)
             )
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
index 45997b1e71..58a73184a8 100644
--- a/python/kvikio/tests/test_s3_io.py
+++ b/python/kvikio/tests/test_s3_io.py
@@ -81,7 +81,7 @@ def s3_context(s3_base, bucket, files=None):
 
 def test_read_access(s3_base):
     bucket_name = "bucket"
-    object_name = "data"
+    object_name = "Data"
     data = b"file content"
     with s3_context(
         s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
@@ -119,7 +119,7 @@ def test_read_access(s3_base):
 @pytest.mark.parametrize("buffer_size", [101, 1001])
 def test_read(s3_base, xp, size, nthreads, tasksize, buffer_size):
     bucket_name = "test_read"
-    object_name = "a1"
+    object_name = "Aa1"
     a = xp.arange(size)
     with s3_context(
         s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}
@@ -151,7 +151,7 @@ def test_read(s3_base, xp, size, nthreads, tasksize, buffer_size):
 )
 def test_read_with_file_offset(s3_base, xp, start, end):
     bucket_name = "test_read_with_file_offset"
-    object_name = "a1"
+    object_name = "Aa1"
     a = xp.arange(end, dtype=xp.int64)
     with s3_context(
         s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}

From 83a25ec05d75b11e0f04a37cf25639b425d61cb0 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Wed, 9 Jul 2025 11:29:33 -0400
Subject: [PATCH 030/103] Fix remote I/O file size overflow in Python binding
 (#766)

KvikIO remote I/O interface requires users to provide a buffer to read the remote data into. The following pattern is often used:
```python
import cupy as cp

# Create a remote handle from the URL
remote_handle = kvikio.RemoteFile.open_s3_url(url)

# Query the remote file size and preallocate the user-provided buffer
buf = cp.empty(remote_handle.nbytes(), dtype=cp.int8)

# Read into the buffer
fut = remote_handle.pread(buf)
fut.get()
```
Currently in Cython, the `extern` method `nbytes()` (remote file size) is given a return type of `int`, whereas its initial return type in the C++ library is `std::size_t`. The `int` here is interpreted as the `int` in C++ as opposed to the variable-length `int` in Python. Consequently, integer overflow occurs when reading from a large-size file, in which case `nbytes()` returns negative values.
This PR fixes this bug.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/kvikio/pull/766
---
 python/kvikio/kvikio/_lib/remote_handle.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 5a7ba2c846..dfb662a9fb 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -36,7 +36,7 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes
         ) except +
         cpp_RemoteHandle(unique_ptr[cpp_RemoteEndpoint] endpoint) except +
-        int nbytes() except +
+        size_t nbytes() except +
         const cpp_RemoteEndpoint& endpoint() except +
         size_t read(
             void* buf,

From 54c420a652bf29e3c7de9ac2e2e19af07de7c256 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 16 Jul 2025 21:39:30 +0200
Subject: [PATCH 031/103] Remove CUDA 11 mentions from docs (#769)

As part of #768, remove CUDA 11 from docs.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Tom Augspurger (https://github.com/TomAugspurger)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/kvikio/pull/769
---
 cpp/doxygen/main_page.md     | 4 ----
 docs/source/install.rst      | 7 -------
 python/kvikio/pyproject.toml | 3 ++-
 3 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index ab9a613663..8d9d0e9320 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -35,8 +35,6 @@ Install the **stable release** from the ``rapidsai`` channel with the following:
 mamba install -c rapidsai -c conda-forge libkvikio
 # Create new environment (CUDA 12)
 mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.9 libkvikio
-# Create new environment (CUDA 11)
-mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=11.8 libkvikio
 ```
 
 Install the **nightly release** from the ``rapidsai-nightly`` channel with the following:
@@ -46,8 +44,6 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel with the f
 mamba install -c rapidsai-nightly -c conda-forge libkvikio
 # Create new environment (CUDA 12)
 mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 libkvikio
-# Create new environment (CUDA 11)
-mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=11.8 libkvikio
 ```
 
 ---
diff --git a/docs/source/install.rst b/docs/source/install.rst
index f9b2918617..5642b137af 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -17,8 +17,6 @@ Install the **stable release** from the ``rapidsai`` channel like:
   mamba install -c rapidsai -c conda-forge kvikio
   # Create new environment (CUDA 12)
   mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=12.9 kvikio
-  # Create new environment (CUDA 11)
-  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=11.8 kvikio
 
 Install the **nightly release** from the ``rapidsai-nightly`` channel like:
 
@@ -28,8 +26,6 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel like:
   mamba install -c rapidsai-nightly -c conda-forge kvikio
   # Create new environment (CUDA 12)
   mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 kvikio
-  # Create new environment (CUDA 11)
-  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=11.8 kvikio
 
 
 .. note::
@@ -44,7 +40,6 @@ KvikIO is also available on PyPI. Install the latest release like:
 
 .. code-block::
 
-  pip install kvikio-cu11 # for CUDA 11
   pip install kvikio-cu12 # for CUDA 12
 
 
@@ -57,8 +52,6 @@ In order to setup a development environment, we recommend Conda:
 
   # CUDA 12
   mamba env create --name kvikio-dev --file conda/environments/all_cuda-128_arch-x86_64.yaml
-  # CUDA 11
-  mamba env create --name kvikio-dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
 
 The Python library depends on the C++ library, thus we build and install both:
 
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index b48b4312ee..d71abdb608 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -163,6 +163,7 @@ filterwarnings = [
     "ignore:This module is deprecated since.*Use the official nvCOMP API from 'nvidia.nvcomp' instead.:FutureWarning:.*nvcomp|.*nvcomp_codec",
 ]
 markers = [
-    "cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`"
+    "cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`",
+    "gpu: zarr-python tests requiring a GPU to run."
 ]
 asyncio_mode = "auto"

From b7ec813176371d123eea294307483ff1d682dc74 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Thu, 17 Jul 2025 12:32:58 -0700
Subject: [PATCH 032/103] DOC v25.10 Updates [skip ci]

---
 .../cuda12.9-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda12.9-pip/devcontainer.json  |  6 ++---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-129_arch-aarch64.yaml            |  2 +-
 .../all_cuda-129_arch-x86_64.yaml             |  2 +-
 .../downstream/cmake/get_kvikio.cmake         |  2 +-
 dependencies.yaml                             | 18 ++++++-------
 java/pom.xml                                  |  2 +-
 python/kvikio/pyproject.toml                  |  6 ++---
 13 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index 7e000efb91..462343b8cc 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index 087acd2900..4acd41599a 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda12.9-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 53d0402fef..d643e1aa9e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 908d30f9e4..3ef91ed573 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,7 +142,7 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
     with:
       arch: '["amd64"]'
       cuda: '["12.9"]'
@@ -153,7 +153,7 @@ jobs:
         sccache -s;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -163,7 +163,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -172,7 +172,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b58dbd3ade..d98e3f330c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 593fcb1086..48bf37afc4 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/VERSION b/VERSION
index 3af4bda020..296e35288d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.08.00
+25.10.00
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 0c352383d8..98070be5c6 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.8.*,>=0.0.0a0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 51d46cafd7..bb54f98ef5 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.8.*,>=0.0.0a0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 5e4132f8e5..4e03834e56 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -30,4 +30,4 @@ function(find_and_configure_kvikio MIN_VERSION)
 
 endfunction()
 
-find_and_configure_kvikio("25.08")
+find_and_configure_kvikio("25.10")
diff --git a/dependencies.yaml b/dependencies.yaml
index 34d1f0b249..f8038dc76c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -151,7 +151,7 @@ dependencies:
     common:
       - output_types: conda
         packages: &libkvikio_packages
-          - libkvikio==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -159,7 +159,7 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.8.*,>=0.0.0a0
+              - libkvikio-cu12==25.10.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -247,7 +247,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.8.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==25.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -261,7 +261,7 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.8.*,>=0.0.0a0
+              - libkvikio-cu12==25.10.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
@@ -321,14 +321,14 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.8.*,>=0.0.0a0
-          - libkvikio-tests==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
+          - libkvikio-tests==25.10.*,>=0.0.0a0
   test_kvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.8.*,>=0.0.0a0
-          - kvikio==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
+          - kvikio==25.10.*,>=0.0.0a0
   test_cpp:
     common:
       - output_types: conda
@@ -338,7 +338,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.8.*,>=0.0.0a0
+          - rapids-dask-dependency==25.10.*,>=0.0.0a0
           - pytest
           - pytest-asyncio
           - pytest-cov
diff --git a/java/pom.xml b/java/pom.xml
index 228c82a3d5..7e00c88661 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -7,7 +7,7 @@
 
   <groupId>ai.rapids.kvikio</groupId>
   <artifactId>cufile</artifactId>
-  <version>25.08.0-SNAPSHOT</version>
+  <version>25.10.0-SNAPSHOT</version>
 
   <name>cuFile</name>
   <description>
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index d71abdb608..b8ddc09945 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cupy-cuda12x>=12.0.0",
-    "libkvikio==25.8.*,>=0.0.0a0",
+    "libkvikio==25.10.*,>=0.0.0a0",
     "numcodecs !=0.12.0",
     "numpy>=1.23,<3.0a0",
     "packaging",
@@ -48,7 +48,7 @@ test = [
     "pytest-cov",
     "pytest-timeout",
     "rangehttpserver",
-    "rapids-dask-dependency==25.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -127,7 +127,7 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
     "cython>=3.0.0",
-    "libkvikio==25.8.*,>=0.0.0a0",
+    "libkvikio==25.10.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From ef7d256bfbc0cf627a73d7f9687803ecd2950a17 Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gforsyth@users.noreply.github.com>
Date: Thu, 17 Jul 2025 13:44:30 -0700
Subject: [PATCH 033/103] fix(docker): use versioned `-latest` tag for all
 `rapidsai` images (#771)

In rapidsai/build-planning#187 we switched the docker image tagging scheme
over to include the CalVer information.  This was done to allow us to make
changes to the images during burndown without breaking release pipelines.

This PR moves all of the existing `latest` tags to the newer versioned tag
`25.08-latest` and also modifies the `update_version.sh` script to bump
that version at branch creation time.

xref: https://github.com/rapidsai/build-planning/issues/187

Authors:
  - Gil Forsyth (https://github.com/gforsyth)
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/kvikio/pull/771
---
 .github/workflows/build.yaml | 2 +-
 .github/workflows/pr.yaml    | 4 ++--
 .github/workflows/test.yaml  | 2 +-
 ci/release/update-version.sh | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 53d0402fef..26013cb954 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -69,7 +69,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.08-latest"
       date: ${{ inputs.date }}
       node_type: "gpu-l4-latest-1"
       script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 908d30f9e4..6decb7f4ef 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -112,7 +112,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.08-latest"
       script: "ci/test_java.sh"
   conda-python-build:
     needs: conda-cpp-build
@@ -137,7 +137,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.08-latest"
       script: "ci/build_docs.sh"
   devcontainer:
     needs: telemetry-setup
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b58dbd3ade..2164e3ced8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -51,5 +51,5 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:25.08-latest"
       script: "ci/test_java.sh"
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 75029bc0c4..4d8f8cffcd 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -51,6 +51,7 @@ done
 # CI files
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "s/:[0-9]*\\.[0-9]*-/:${NEXT_SHORT_TAG}-/g" "${FILE}"
 done
 
 # .devcontainer files

From 33a734cde6f4c5af5960fed826131e8797b5b46a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 21 Jul 2025 08:40:15 -0500
Subject: [PATCH 034/103] Use libnvcomp conda package (#774)

The `nvcomp` conda package is being split into a C++ package `libnvcomp` and a Python bindings package `nvcomp`. We want to use the C++ package only, so we are adopting `libnvcomp`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/kvikio/pull/774
---
 .../all_cuda-129_arch-aarch64.yaml            |  2 +-
 .../all_cuda-129_arch-x86_64.yaml             |  2 +-
 conda/recipes/kvikio/recipe.yaml              |  4 +--
 dependencies.yaml                             | 28 ++++---------------
 4 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 0c352383d8..1ba892c85a 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -19,12 +19,12 @@ dependencies:
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
+- libnvcomp-dev==4.2.0.11
 - moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.2.0.11
 - packaging
 - pre-commit
 - pytest
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 51d46cafd7..d95bcba871 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -19,12 +19,12 @@ dependencies:
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
+- libnvcomp-dev==4.2.0.11
 - moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.2.0.11
 - packaging
 - pre-commit
 - pytest
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 742f96bf48..3a7957133e 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -66,7 +66,7 @@ requirements:
     - cython >=3.0.0
     - libcurl ${{ libcurl_version }}
     - libkvikio =${{ version }}
-    - nvcomp ${{ nvcomp_version }}
+    - libnvcomp-dev ${{ nvcomp_version }}
     - pip
     - python =${{ py_version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
@@ -79,7 +79,6 @@ requirements:
     # See https://github.com/zarr-developers/numcodecs/pull/475
     - numcodecs !=0.12.0
     - numpy >=1.23,<3.0a0
-    - nvcomp ${{ nvcomp_version }}
     - packaging
     - python
     - zarr >=2.0.0,<4.0.0a0
@@ -90,7 +89,6 @@ requirements:
       - cuda-version
       - libcurl
       - libkvikio
-      - nvcomp
 
 tests:
   - python:
diff --git a/dependencies.yaml b/dependencies.yaml
index 34d1f0b249..d753e75ded 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -13,7 +13,7 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
-      - depends_on_nvcomp
+      - depends_on_libnvcomp
       - docs
       - py_version
       - rapids_build_skbuild
@@ -67,9 +67,9 @@ files:
     includes:
       - depends_on_cupy
       # TODO: restore runtime dependency when we no longer vendor nvcomp
-      # (when nvcomp supports Python 3.13)
+      # (when nvcomp ships C++ wheels)
       # https://github.com/rapidsai/build-planning/issues/171
-      # - depends_on_nvcomp
+      # - depends_on_libnvcomp
       - depends_on_libkvikio
       - run
   py_rapids_build_libkvikio:
@@ -220,29 +220,11 @@ dependencies:
           - matrix: # All CUDA 12 versions
             packages:
               - cupy-cuda12x>=12.0.0
-  depends_on_nvcomp:
+  depends_on_libnvcomp:
     common:
       - output_types: conda
         packages:
-          - nvcomp==4.2.0.11
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              use_cuda_wheels: "true"
-            packages:
-              - nvidia-nvcomp-cu12==4.2.0.11
-          # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
-          # (e.g. for DLFW and pip devcontainers)
-          - matrix:
-              use_cuda_wheels: "false"
-            packages:
-          # if no matching matrix selectors passed, list the unsuffixed packages
-          # (just as a source of documentation, as this populates pyproject.toml in source control)
-          - matrix:
-            packages:
-              - nvidia-nvcomp==4.2.0.11
+          - libnvcomp-dev==4.2.0.11
   depends_on_libkvikio:
     common:
       - output_types: conda

From 068e11a632c14114636f1a6432ed070ddeb453f1 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 23 Jul 2025 20:26:15 +0200
Subject: [PATCH 035/103] Remove CUDA 11 workarounds (#770)

As part of https://github.com/rapidsai/kvikio/issues/768, remove CUDA 11 workarounds that should not anymore be necessary given CUDA 11 support is being dropped.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - https://github.com/jakirkham

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/kvikio/pull/770
---
 cpp/include/kvikio/shim/cufile.hpp |  9 ---------
 cpp/include/kvikio/shim/utils.hpp  |  9 ---------
 cpp/src/shim/cufile.cpp            | 27 +--------------------------
 cpp/src/shim/utils.cpp             | 14 --------------
 4 files changed, 1 insertion(+), 58 deletions(-)

diff --git a/cpp/include/kvikio/shim/cufile.hpp b/cpp/include/kvikio/shim/cufile.hpp
index 29d99ed5a1..987ab6aa6f 100644
--- a/cpp/include/kvikio/shim/cufile.hpp
+++ b/cpp/include/kvikio/shim/cufile.hpp
@@ -63,15 +63,6 @@ class cuFileAPI {
  private:
   cuFileAPI();
 
-#ifdef KVIKIO_CUFILE_FOUND
-  // Notice, we have to close the driver at program exit (if we opened it) even though we are
-  // not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the
-  // driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but
-  // this seems to be the lesser of two evils.
-  // [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-  ~cuFileAPI();
-#endif
-
  public:
   cuFileAPI(cuFileAPI const&)       = delete;
   void operator=(cuFileAPI const&)  = delete;
diff --git a/cpp/include/kvikio/shim/utils.hpp b/cpp/include/kvikio/shim/utils.hpp
index 03c9773e9f..1beb67ef0d 100644
--- a/cpp/include/kvikio/shim/utils.hpp
+++ b/cpp/include/kvikio/shim/utils.hpp
@@ -48,15 +48,6 @@ namespace kvikio {
  */
 void* load_library(std::string const& name, int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE);
 
-/**
- * @brief Load shared library
- *
- * @param names Vector of names to try when loading shared library.
- * @return The library handle.
- */
-void* load_library(std::vector<std::string> const& names,
-                   int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE);
-
 /**
  * @brief Get symbol using `dlsym`
  *
diff --git a/cpp/src/shim/cufile.cpp b/cpp/src/shim/cufile.cpp
index 7e69c84456..3daa52cd62 100644
--- a/cpp/src/shim/cufile.cpp
+++ b/cpp/src/shim/cufile.cpp
@@ -27,17 +27,7 @@ namespace kvikio {
 #ifdef KVIKIO_CUFILE_FOUND
 cuFileAPI::cuFileAPI()
 {
-  // CUDA versions before CUDA 11.7.1 did not ship libcufile.so.0, so this is
-  // a workaround that adds support for all prior versions of libcufile.
-  void* lib = load_library({"libcufile.so.0",
-                            "libcufile.so.1.3.0" /* 11.7.0 */,
-                            "libcufile.so.1.2.1" /* 11.6.2, 11.6.1 */,
-                            "libcufile.so.1.2.0" /* 11.6.0 */,
-                            "libcufile.so.1.1.1" /* 11.5.1 */,
-                            "libcufile.so.1.1.0" /* 11.5.0 */,
-                            "libcufile.so.1.0.2" /* 11.4.4, 11.4.3, 11.4.2 */,
-                            "libcufile.so.1.0.1" /* 11.4.1 */,
-                            "libcufile.so.1.0.0" /* 11.4.0 */});
+  void* lib = load_library("libcufile.so.0");
   get_symbol(HandleRegister, lib, KVIKIO_STRINGIFY(cuFileHandleRegister));
   get_symbol(HandleDeregister, lib, KVIKIO_STRINGIFY(cuFileHandleDeregister));
   get_symbol(Read, lib, KVIKIO_STRINGIFY(cuFileRead));
@@ -79,21 +69,6 @@ cuFileAPI::cuFileAPI()
     get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister));
     get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister));
   }
-
-  // cuFile is supposed to open and close the driver automatically but
-  // because of a bug in cuFile v1.4 (CUDA v11.8) it sometimes segfaults:
-  // <https://github.com/rapidsai/kvikio/issues/159>.
-  if (version < 1050) { driver_open(); }
-}
-
-// Notice, we have to close the driver at program exit (if we opened it) even though we are
-// not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the
-// driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but
-// this seems to be the lesser of two evils.
-// [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-cuFileAPI::~cuFileAPI()
-{
-  if (version < 1050) { driver_close(); }
 }
 #else
 cuFileAPI::cuFileAPI() { KVIKIO_FAIL("KvikIO not compiled with cuFile.h", std::runtime_error); }
diff --git a/cpp/src/shim/utils.cpp b/cpp/src/shim/utils.cpp
index ab418c3a0f..e393cf77c1 100644
--- a/cpp/src/shim/utils.cpp
+++ b/cpp/src/shim/utils.cpp
@@ -33,20 +33,6 @@ void* load_library(std::string const& name, int mode)
   return ret;
 }
 
-void* load_library(std::vector<std::string> const& names, int mode)
-{
-  std::stringstream ss;
-  for (auto const& name : names) {
-    ss << name << " ";
-    try {
-      return load_library(name, mode);
-    } catch (std::runtime_error const&) {
-    }
-  }
-  KVIKIO_FAIL("cannot open shared object file, tried: " + ss.str(), std::runtime_error);
-  return {};
-}
-
 bool is_running_in_wsl() noexcept
 {
   try {

From cb76305eef35d370a1ff62d4f5171dea5df1926b Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Wed, 23 Jul 2025 15:45:30 -0400
Subject: [PATCH 036/103] Support file-backed mapping (1/n): host/device read
 C++ implementation (#740)

This PR introduces memory-mapped I/O (`MmapHandle`) as an alternative to the standard I/O (`FileHandle`).

The benchmark results are at https://github.com/rapidsai/kvikio/issues/530#issuecomment-2994871923

Partially addresses #530

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/kvikio/pull/740
---
 cpp/CMakeLists.txt                         |   1 +
 cpp/include/kvikio/defaults.hpp            |   2 +-
 cpp/include/kvikio/detail/utils.hpp        |  58 +++
 cpp/include/kvikio/file_utils.hpp          |  14 +-
 cpp/include/kvikio/mmap.hpp                | 189 +++++++++
 cpp/include/kvikio/remote_handle.hpp       |   4 +-
 cpp/include/kvikio/shim/cuda.hpp           |  68 +++
 cpp/include/kvikio/shim/cuda_h_wrapper.hpp |  17 +
 cpp/src/file_utils.cpp                     |  12 +
 cpp/src/mmap.cpp                           | 468 +++++++++++++++++++++
 cpp/src/shim/cuda.cpp                      |  23 +
 cpp/src/utils.cpp                          |  27 ++
 cpp/tests/CMakeLists.txt                   |   2 +
 cpp/tests/test_mmap.cpp                    | 348 +++++++++++++++
 cpp/tests/utils/utils.hpp                  |   1 +
 15 files changed, 1228 insertions(+), 6 deletions(-)
 create mode 100644 cpp/include/kvikio/detail/utils.hpp
 create mode 100644 cpp/include/kvikio/mmap.hpp
 create mode 100644 cpp/src/mmap.cpp
 create mode 100644 cpp/tests/test_mmap.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3d61d4a4e1..37d237e0e3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -147,6 +147,7 @@ set(SOURCES
     "src/error.cpp"
     "src/file_handle.cpp"
     "src/file_utils.cpp"
+    "src/mmap.cpp"
     "src/nvtx.cpp"
     "src/posix_io.cpp"
     "src/shim/cuda.cpp"
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index 200f248d79..6576d286fe 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -77,7 +77,7 @@ std::vector<int> getenv_or(std::string_view env_var_name, std::vector<int> defau
  * `result` be the set value. If more than one candidates have been set with the same value,
  * `env_var_name` will be assigned the last candidate.
  *
- * @throws std::invalid_argument if:
+ * @exception std::invalid_argument if:
  *   - `env_var_names` is empty.
  *   - The environment variable is not defined to be string type and is assigned an empty value (in
  *     other words, string-type environment variables are allowed to hold an empty value).
diff --git a/cpp/include/kvikio/detail/utils.hpp b/cpp/include/kvikio/detail/utils.hpp
new file mode 100644
index 0000000000..b3ce990c44
--- /dev/null
+++ b/cpp/include/kvikio/detail/utils.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstring>
+
+namespace kvikio::detail {
+
+/**
+ * @brief Round up `value` to multiples of `alignment`
+ *
+ * @param value Value to be rounded up
+ * @param alignment Must be a power of 2
+ * @return Rounded result
+ */
+[[nodiscard]] std::size_t align_up(std::size_t value, std::size_t alignment) noexcept;
+
+/**
+ * @brief Align the address `addr` up to the boundary of `alignment`
+ *
+ * @param addr Address to be aligned up
+ * @param alignment Must be a power of 2
+ * @return Aligned address
+ */
+[[nodiscard]] void* align_up(void* addr, std::size_t alignment) noexcept;
+
+/**
+ * @brief Round down `value` to multiples of `alignment`
+ *
+ * @param value Value to be rounded down
+ * @param alignment Must be a power of 2
+ * @return Rounded result
+ */
+[[nodiscard]] std::size_t align_down(std::size_t value, std::size_t alignment) noexcept;
+
+/**
+ * @brief Align the address `addr` down to the boundary of `alignment`
+ *
+ * @param addr Address to be aligned down
+ * @param alignment Must be a power of 2
+ * @return Aligned address
+ */
+[[nodiscard]] void* align_down(void* addr, std::size_t alignment) noexcept;
+
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/file_utils.hpp b/cpp/include/kvikio/file_utils.hpp
index 2f03f0cbdc..aa3e6fd380 100644
--- a/cpp/include/kvikio/file_utils.hpp
+++ b/cpp/include/kvikio/file_utils.hpp
@@ -132,8 +132,8 @@ class CUFileHandleWrapper {
  * @param o_direct Append O_DIRECT to the open flags
  * @return oflags
  *
- * @throw std::invalid_argument if the specified flags are not supported.
- * @throw std::invalid_argument if `o_direct` is true, but `O_DIRECT` is not supported.
+ * @exception std::invalid_argument if the specified flags are not supported.
+ * @exception std::invalid_argument if `o_direct` is true, but `O_DIRECT` is not supported.
  */
 int open_fd_parse_flags(std::string const& flags, bool o_direct);
 
@@ -160,6 +160,14 @@ int open_fd(std::string const& file_path, std::string const& flags, bool o_direc
  * @param file_descriptor Open file descriptor
  * @return The number of bytes
  */
+[[nodiscard]] std::size_t get_file_size(std::string const& file_path);
+
+/**
+ * @brief Get file size given the file path
+ *
+ * @param file_path Path to a file
+ * @return The number of bytes
+ */
 [[nodiscard]] std::size_t get_file_size(int file_descriptor);
 
 /**
@@ -200,7 +208,7 @@ std::pair<std::size_t, std::size_t> get_page_cache_info(int fd);
  * - Execute the command with `sudo` prefix. This is for the general case where selective
  * unprivileged users have permission to run `/sbin/sysctl` with `sudo` prefix.
  *
- * @throws kvikio::GenericSystemError if somehow the child process could not be created.
+ * @exception kvikio::GenericSystemError if somehow the child process could not be created.
  */
 bool clear_page_cache(bool reclaim_dentries_and_inodes = true, bool clear_dirty_pages = true);
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/mmap.hpp b/cpp/include/kvikio/mmap.hpp
new file mode 100644
index 0000000000..f5245858ef
--- /dev/null
+++ b/cpp/include/kvikio/mmap.hpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>
+#include <future>
+
+#include <kvikio/defaults.hpp>
+#include <kvikio/file_handle.hpp>
+#include <optional>
+
+namespace kvikio {
+
+/**
+ * @brief Handle of a memory-mapped file
+ *
+ * This utility class facilitates the use of file-backed memory by providing a performant method
+ * `pread()` to read a range of data into user-provided memory residing on the host or device.
+ *
+ * File-backed memory can be considered when a large number of nonadjacent file ranges (specified by
+ * the `offset` and `size` pair) are to be frequently accessed. It can potentially reduce memory
+ * usage due to demand paging (compared to reading the entire file with `read(2)`), and may improve
+ * I/O performance compared to frequent calls to `read(2)`.
+ */
+class MmapHandle {
+ private:
+  void* _buf{};
+  std::size_t _initial_map_size{};
+  std::size_t _initial_map_offset{};
+  std::size_t _file_size{};
+  std::size_t _map_offset{};
+  std::size_t _map_size{};
+  void* _map_addr{};
+  bool _initialized{};
+  int _map_protection{};
+  int _map_flags{};
+  FileWrapper _file_wrapper{};
+
+  /**
+   * @brief Validate and adjust the read arguments.
+   *
+   * @param size Size in bytes to read. If not specified, set it to the bytes from `offset` to
+   * the end of file
+   * @param offset File offset
+   * @return Adjusted size in bytes to read
+   *
+   * @exception std::out_of_range if the read region specified by `offset` and `size` is
+   * outside the initial region specified when the mapping handle was constructed
+   * @exception std::invalid_argument if the size is given but is 0
+   * @exception std::runtime_error if the mapping handle is closed
+   */
+  std::size_t validate_and_adjust_read_args(std::optional<std::size_t> const& size,
+                                            std::size_t offset);
+
+ public:
+  /**
+   * @brief Construct an empty memory-mapped file
+   *
+   */
+  MmapHandle() noexcept = default;
+
+  /**
+   * @brief Construct a new memory-mapped file
+   *
+   * @param file_path File path
+   * @param flags Open flags (see also `fopen(3)`):
+   *   - "r": "open for reading (default)"
+   *   - "w": "open for writing, truncating the file first"
+   *   - "a": "open for writing, appending to the end of file if it exists"
+   *   - "+": "open for updating (reading and writing)"
+   * @param initial_map_size Size in bytes of the mapped region. If not specified, map the region
+   * starting from `initial_map_offset` to the end of file
+   * @param initial_map_offset File offset of the mapped region
+   * @param mode Access mode
+   * @param map_flags Flags to be passed to the system call `mmap`. See `mmap(2)` for details
+   */
+  MmapHandle(std::string const& file_path,
+             std::string const& flags                    = "r",
+             std::optional<std::size_t> initial_map_size = std::nullopt,
+             std::size_t initial_map_offset              = 0,
+             mode_t mode                                 = FileHandle::m644,
+             std::optional<int> map_flags                = std::nullopt);
+
+  MmapHandle(MmapHandle const&)            = delete;
+  MmapHandle& operator=(MmapHandle const&) = delete;
+  MmapHandle(MmapHandle&& o) noexcept;
+  MmapHandle& operator=(MmapHandle&& o) noexcept;
+  ~MmapHandle() noexcept;
+
+  /**
+   * @brief Size in bytes of the mapped region when the mapping handle was constructed
+   *
+   * @return Initial size of the mapped region
+   */
+  [[nodiscard]] std::size_t initial_map_size() const noexcept;
+
+  /**
+   * @brief File offset of the mapped region when the mapping handle was constructed
+   *
+   * @return Initial file offset of the mapped region
+   */
+  [[nodiscard]] std::size_t initial_map_offset() const noexcept;
+
+  /**
+   * @brief Get the file size if the file is open. Returns 0 if the file is closed.
+   *
+   * The behavior of this method is consistent with `FileHandle::nbytes`.
+   *
+   * @return The file size in bytes
+   */
+  [[nodiscard]] std::size_t file_size() const;
+
+  /**
+   * @brief Alias of `file_size`
+   *
+   * @return The file size in bytes
+   */
+  [[nodiscard]] std::size_t nbytes() const;
+
+  /**
+   * @brief Whether the mapping handle is closed
+   *
+   * @return Boolean answer
+   */
+  [[nodiscard]] bool closed() const noexcept;
+
+  /**
+   * @brief Close the mapping handle if it is open; do nothing otherwise
+   */
+  void close() noexcept;
+
+  /**
+   * @brief Sequential read `size` bytes from the file (with the offset `offset`) to the
+   * destination buffer `buf`
+   *
+   * @param buf Address of the host or device memory (destination buffer)
+   * @param size Size in bytes to read. If not specified, read starts from `offset` to the end
+   * of file
+   * @param offset File offset
+   * @return Number of bytes that have been read
+   *
+   * @exception std::out_of_range if the read region specified by `offset` and `size` is
+   * outside the initial region specified when the mapping handle was constructed
+   * @exception std::invalid_argument if the size is given but is 0
+   * @exception std::runtime_error if the mapping handle is closed
+   */
+  std::size_t read(void* buf,
+                   std::optional<std::size_t> size = std::nullopt,
+                   std::size_t offset              = 0);
+
+  /**
+   * @brief Parallel read `size` bytes from the file (with the offset `offset`) to the
+   * destination buffer `buf`
+   *
+   * @param buf Address of the host or device memory (destination buffer)
+   * @param size Size in bytes to read. If not specified, read starts from `offset` to the end
+   * of file
+   * @param offset File offset
+   * @param task_size Size of each task in bytes
+   * @return Future that on completion returns the size of bytes that were successfully read.
+   *
+   * @exception std::out_of_range if the read region specified by `offset` and `size` is
+   * outside the initial region specified when the mapping handle was constructed
+   * @exception std::invalid_argument if the size is given but is 0
+   * @exception std::runtime_error if the mapping handle is closed
+   *
+   * @note The `std::future` object's `wait()` or `get()` should not be called after the lifetime of
+   * the MmapHandle object ends. Otherwise, the behavior is undefined.
+   */
+  std::future<std::size_t> pread(void* buf,
+                                 std::optional<std::size_t> size = std::nullopt,
+                                 std::size_t offset              = 0,
+                                 std::size_t task_size           = defaults::task_size());
+};
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index b67ae5560e..7c197ea2b0 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -117,7 +117,7 @@ class S3Endpoint : public RemoteEndpoint {
   /**
    * @brief Get url from a AWS S3 bucket and object name.
    *
-   * @throws std::invalid_argument if no region is specified and no default region is
+   * @exception std::invalid_argument if no region is specified and no default region is
    * specified in the environment.
    *
    * @param bucket_name The name of the S3 bucket.
@@ -137,7 +137,7 @@ class S3Endpoint : public RemoteEndpoint {
   /**
    * @brief Given an url like "s3://<bucket>/<object>", return the name of the bucket and object.
    *
-   * @throws std::invalid_argument if url is ill-formed or is missing the bucket or object name.
+   * @exception std::invalid_argument if url is ill-formed or is missing the bucket or object name.
    *
    * @param s3_url S3 url.
    * @return Pair of strings: [bucket-name, object-name].
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index 9aaac08827..4ebf243fbb 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -15,11 +15,70 @@
  */
 #pragma once
 
+#include <any>
+#include <functional>
+
 #include <kvikio/shim/cuda_h_wrapper.hpp>
 #include <kvikio/shim/utils.hpp>
+#include <stdexcept>
 
 namespace kvikio {
 
+namespace detail {
+/**
+ * @brief Non-templated class to hold any callable that returns CUresult
+ */
+class AnyCallable {
+ private:
+  std::any _callable;
+
+ public:
+  /**
+   * @brief Assign a callable to the object
+   *
+   * @tparam Callable A callable that must return CUresult
+   * @param c The callable object
+   */
+  template <typename Callable>
+  void set(Callable&& c)
+  {
+    _callable = std::function(c);
+  }
+
+  /**
+   * @brief Destroy the contained callable
+   */
+  void reset() { _callable.reset(); }
+
+  /**
+   * @brief Invoke the container callable
+   *
+   * @tparam Args Types of the argument. Must exactly match the parameter types of the contained
+   * callable. For example, if the parameter is `std::size_t*`, an argument of `nullptr` must be
+   * explicitly cast to `std::size_t*`.
+   * @param args Arguments to be passed
+   * @return CUDA driver API error code
+   * @exception std::bad_any_cast if any argument type does not exactly match the parameter type of
+   * the contained callable.
+   */
+  template <typename... Args>
+  CUresult operator()(Args... args)
+  {
+    using T = std::function<CUresult(Args...)>;
+    if (!_callable.has_value()) {
+      throw std::runtime_error("No callable has been assigned to the wrapper yet.");
+    }
+    return std::any_cast<T&>(_callable)(args...);
+  }
+
+  /**
+   * @brief Check if the object holds a callable
+   */
+  operator bool() const { return _callable.has_value(); }
+};
+
+}  // namespace detail
+
 /**
  * @brief Shim layer of the cuda C-API
  *
@@ -29,25 +88,34 @@ namespace kvikio {
  */
 class cudaAPI {
  public:
+  int driver_version{0};
+
   decltype(cuInit)* Init{nullptr};
   decltype(cuMemHostAlloc)* MemHostAlloc{nullptr};
   decltype(cuMemFreeHost)* MemFreeHost{nullptr};
   decltype(cuMemcpyHtoDAsync)* MemcpyHtoDAsync{nullptr};
   decltype(cuMemcpyDtoHAsync)* MemcpyDtoHAsync{nullptr};
+
+  detail::AnyCallable MemcpyBatchAsync{};
+
   decltype(cuPointerGetAttribute)* PointerGetAttribute{nullptr};
   decltype(cuPointerGetAttributes)* PointerGetAttributes{nullptr};
   decltype(cuCtxPushCurrent)* CtxPushCurrent{nullptr};
   decltype(cuCtxPopCurrent)* CtxPopCurrent{nullptr};
   decltype(cuCtxGetCurrent)* CtxGetCurrent{nullptr};
+  decltype(cuCtxGetDevice)* CtxGetDevice{nullptr};
   decltype(cuMemGetAddressRange)* MemGetAddressRange{nullptr};
   decltype(cuGetErrorName)* GetErrorName{nullptr};
   decltype(cuGetErrorString)* GetErrorString{nullptr};
   decltype(cuDeviceGet)* DeviceGet{nullptr};
+  decltype(cuDeviceGetCount)* DeviceGetCount{nullptr};
+  decltype(cuDeviceGetAttribute)* DeviceGetAttribute{nullptr};
   decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr};
   decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr};
   decltype(cuStreamSynchronize)* StreamSynchronize{nullptr};
   decltype(cuStreamCreate)* StreamCreate{nullptr};
   decltype(cuStreamDestroy)* StreamDestroy{nullptr};
+  decltype(cuDriverGetVersion)* DriverGetVersion{nullptr};
 
  private:
   cudaAPI();
diff --git a/cpp/include/kvikio/shim/cuda_h_wrapper.hpp b/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
index ee3c855efe..ca6ab5fa59 100644
--- a/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
+++ b/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
@@ -60,19 +60,36 @@ CUresult cuMemHostAlloc(...);
 CUresult cuMemFreeHost(...);
 CUresult cuMemcpyHtoDAsync(...);
 CUresult cuMemcpyDtoHAsync(...);
+CUresult cuMemcpyBatchAsync(...);
 CUresult cuPointerGetAttribute(...);
 CUresult cuPointerGetAttributes(...);
 CUresult cuCtxPushCurrent(...);
 CUresult cuCtxPopCurrent(...);
 CUresult cuCtxGetCurrent(...);
+CUresult cuCtxGetDevice(...);
 CUresult cuMemGetAddressRange(...);
 CUresult cuGetErrorName(...);
 CUresult cuGetErrorString(...);
 CUresult cuDeviceGet(...);
+CUresult cuDeviceGetCount(...);
+CUresult cuDeviceGetAttribute(...);
 CUresult cuDevicePrimaryCtxRetain(...);
 CUresult cuDevicePrimaryCtxRelease(...);
 CUresult cuStreamCreate(...);
 CUresult cuStreamDestroy(...);
 CUresult cuStreamSynchronize(...);
+CUresult cuDriverGetVersion(...);
+
+enum CUdevice_attribute {
+  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+};
+
+enum CUmemcpySrcAccessOrder_enum {
+  CU_MEMCPY_SRC_ACCESS_ORDER_STREAM,
+};
+
+struct CUmemcpyAttributes {
+  int srcAccessOrder;
+};
 
 #endif
diff --git a/cpp/src/file_utils.cpp b/cpp/src/file_utils.cpp
index 124edf7b86..e567b86476 100644
--- a/cpp/src/file_utils.cpp
+++ b/cpp/src/file_utils.cpp
@@ -169,6 +169,18 @@ int open_fd(std::string const& file_path, std::string const& flags, bool o_direc
   return ret;
 }
 
+[[nodiscard]] std::size_t get_file_size(std::string const& file_path)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  std::string const flags{"r"};
+  bool const o_direct{false};
+  mode_t const mode{FileHandle::m644};
+  auto fd     = open_fd(file_path, flags, o_direct, mode);
+  auto result = get_file_size(fd);
+  SYSCALL_CHECK(close(fd));
+  return result;
+}
+
 [[nodiscard]] std::size_t get_file_size(int file_descriptor)
 {
   KVIKIO_NVTX_FUNC_RANGE();
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
new file mode 100644
index 0000000000..11b0416c29
--- /dev/null
+++ b/cpp/src/mmap.cpp
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sys/mman.h>
+
+#include <cstddef>
+#include <cstdlib>
+#include <future>
+#include <optional>
+#include <stdexcept>
+#include <type_traits>
+#include <unordered_map>
+
+#include <kvikio/detail/utils.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/mmap.hpp>
+#include <kvikio/nvtx.hpp>
+#include <kvikio/parallel_operation.hpp>
+#include <kvikio/posix_io.hpp>
+#include <kvikio/utils.hpp>
+#include "kvikio/file_utils.hpp"
+
+namespace kvikio {
+
+namespace detail {
+/**
+ * @brief Prevent the compiler from optimizing away the read of a byte from a given address
+ *
+ * @param addr The address to read from
+ */
+void disable_read_optimization(void* addr)
+{
+  auto addr_byte = static_cast<std::byte*>(addr);
+  std::byte tmp{};
+  asm volatile("" : "+r,m"(tmp = *addr_byte) : : "memory");
+}
+
+/**
+ * @brief Change an address `p` by a signed difference of `v`
+ *
+ * @tparam Integer Signed integer type
+ * @param p An address
+ * @param v Change of `p` in bytes
+ * @return A new address as a result of applying `v` on `p`
+ *
+ * @note Technically, if the initial pointer is non-null, or does not point to an element of an
+ * array object, (p + v) is undefined behavior (https://eel.is/c++draft/expr.add#4). However,
+ * (p + v) on dynamic allocation is generally acceptable in practice, as long as users guarantee
+ * that the resulting pointer points to a valid region.
+ */
+template <typename Integer>
+void* pointer_add(void* p, Integer v)
+{
+  static_assert(std::is_integral_v<Integer>);
+  return static_cast<std::byte*>(p) + v;
+}
+
+/**
+ * @brief The distance in bytes between pointer `p1` and `p2`
+ *
+ * @param p1 The first pointer
+ * @param p2 The second pointer
+ * @return Signed result of (`p1` - `p2`). Both pointers are cast to std::byte* before subtraction.
+ *
+ * @note Technically, if two pointers do not point to elements from the same array, (p1 - p2) is
+ * undefined behavior (https://eel.is/c++draft/expr.add#5). However, (p1 - p2) on dynamic allocation
+ * is generally acceptable in practice, as long as users guarantee that both pointers are within the
+ * valid region.
+ */
+std::ptrdiff_t pointer_diff(void* p1, void* p2)
+{
+  return static_cast<std::byte*>(p1) - static_cast<std::byte*>(p2);
+}
+
+/**
+ * @brief Whether the current device supports address translation service (ATS), whereby the CPU and
+ * GPU share a single page table.
+ *
+ * @return Boolean answer
+ */
+bool is_ats_available()
+{
+  // Memoize the ATS availability record of all devices
+  static auto const ats_availability = []() -> auto {
+    std::unordered_map<CUdevice, int> result;
+    int num_devices{};
+    CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGetCount(&num_devices));
+    for (int device_ordinal = 0; device_ordinal < num_devices; ++device_ordinal) {
+      CUdevice device_handle{};
+      CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&device_handle, device_ordinal));
+      int attr{};
+      CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGetAttribute(
+        &attr,
+        CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+        device_handle));
+      result[device_handle] = attr;
+    }
+    return result;
+  }();
+
+  // Get current device
+  CUdevice device_handle{};
+  CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetDevice(&device_handle));
+
+  // Look up the record
+  return ats_availability.at(device_handle);
+}
+
+/**
+ * @brief For the specified memory range, touch the first byte of each page to cause page fault.
+ *
+ * For the first page, if the starting address is not aligned to the page boundary, the byte at
+ * that address is touched.
+ *
+ * @param buf The starting memory address
+ * @param size The size in bytes of the memory range
+ * @return The number of bytes touched
+ */
+std::size_t perform_prefault(void* buf, std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  auto const page_size = get_page_size();
+  auto aligned_addr    = detail::align_up(buf, page_size);
+
+  std::size_t touched_bytes{0};
+
+  // If buf is not aligned, read the byte at buf.
+  auto num_bytes = detail::pointer_diff(aligned_addr, buf);
+  if (num_bytes > 0) {
+    detail::disable_read_optimization(buf);
+    touched_bytes += num_bytes;
+    if (size >= num_bytes) { size -= num_bytes; }
+  }
+
+  if (num_bytes >= size) { return touched_bytes; }
+
+  while (size > 0) {
+    detail::disable_read_optimization(aligned_addr);
+    if (size >= page_size) {
+      aligned_addr = detail::pointer_add(aligned_addr, page_size);
+      size -= page_size;
+      touched_bytes += page_size;
+    } else {
+      touched_bytes += size;
+      break;
+    }
+  }
+  return touched_bytes;
+}
+
+/**
+ * @brief Implementation of read
+ *
+ * Copy data from the source buffer `src_mapped_buf + buf_offset` to the destination buffer
+ * `dst_buf + buf_offset`.
+ *
+ * @param dst_buf Address of the host or device memory (destination buffer)
+ * @param src_mapped_buf Address of the host memory (source buffer)
+ * @param size Size in bytes to read
+ * @param buf_offset Offset for both `dst_buf` and `src_mapped_buf`
+ * @param is_dst_buf_host_mem Whether the destination buffer is host memory or not
+ * @param ctx CUDA context when the destination buffer is not host memory
+ */
+void read_impl(void* dst_buf,
+               void* src_mapped_buf,
+               std::size_t size,
+               std::size_t buf_offset,
+               bool is_dst_buf_host_mem,
+               CUcontext ctx)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  auto const src = detail::pointer_add(src_mapped_buf, buf_offset);
+  auto const dst = detail::pointer_add(dst_buf, buf_offset);
+
+  if (is_dst_buf_host_mem) {
+    // std::memcpy implicitly performs prefault for the mapped memory.
+    std::memcpy(dst, src, size);
+    return;
+  }
+
+  // Empirically, take the following steps to achieve good performance:
+  // - On C2C:
+  //   - Explicitly prefault
+  //   - Copy from the mapped memory (pageable) to the device buffer
+  // - On PCIe:
+  //   - std::memcpy from the mapped memory to the pinned bounce buffer (which implicitly
+  //     prefaults)
+  //   - Copy from the bounce buffer to the device buffer
+
+  PushAndPopContext c(ctx);
+  CUstream stream = detail::StreamsByThread::get();
+
+  auto h2d_batch_cpy_sync =
+    [](CUdeviceptr dst_devptr, CUdeviceptr src_devptr, std::size_t size, CUstream stream) {
+#if CUDA_VERSION >= 12080
+      if (cudaAPI::instance().MemcpyBatchAsync) {
+        CUmemcpyAttributes attrs{};
+        std::size_t attrs_idxs[] = {0};
+        attrs.srcAccessOrder     = CUmemcpySrcAccessOrder_enum::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
+        CUDA_DRIVER_TRY(
+          cudaAPI::instance().MemcpyBatchAsync(&dst_devptr,
+                                               &src_devptr,
+                                               &size,
+                                               static_cast<std::size_t>(1) /* count */,
+                                               &attrs,
+                                               attrs_idxs,
+                                               static_cast<std::size_t>(1) /* num_attrs */,
+                                               static_cast<std::size_t*>(nullptr),
+                                               stream));
+      } else {
+        // Fall back to the conventional H2D copy if the batch copy API is not available.
+        CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(
+          dst_devptr, reinterpret_cast<void*>(src_devptr), size, stream));
+      }
+#else
+      CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(
+        dst_devptr, reinterpret_cast<void*>(src_devptr), size, stream));
+#endif
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
+    };
+
+  auto dst_devptr = convert_void2deviceptr(dst);
+  CUdeviceptr src_devptr{};
+  if (detail::is_ats_available()) {
+    perform_prefault(src, size);
+    src_devptr = convert_void2deviceptr(src);
+    h2d_batch_cpy_sync(dst_devptr, src_devptr, size, stream);
+  } else {
+    auto alloc = AllocRetain::instance().get();
+    std::memcpy(alloc.get(), src, size);
+    src_devptr = convert_void2deviceptr(alloc.get());
+    h2d_batch_cpy_sync(dst_devptr, src_devptr, size, stream);
+  }
+}
+
+}  // namespace detail
+
+//     |--> file start                 |<--page_size-->|
+//     |
+// (0) |...............|...............|...............|...............|............
+//
+// (1) |<---_initial_map_offset-->|<---------------_initial_map_size--------------->|
+//                                |--> _buf
+//
+// (2) |<-_map_offset->|<----------------------_map_size----------------------->|
+//                     |--> _map_addr
+//
+// (3) |<--------------------------offset--------------------->|<--size-->|
+//                                |--> _buf
+//
+// (0): Layout of the file-backed memory mapping if the whole file were mapped
+// (1): At mapping handle construction time, the member `_initial_map_offset` and
+// `_initial_map_size` determine the mapped region
+// (2): `_map_addr` is the page aligned address returned by `mmap`. `_map_offset` is the adjusted
+// offset.
+// (3): At read time, the argument `offset` and `size` determine the region to be read.
+// This region must be a subset of the one defined at mapping handle construction time.
+MmapHandle::MmapHandle(std::string const& file_path,
+                       std::string const& flags,
+                       std::optional<std::size_t> initial_map_size,
+                       std::size_t initial_map_offset,
+                       mode_t mode,
+                       std::optional<int> map_flags)
+  : _initial_map_offset(initial_map_offset),
+    _initialized{true},
+    _file_wrapper(file_path, flags, false /* o_direct */, mode)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+
+  _file_size = get_file_size(_file_wrapper.fd());
+  if (_file_size == 0) { return; }
+
+  KVIKIO_EXPECT(
+    _initial_map_offset < _file_size, "Offset is past the end of file", std::out_of_range);
+
+  // An initial size of std::nullopt is a shorthand for "starting from _initial_map_offset to the
+  // end of file".
+  _initial_map_size =
+    initial_map_size.has_value() ? initial_map_size.value() : (_file_size - _initial_map_offset);
+
+  KVIKIO_EXPECT(
+    _initial_map_size > 0, "Mapped region should not be zero byte", std::invalid_argument);
+  KVIKIO_EXPECT(_initial_map_offset + _initial_map_size <= _file_size,
+                "Mapped region is past the end of file",
+                std::out_of_range);
+
+  auto const page_size    = get_page_size();
+  _map_offset             = detail::align_down(_initial_map_offset, page_size);
+  auto const offset_delta = _initial_map_offset - _map_offset;
+  _map_size               = _initial_map_size + offset_delta;
+
+  switch (flags[0]) {
+    case 'r': {
+      _map_protection = PROT_READ;
+      break;
+    }
+    case 'w': {
+      KVIKIO_FAIL("File-backed mmap write is not supported yet", std::invalid_argument);
+    }
+    default: {
+      KVIKIO_FAIL("Unknown file open flag", std::invalid_argument);
+    }
+  }
+
+  _map_flags = map_flags.has_value() ? map_flags.value() : MAP_PRIVATE;
+
+  _map_addr =
+    mmap(nullptr, _map_size, _map_protection, _map_flags, _file_wrapper.fd(), _map_offset);
+  SYSCALL_CHECK(_map_addr, "Cannot create memory mapping", MAP_FAILED);
+  _buf = detail::pointer_add(_map_addr, offset_delta);
+}
+
+MmapHandle::MmapHandle(MmapHandle&& o) noexcept
+  : _buf{std::exchange(o._buf, {})},
+    _initial_map_size{std::exchange(o._initial_map_size, {})},
+    _initial_map_offset{std::exchange(o._initial_map_offset, {})},
+    _file_size{std::exchange(o._file_size, {})},
+    _map_offset{std::exchange(o._map_offset, {})},
+    _map_size{std::exchange(o._map_size, {})},
+    _map_addr{std::exchange(o._map_addr, {})},
+    _initialized{std::exchange(o._initialized, {})},
+    _map_protection{std::exchange(o._map_protection, {})},
+    _map_flags{std::exchange(o._map_flags, {})},
+    _file_wrapper{std::exchange(o._file_wrapper, {})}
+{
+}
+
+MmapHandle& MmapHandle::operator=(MmapHandle&& o) noexcept
+{
+  close();
+  _buf                = std::exchange(o._buf, {});
+  _initial_map_size   = std::exchange(o._initial_map_size, {});
+  _initial_map_offset = std::exchange(o._initial_map_offset, {});
+  _file_size          = std::exchange(o._file_size, {});
+  _map_offset         = std::exchange(o._map_offset, {});
+  _map_size           = std::exchange(o._map_size, {});
+  _map_addr           = std::exchange(o._map_addr, {});
+  _initialized        = std::exchange(o._initialized, {});
+  _map_protection     = std::exchange(o._map_protection, {});
+  _map_flags          = std::exchange(o._map_flags, {});
+  _file_wrapper       = std::exchange(o._file_wrapper, {});
+  return *this;
+}
+
+MmapHandle::~MmapHandle() noexcept
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  close();
+}
+
+bool MmapHandle::closed() const noexcept { return !_initialized; }
+
+void MmapHandle::close() noexcept
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (closed() || _map_addr == nullptr) { return; }
+  try {
+    auto ret = munmap(_map_addr, _map_size);
+    SYSCALL_CHECK(ret);
+  } catch (...) {
+  }
+  _buf                = {};
+  _initial_map_size   = {};
+  _initial_map_offset = {};
+  _file_size          = {};
+  _map_offset         = {};
+  _map_size           = {};
+  _map_addr           = {};
+  _initialized        = {};
+  _map_protection     = {};
+  _map_flags          = {};
+  _file_wrapper       = {};
+}
+
+std::size_t MmapHandle::initial_map_size() const noexcept { return _initial_map_size; }
+
+std::size_t MmapHandle::initial_map_offset() const noexcept { return _initial_map_offset; }
+
+std::size_t MmapHandle::file_size() const
+{
+  if (closed()) { return 0; }
+  return get_file_size(_file_wrapper.fd());
+}
+
+std::size_t MmapHandle::nbytes() const { return file_size(); }
+
+std::size_t MmapHandle::read(void* buf, std::optional<std::size_t> size, std::size_t offset)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+
+  auto actual_size = validate_and_adjust_read_args(size, offset);
+
+  auto const is_dst_buf_host_mem = is_host_memory(buf);
+  CUcontext ctx{};
+  if (!is_dst_buf_host_mem) { ctx = get_context_from_pointer(buf); }
+
+  // Copy `actual_size` bytes from `src_mapped_buf` (src) to `buf` (dst)
+  auto const src_mapped_buf = detail::pointer_add(_buf, offset - _initial_map_offset);
+  detail::read_impl(buf, src_mapped_buf, actual_size, 0, is_dst_buf_host_mem, ctx);
+  return actual_size;
+}
+
+std::future<std::size_t> MmapHandle::pread(void* buf,
+                                           std::optional<std::size_t> size,
+                                           std::size_t offset,
+                                           std::size_t task_size)
+{
+  KVIKIO_EXPECT(task_size <= defaults::bounce_buffer_size(),
+                "bounce buffer size cannot be less than task size.");
+  auto actual_size = validate_and_adjust_read_args(size, offset);
+
+  auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx();
+  KVIKIO_NVTX_FUNC_RANGE(actual_size, nvtx_color);
+
+  auto const is_dst_buf_host_mem = is_host_memory(buf);
+  CUcontext ctx{};
+  if (!is_dst_buf_host_mem) { ctx = get_context_from_pointer(buf); }
+
+  // Copy `actual_size` bytes from `src_mapped_buf` (src) to `buf` (dst)
+  auto const src_mapped_buf = detail::pointer_add(_buf, offset - _initial_map_offset);
+  auto op =
+    [this, src_mapped_buf = src_mapped_buf, is_dst_buf_host_mem = is_dst_buf_host_mem, ctx = ctx](
+      void* dst_buf,
+      std::size_t size,
+      std::size_t,  // offset will be taken into account by dst_buf, hence no longer used here
+      std::size_t buf_offset  // buf_offset will be incremented for each individual task
+      ) -> std::size_t {
+    detail::read_impl(dst_buf, src_mapped_buf, size, buf_offset, is_dst_buf_host_mem, ctx);
+    return size;
+  };
+
+  return parallel_io(op,
+                     buf,
+                     actual_size,
+                     offset,
+                     task_size,
+                     0,  // dst buffer offset initial value
+                     call_idx,
+                     nvtx_color);
+}
+
+std::size_t MmapHandle::validate_and_adjust_read_args(std::optional<std::size_t> const& size,
+                                                      std::size_t offset)
+{
+  KVIKIO_EXPECT(!closed(), "Cannot read from a closed MmapHandle", std::runtime_error);
+  KVIKIO_EXPECT(offset < _file_size, "Offset is past the end of file", std::out_of_range);
+  auto actual_size = size.has_value() ? size.value() : _file_size - offset;
+  KVIKIO_EXPECT(actual_size > 0, "Read size must be greater than 0", std::invalid_argument);
+  KVIKIO_EXPECT(offset >= _initial_map_offset &&
+                  offset + actual_size <= _initial_map_offset + _initial_map_size,
+                "Read is out of bound",
+                std::out_of_range);
+  return actual_size;
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
index 9e5c05bc05..8053ea152a 100644
--- a/cpp/src/shim/cuda.cpp
+++ b/cpp/src/shim/cuda.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <functional>
 #include <stdexcept>
 
 #include <kvikio/error.hpp>
@@ -38,15 +39,37 @@ cudaAPI::cudaAPI()
   get_symbol(CtxPushCurrent, lib, KVIKIO_STRINGIFY(cuCtxPushCurrent));
   get_symbol(CtxPopCurrent, lib, KVIKIO_STRINGIFY(cuCtxPopCurrent));
   get_symbol(CtxGetCurrent, lib, KVIKIO_STRINGIFY(cuCtxGetCurrent));
+  get_symbol(CtxGetDevice, lib, KVIKIO_STRINGIFY(cuCtxGetDevice));
   get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange));
   get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName));
   get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString));
   get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
+  get_symbol(DeviceGetCount, lib, KVIKIO_STRINGIFY(cuDeviceGetCount));
+  get_symbol(DeviceGetAttribute, lib, KVIKIO_STRINGIFY(cuDeviceGetAttribute));
   get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
   get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
   get_symbol(StreamSynchronize, lib, KVIKIO_STRINGIFY(cuStreamSynchronize));
   get_symbol(StreamCreate, lib, KVIKIO_STRINGIFY(cuStreamCreate));
   get_symbol(StreamDestroy, lib, KVIKIO_STRINGIFY(cuStreamDestroy));
+  get_symbol(DriverGetVersion, lib, KVIKIO_STRINGIFY(cuDriverGetVersion));
+
+  CUDA_DRIVER_TRY(DriverGetVersion(&driver_version));
+
+#if CUDA_VERSION >= 12080
+  // cuMemcpyBatchAsync was introduced in CUDA 12.8.
+  try {
+    decltype(cuMemcpyBatchAsync)* fp;
+    get_symbol(fp, lib, KVIKIO_STRINGIFY(cuMemcpyBatchAsync));
+    MemcpyBatchAsync.set(fp);
+  } catch (std::runtime_error const&) {
+    // Rethrow the exception if the CUDA driver version at runtime is satisfied but
+    // cuMemcpyBatchAsync is not found.
+    if (driver_version >= 12080) { throw; }
+    // If the CUDA driver version at runtime is not satisfied, reset the wrapper. At the call site,
+    // use the conventional cuMemcpyXtoXAsync API as the fallback.
+    MemcpyBatchAsync.reset();
+  }
+#endif
 }
 #else
 cudaAPI::cudaAPI() { KVIKIO_FAIL("KvikIO not compiled with CUDA support", std::runtime_error); }
diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp
index cb4a51b890..f86b539ab4 100644
--- a/cpp/src/utils.cpp
+++ b/cpp/src/utils.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <unistd.h>
+#include <cstdint>
 #include <cstring>
 #include <iostream>
 #include <map>
@@ -23,6 +24,7 @@
 #include <tuple>
 #include <type_traits>
 
+#include <kvikio/detail/utils.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
@@ -180,4 +182,29 @@ std::tuple<void*, std::size_t, std::size_t> get_alloc_info(void const* devPtr, C
   return std::make_tuple(reinterpret_cast<void*>(base_ptr), base_size, offset);
 }
 
+namespace detail {
+
+std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
+{
+  return (value + alignment - 1) & ~(alignment - 1);
+}
+
+void* align_up(void* addr, std::size_t alignment) noexcept
+{
+  auto res = (reinterpret_cast<uintptr_t>(addr) + alignment - 1) & ~(alignment - 1);
+  return reinterpret_cast<void*>(res);
+}
+
+std::size_t align_down(std::size_t value, std::size_t alignment) noexcept
+{
+  return value & ~(alignment - 1);
+}
+
+void* align_down(void* addr, std::size_t alignment) noexcept
+{
+  auto res = reinterpret_cast<uintptr_t>(addr) & ~(alignment - 1);
+  return reinterpret_cast<void*>(res);
+}
+
+}  // namespace detail
 }  // namespace kvikio
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 47be1d9899..a6b8391928 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -76,6 +76,8 @@ kvikio_add_test(NAME DEFAULTS_TEST SOURCES test_defaults.cpp utils/env.cpp)
 
 kvikio_add_test(NAME ERROR_TEST SOURCES test_error.cpp)
 
+kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
+
 kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/tests/libkvikio)
diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
new file mode 100644
index 0000000000..b52730ee74
--- /dev/null
+++ b/cpp/tests/test_mmap.cpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/mman.h>
+#include <optional>
+#include <stdexcept>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/file_handle.hpp>
+#include <kvikio/mmap.hpp>
+#include <kvikio/utils.hpp>
+
+#include "utils/utils.hpp"
+
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+class MmapTest : public testing::Test {
+ protected:
+  void SetUp() override
+  {
+    kvikio::test::TempDir tmp_dir{false};
+    _filepath                = tmp_dir.path() / "test.bin";
+    std::size_t num_elements = 1024ull * 1024ull;
+    _host_buf                = CreateTempFile<value_type>(_filepath, num_elements);
+    _dev_buf                 = kvikio::test::DevBuffer{_host_buf};
+    _page_size               = kvikio::get_page_size();
+  }
+
+  void TearDown() override {}
+
+  template <typename T>
+  std::vector<T> CreateTempFile(std::string const& filepath, std::size_t num_elements)
+  {
+    std::vector<T> v(num_elements);
+    std::iota(v.begin(), v.end(), 0);
+    kvikio::FileHandle f(filepath, "w");
+    auto fut = f.pwrite(v.data(), v.size() * sizeof(T));
+    fut.get();
+    _file_size = f.nbytes();
+    return v;
+  }
+
+  std::filesystem::path _filepath;
+  std::size_t _file_size;
+  std::size_t _page_size;
+  std::vector<std::int64_t> _host_buf;
+  kvikio::test::DevBuffer _dev_buf;
+
+  using value_type = decltype(_host_buf)::value_type;
+};
+
+TEST_F(MmapTest, invalid_file_open_flag)
+{
+  // Empty file open flag
+  EXPECT_THAT(
+    [=] {
+      {
+        kvikio::MmapHandle(_filepath, "");
+      }
+    },
+    ThrowsMessage<std::invalid_argument>(HasSubstr("Unknown file open flag")));
+
+  // Invalid file open flag
+  EXPECT_THAT(
+    [=] {
+      {
+        kvikio::MmapHandle(_filepath, "z");
+      }
+    },
+    ThrowsMessage<std::invalid_argument>(HasSubstr("Unknown file open flag")));
+}
+
+TEST_F(MmapTest, invalid_mmap_flag)
+{
+  EXPECT_THAT(
+    [=] {
+      {
+        int invalid_flag{-1};
+        kvikio::MmapHandle(_filepath, "r", std::nullopt, 0, kvikio::FileHandle::m644, invalid_flag);
+      }
+    },
+    ThrowsMessage<kvikio::GenericSystemError>(HasSubstr("Invalid argument")));
+}
+
+TEST_F(MmapTest, constructor_invalid_range)
+{
+  // init_size is too large (by 1 char)
+  EXPECT_THAT([&] { kvikio::MmapHandle(_filepath, "r", _file_size + 1); },
+              ThrowsMessage<std::out_of_range>(HasSubstr("Mapped region is past the end of file")));
+
+  // init_file_offset is too large (by 1 char)
+  EXPECT_THAT([=] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
+              ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+
+  // init_size is 0
+  EXPECT_THAT(
+    [=] { kvikio::MmapHandle(_filepath, "r", 0); },
+    ThrowsMessage<std::invalid_argument>(HasSubstr("Mapped region should not be zero byte")));
+}
+
+TEST_F(MmapTest, constructor_valid_range)
+{
+  // init_size is exactly equal to file size
+  EXPECT_NO_THROW({ kvikio::MmapHandle(_filepath, "r", _file_size); });
+
+  // init_file_offset is exactly on the last char
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r", std::nullopt, _file_size - 1);
+    EXPECT_EQ(mmap_handle.initial_map_size(), 1);
+  });
+}
+
+TEST_F(MmapTest, read_invalid_range)
+{
+  std::size_t const initial_map_size{1024};
+  std::size_t const initial_file_offset{512};
+  std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
+
+  // file_offset is too large
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size, _file_size);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+
+  // file_offset is too small
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size, initial_file_offset - 128);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
+
+  // size is 0
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), 0, initial_file_offset);
+    },
+    ThrowsMessage<std::invalid_argument>(HasSubstr("Read size must be greater than 0")));
+
+  // size is too large
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size + 128, initial_file_offset);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
+}
+
+TEST_F(MmapTest, read_seq)
+{
+  auto do_test = [&](std::size_t num_elements_to_skip, std::size_t num_elements_to_read) {
+    kvikio::MmapHandle mmap_handle(_filepath, "r");
+    auto const offset             = num_elements_to_skip * sizeof(value_type);
+    auto const expected_read_size = num_elements_to_read * sizeof(value_type);
+
+    // host
+    {
+      std::vector<value_type> out_host_buf(num_elements_to_read, {});
+      auto const read_size = mmap_handle.read(out_host_buf.data(), expected_read_size, offset);
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+
+    // device
+    {
+      kvikio::test::DevBuffer out_device_buf(num_elements_to_read);
+      auto const read_size = mmap_handle.read(out_device_buf.ptr, expected_read_size, offset);
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+  };
+
+  for (const auto& num_elements_to_read : {10, 9999}) {
+    for (const auto& num_elements_to_skip : {0, 10, 100, 1000, 9999}) {
+      do_test(num_elements_to_skip, num_elements_to_read);
+    }
+  }
+}
+
+TEST_F(MmapTest, read_parallel)
+{
+  auto do_test =
+    [&](std::size_t num_elements_to_skip, std::size_t num_elements_to_read, std::size_t task_size) {
+      kvikio::MmapHandle mmap_handle(_filepath, "r");
+      auto const offset             = num_elements_to_skip * sizeof(value_type);
+      auto const expected_read_size = num_elements_to_read * sizeof(value_type);
+
+      // host
+      {
+        std::vector<value_type> out_host_buf(num_elements_to_read, {});
+        auto fut = mmap_handle.pread(out_host_buf.data(), expected_read_size, offset, task_size);
+        auto const read_size = fut.get();
+        for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+          EXPECT_EQ(_host_buf[i], out_host_buf[i - num_elements_to_skip]);
+        }
+        EXPECT_EQ(read_size, expected_read_size);
+      }
+
+      // device
+      {
+        kvikio::test::DevBuffer out_device_buf(num_elements_to_read);
+        auto fut             = mmap_handle.pread(out_device_buf.ptr, expected_read_size, offset);
+        auto const read_size = fut.get();
+        auto out_host_buf    = out_device_buf.to_vector();
+        for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+          EXPECT_EQ(_host_buf[i], out_host_buf[i - num_elements_to_skip]);
+        }
+        EXPECT_EQ(read_size, expected_read_size);
+      }
+    };
+
+  std::vector<std::size_t> task_sizes{256, 1024, kvikio::defaults::task_size()};
+  for (const auto& task_size : task_sizes) {
+    for (const auto& num_elements_to_read : {10, 9999}) {
+      for (const auto& num_elements_to_skip : {0, 10, 100, 1000, 9999}) {
+        do_test(num_elements_to_skip, num_elements_to_read, task_size);
+      }
+    }
+  }
+}
+
+TEST_F(MmapTest, read_with_default_arguments)
+{
+  std::size_t num_elements = _file_size / sizeof(value_type);
+  kvikio::MmapHandle mmap_handle(_filepath, "r");
+
+  // host
+  {
+    std::vector<value_type> out_host_buf(num_elements, {});
+
+    {
+      auto const read_size = mmap_handle.read(out_host_buf.data());
+      for (std::size_t i = 0; i < num_elements; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+      }
+      EXPECT_EQ(read_size, _file_size);
+    }
+
+    {
+      auto fut             = mmap_handle.pread(out_host_buf.data());
+      auto const read_size = fut.get();
+      for (std::size_t i = 0; i < num_elements; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+      }
+      EXPECT_EQ(read_size, _file_size);
+    }
+  }
+
+  // device
+  {
+    kvikio::test::DevBuffer out_device_buf(num_elements);
+
+    {
+      auto const read_size = mmap_handle.read(out_device_buf.ptr);
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = 0; i < num_elements; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+      }
+      EXPECT_EQ(read_size, _file_size);
+    }
+
+    {
+      auto fut             = mmap_handle.pread(out_device_buf.ptr);
+      auto const read_size = fut.get();
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = 0; i < num_elements; ++i) {
+        EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+      }
+      EXPECT_EQ(read_size, _file_size);
+    }
+  }
+}
+
+TEST_F(MmapTest, closed_handle)
+{
+  kvikio::MmapHandle mmap_handle(_filepath, "r");
+  mmap_handle.close();
+
+  EXPECT_TRUE(mmap_handle.closed());
+  EXPECT_EQ(mmap_handle.file_size(), 0);
+
+  std::size_t num_elements = _file_size / sizeof(value_type);
+  std::vector<value_type> out_host_buf(num_elements, {});
+
+  EXPECT_THAT([&] { mmap_handle.read(out_host_buf.data()); },
+              ThrowsMessage<std::runtime_error>(HasSubstr("Cannot read from a closed MmapHandle")));
+
+  EXPECT_THAT([&] { mmap_handle.pread(out_host_buf.data()); },
+              ThrowsMessage<std::runtime_error>(HasSubstr("Cannot read from a closed MmapHandle")));
+}
+
+TEST_F(MmapTest, cpp_move)
+{
+  auto do_test = [&](kvikio::MmapHandle& mmap_handle) {
+    std::size_t num_elements = _file_size / sizeof(value_type);
+    std::vector<value_type> out_host_buf(num_elements, {});
+
+    EXPECT_NO_THROW({ mmap_handle.read(out_host_buf.data()); });
+    auto fut             = mmap_handle.pread(out_host_buf.data());
+    auto const read_size = fut.get();
+    for (std::size_t i = 0; i < num_elements; ++i) {
+      EXPECT_EQ(_host_buf[i], out_host_buf[i]);
+    }
+    EXPECT_EQ(read_size, _file_size);
+  };
+
+  {
+    kvikio::MmapHandle mmap_handle{};
+    EXPECT_TRUE(mmap_handle.closed());
+    mmap_handle = kvikio::MmapHandle(_filepath, "r");
+    EXPECT_FALSE(mmap_handle.closed());
+    do_test(mmap_handle);
+  }
+
+  {
+    kvikio::MmapHandle mmap_handle_1(_filepath, "r");
+    kvikio::MmapHandle mmap_handle_2{std::move(mmap_handle_1)};
+    EXPECT_TRUE(mmap_handle_1.closed());
+    EXPECT_FALSE(mmap_handle_2.closed());
+    do_test(mmap_handle_2);
+  }
+}
diff --git a/cpp/tests/utils/utils.hpp b/cpp/tests/utils/utils.hpp
index 5722d3db25..bb50d07c13 100644
--- a/cpp/tests/utils/utils.hpp
+++ b/cpp/tests/utils/utils.hpp
@@ -119,6 +119,7 @@ class DevBuffer {
   DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(std::int64_t)}
   {
     KVIKIO_CHECK_CUDA(cudaMalloc(&ptr, nbytes));
+    KVIKIO_CHECK_CUDA(cudaMemset(ptr, 0, nbytes));
   }
   DevBuffer(std::vector<std::int64_t> const& host_buffer) : DevBuffer{host_buffer.size()}
   {

From fab9f0e568cab45acab76ddc7f2f206b95f2e5cc Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 24 Jul 2025 10:19:34 -0500
Subject: [PATCH 037/103] Allow latest OS in devcontainers (#780)

This PR removes the OS suffix from devcontainers, allowing the upstream devcontainer images to determine the OS version.

Contributes to https://github.com/rapidsai/build-planning/issues/200.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/kvikio/pull/780
---
 .devcontainer/cuda12.9-conda/devcontainer.json | 2 +-
 .devcontainer/cuda12.9-pip/devcontainer.json   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index 462343b8cc..fc3e89da7b 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge"
     }
   },
   "runArgs": [
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index 4acd41599a..334b4b6d6c 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9"
     }
   },
   "runArgs": [

From 888fcbf0169acec7bdb3ea3d1faa452fcfbfd60e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 25 Jul 2025 10:50:55 -0400
Subject: [PATCH 038/103] Relax mmap read requirement. Improve error message.
 (#781)

cuDF PR https://github.com/rapidsai/cudf/pull/19164 currently has 4 failed unit tests when `LIBCUDF_MMAP_ENABLED=ON`:
```
28 - CSV_TEST (Failed)
29 - ORC_TEST (Failed)
32 - JSON_TEST (Failed)
40 - DATA_CHUNK_SOURCE_TEST (Failed)
```
The fix entails code changes on both the KvikIO and cuDF sides.
On the KvikIO side, the `MmapHandle::read()` and `MmapHandle::pread()` methods need to:
- Allow the read size to be 0
- Allow `offset` to be equal to `initial_map_offset` (when the read size is 0)

This PR makes this change. In addition, this PR adds more detailed error messages when out-of-range exception occurs.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/781
---
 cpp/include/kvikio/mmap.hpp | 20 ++++++++-------
 cpp/src/mmap.cpp            | 49 ++++++++++++++++++++++++++++---------
 cpp/tests/test_mmap.cpp     | 44 ++++++++++++++++++++++++++-------
 3 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/cpp/include/kvikio/mmap.hpp b/cpp/include/kvikio/mmap.hpp
index f5245858ef..b5730727f1 100644
--- a/cpp/include/kvikio/mmap.hpp
+++ b/cpp/include/kvikio/mmap.hpp
@@ -59,7 +59,6 @@ class MmapHandle {
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    */
   std::size_t validate_and_adjust_read_args(std::optional<std::size_t> const& size,
@@ -81,11 +80,16 @@ class MmapHandle {
    *   - "w": "open for writing, truncating the file first"
    *   - "a": "open for writing, appending to the end of file if it exists"
    *   - "+": "open for updating (reading and writing)"
-   * @param initial_map_size Size in bytes of the mapped region. If not specified, map the region
-   * starting from `initial_map_offset` to the end of file
+   * @param initial_map_size Size in bytes of the mapped region. Must be greater than 0. If not
+   * specified, map the region starting from `initial_map_offset` to the end of file
    * @param initial_map_offset File offset of the mapped region
    * @param mode Access mode
    * @param map_flags Flags to be passed to the system call `mmap`. See `mmap(2)` for details
+   * @exception std::out_of_range if `initial_map_offset` (left bound of the mapped region) is equal
+   * to or greater than the file size
+   * @exception std::out_of_range if the sum of `initial_map_offset` and `initial_map_size` (right
+   * bound of the mapped region) is greater than the file size
+   * @exception std::invalid_argument if `initial_map_size` is given but is 0
    */
   MmapHandle(std::string const& file_path,
              std::string const& flags                    = "r",
@@ -147,14 +151,13 @@ class MmapHandle {
    * destination buffer `buf`
    *
    * @param buf Address of the host or device memory (destination buffer)
-   * @param size Size in bytes to read. If not specified, read starts from `offset` to the end
-   * of file
+   * @param size Size in bytes to read. Can be 0 in which case nothing will be read. If not
+   * specified, read starts from `offset` to the end of file
    * @param offset File offset
    * @return Number of bytes that have been read
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    */
   std::size_t read(void* buf,
@@ -166,15 +169,14 @@ class MmapHandle {
    * destination buffer `buf`
    *
    * @param buf Address of the host or device memory (destination buffer)
-   * @param size Size in bytes to read. If not specified, read starts from `offset` to the end
-   * of file
+   * @param size Size in bytes to read. Can be 0 in which case nothing will be read. If not
+   * specified, read starts from `offset` to the end of file
    * @param offset File offset
    * @param task_size Size of each task in bytes
    * @return Future that on completion returns the size of bytes that were successfully read.
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    *
    * @note The `std::future` object's `wait()` or `get()` should not be called after the lifetime of
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index 11b0416c29..671340cb09 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -19,6 +19,7 @@
 #include <cstdlib>
 #include <future>
 #include <optional>
+#include <sstream>
 #include <stdexcept>
 #include <type_traits>
 #include <unordered_map>
@@ -282,8 +283,12 @@ MmapHandle::MmapHandle(std::string const& file_path,
   _file_size = get_file_size(_file_wrapper.fd());
   if (_file_size == 0) { return; }
 
-  KVIKIO_EXPECT(
-    _initial_map_offset < _file_size, "Offset is past the end of file", std::out_of_range);
+  {
+    std::stringstream ss;
+    ss << "Offset must be less than the file size. initial_map_offset: " << _initial_map_offset
+       << ", file size: " << _file_size << "\n";
+    KVIKIO_EXPECT(_initial_map_offset < _file_size, ss.str(), std::out_of_range);
+  }
 
   // An initial size of std::nullopt is a shorthand for "starting from _initial_map_offset to the
   // end of file".
@@ -292,9 +297,14 @@ MmapHandle::MmapHandle(std::string const& file_path,
 
   KVIKIO_EXPECT(
     _initial_map_size > 0, "Mapped region should not be zero byte", std::invalid_argument);
-  KVIKIO_EXPECT(_initial_map_offset + _initial_map_size <= _file_size,
-                "Mapped region is past the end of file",
-                std::out_of_range);
+
+  {
+    std::stringstream ss;
+    ss << "Mapped region is past the end of file. initial map offset: " << _initial_map_offset
+       << ", initial map size: " << _initial_map_size << ", file size: " << _file_size << "\n";
+    KVIKIO_EXPECT(
+      _initial_map_offset + _initial_map_size <= _file_size, ss.str(), std::out_of_range);
+  }
 
   auto const page_size    = get_page_size();
   _map_offset             = detail::align_down(_initial_map_offset, page_size);
@@ -401,6 +411,7 @@ std::size_t MmapHandle::read(void* buf, std::optional<std::size_t> size, std::si
   KVIKIO_NVTX_FUNC_RANGE();
 
   auto actual_size = validate_and_adjust_read_args(size, offset);
+  if (actual_size == 0) { return actual_size; }
 
   auto const is_dst_buf_host_mem = is_host_memory(buf);
   CUcontext ctx{};
@@ -420,6 +431,7 @@ std::future<std::size_t> MmapHandle::pread(void* buf,
   KVIKIO_EXPECT(task_size <= defaults::bounce_buffer_size(),
                 "bounce buffer size cannot be less than task size.");
   auto actual_size = validate_and_adjust_read_args(size, offset);
+  if (actual_size == 0) { return make_ready_future(actual_size); }
 
   auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx();
   KVIKIO_NVTX_FUNC_RANGE(actual_size, nvtx_color);
@@ -454,14 +466,27 @@ std::future<std::size_t> MmapHandle::pread(void* buf,
 std::size_t MmapHandle::validate_and_adjust_read_args(std::optional<std::size_t> const& size,
                                                       std::size_t offset)
 {
-  KVIKIO_EXPECT(!closed(), "Cannot read from a closed MmapHandle", std::runtime_error);
-  KVIKIO_EXPECT(offset < _file_size, "Offset is past the end of file", std::out_of_range);
+  {
+    std::stringstream ss;
+    KVIKIO_EXPECT(!closed(), "Cannot read from a closed MmapHandle", std::runtime_error);
+
+    ss << "Offset is past the end of file. offset: " << offset << ", file size: " << _file_size
+       << "\n";
+    KVIKIO_EXPECT(offset <= _file_size, ss.str(), std::out_of_range);
+  }
+
   auto actual_size = size.has_value() ? size.value() : _file_size - offset;
-  KVIKIO_EXPECT(actual_size > 0, "Read size must be greater than 0", std::invalid_argument);
-  KVIKIO_EXPECT(offset >= _initial_map_offset &&
-                  offset + actual_size <= _initial_map_offset + _initial_map_size,
-                "Read is out of bound",
-                std::out_of_range);
+
+  {
+    std::stringstream ss;
+    ss << "Read is out of bound. offset: " << offset << ", actual size to read: " << actual_size
+       << ", initial map offset: " << _initial_map_offset
+       << ", initial map size: " << _initial_map_size << "\n";
+    KVIKIO_EXPECT(offset >= _initial_map_offset &&
+                    offset + actual_size <= _initial_map_offset + _initial_map_size,
+                  ss.str(),
+                  std::out_of_range);
+  }
   return actual_size;
 }
 
diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
index b52730ee74..2de89c5cd7 100644
--- a/cpp/tests/test_mmap.cpp
+++ b/cpp/tests/test_mmap.cpp
@@ -107,8 +107,9 @@ TEST_F(MmapTest, constructor_invalid_range)
               ThrowsMessage<std::out_of_range>(HasSubstr("Mapped region is past the end of file")));
 
   // init_file_offset is too large (by 1 char)
-  EXPECT_THAT([=] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
-              ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+  EXPECT_THAT(
+    [=] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset must be less than the file size")));
 
   // init_size is 0
   EXPECT_THAT(
@@ -134,29 +135,36 @@ TEST_F(MmapTest, read_invalid_range)
   std::size_t const initial_file_offset{512};
   std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
 
-  // file_offset is too large
+  // Right bound is too large
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
       mmap_handle.read(out_host_buf.data(), initial_map_size, _file_size);
     },
-    ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
-  // file_offset is too small
+  // Left bound is too large
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
-      mmap_handle.read(out_host_buf.data(), initial_map_size, initial_file_offset - 128);
+      mmap_handle.read(out_host_buf.data(), 0, initial_file_offset + initial_map_size + 1);
     },
     ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
-  // size is 0
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r");
+      mmap_handle.read(out_host_buf.data(), 0, _file_size + 1);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+
+  // Left bound is too small
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
-      mmap_handle.read(out_host_buf.data(), 0, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size, initial_file_offset - 128);
     },
-    ThrowsMessage<std::invalid_argument>(HasSubstr("Read size must be greater than 0")));
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
   // size is too large
   EXPECT_THAT(
@@ -167,6 +175,24 @@ TEST_F(MmapTest, read_invalid_range)
     ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 }
 
+TEST_F(MmapTest, read_valid_range)
+{
+  std::size_t const initial_map_size{1024};
+  std::size_t const initial_file_offset{512};
+  std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
+
+  // size is 0
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+    mmap_handle.read(out_host_buf.data(), 0, initial_file_offset + initial_map_size);
+  });
+
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r");
+    mmap_handle.read(out_host_buf.data(), 0, _file_size);
+  });
+}
+
 TEST_F(MmapTest, read_seq)
 {
   auto do_test = [&](std::size_t num_elements_to_skip, std::size_t num_elements_to_read) {

From 003d6711f08b2b2bc8ae0c227cb67d92c0a03980 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 25 Jul 2025 08:34:14 -0700
Subject: [PATCH 039/103] Use GCC 14 in conda builds. (#756)

conda-forge is migrating to gcc 14, so this PR is updating for alignment.

See https://github.com/rapidsai/build-planning/issues/188

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/kvikio/pull/756
---
 conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-129_arch-x86_64.yaml  | 2 +-
 conda/recipes/kvikio/conda_build_config.yaml      | 4 ++--
 conda/recipes/libkvikio/recipe.yaml               | 2 +-
 dependencies.yaml                                 | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 3f0a88d8b6..45a7d1297c 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
-- gcc_linux-aarch64=13.*
+- gcc_linux-aarch64=14.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 48a3f925f8..4ade45d196 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
-- gcc_linux-64=13.*
+- gcc_linux-64=14.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
index c831f4c3cb..2694d62eac 100644
--- a/conda/recipes/kvikio/conda_build_config.yaml
+++ b/conda/recipes/kvikio/conda_build_config.yaml
@@ -1,8 +1,8 @@
 c_compiler_version:
-  - 13
+  - 14
 
 cxx_compiler_version:
-  - 13
+  - 14
 
 cmake_version:
   - ">=3.30.4"
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 6d6316593e..55ddc6fcaf 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -10,7 +10,7 @@ context:
   # 3. Linux aarch64 with CUDA < 12.2, which does not use libcufile
   # Each case has different cuda-version constraints as expressed below
   should_use_cufile: ${{ x86_64 or (aarch64 and cuda_version >= "12.2") }}
-  c_compiler_version: ${{ 13 if should_use_cufile else 12 }}
+  c_compiler_version: ${{ 14 if should_use_cufile else 12 }}
   cxx_compiler_version: ${{ c_compiler_version }}
   # When reverting, instances of cuda_key_string can be replaced with cuda_major
   cuda_key_string: ${{ cuda_version | replace(".", "_") }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 589715cdef..9b7b899780 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -133,13 +133,13 @@ dependencies:
               arch: x86_64
               cuda: "12.*"
             packages:
-              - gcc_linux-64=13.*
+              - gcc_linux-64=14.*
               - sysroot_linux-64=2.28
           - matrix:
               arch: aarch64
               cuda: "12.*"
             packages:
-              - gcc_linux-aarch64=13.*
+              - gcc_linux-aarch64=14.*
               - sysroot_linux-aarch64=2.28
       - output_types: conda
         matrices:

From 074758eb31725660217c67b6caddf412512ea544 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 25 Jul 2025 13:58:04 -0400
Subject: [PATCH 040/103] Update build infra to support new branching strategy
 (#776)

rapids_config will use `RAPIDS_BRANCH` contents to determine what branch to use

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/776
---
 RAPIDS_BRANCH             |  1 +
 cmake/RAPIDS.cmake        |  6 +++---
 cmake/rapids_config.cmake | 10 ++++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 RAPIDS_BRANCH

diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
new file mode 100644
index 0000000000..9b1c52d941
--- /dev/null
+++ b/RAPIDS_BRANCH
@@ -0,0 +1 @@
+branch-25.10
diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake
index d112951d3c..40de7cefcd 100644
--- a/cmake/RAPIDS.cmake
+++ b/cmake/RAPIDS.cmake
@@ -18,9 +18,9 @@
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
 
 # Allow users to control which version is used
-if(NOT rapids-cmake-version OR NOT rapids-cmake-version MATCHES [[^([0-9][0-9])\.([0-9][0-9])$]])
+if(NOT rapids-cmake-branch OR NOT rapids-cmake-version)
   message(
-    FATAL_ERROR "The CMake variable rapids-cmake-version must be defined in the format MAJOR.MINOR."
+    FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined"
   )
 endif()
 
@@ -33,7 +33,7 @@ endif()
 # Allow users to control which branch is fetched
 if(NOT rapids-cmake-branch)
   # Define a default branch if the user doesn't set one
-  set(rapids-cmake-branch "branch-${rapids-cmake-version}")
+  set(rapids-cmake-branch "release/${rapids-cmake-version}")
 endif()
 
 # Allow users to control the exact URL passed to FetchContent
diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake
index abe468dce8..b706c926e7 100644
--- a/cmake/rapids_config.cmake
+++ b/cmake/rapids_config.cmake
@@ -26,5 +26,15 @@ else()
   )
 endif()
 
+# Use STRINGS to trim whitespace/newlines
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH" _rapids_branch)
+if(NOT _rapids_branch)
+  message(
+    FATAL_ERROR
+      "Could not determine branch name to use for checking out rapids-cmake. The file \"${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH\" is missing."
+  )
+endif()
+
 set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
+set(rapids-cmake-branch "${_rapids_branch}")
 include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake")

From a1bd3eaed2297d615d8377a5ae8c99cebcf2c033 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 25 Jul 2025 11:38:49 -0700
Subject: [PATCH 041/103] Revert "Set compiler versions in context (#755)"
 (#784)

Closes https://github.com/rapidsai/kvikio/issues/773

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/784
---
 ci/build_cpp.sh                     | 32 +++++++++++++++++++++++------
 conda/recipes/libkvikio/recipe.yaml |  2 --
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index b12ed048ad..700ee25707 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -26,13 +26,33 @@ export RAPIDS_ARTIFACTS_DIR
 source rapids-rattler-channel-string
 
 # Construct the extra variants according to the architecture
-cat > variants.yaml << EOF
-cuda_version:
-  - ${RAPIDS_CUDA_VERSION%.*}
+if [[ "$(arch)" == "x86_64" ]]; then
+    cat > variants.yaml << EOF
+    c_compiler_version:
+      - 14
+
+    cxx_compiler_version:
+      - 14
+
+    cuda_version:
+      - ${RAPIDS_CUDA_VERSION%.*}
 EOF
-if [[ "$(arch)" == "aarch64" ]]; then
-  cat >> variants.yaml << EOF
-  - 12.1 # The last version to not support cufile
+else
+    cat > variants.yaml << EOF
+    zip_keys:
+    - [c_compiler_version, cxx_compiler_version, cuda_version]
+
+    c_compiler_version:
+    - 12
+    - 14
+
+    cxx_compiler_version:
+    - 12
+    - 14
+
+    cuda_version:
+    - 12.1 # The last version to not support cufile
+    - ${RAPIDS_CUDA_VERSION%.*}
 EOF
 fi
 
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 55ddc6fcaf..a4e423d3ed 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -10,8 +10,6 @@ context:
   # 3. Linux aarch64 with CUDA < 12.2, which does not use libcufile
   # Each case has different cuda-version constraints as expressed below
   should_use_cufile: ${{ x86_64 or (aarch64 and cuda_version >= "12.2") }}
-  c_compiler_version: ${{ 14 if should_use_cufile else 12 }}
-  cxx_compiler_version: ${{ c_compiler_version }}
   # When reverting, instances of cuda_key_string can be replaced with cuda_major
   cuda_key_string: ${{ cuda_version | replace(".", "_") }}
   #cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}

From 7bfaf3482505d305c04239ed08e62ef5adf9e05e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 25 Jul 2025 16:06:06 -0400
Subject: [PATCH 042/103] Use C++20 standard (#749)

This PR changes KvikIO C++ standard from 17 to 20.

Depends on https://github.com/rapidsai/kvikio/pull/751

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/749
---
 cpp/CMakeLists.txt            | 5 +++++
 cpp/benchmarks/CMakeLists.txt | 9 ++++++++-
 cpp/tests/CMakeLists.txt      | 6 ++----
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 37d237e0e3..53351393ee 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -29,6 +29,11 @@ project(
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")
 
+# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the
+# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with
+# gcc>=14. We can remove this once we upgrade to a newer sccache version.
+set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
+
 # Write the version header
 rapids_cmake_write_version_file(include/kvikio/version_config.hpp)
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 665d45edfb..fc857fdfdf 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -42,7 +42,14 @@ function(kvikio_add_benchmark)
   endif()
 
   add_executable(${_KVIKIO_NAME} ${_KVIKIO_SOURCES})
-  set_target_properties(${_KVIKIO_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
+  set_target_properties(
+    ${_KVIKIO_NAME}
+    PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 20
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 20
+               CUDA_STANDARD_REQUIRED ON
+  )
 
   target_link_libraries(${_KVIKIO_NAME} PUBLIC benchmark::benchmark kvikio::kvikio)
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a6b8391928..3d53bbd86f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -50,11 +50,9 @@ function(kvikio_add_test)
   set_target_properties(
     ${_KVIKIO_NAME}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${KvikIO_BINARY_DIR}/gtests>"
-               CXX_STANDARD 17
+               CXX_STANDARD 20
                CXX_STANDARD_REQUIRED ON
-               # For std:: support of __int128_t. Can be removed once using cuda::std
-               CXX_EXTENSIONS ON
-               CUDA_STANDARD 17
+               CUDA_STANDARD 20
                CUDA_STANDARD_REQUIRED ON
   )
   target_link_libraries(

From c9935d144f56e5b2d2c3557db0417ce2c1e9207c Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 28 Jul 2025 11:02:57 -0400
Subject: [PATCH 043/103] Fix C++20 warning in the mmap test (#785)

The unit tests of mmap contain lambda expressions. The style of capturing the current object (`*this`) is not consistent: some places use `[&]` and others use `[=]`. In both cases, `*this` is captured by reference. However, in C++20, implicit capture of `*this` when the capture default is `=` is deprecated. This PR fixes the warning messages by consistently using `[&]` on the ground that the lifetime of `*this` is longer than the point the closure is being called.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/785
---
 cpp/tests/test_mmap.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
index 2de89c5cd7..7fb1e3a4fd 100644
--- a/cpp/tests/test_mmap.cpp
+++ b/cpp/tests/test_mmap.cpp
@@ -71,7 +71,7 @@ TEST_F(MmapTest, invalid_file_open_flag)
 {
   // Empty file open flag
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         kvikio::MmapHandle(_filepath, "");
       }
@@ -80,7 +80,7 @@ TEST_F(MmapTest, invalid_file_open_flag)
 
   // Invalid file open flag
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         kvikio::MmapHandle(_filepath, "z");
       }
@@ -91,7 +91,7 @@ TEST_F(MmapTest, invalid_file_open_flag)
 TEST_F(MmapTest, invalid_mmap_flag)
 {
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         int invalid_flag{-1};
         kvikio::MmapHandle(_filepath, "r", std::nullopt, 0, kvikio::FileHandle::m644, invalid_flag);
@@ -108,12 +108,12 @@ TEST_F(MmapTest, constructor_invalid_range)
 
   // init_file_offset is too large (by 1 char)
   EXPECT_THAT(
-    [=] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
+    [&] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
     ThrowsMessage<std::out_of_range>(HasSubstr("Offset must be less than the file size")));
 
   // init_size is 0
   EXPECT_THAT(
-    [=] { kvikio::MmapHandle(_filepath, "r", 0); },
+    [&] { kvikio::MmapHandle(_filepath, "r", 0); },
     ThrowsMessage<std::invalid_argument>(HasSubstr("Mapped region should not be zero byte")));
 }
 

From 71c7638ef47cf137839385b8f5875e403d7dd4bf Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 31 Jul 2025 11:34:48 -0400
Subject: [PATCH 044/103] Support file-backed mapping (2/n): host/device read
 Python binding (#742)

On top of https://github.com/rapidsai/kvikio/pull/740, this PR provides Python binding for file-backed memory mapping.

Closes https://github.com/rapidsai/kvikio/issues/530

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/742
---
 cpp/src/mmap.cpp                          |  37 ++---
 docs/source/api.rst                       |   7 +
 python/kvikio/kvikio/__init__.py          |   2 +
 python/kvikio/kvikio/_lib/CMakeLists.txt  |   2 +-
 python/kvikio/kvikio/_lib/defaults.pyx    |   1 -
 python/kvikio/kvikio/_lib/file_handle.pyx |   7 +-
 python/kvikio/kvikio/_lib/mmap.pyx        | 116 +++++++++++++
 python/kvikio/kvikio/mmap.py              | 171 ++++++++++++++++++++
 python/kvikio/tests/test_mmap.py          | 188 ++++++++++++++++++++++
 9 files changed, 505 insertions(+), 26 deletions(-)
 create mode 100644 python/kvikio/kvikio/_lib/mmap.pyx
 create mode 100644 python/kvikio/kvikio/mmap.py
 create mode 100644 python/kvikio/tests/test_mmap.py

diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index 671340cb09..968b96aa42 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -274,13 +274,25 @@ MmapHandle::MmapHandle(std::string const& file_path,
                        std::size_t initial_map_offset,
                        mode_t mode,
                        std::optional<int> map_flags)
-  : _initial_map_offset(initial_map_offset),
-    _initialized{true},
-    _file_wrapper(file_path, flags, false /* o_direct */, mode)
+  : _initial_map_offset(initial_map_offset), _initialized{true}
 {
   KVIKIO_NVTX_FUNC_RANGE();
 
-  _file_size = get_file_size(_file_wrapper.fd());
+  switch (flags[0]) {
+    case 'r': {
+      _map_protection = PROT_READ;
+      break;
+    }
+    case 'w': {
+      KVIKIO_FAIL("File-backed mmap write is not supported yet", std::invalid_argument);
+    }
+    default: {
+      KVIKIO_FAIL("Unknown file open flag", std::invalid_argument);
+    }
+  }
+
+  _file_wrapper = FileWrapper(file_path, flags, false /* o_direct */, mode);
+  _file_size    = get_file_size(_file_wrapper.fd());
   if (_file_size == 0) { return; }
 
   {
@@ -310,22 +322,7 @@ MmapHandle::MmapHandle(std::string const& file_path,
   _map_offset             = detail::align_down(_initial_map_offset, page_size);
   auto const offset_delta = _initial_map_offset - _map_offset;
   _map_size               = _initial_map_size + offset_delta;
-
-  switch (flags[0]) {
-    case 'r': {
-      _map_protection = PROT_READ;
-      break;
-    }
-    case 'w': {
-      KVIKIO_FAIL("File-backed mmap write is not supported yet", std::invalid_argument);
-    }
-    default: {
-      KVIKIO_FAIL("Unknown file open flag", std::invalid_argument);
-    }
-  }
-
-  _map_flags = map_flags.has_value() ? map_flags.value() : MAP_PRIVATE;
-
+  _map_flags              = map_flags.has_value() ? map_flags.value() : MAP_PRIVATE;
   _map_addr =
     mmap(nullptr, _map_size, _map_protection, _map_flags, _file_wrapper.fd(), _map_offset);
   SYSCALL_CHECK(_map_addr, "Cannot create memory mapping", MAP_FAILED);
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 1e19f12bdc..7ae724dec5 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -33,6 +33,13 @@ CuFile driver
 
 .. autofunction:: initialize
 
+Mmap
+----
+.. currentmodule:: kvikio.mmap
+
+.. autoclass:: Mmap
+    :members:
+
 Zarr
 ----
 .. currentmodule:: kvikio.zarr
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index 9208d4e3ce..5c6d8b6353 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -15,6 +15,7 @@
 from kvikio._lib.defaults import CompatMode  # noqa: F401
 from kvikio._version import __git_commit__, __version__
 from kvikio.cufile import CuFile, clear_page_cache, get_page_cache_info
+from kvikio.mmap import Mmap
 from kvikio.remote_file import RemoteFile, is_remote_file_available
 
 __all__ = [
@@ -22,6 +23,7 @@
     "__version__",
     "clear_page_cache",
     "CuFile",
+    "Mmap",
     "get_page_cache_info",
     "is_remote_file_available",
     "RemoteFile",
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index 1ea9b85dff..7dfb47cc0f 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build, one .so per file
 set(cython_modules arr.pyx buffer.pyx defaults.pyx cufile_driver.pyx file_handle.pyx future.pyx
-                   libnvcomp.pyx libnvcomp_ll.pyx
+                   libnvcomp.pyx libnvcomp_ll.pyx mmap.pyx
 )
 
 if(KvikIO_REMOTE_SUPPORT)
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index 00f1de4ec1..de511211e6 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -37,7 +37,6 @@ cdef extern from "<kvikio/defaults.hpp>" namespace "kvikio" nogil:
     vector[int] cpp_http_status_codes "kvikio::defaults::http_status_codes"() except +
     void cpp_set_http_status_codes \
         "kvikio::defaults::set_http_status_codes"(vector[int] status_codes) except +
-
     long cpp_http_timeout "kvikio::defaults::http_timeout"() except +
     void cpp_set_http_timeout\
         "kvikio::defaults::set_http_timeout"(long timeout_seconds) except +
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index c0d71f36a7..b17d283433 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -6,7 +6,6 @@
 
 import io
 import os
-import pathlib
 from typing import Optional, Union
 
 from posix cimport fcntl
@@ -97,8 +96,8 @@ cdef class CuFile:
     def __init__(self, file_path, flags="r"):
         self._handle = move(
             FileHandle(
-                str.encode(str(pathlib.Path(file_path))),
-                str.encode(str(flags))
+                os.fsencode(file_path),
+                str(flags).encode()
             )
         )
 
@@ -194,7 +193,7 @@ def get_page_cache_info(file: Union[os.PathLike, str, int, io.IOBase]) \
         -> tuple[int, int]:
     if isinstance(file, os.PathLike) or isinstance(file, str):
         # file is a path or a string object
-        path_bytes = str(pathlib.Path(file)).encode()
+        path_bytes = os.fsencode(file)
         return cpp_get_page_cache_info_str(path_bytes)
     elif isinstance(file, int):
         # file is a file descriptor
diff --git a/python/kvikio/kvikio/_lib/mmap.pyx b/python/kvikio/kvikio/_lib/mmap.pyx
new file mode 100644
index 0000000000..ac4889b25c
--- /dev/null
+++ b/python/kvikio/kvikio/_lib/mmap.pyx
@@ -0,0 +1,116 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+# distutils: language = c++
+# cython: language_level=3
+
+import os
+from typing import Any, Optional
+
+from posix cimport fcntl, stat
+
+from libc.stdint cimport uintptr_t
+from libcpp cimport bool
+from libcpp.optional cimport nullopt, optional
+from libcpp.string cimport string
+from libcpp.utility cimport move, pair
+
+from kvikio._lib.arr cimport parse_buffer_argument
+from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
+
+from kvikio._lib import defaults
+
+
+cdef extern from "<kvikio/mmap.hpp>" namespace "kvikio" nogil:
+    cdef cppclass CppMmapHandle "kvikio::MmapHandle":
+        CppMmapHandle() noexcept
+        CppMmapHandle(string file_path, string flags, optional[size_t] initial_map_size,
+                      size_t initial_map_offset, fcntl.mode_t mode,
+                      optional[int] map_flags) except +
+        size_t initial_map_size() noexcept
+        size_t initial_map_offset() noexcept
+        size_t file_size() except +
+        void close() noexcept
+        bool closed() noexcept
+        size_t read(void* buf, optional[size_t] size, size_t offset) except +
+        future[size_t] pread(void* buf, optional[size_t] size, size_t offset,
+                             size_t task_size) except +
+
+cdef class InternalMmapHandle:
+    cdef CppMmapHandle _handle
+
+    def __init__(self, file_path: os.PathLike,
+                 flags: str = "r",
+                 initial_map_size: Optional[int] = None,
+                 initial_map_offset: int = 0,
+                 mode: int = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH,
+                 map_flags: Optional[int] = None):
+        if not os.path.exists(file_path):
+            raise RuntimeError("Unable to open file")
+
+        cdef optional[size_t] cpp_initial_map_size
+        if initial_map_size is None:
+            cpp_initial_map_size = nullopt
+        else:
+            cpp_initial_map_size = <size_t>(initial_map_size)
+
+        path_bytes = os.fsencode(file_path)
+        flags_bytes = str(flags).encode()
+
+        cdef optional[int] cpp_map_flags
+        if map_flags is None:
+            cpp_map_flags = nullopt
+        else:
+            cpp_map_flags = <int>(map_flags)
+
+        self._handle = move(CppMmapHandle(path_bytes,
+                                          flags_bytes,
+                                          cpp_initial_map_size,
+                                          initial_map_offset,
+                                          mode,
+                                          cpp_map_flags))
+
+    def initial_map_size(self) -> int:
+        return self._handle.initial_map_size()
+
+    def initial_map_offset(self) -> int:
+        return self._handle.initial_map_offset()
+
+    def file_size(self) -> int:
+        return self._handle.file_size()
+
+    def close(self) -> None:
+        self._handle.close()
+
+    def closed(self) -> bool:
+        return self._handle.closed()
+
+    def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        cdef optional[size_t] cpp_size
+        if size is None:
+            cpp_size = nullopt
+        else:
+            cpp_size = <size_t>(size)
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+        return self._handle.read(<void*>info.first,
+                                 cpp_size,
+                                 offset)
+
+    def pread(self, buf: Any, size: Optional[int] = None, offset: int = 0,
+              task_size: Optional[int] = None) -> IOFuture:
+        cdef optional[size_t] cpp_size
+        if size is None:
+            cpp_size = nullopt
+        else:
+            cpp_size = <size_t>(size)
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+
+        if task_size is None:
+            cpp_task_size = defaults.task_size()
+        else:
+            cpp_task_size = task_size
+
+        return _wrap_io_future(self._handle.pread(<void*>info.first,
+                               cpp_size,
+                               offset,
+                               cpp_task_size))
diff --git a/python/kvikio/kvikio/mmap.py b/python/kvikio/kvikio/mmap.py
new file mode 100644
index 0000000000..3b506ded02
--- /dev/null
+++ b/python/kvikio/kvikio/mmap.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import os
+import stat
+from typing import Any, Optional
+
+from kvikio._lib.mmap import InternalMmapHandle
+from kvikio.cufile import IOFuture
+
+
+class Mmap:
+    """Handle of a memory-mapped file"""
+
+    def __init__(
+        self,
+        file_path: os.PathLike,
+        flags: str = "r",
+        initial_map_size: Optional[int] = None,
+        initial_map_offset: int = 0,
+        mode: int = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH,
+        map_flags: Optional[int] = None,
+    ):
+        """Construct a new memory-mapped file handle
+
+        Parameters
+        ----------
+        file_path : os.PathLike
+            File path.
+        flags : str, optional
+
+            - ``r``: Open for reading (default)
+            - ``w``: (Not implemented yet) Open for writing, truncating the file first
+            - ``a``: (Not implemented yet) Open for writing, appending to the end of
+              file if it exists
+            - ``+``: (Not implemented yet) Open for updating (reading and writing)
+        initial_map_size : int, optional
+            Size in bytes of the mapped region. If not specified, map the region
+            starting from ``initial_map_offset`` to the end of file.
+        initial_map_offset : int, optional
+            File offset of the mapped region. Default is 0.
+        mode : int, optional
+            Access mode (permissions) to use if creating a new file. Default is
+            0644 (octal), 420 (decimal).
+        map_flags : int, optional
+            Flags to be passed to the system call ``mmap``. See `mmap(2)` for details.
+        """
+        self._handle = InternalMmapHandle(
+            file_path, flags, initial_map_size, initial_map_offset, mode, map_flags
+        )
+
+    def initial_map_size(self) -> int:
+        """Size in bytes of the mapped region when the mapping handle was constructed
+
+        Returns
+        -------
+        int
+            Initial size of the mapped region.
+        """
+        return self._handle.initial_map_size()
+
+    def initial_map_offset(self) -> int:
+        """File offset of the mapped region when the mapping handle was constructed
+
+        Returns
+        -------
+        int
+            Initial file offset of the mapped region.
+        """
+        return self._handle.initial_map_offset()
+
+    def file_size(self) -> int:
+        """Get the file size if the file is open
+
+        Returns 0 if the file is closed.
+
+        Returns
+        -------
+        int
+            The file size in bytes.
+        """
+        return self._handle.file_size()
+
+    def close(self) -> None:
+        """Close the mapping handle if it is open; do nothing otherwise
+
+        Unmaps the memory region and closes the underlying file descriptor.
+        """
+        self._handle.close()
+
+    def closed(self) -> bool:
+        """Whether the mapping handle is closed
+
+        Returns
+        -------
+        bool
+            Boolean answer.
+        """
+        return self._handle.closed()
+
+    def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        """Sequential read ``size`` bytes from the file to the destination buffer
+        ``buf``
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Address of the host or device memory (destination buffer).
+        size : int, optional
+            Size in bytes to read. If not specified, read starts from ``offset``
+            to the end of file.
+        offset : int, optional
+            File offset. Default is 0.
+
+        Returns
+        -------
+        int
+            Number of bytes that have been read.
+
+        Raises
+        ------
+        IndexError
+            If the read region specified by ``offset`` and ``size`` is outside the
+            initial region specified when the mapping handle was constructed.
+        RuntimeError
+            If the mapping handle is closed.
+        """
+        return self._handle.read(buf, size, offset)
+
+    def pread(
+        self,
+        buf: Any,
+        size: Optional[int] = None,
+        offset: int = 0,
+        task_size: Optional[int] = None,
+    ) -> IOFuture:
+        """Parallel read ``size`` bytes from the file to the destination buffer ``buf``
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Address of the host or device memory (destination buffer).
+        size : int, optional
+            Size in bytes to read. If not specified, read starts from ``offset``
+            to the end of file.
+        offset : int, optional
+            File offset. Default is 0.
+        task_size : int, optional
+            Size of each task in bytes for parallel execution. If None, uses
+            the default task size from :func:`kvikio.defaults.task_size`.
+
+        Returns
+        -------
+        IOFuture
+            Future that on completion returns the size of bytes that were successfully
+            read.
+
+        Raises
+        ------
+        IndexError
+            If the read region specified by ``offset`` and ``size`` is outside the
+            initial region specified when the mapping handle was constructed.
+        RuntimeError
+            If the mapping handle is closed.
+
+        Notes
+        -----
+        The returned IOFuture object's ``get()`` should not be called after the lifetime
+        of the MmapHandle object ends. Otherwise, the behavior is undefined.
+        """
+        return IOFuture(self._handle.pread(buf, size, offset, task_size))
diff --git a/python/kvikio/tests/test_mmap.py b/python/kvikio/tests/test_mmap.py
new file mode 100644
index 0000000000..6ad7468690
--- /dev/null
+++ b/python/kvikio/tests/test_mmap.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import contextlib
+import os
+
+import pytest
+
+import kvikio.defaults
+
+cupy = pytest.importorskip("cupy")
+numpy = pytest.importorskip("numpy")
+
+
+def test_no_file(tmp_path):
+    nonexistent_file = tmp_path / "nonexistent_file"
+    with pytest.raises(RuntimeError, match=r".*Unable to open file.*"):
+        kvikio.Mmap(nonexistent_file)
+
+
+def test_invalid_file_open_flag(tmp_path):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = numpy.arange(1024)
+    expected_data.tofile(filename)
+
+    with pytest.raises(ValueError, match=r".*Unknown file open flag.*"):
+        kvikio.Mmap(filename, "")
+
+    with pytest.raises(ValueError, match=r".*Unknown file open flag.*"):
+        kvikio.Mmap(filename, "z")
+
+
+def test_constructor_invalid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    with pytest.raises(IndexError, match=r".*Offset must be less than the file size.*"):
+        kvikio.Mmap(filename, "r", None, test_data.nbytes * 2)
+
+    with pytest.raises(IndexError, match=r".*Mapped region is past the end of file.*"):
+        kvikio.Mmap(filename, "r", test_data.nbytes * 2)
+
+    with pytest.raises(ValueError, match=r".*Mapped region should not be zero byte.*"):
+        kvikio.Mmap(filename, "r", 0)
+
+
+def test_read_invalid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+    output_data = xp.zeros_like(test_data)
+
+    initial_size = 1024
+    initial_file_offset = 512
+
+    with pytest.raises(IndexError, match=r".*Offset is past the end of file.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, test_data.nbytes + 1)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, initial_file_offset + 1)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, initial_file_offset - 128)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size + 128, initial_file_offset)
+
+
+def test_read_valid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+    output_data = xp.zeros_like(test_data)
+
+    initial_size = 1024
+    initial_file_offset = 512
+
+    with contextlib.nullcontext():
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, 0, initial_file_offset)
+
+
+@pytest.mark.parametrize("num_elements_to_read", [None, 10, 9999])
+@pytest.mark.parametrize("num_elements_to_skip", [0, 10, 100, 1000, 9999])
+def test_read_seq(tmp_path, xp, num_elements_to_read, num_elements_to_skip):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    if num_elements_to_read is None:
+        initial_size = None
+        actual_num_elements_to_read = int(
+            os.path.getsize(filename) / test_data.itemsize
+        )
+    else:
+        initial_size = num_elements_to_read * test_data.itemsize
+        actual_num_elements_to_read = num_elements_to_read
+
+    initial_file_offset = num_elements_to_skip * test_data.itemsize
+    expected_data = test_data[
+        num_elements_to_skip : (num_elements_to_skip + actual_num_elements_to_read)
+    ]
+    actual_data = xp.zeros_like(expected_data)
+
+    mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+    read_size = mmap_handle.read(actual_data, initial_size, initial_file_offset)
+
+    assert read_size == expected_data.nbytes
+    xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+@pytest.mark.parametrize("num_elements_to_read", [None, 10, 9999])
+@pytest.mark.parametrize("num_elements_to_skip", [0, 10, 100, 1000, 9999])
+@pytest.mark.parametrize("task_size", [1024, 12345])
+def test_read_parallel(
+    tmp_path, xp, num_elements_to_read, num_elements_to_skip, task_size
+):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    if num_elements_to_read is None:
+        initial_size = None
+        actual_num_elements_to_read = int(
+            os.path.getsize(filename) / test_data.itemsize
+        )
+    else:
+        initial_size = num_elements_to_read * test_data.itemsize
+        actual_num_elements_to_read = num_elements_to_read
+
+    initial_file_offset = num_elements_to_skip * test_data.itemsize
+    expected_data = test_data[
+        num_elements_to_skip : (num_elements_to_skip + actual_num_elements_to_read)
+    ]
+    actual_data = xp.zeros_like(expected_data)
+
+    with kvikio.defaults.set("task_size", task_size):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        fut = mmap_handle.pread(
+            actual_data, initial_size, initial_file_offset, task_size
+        )
+
+        assert fut.get() == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_read_with_default_arguments(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = xp.arange(1024 * 1024)
+    expected_data.tofile(filename)
+    actual_data = xp.zeros_like(expected_data)
+
+    # Workaround for a CI failure where defaults.task_size() is somehow 0
+    # instead of 4 MiB when KVIKIO_TASK_SIZE is unset
+    with kvikio.defaults.set("task_size", 4 * 1024 * 1024):
+        mmap_handle = kvikio.Mmap(filename, "r")
+
+        read_size = mmap_handle.read(actual_data)
+        assert read_size == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+        fut = mmap_handle.pread(actual_data)
+        assert fut.get() == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_closed_handle(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = xp.arange(1024 * 1024)
+    expected_data.tofile(filename)
+    actual_data = xp.zeros_like(expected_data)
+
+    mmap_handle = kvikio.Mmap(filename, "r")
+    mmap_handle.close()
+
+    assert mmap_handle.closed()
+    assert mmap_handle.file_size() == 0
+
+    with pytest.raises(RuntimeError, match=r".*Cannot read from a closed MmapHandle.*"):
+        mmap_handle.read(actual_data)
+
+    with pytest.raises(RuntimeError, match=r".*Cannot read from a closed MmapHandle.*"):
+        mmap_handle.pread(actual_data)

From 6cffae5a905644f88227bdcad35cbad695258789 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 4 Aug 2025 09:35:54 -0400
Subject: [PATCH 045/103] Support file size query for S3 presigned URL (#789)

## Background

Knowing the size of the remote file before reading is important in remote I/O, as it allows users to pre-allocate buffer to avoid expensive on-the-fly reallocation. Currently in KvikIO this is not possible for AWS S3 presigned URL, which is a special link generated by data owner to grant time-limited access without using AWS credentials.

As is described in #585, file size query in KvikIO results in the HTTP 403 (forbidden) status code. This is because the query method is based on the `HEAD` request, and AWS S3 does not allow `HEAD` for presigned URL.

## Proposed solution

This PR provides a solution. The idea is to send a `GET` request (instead of `HEAD`) with a 1-byte range, so that we can still obtain the header information at a negligible cost. Since the `content-length` header is now at a fixed value of 1, we instead extract the file size value from `content-range`.

This PR adds a new C++ endpoint `S3EndpointWithPresignedUrl` and Python API `kvikio.RemoteFile.open_s3_presigned_url(url)`.

## Result

The following code now works properly without 403 error:

```python
import kvikio
import cupy

presigned_url = "<long_url_generated_by_data_owner>"
remote_file = kvikio.RemoteFile.open_s3_presigned_url(presigned_url)
print("--> file size: {:}".format(remote_file.nbytes()))

buf = cupy.zeros(remote_file.nbytes() // 8)
fut = remote_file.pread(buf)
read_size = fut.get()

print("--> read_size: {:}", read_size)
print(buf)
```

## Limitation

This PR is tested manually using a presigned URL. In a future PR, we need to add unit tests using `boto`.

Closes #585

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/kvikio/pull/789
---
 cpp/include/kvikio/remote_handle.hpp        |  40 ++++--
 cpp/src/remote_handle.cpp                   | 139 ++++++++++++++++++--
 python/kvikio/kvikio/_lib/remote_handle.pyx |  16 +++
 python/kvikio/kvikio/remote_file.py         |  22 ++++
 4 files changed, 196 insertions(+), 21 deletions(-)

diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 7c197ea2b0..5f11d76f3d 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -18,12 +18,8 @@
 #include <cassert>
 #include <cstddef>
 #include <cstring>
-#include <iostream>
 #include <memory>
 #include <optional>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
 #include <string>
 
 #include <kvikio/defaults.hpp>
@@ -48,6 +44,8 @@ class CurlHandle;  // Prototype
  */
 class RemoteEndpoint {
  public:
+  virtual ~RemoteEndpoint() = default;
+
   /**
    * @brief Set needed connection options on a curl handle.
    *
@@ -64,7 +62,12 @@ class RemoteEndpoint {
    */
   virtual std::string str() const = 0;
 
-  virtual ~RemoteEndpoint() = default;
+  /**
+   * @brief Get the size of the remote file.
+   *
+   * @return The file size
+   */
+  virtual std::size_t get_file_size() = 0;
 };
 
 /**
@@ -81,9 +84,11 @@ class HttpEndpoint : public RemoteEndpoint {
    * @param url The full http url to the remote file.
    */
   HttpEndpoint(std::string url);
+
+  ~HttpEndpoint() override = default;
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
-  ~HttpEndpoint() override = default;
+  std::size_t get_file_size() override;
 };
 
 /**
@@ -189,9 +194,27 @@ class S3Endpoint : public RemoteEndpoint {
              std::optional<std::string> aws_endpoint_url      = std::nullopt,
              std::optional<std::string> aws_session_token     = std::nullopt);
 
+  ~S3Endpoint() override;
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
-  ~S3Endpoint() override;
+  std::size_t get_file_size() override;
+};
+
+/**
+ * @brief A remote endpoint using AWS's S3 protocol and expecting a presigned URL. File access via
+ * this type of URL is time-limited and does not require AWS credentials.
+ */
+class S3EndpointWithPresignedUrl : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  explicit S3EndpointWithPresignedUrl(std::string presigned_url);
+
+  ~S3EndpointWithPresignedUrl() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
 };
 
 /**
@@ -229,7 +252,8 @@ class RemoteHandle {
   /**
    * @brief Get the file size.
    *
-   * Note, this is very fast, no communication needed.
+   * Note, the file size is retrieved at construction so this method is very fast, no communication
+   * needed.
    *
    * @return The number of bytes.
    */
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 485e0739ac..1aba93bdc5 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -19,6 +19,7 @@
 #include <cstring>
 #include <iostream>
 #include <memory>
+#include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -133,12 +134,46 @@ class BounceBufferH2D {
   }
 };
 
+/**
+ * @brief Get the file size, if using `HEAD` request to obtain the content-length header is
+ * permitted.
+ *
+ * This function works for the `HttpEndpoint` and `S3Endpoint`, but not for
+ * `S3EndpointWithPresignedUrl`, which does not allow `HEAD` request.
+ *
+ * @param endpoint The remote endpoint
+ * @param url The URL of the remote file
+ * @return The file size
+ */
+std::size_t get_file_size_using_head_impl(RemoteEndpoint& endpoint, std::string const& url)
+{
+  auto curl = create_curl_handle();
+
+  endpoint.setopt(curl);
+  curl.setopt(CURLOPT_NOBODY, 1L);
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+  curl.perform();
+  curl_off_t cl;
+  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
+  KVIKIO_EXPECT(
+    cl >= 0,
+    "cannot get size of " + endpoint.str() + ", content-length not provided by the server",
+    std::runtime_error);
+  return static_cast<std::size_t>(cl);
+}
+
 }  // namespace
 
 HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
 
 std::string HttpEndpoint::str() const { return _url; }
 
+std::size_t HttpEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
+}
+
 void HttpEndpoint::setopt(CurlHandle& curl)
 {
   KVIKIO_NVTX_FUNC_RANGE();
@@ -286,28 +321,106 @@ S3Endpoint::~S3Endpoint() { curl_slist_free_all(_curl_header_list); }
 
 std::string S3Endpoint::str() const { return _url; }
 
-RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
-  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+std::size_t S3Endpoint::get_file_size()
 {
   KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
 }
 
-RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
+  : _url{std::move(presigned_url)}
+{
+}
+
+void S3EndpointWithPresignedUrl::setopt(CurlHandle& curl)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+}
+
+std::string S3EndpointWithPresignedUrl::str() const { return _url; }
+
+namespace {
+/**
+ * @brief Callback for the `CURLOPT_HEADERFUNCTION` parameter in libcurl
+ *
+ * The header callback is called once for each header and only complete header lines are passed on
+ * to the callback. The provided header line is not null-terminated.
+ *
+ * @param data Transfer buffer where new data is received
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes The size of new data received
+ * @param userdata User-defined data
+ * @return The number of bytes consumed by the callback
+ * @exception std::invalid_argument if the server does not know the file size, thereby using "*" as
+ * the filler text in the content-range header of the HTTP message.
+ */
+std::size_t callback_header(char* data, std::size_t size, std::size_t num_bytes, void* userdata)
 {
+  auto new_data_size = size * num_bytes;
+  auto* file_size    = reinterpret_cast<long*>(userdata);
+
+  // The header line is not null-terminated. This constructor overload ensures header_line.data() is
+  // null-terminated.
+  std::string const header_line{data, new_data_size};
+
+  // The content-range header has the format
+  // Content-Range: <unit> <range>/<size>
+  // Content-Range: <unit> <range>/*
+  // Content-Range: <unit> */<size>
+  std::regex const pattern(R"(Content-Range:[^/]+/(.*))", std::regex::icase);
+  std::smatch match_result;
+  bool found = std::regex_search(header_line, match_result, pattern);
+  if (found) {
+    // If the file size is unknown (represented by "*" in the content-range header), string-to-long
+    // conversion will throw an `std::invalid_argument` exception. The exception message from
+    // `std::stol` is usually too concise to be useful (being simply a string of "stol"), so a
+    // custom exception is used instead.
+    try {
+      *file_size = std::stol(match_result[1].str());
+    } catch (...) {
+      KVIKIO_FAIL("File size information missing on the server side.", std::invalid_argument);
+    }
+  }
+  return new_data_size;
+}
+}  // namespace
+
+std::size_t S3EndpointWithPresignedUrl::get_file_size()
+{
+  // Usually the `HEAD` request is used to obtain the content-length (file size). However, AWS S3
+  // does not allow it for presigned URL. The workaround here is to send the `GET` request with
+  // 1-byte range, so that we can still obtain the header information at a negligible cost. Since
+  // the content-length header is now at a fixed value of 1, we instead extract the file size value
+  // from content-range.
+
   KVIKIO_NVTX_FUNC_RANGE();
+
   auto curl = create_curl_handle();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+
+  // 1-byte range, specified in the format "<start-byte>-<end-byte>""
+  std::string my_range{"0-0"};
+  curl.setopt(CURLOPT_RANGE, my_range.c_str());
+
+  long file_size{};
+  curl.setopt(CURLOPT_HEADERDATA, static_cast<void*>(&file_size));
+  curl.setopt(CURLOPT_HEADERFUNCTION, callback_header);
 
-  endpoint->setopt(curl);
-  curl.setopt(CURLOPT_NOBODY, 1L);
-  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
   curl.perform();
-  curl_off_t cl;
-  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
-  KVIKIO_EXPECT(
-    cl >= 0,
-    "cannot get size of " + endpoint->str() + ", content-length not provided by the server",
-    std::runtime_error);
-  _nbytes   = cl;
+  return file_size;
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
+  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  _nbytes   = endpoint->get_file_size();
   _endpoint = std::move(endpoint);
 }
 
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index dfb662a9fb..0c2ae4c3e4 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -31,6 +31,10 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
     pair[string, string] cpp_parse_s3_url \
         "kvikio::S3Endpoint::parse_s3_url"(string url) except +
 
+    cdef cppclass cpp_S3EndpointWithPresignedUrl "kvikio::S3EndpointWithPresignedUrl" \
+                                                 (cpp_RemoteEndpoint):
+        cpp_S3EndpointWithPresignedUrl(string presigned_url) except +
+
     cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle":
         cpp_RemoteHandle(
             unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes
@@ -141,6 +145,18 @@ cdef class RemoteFile:
             nbytes
         )
 
+    @staticmethod
+    def open_s3_from_http_presigned_url(
+        presigned_url: str,
+        nbytes: Optional[int],
+    ):
+        return RemoteFile._from_endpoint(
+            cast_to_remote_endpoint(
+                make_unique[cpp_S3EndpointWithPresignedUrl](_to_string(presigned_url))
+            ),
+            nbytes
+        )
+
     def __str__(self) -> str:
         cdef string ep_str = deref(self._handle).endpoint().str()
         return f'<{self.__class__.__name__} "{ep_str.decode()}">'
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 41ec216e5c..f06a40b45f 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -142,6 +142,28 @@ def open_s3_url(
             )
         raise ValueError(f"Unsupported protocol: {url}")
 
+    @classmethod
+    def open_s3_presigned_url(
+        cls,
+        presigned_url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a AWS S3 file from a presigned URL.
+
+        Parameters
+        ----------
+        presigned_url
+            Presigned URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return RemoteFile(
+            _get_remote_module().RemoteFile.open_s3_from_http_presigned_url(
+                presigned_url, nbytes
+            )
+        )
+
     def close(self) -> None:
         """Close the file"""
         pass

From ac5fc39b57e4761e8e9c768bb923093967be17d1 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 4 Aug 2025 14:26:33 -0400
Subject: [PATCH 046/103] Update rapids-build-backend to 0.4.0 (#790)

Issue: https://github.com/rapidsai/build-planning/issues/207

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/kvikio/pull/790
---
 conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-129_arch-x86_64.yaml  | 2 +-
 conda/recipes/kvikio/recipe.yaml                  | 2 +-
 dependencies.yaml                                 | 2 +-
 python/kvikio/pyproject.toml                      | 2 +-
 python/libkvikio/pyproject.toml                   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 45a7d1297c..f72a97faf3 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - pytest-timeout
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 4ade45d196..890bfc29d7 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - pytest-timeout
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 3a7957133e..2e9e411d9e 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -69,7 +69,7 @@ requirements:
     - libnvcomp-dev ${{ nvcomp_version }}
     - pip
     - python =${{ py_version }}
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - rapids-build-backend >=0.4.0,<0.5.0.dev0
     - scikit-build-core >=0.10.0
     - cuda-cudart-dev
   run:
diff --git a/dependencies.yaml b/dependencies.yaml
index 9b7b899780..92fdec6bd5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -283,7 +283,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-build-backend>=0.3.0,<0.4.0.dev0
+          - rapids-build-backend>=0.4.0,<0.5.0.dev0
       - output_types: conda
         packages:
           - scikit-build-core>=0.10.0
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index b8ddc09945..8baa9aa89c 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -4,7 +4,7 @@
 [build-system]
 build-backend = "rapids_build_backend.build"
 requires = [
-    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "rapids-build-backend>=0.4.0,<0.5.0.dev0",
     "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/libkvikio/pyproject.toml b/python/libkvikio/pyproject.toml
index bbbd6f2e74..43b659aec8 100644
--- a/python/libkvikio/pyproject.toml
+++ b/python/libkvikio/pyproject.toml
@@ -4,7 +4,7 @@
 [build-system]
 build-backend = "rapids_build_backend.build"
 requires = [
-    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "rapids-build-backend>=0.4.0,<0.5.0.dev0",
     "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From a2bbfeb0de49c29245da15d9df0ae7619c0a7531 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Wed, 6 Aug 2025 09:19:56 -0500
Subject: [PATCH 047/103] Update Changelog [skip ci]

---
 CHANGELOG.md | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6911639b10..a3aeaa2633 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,48 @@
+# kvikio 25.08.00 (6 Aug 2025)
+
+## 🚨 Breaking Changes
+
+- Remove CUDA 11 from dependencies.yaml ([#745](https://github.com/rapidsai/kvikio/pull/745)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- stop uploading packages to downloads.rapids.ai ([#734](https://github.com/rapidsai/kvikio/pull/734)) [@jameslamb](https://github.com/jameslamb)
+
+## 🐛 Bug Fixes
+
+- Fix remote I/O file size overflow in Python binding ([#766](https://github.com/rapidsai/kvikio/pull/766)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix a case sensitivity bug in S3 remote I/O ([#765](https://github.com/rapidsai/kvikio/pull/765)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Use RAPIDS&#39; CUDA major-minor version in Conda&#39;s `cuda_version` ([#758](https://github.com/rapidsai/kvikio/pull/758)) [@vyasr](https://github.com/vyasr)
+- Fix unit test UB resulting from incorrect initialization ([#751](https://github.com/rapidsai/kvikio/pull/751)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Add an overload getenv_or that supports env var alias. Add new env var KVIKIO_NUM_THREADS. Fix UB. ([#735](https://github.com/rapidsai/kvikio/pull/735)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 📖 Documentation
+
+- Remove CUDA 11 mentions from docs ([#769](https://github.com/rapidsai/kvikio/pull/769)) [@pentschev](https://github.com/pentschev)
+- add docs on CI workflow inputs ([#764](https://github.com/rapidsai/kvikio/pull/764)) [@jameslamb](https://github.com/jameslamb)
+
+## 🚀 New Features
+
+- Add the utility function to clear page cache ([#741](https://github.com/rapidsai/kvikio/pull/741)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file-backed mapping (1/n): host/device read C++ implementation ([#740](https://github.com/rapidsai/kvikio/pull/740)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🛠️ Improvements
+
+- Use libnvcomp conda package ([#774](https://github.com/rapidsai/kvikio/pull/774)) [@bdice](https://github.com/bdice)
+- fix(docker): use versioned `-latest` tag for all `rapidsai` images ([#771](https://github.com/rapidsai/kvikio/pull/771)) [@gforsyth](https://github.com/gforsyth)
+- Remove CUDA 11 workarounds ([#770](https://github.com/rapidsai/kvikio/pull/770)) [@pentschev](https://github.com/pentschev)
+- Use CUDA 12.9 in Conda, Devcontainers, Spark, GHA, etc. ([#762](https://github.com/rapidsai/kvikio/pull/762)) [@jakirkham](https://github.com/jakirkham)
+- refactor(shellcheck): enable for all files and fix remaining warnings ([#760](https://github.com/rapidsai/kvikio/pull/760)) [@gforsyth](https://github.com/gforsyth)
+- Remove nvidia and dask channels ([#759](https://github.com/rapidsai/kvikio/pull/759)) [@vyasr](https://github.com/vyasr)
+- Set compiler versions in context ([#755](https://github.com/rapidsai/kvikio/pull/755)) [@jakirkham](https://github.com/jakirkham)
+- Use variants to produce separate builds with and without cufile support ([#754](https://github.com/rapidsai/kvikio/pull/754)) [@vyasr](https://github.com/vyasr)
+- Only use cufile on x86 ([#753](https://github.com/rapidsai/kvikio/pull/753)) [@vyasr](https://github.com/vyasr)
+- refactor(rattler): remove explicit `libnuma` pin now that upstream feedstock is fixed ([#752](https://github.com/rapidsai/kvikio/pull/752)) [@gforsyth](https://github.com/gforsyth)
+- Remove CUDA 11 from dependencies.yaml ([#745](https://github.com/rapidsai/kvikio/pull/745)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add the conda dependency libnuma necessary for GDS I/O ([#744](https://github.com/rapidsai/kvikio/pull/744)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Remove CUDA 11 devcontainers and update CI scripts ([#738](https://github.com/rapidsai/kvikio/pull/738)) [@bdice](https://github.com/bdice)
+- refactor(rattler): remove cuda11 options and general cleanup ([#737](https://github.com/rapidsai/kvikio/pull/737)) [@gforsyth](https://github.com/gforsyth)
+- stop uploading packages to downloads.rapids.ai ([#734](https://github.com/rapidsai/kvikio/pull/734)) [@jameslamb](https://github.com/jameslamb)
+- Forward-merge branch-25.06 into branch-25.08 ([#722](https://github.com/rapidsai/kvikio/pull/722)) [@gforsyth](https://github.com/gforsyth)
+- resolve forward-merge from branch-25.06 to branch-25.08 ([#710](https://github.com/rapidsai/kvikio/pull/710)) [@gforsyth](https://github.com/gforsyth)
+
 # kvikio 25.06.00 (5 Jun 2025)
 
 ## 🚨 Breaking Changes

From 9afce55d806f806d8883ac207c90c11e33715e15 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 7 Aug 2025 14:22:21 -0400
Subject: [PATCH 048/103] Update rapids_config to handle user defined branch
 name (#794)

rapids_config will use a user defined branch over `RAPIDS_BRANCH` contents

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/kvikio/pull/794
---
 cmake/RAPIDS.cmake        | 2 +-
 cmake/rapids_config.cmake | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake
index 40de7cefcd..ddef819498 100644
--- a/cmake/RAPIDS.cmake
+++ b/cmake/RAPIDS.cmake
@@ -18,7 +18,7 @@
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
 
 # Allow users to control which version is used
-if(NOT rapids-cmake-branch OR NOT rapids-cmake-version)
+if(NOT (rapids-cmake-branch OR rapids-cmake-version))
   message(
     FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined"
   )
diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake
index b706c926e7..b2c54a3f27 100644
--- a/cmake/rapids_config.cmake
+++ b/cmake/rapids_config.cmake
@@ -35,6 +35,10 @@ if(NOT _rapids_branch)
   )
 endif()
 
-set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
-set(rapids-cmake-branch "${_rapids_branch}")
+if(NOT rapids-cmake-version)
+  set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
+endif()
+if(NOT rapids-cmake-branch)
+  set(rapids-cmake-branch "${_rapids_branch}")
+endif()
 include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake")

From acada8d748b5cc1371b856c4ef3876d61fea18dc Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 7 Aug 2025 22:23:27 -0400
Subject: [PATCH 049/103] Support WebHDFS (1/2): C++ implementation (#788)

## Summary
This PR adds WebHDFS support to KvikIO. The background information is available at #787.

## Limitations
This PR does not address:
- Idiomatic and secure URL parsing and validation
- Testing on URL encoding/decoding (which means percent-decoded URL may or may not work at the moment)
- Advanced authentication such as Kerberos

These features will be added in the future.

Partially addresses #787

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/788
---
 .../recipes/libkvikio/conda_build_config.yaml |   2 +-
 conda/recipes/libkvikio/recipe.yaml           |   7 +-
 cpp/CMakeLists.txt                            |   4 +-
 cpp/include/kvikio/detail/remote_handle.hpp   |  34 +++
 cpp/include/kvikio/hdfs.hpp                   |  62 ++++++
 cpp/include/kvikio/remote_handle.hpp          |   9 +
 cpp/src/detail/remote_handle.cpp              |  32 +++
 cpp/src/hdfs.cpp                              | 131 ++++++++++++
 cpp/src/remote_handle.cpp                     |  43 +++-
 cpp/tests/CMakeLists.txt                      |   7 +-
 cpp/tests/test_basic_io.cpp                   |   9 +-
 cpp/tests/test_hdfs.cpp                       | 180 ++++++++++++++++
 cpp/tests/test_mmap.cpp                       |  11 +-
 cpp/tests/utils/hdfs_helper.cpp               | 197 ++++++++++++++++++
 cpp/tests/utils/hdfs_helper.hpp               |  61 ++++++
 cpp/tests/utils/utils.hpp                     |  16 +-
 16 files changed, 771 insertions(+), 34 deletions(-)
 create mode 100644 cpp/include/kvikio/detail/remote_handle.hpp
 create mode 100644 cpp/include/kvikio/hdfs.hpp
 create mode 100644 cpp/src/detail/remote_handle.cpp
 create mode 100644 cpp/src/hdfs.cpp
 create mode 100644 cpp/tests/test_hdfs.cpp
 create mode 100644 cpp/tests/utils/hdfs_helper.cpp
 create mode 100644 cpp/tests/utils/hdfs_helper.hpp

diff --git a/conda/recipes/libkvikio/conda_build_config.yaml b/conda/recipes/libkvikio/conda_build_config.yaml
index 1149e0f9fa..b67ab5d118 100644
--- a/conda/recipes/libkvikio/conda_build_config.yaml
+++ b/conda/recipes/libkvikio/conda_build_config.yaml
@@ -11,4 +11,4 @@ c_stdlib_version:
   - "2.28"
 
 libcurl_version:
-  - "==8.5.0"
+  - "8.5.0"
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index a4e423d3ed..f4164c9611 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -65,7 +65,7 @@ cache:
       - ${{ stdlib("c") }}
     host:
       - cuda-version =${{ cuda_version }}
-      - libcurl ${{ libcurl_version }}
+      - libcurl ==${{ libcurl_version }}
       - if: should_use_cufile
         then:
         - libcufile-dev
@@ -91,7 +91,7 @@ outputs:
         - ${{ compiler("c") }}
       host:
         - cuda-version =${{ cuda_version }}
-        - libcurl ${{ libcurl_version }}
+        - libcurl ==${{ libcurl_version }}
       run:
         - if: x86_64
           then:
@@ -108,7 +108,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-version
-          - libcurl
           - if: should_use_cufile
             then:
             - libcufile
@@ -138,6 +137,7 @@ outputs:
         - ${{ pin_subpackage("libkvikio", exact=True) }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
+        - libcurl ==${{ libcurl_version }}
         - if: should_use_cufile
           then:
           - libcufile-dev
@@ -156,7 +156,6 @@ outputs:
         by_name:
           - cuda-cudart
           - cuda-version
-          - libcurl
           - libnuma
           - if: should_use_cufile
             then:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 53351393ee..6107a0a795 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -163,7 +163,9 @@ set(SOURCES
 )
 
 if(KvikIO_REMOTE_SUPPORT)
-  list(APPEND SOURCES "src/remote_handle.cpp" "src/shim/libcurl.cpp")
+  list(APPEND SOURCES "src/hdfs.cpp" "src/remote_handle.cpp" "src/detail/remote_handle.cpp"
+       "src/shim/libcurl.cpp"
+  )
 endif()
 
 add_library(kvikio ${SOURCES})
diff --git a/cpp/include/kvikio/detail/remote_handle.hpp b/cpp/include/kvikio/detail/remote_handle.hpp
new file mode 100644
index 0000000000..f6bd55c4c0
--- /dev/null
+++ b/cpp/include/kvikio/detail/remote_handle.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstring>
+
+namespace kvikio::detail {
+/**
+ * @brief Callback for `CURLOPT_WRITEFUNCTION` that copies received data into a `std::string`.
+ *
+ * @param data Received data
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes Number of bytes received
+ * @param userdata Must be cast from `std::string*`
+ * @return The number of bytes consumed by the callback
+ */
+std::size_t callback_get_string_response(char* data,
+                                         std::size_t size,
+                                         std::size_t num_bytes,
+                                         void* userdata);
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
new file mode 100644
index 0000000000..0b20d658bd
--- /dev/null
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+
+#include <kvikio/remote_handle.hpp>
+
+namespace kvikio {
+
+/**
+ * @brief A remote endpoint for Apache Hadoop WebHDFS.
+ *
+ * If KvikIO is run within a Docker, the argument `--network host` needs to be passed to the `docker
+ * run` command.
+ */
+class WebHdfsEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+  std::optional<std::string> _username;
+
+ public:
+  /**
+   * @brief Create an WebHDFS endpoint from a url.
+   *
+   * @param url The WebHDFS HTTP/HTTPS url to the remote file.
+   */
+  explicit WebHdfsEndpoint(std::string url);
+
+  /**
+   * @brief Create an WebHDFS endpoint from the host, port, file path and optionally username.
+   *
+   * @param host Host
+   * @param port Port
+   * @param remote_file_path Remote file path
+   * @param username User name
+   */
+  explicit WebHdfsEndpoint(std::string host,
+                           std::string port,
+                           std::string remote_file_path,
+                           std::optional<std::string> username = std::nullopt);
+
+  ~WebHdfsEndpoint() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+};
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 5f11d76f3d..b2e2d1d0ff 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -68,6 +68,12 @@ class RemoteEndpoint {
    * @return The file size
    */
   virtual std::size_t get_file_size() = 0;
+
+  /**
+   * @brief Set up the range request in order to read part of a file given the file offset and read
+   * size.
+   */
+  virtual void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) = 0;
 };
 
 /**
@@ -89,6 +95,7 @@ class HttpEndpoint : public RemoteEndpoint {
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
   std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
 };
 
 /**
@@ -198,6 +205,7 @@ class S3Endpoint : public RemoteEndpoint {
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
   std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
 };
 
 /**
@@ -215,6 +223,7 @@ class S3EndpointWithPresignedUrl : public RemoteEndpoint {
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
   std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
 };
 
 /**
diff --git a/cpp/src/detail/remote_handle.cpp b/cpp/src/detail/remote_handle.cpp
new file mode 100644
index 0000000000..b023859ef8
--- /dev/null
+++ b/cpp/src/detail/remote_handle.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include <kvikio/detail/remote_handle.hpp>
+
+namespace kvikio::detail {
+std::size_t callback_get_string_response(char* data,
+                                         std::size_t size,
+                                         std::size_t num_bytes,
+                                         void* userdata)
+{
+  auto new_data_size = size * num_bytes;
+  auto* response     = reinterpret_cast<std::string*>(userdata);
+  response->append(data, new_data_size);
+  return new_data_size;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
new file mode 100644
index 0000000000..12455b3a26
--- /dev/null
+++ b/cpp/src/hdfs.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <regex>
+
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/hdfs.hpp>
+#include <kvikio/nvtx.hpp>
+#include <kvikio/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio {
+
+WebHdfsEndpoint::WebHdfsEndpoint(std::string url)
+{
+  // todo: Use libcurl URL API for more secure and idiomatic parsing.
+  // Split the URL into two parts: one without query and one with.
+  std::regex const pattern{R"(^([^?]+)\?([^#]*))"};
+  // Regex meaning:
+  // ^: From the start of the line
+  // [^?]+: Matches non-question-mark characters one or more times. The question mark ushers in the
+  // URL query component.
+  // \?: Matches the question mark, which needs to be escaped.
+  // [^#]*: Matches the non-pound characters zero or more times. The pound sign ushers in the URL
+  // fragment component. It is very likely that this part does not exist.
+  std::smatch match_results;
+  bool found = std::regex_search(url, match_results, pattern);
+  // If the match is not found, the URL contains no query.
+  if (!found) {
+    _url = url;
+    return;
+  }
+
+  _url       = match_results[1].str();
+  auto query = match_results[2].str();
+
+  {
+    // Extract user name if provided. In WebHDFS, user name is specified as the key=value pair in
+    // the query
+    std::regex const pattern{R"(user.name=([^&]+))"};
+    // Regex meaning:
+    // [^&]+: Matches the non-ampersand character one or more times. The ampersand delimits
+    // different parameters.
+    std::smatch match_results;
+    if (std::regex_search(query, match_results, pattern)) { _username = match_results[1].str(); }
+  }
+}
+
+WebHdfsEndpoint::WebHdfsEndpoint(std::string host,
+                                 std::string port,
+                                 std::string file_path,
+                                 std::optional<std::string> username)
+  : _username{std::move(username)}
+{
+  std::stringstream ss;
+  ss << "http://" << host << ":" << port << "/webhdfs/v1" << file_path;
+  _url = ss.str();
+}
+
+std::string WebHdfsEndpoint::str() const { return _url; }
+
+void WebHdfsEndpoint::setopt(CurlHandle& curl)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+}
+
+std::size_t WebHdfsEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+
+  std::stringstream ss;
+  ss << _url << "?";
+  if (_username.has_value()) { ss << "user.name=" << _username.value() << "&"; }
+  ss << "op=GETFILESTATUS";
+
+  auto curl = create_curl_handle();
+  curl.setopt(CURLOPT_URL, ss.str().c_str());
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+
+  std::string response;
+  curl.setopt(CURLOPT_WRITEDATA, static_cast<void*>(&response));
+  curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_get_string_response);
+
+  curl.perform();
+
+  long http_status_code{};
+  curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+  KVIKIO_EXPECT(http_status_code == 200, "HTTP response is not successful.");
+
+  // The response is in JSON format. The file size is given by `"length":<file_size>`.
+  std::regex const pattern{R"("length"\s*:\s*(\d+)[^\d])"};
+  // Regex meaning:
+  // \s*: Matches the space character zero or more times.
+  // \d+: Matches the digit one or more times.
+  // [^\d]: Matches a non-digit character.
+  std::smatch match_results;
+  bool found = std::regex_search(response, match_results, pattern);
+  KVIKIO_EXPECT(
+    found, "Regular expression search failed. Cannot extract file length from the JSON response.");
+  return std::stoull(match_results[1].str());
+}
+
+void WebHdfsEndpoint::setup_range_request(CurlHandle& curl,
+                                          std::size_t file_offset,
+                                          std::size_t size)
+{
+  // WebHDFS does not support CURLOPT_RANGE. The range is specified as query parameters in the URL.
+  KVIKIO_NVTX_FUNC_RANGE();
+  std::stringstream ss;
+  ss << _url << "?";
+  if (_username.has_value()) { ss << "user.name=" << _username.value() << "&"; }
+  ss << "op=OPEN&offset=" << file_offset << "&length=" << size;
+  curl.setopt(CURLOPT_URL, ss.str().c_str());
+}
+}  // namespace kvikio
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 1aba93bdc5..23cf5c6305 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -25,6 +25,7 @@
 #include <string>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/remote_handle.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/nvtx.hpp>
 #include <kvikio/parallel_operation.hpp>
@@ -162,6 +163,20 @@ std::size_t get_file_size_using_head_impl(RemoteEndpoint& endpoint, std::string
   return static_cast<std::size_t>(cl);
 }
 
+/**
+ * @brief Set up the range request for libcurl. Use this method when HTTP range request is supposed.
+ *
+ * @param curl A curl handle
+ * @param file_offset File offset
+ * @param size read size
+ */
+void setup_range_request_impl(CurlHandle& curl, std::size_t file_offset, std::size_t size)
+{
+  std::string const byte_range =
+    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
+  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+}
+
 }  // namespace
 
 HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
@@ -174,15 +189,15 @@ std::size_t HttpEndpoint::get_file_size()
   return get_file_size_using_head_impl(*this, _url);
 }
 
-void HttpEndpoint::setopt(CurlHandle& curl)
+void HttpEndpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size)
 {
-  KVIKIO_NVTX_FUNC_RANGE();
-  curl.setopt(CURLOPT_URL, _url.c_str());
+  setup_range_request_impl(curl, file_offset, size);
 }
 
+void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+
 void S3Endpoint::setopt(CurlHandle& curl)
 {
-  KVIKIO_NVTX_FUNC_RANGE();
   curl.setopt(CURLOPT_URL, _url.c_str());
   curl.setopt(CURLOPT_AWS_SIGV4, _aws_sigv4.c_str());
   curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str());
@@ -327,6 +342,12 @@ std::size_t S3Endpoint::get_file_size()
   return get_file_size_using_head_impl(*this, _url);
 }
 
+void S3Endpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
 S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
   : _url{std::move(presigned_url)}
 {
@@ -334,7 +355,6 @@ S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url
 
 void S3EndpointWithPresignedUrl::setopt(CurlHandle& curl)
 {
-  KVIKIO_NVTX_FUNC_RANGE();
   curl.setopt(CURLOPT_URL, _url.c_str());
 }
 
@@ -411,6 +431,14 @@ std::size_t S3EndpointWithPresignedUrl::get_file_size()
   return file_size;
 }
 
+void S3EndpointWithPresignedUrl::setup_range_request(CurlHandle& curl,
+                                                     std::size_t file_offset,
+                                                     std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
 RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
   : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
 {
@@ -510,10 +538,7 @@ std::size_t RemoteHandle::read(void* buf, std::size_t size, std::size_t file_off
   bool const is_host_mem = is_host_memory(buf);
   auto curl              = create_curl_handle();
   _endpoint->setopt(curl);
-
-  std::string const byte_range =
-    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
-  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+  _endpoint->setup_range_request(curl, file_offset, size);
 
   if (is_host_mem) {
     curl.setopt(CURLOPT_WRITEFUNCTION, callback_host_memory);
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3d53bbd86f..41de4bb6fa 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -57,7 +57,7 @@ function(kvikio_add_test)
   )
   target_link_libraries(
     ${_KVIKIO_NAME} PRIVATE kvikio::kvikio GTest::gmock GTest::gmock_main GTest::gtest
-                            GTest::gtest_main CUDA::cudart
+                            GTest::gtest_main CUDA::cudart $<TARGET_NAME_IF_EXISTS:CURL::libcurl>
   )
 
   rapids_test_add(
@@ -76,6 +76,9 @@ kvikio_add_test(NAME ERROR_TEST SOURCES test_error.cpp)
 
 kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
 
-kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
+if(KvikIO_REMOTE_SUPPORT)
+  kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
+  kvikio_add_test(NAME HDFS_TEST SOURCES test_hdfs.cpp utils/hdfs_helper.cpp)
+endif()
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/tests/libkvikio)
diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp
index d72ba8841f..141fe386c3 100644
--- a/cpp/tests/test_basic_io.cpp
+++ b/cpp/tests/test_basic_io.cpp
@@ -27,15 +27,16 @@ class BasicIOTest : public testing::Test {
     TempDir tmp_dir{false};
     _filepath = tmp_dir.path() / "test";
 
-    _dev_a = std::move(DevBuffer::arange(100));
-    _dev_b = std::move(DevBuffer::zero_like(_dev_a));
+    _dev_a = std::move(DevBuffer<value_type>::arange(100));
+    _dev_b = std::move(DevBuffer<value_type>::zero_like(_dev_a));
   }
 
   void TearDown() override {}
 
   std::filesystem::path _filepath;
-  DevBuffer _dev_a;
-  DevBuffer _dev_b;
+  using value_type = std::int64_t;
+  DevBuffer<value_type> _dev_a;
+  DevBuffer<value_type> _dev_b;
 };
 
 TEST_F(BasicIOTest, write_read)
diff --git a/cpp/tests/test_hdfs.cpp b/cpp/tests/test_hdfs.cpp
new file mode 100644
index 0000000000..354c8c4aca
--- /dev/null
+++ b/cpp/tests/test_hdfs.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <memory>
+
+#include <kvikio/file_handle.hpp>
+#include <kvikio/hdfs.hpp>
+#include <kvikio/remote_handle.hpp>
+
+#include "utils/hdfs_helper.hpp"
+#include "utils/utils.hpp"
+
+// This test makes the following assumptions:
+// - This KvikIO unit test is run on the "name node" of a WebHDFS server.
+// - Port 9870 (default for WebHDFS) is being used to listen to the requests.
+// - The environment variable `KVIKIO_USER` is specified prior to the test. It contains a valid user
+//   name that has been granted access to the HDFS.
+// - The user has the proper permission to create a file under the `/tmp` directory on the HDFS.
+// - If the unit test is run within a Docker. The following arguments are passed to the `docker run`
+//   command:
+//   - `--network host`
+//   - `--env KVIKIO_USER=<hdfs_username>`
+//
+// If any of these assumptions is not satisfied, this unit test is expected to be skipped
+// gracefully.
+
+using value_type = double;
+
+namespace kvikio::test {
+struct Config {
+  std::size_t num_elements{1024ull * 1024ull};
+  std::vector<value_type> host_buf;
+  kvikio::test::DevBuffer<value_type> dev_buf;
+  std::string host;
+  std::string port;
+  std::string _username;
+  std::string remote_file_path;
+  bool file_created{false};
+};
+}  // namespace kvikio::test
+
+class WebHdfsTest : public testing::Test {
+ protected:
+  static void SetUpTestSuite()
+  {
+    config.num_elements = 1024ull * 1024ull;
+    config.host_buf.resize(config.num_elements);
+    std::iota(config.host_buf.begin(), config.host_buf.end(), 0);
+
+    config.dev_buf = kvikio::test::DevBuffer<value_type>{config.host_buf};
+
+    config.host = "localhost";
+    config.port = "9870";
+
+    config.remote_file_path = "/tmp/kvikio-test-webhdfs.bin";
+
+    auto res = std::getenv("KVIKIO_USER");
+    if (res) {
+      config._username = res;
+    } else {
+      GTEST_SKIP() << "Environment variable KVIKIO_USER is not set for this test.";
+    }
+
+    webhdfs_helper =
+      std::make_unique<kvikio::test::WebHdfsTestHelper>(config.host, config.port, config._username);
+
+    if (!webhdfs_helper->can_connect()) {
+      GTEST_SKIP() << "Cannot connect to WebHDFS. Skipping all tests for this fixture.";
+    }
+
+    std::span<std::byte> buffer{reinterpret_cast<std::byte*>(config.host_buf.data()),
+                                config.host_buf.size() * sizeof(value_type)};
+    if (!webhdfs_helper->upload_data(buffer, config.remote_file_path)) {
+      GTEST_SKIP()
+        << "Failed to upload test data using WebHDFS. Skipping all tests for this fixture.";
+    };
+
+    config.file_created = true;
+  }
+
+  static void TearDownTestSuite()
+  {
+    if (config.file_created) { webhdfs_helper->delete_data(config.remote_file_path); }
+  }
+
+  static kvikio::test::Config config;
+  static std::unique_ptr<kvikio::test::WebHdfsTestHelper> webhdfs_helper;
+};
+
+kvikio::test::Config WebHdfsTest::config{};
+std::unique_ptr<kvikio::test::WebHdfsTestHelper> WebHdfsTest::webhdfs_helper{};
+
+TEST_F(WebHdfsTest, constructor)
+{
+  auto do_test = [&](kvikio::RemoteHandle& remote_handle) {
+    kvikio::test::DevBuffer<value_type> out_device_buf(config.num_elements);
+    auto read_size    = remote_handle.read(out_device_buf.ptr, remote_handle.nbytes());
+    auto out_host_buf = out_device_buf.to_vector();
+    for (std::size_t i = 0; i < config.num_elements; ++i) {
+      EXPECT_EQ(config.host_buf[i], out_host_buf[i]);
+    }
+    EXPECT_EQ(read_size, remote_handle.nbytes());
+  };
+
+  std::stringstream ss;
+  ss << "http://" << config.host << ":" << config.port << "/webhdfs/v1" << config.remote_file_path
+     << "?user.name=" << config._username;
+  std::vector<kvikio::RemoteHandle> remote_handles;
+
+  remote_handles.emplace_back(std::make_unique<kvikio::WebHdfsEndpoint>(ss.str()));
+  remote_handles.emplace_back(std::make_unique<kvikio::WebHdfsEndpoint>(
+    config.host, config.port, config.remote_file_path, config._username));
+
+  for (auto& remote_handle : remote_handles) {
+    do_test(remote_handle);
+  }
+}
+
+TEST_F(WebHdfsTest, read_parallel)
+{
+  auto do_test = [&](std::string const& url,
+                     std::size_t num_elements_to_skip,
+                     std::size_t num_elements_to_read,
+                     std::size_t task_size) {
+    kvikio::RemoteHandle remote_handle{std::make_unique<kvikio::WebHdfsEndpoint>(url)};
+    auto const offset             = num_elements_to_skip * sizeof(value_type);
+    auto const expected_read_size = num_elements_to_read * sizeof(value_type);
+
+    // host
+    {
+      std::vector<value_type> out_host_buf(num_elements_to_read, {});
+      auto fut = remote_handle.pread(out_host_buf.data(), expected_read_size, offset, task_size);
+      auto const read_size = fut.get();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(config.host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+
+    // device
+    {
+      kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
+      auto fut = remote_handle.pread(out_device_buf.ptr, expected_read_size, offset, task_size);
+      auto const read_size = fut.get();
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(config.host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+  };
+
+  std::stringstream ss;
+  ss << "http://" << config.host << ":" << config.port << "/webhdfs/v1" << config.remote_file_path
+     << "?user.name=" << config._username;
+  std::vector<std::size_t> task_sizes{256, 1024, kvikio::defaults::task_size()};
+
+  for (const auto& task_size : task_sizes) {
+    for (const auto& num_elements_to_read : {10, 9999}) {
+      for (const auto& num_elements_to_skip : {0, 10, 100, 1000, 9999}) {
+        do_test(ss.str(), num_elements_to_skip, num_elements_to_read, task_size);
+      }
+    }
+  }
+}
diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
index 7fb1e3a4fd..1c6e3a54b3 100644
--- a/cpp/tests/test_mmap.cpp
+++ b/cpp/tests/test_mmap.cpp
@@ -40,7 +40,7 @@ class MmapTest : public testing::Test {
     _filepath                = tmp_dir.path() / "test.bin";
     std::size_t num_elements = 1024ull * 1024ull;
     _host_buf                = CreateTempFile<value_type>(_filepath, num_elements);
-    _dev_buf                 = kvikio::test::DevBuffer{_host_buf};
+    _dev_buf                 = kvikio::test::DevBuffer<value_type>{_host_buf};
     _page_size               = kvikio::get_page_size();
   }
 
@@ -62,9 +62,8 @@ class MmapTest : public testing::Test {
   std::size_t _file_size;
   std::size_t _page_size;
   std::vector<std::int64_t> _host_buf;
-  kvikio::test::DevBuffer _dev_buf;
-
   using value_type = decltype(_host_buf)::value_type;
+  kvikio::test::DevBuffer<value_type> _dev_buf;
 };
 
 TEST_F(MmapTest, invalid_file_open_flag)
@@ -212,7 +211,7 @@ TEST_F(MmapTest, read_seq)
 
     // device
     {
-      kvikio::test::DevBuffer out_device_buf(num_elements_to_read);
+      kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
       auto const read_size = mmap_handle.read(out_device_buf.ptr, expected_read_size, offset);
       auto out_host_buf    = out_device_buf.to_vector();
       for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
@@ -250,7 +249,7 @@ TEST_F(MmapTest, read_parallel)
 
       // device
       {
-        kvikio::test::DevBuffer out_device_buf(num_elements_to_read);
+        kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
         auto fut             = mmap_handle.pread(out_device_buf.ptr, expected_read_size, offset);
         auto const read_size = fut.get();
         auto out_host_buf    = out_device_buf.to_vector();
@@ -300,7 +299,7 @@ TEST_F(MmapTest, read_with_default_arguments)
 
   // device
   {
-    kvikio::test::DevBuffer out_device_buf(num_elements);
+    kvikio::test::DevBuffer<value_type> out_device_buf(num_elements);
 
     {
       auto const read_size = mmap_handle.read(out_device_buf.ptr);
diff --git a/cpp/tests/utils/hdfs_helper.cpp b/cpp/tests/utils/hdfs_helper.cpp
new file mode 100644
index 0000000000..2bcbc7fed2
--- /dev/null
+++ b/cpp/tests/utils/hdfs_helper.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hdfs_helper.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio::test {
+
+namespace {
+
+/**
+ * @brief Helper struct that wraps a buffer view and tracks how many data have been processed via an
+ * offset value.
+ */
+struct tracked_buffer_t {
+  std::span<std::byte> buffer;
+  std::size_t offset;
+};
+
+/**
+ * @brief Callback for `CURLOPT_READFUNCTION` to upload data.
+ *
+ * @param data
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes_max The maximum number of bytes that can be uploaded
+ * @param userdata Must be cast from `tracked_buffer_t*`
+ * @return The number of bytes that have been copied to the transfer buffer.
+ */
+std::size_t callback_upload(char* data, std::size_t size, std::size_t num_bytes_max, void* userdata)
+{
+  auto new_data_size_max = size * num_bytes_max;
+  auto* tracked_buffer   = reinterpret_cast<tracked_buffer_t*>(userdata);
+
+  // All data have been uploaded. Nothing more to do.
+  if (tracked_buffer->offset >= tracked_buffer->buffer.size()) { return 0; }
+
+  auto copy_size =
+    std::min(new_data_size_max, tracked_buffer->buffer.size() - tracked_buffer->offset);
+  std::memcpy(data, tracked_buffer->buffer.data() + tracked_buffer->offset, copy_size);
+  tracked_buffer->offset += copy_size;
+
+  return copy_size;
+}
+}  // namespace
+
+WebHdfsTestHelper::WebHdfsTestHelper(std::string const& host,
+                                     std::string const& port,
+                                     std::string const& username)
+  : _host{host}, _port{port}, _username{username}
+{
+  std::stringstream ss;
+  ss << "http://" << host << ":" << port << "/webhdfs/v1";
+  _url_before_path = ss.str();
+}
+
+bool WebHdfsTestHelper::can_connect() noexcept
+{
+  try {
+    auto curl = create_curl_handle();
+
+    std::stringstream ss;
+    ss << _url_before_path << "/?user.name=" << _username << "&op=GETHOMEDIRECTORY";
+
+    curl.setopt(CURLOPT_URL, ss.str().c_str());
+
+    std::string response{};
+    curl.setopt(CURLOPT_WRITEDATA, &response);
+    curl.setopt(CURLOPT_WRITEFUNCTION, kvikio::detail::callback_get_string_response);
+    curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+    curl.perform();
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+
+bool WebHdfsTestHelper::upload_data(std::span<std::byte> buffer,
+                                    std::string const& remote_file_path) noexcept
+{
+  try {
+    // Official reference on how to create and write to a file:
+    // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Create_and_Write_to_a_File
+    std::string redirect_url;
+
+    {
+      // Step 1: Submit a HTTP PUT request without automatically following redirects and without
+      // sending the file data.
+      auto curl = create_curl_handle();
+
+      std::stringstream ss;
+      ss << _url_before_path << remote_file_path << "?user.name=" << _username << "&op=CREATE";
+      std::string redirect_data_node_location{};
+
+      curl.setopt(CURLOPT_URL, ss.str().c_str());
+      curl.setopt(CURLOPT_FOLLOWLOCATION, 0L);
+      curl.setopt(CURLOPT_CUSTOMREQUEST, "PUT");
+
+      std::string response{};
+      curl.setopt(CURLOPT_HEADERDATA, &response);
+      curl.setopt(CURLOPT_HEADERFUNCTION, kvikio::detail::callback_get_string_response);
+
+      curl.perform();
+
+      long http_status_code{};
+      curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+      KVIKIO_EXPECT(http_status_code == 307, "Redirection from name node to data node failed.");
+
+      std::regex const pattern{R"(Location:\s*(.*)\s*)"};
+      std::smatch match_results;
+      bool found = std::regex_search(response, match_results, pattern);
+      KVIKIO_EXPECT(found,
+                    "Regular expression search failed. Cannot extract redirect location from the "
+                    "JSON response.");
+      redirect_url = match_results[1].str();
+    }
+
+    {
+      // Step 2: Submit another HTTP PUT request using the URL in the Location header with the file
+      // data to be written.
+      auto curl = create_curl_handle();
+      curl.setopt(CURLOPT_URL, redirect_url.c_str());
+      curl.setopt(CURLOPT_UPLOAD, 1L);
+
+      tracked_buffer_t tracked_buffer{.buffer = buffer, .offset = 0};
+      curl.setopt(CURLOPT_READDATA, &tracked_buffer);
+      curl.setopt(CURLOPT_READFUNCTION, callback_upload);
+      curl.setopt(CURLOPT_INFILESIZE_LARGE, static_cast<curl_off_t>(buffer.size()));
+
+      curl.perform();
+
+      long http_status_code{};
+      curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+      KVIKIO_EXPECT(http_status_code == 201, "File creation failed.");
+    }
+
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+
+bool WebHdfsTestHelper::delete_data(std::string const& remote_file_path) noexcept
+{
+  try {
+    // Official reference on how to delete a file:
+    // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Delete_a_File.2FDirectory
+    auto curl = create_curl_handle();
+
+    std::stringstream ss;
+    ss << _url_before_path << remote_file_path << "?user.name=" << _username << "&op=DELETE";
+    std::string const url = ss.str();
+    std::string redirect_data_node_location{};
+
+    curl.setopt(CURLOPT_URL, url.c_str());
+    curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+    curl.setopt(CURLOPT_CUSTOMREQUEST, "DELETE");
+
+    std::string response{};
+    curl.setopt(CURLOPT_HEADERDATA, &response);
+    curl.setopt(CURLOPT_HEADERFUNCTION, kvikio::detail::callback_get_string_response);
+
+    curl.perform();
+
+    long http_status_code{};
+    curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+    KVIKIO_EXPECT(http_status_code == 200, "File deletion failed.");
+
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+}  // namespace kvikio::test
diff --git a/cpp/tests/utils/hdfs_helper.hpp b/cpp/tests/utils/hdfs_helper.hpp
new file mode 100644
index 0000000000..7cc5da377c
--- /dev/null
+++ b/cpp/tests/utils/hdfs_helper.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <span>
+#include <string>
+
+namespace kvikio::test {
+
+/**
+ * @brief Helper class to create and upload a file on WebHDFS so as to enable read testing.
+ */
+class WebHdfsTestHelper {
+ private:
+  std::string _host;
+  std::string _port;
+  std::string _username;
+  std::string _url_before_path;
+
+ public:
+  WebHdfsTestHelper(std::string const& host, std::string const& port, std::string const& username);
+
+  /**
+   * @brief Whether KvikIO can connect to the WebHDFS server.
+   *
+   * @return A boolean answer.
+   */
+  bool can_connect() noexcept;
+
+  /**
+   * @brief Copy the data from a host buffer to a remote file on the WebHDFS server.
+   *
+   * @param buffer View to the host buffer whose data will be copied to the WebHDFS server
+   * @param remote_file_path Remote file path
+   * @return True if the file has been successfully uploaded; false otherwise.
+   */
+  bool upload_data(std::span<std::byte> buffer, std::string const& remote_file_path) noexcept;
+
+  /**
+   * @brief Delete a remote file on the WebHDFS server.
+   *
+   * @param remote_file_path Remote file path
+   * @return True if the file has been successfully deleted; false otherwise.
+   */
+  bool delete_data(std::string const& remote_file_path) noexcept;
+};
+
+}  // namespace kvikio::test
diff --git a/cpp/tests/utils/utils.hpp b/cpp/tests/utils/utils.hpp
index bb50d07c13..7d733621a7 100644
--- a/cpp/tests/utils/utils.hpp
+++ b/cpp/tests/utils/utils.hpp
@@ -108,6 +108,7 @@ class TempDir {
 /**
  * @brief Help class for creating and comparing buffers.
  */
+template <typename T>
 class DevBuffer {
  public:
   std::size_t nelem;
@@ -116,12 +117,12 @@ class DevBuffer {
 
   DevBuffer() : nelem{0}, nbytes{0} {};
 
-  DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(std::int64_t)}
+  DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(T)}
   {
     KVIKIO_CHECK_CUDA(cudaMalloc(&ptr, nbytes));
     KVIKIO_CHECK_CUDA(cudaMemset(ptr, 0, nbytes));
   }
-  DevBuffer(std::vector<std::int64_t> const& host_buffer) : DevBuffer{host_buffer.size()}
+  DevBuffer(std::vector<T> const& host_buffer) : DevBuffer{host_buffer.size()}
   {
     KVIKIO_CHECK_CUDA(cudaMemcpy(ptr, host_buffer.data(), nbytes, cudaMemcpyHostToDevice));
   }
@@ -143,9 +144,9 @@ class DevBuffer {
 
   ~DevBuffer() noexcept { cudaFree(ptr); }
 
-  [[nodiscard]] static DevBuffer arange(std::size_t nelem, std::int64_t start = 0)
+  [[nodiscard]] static DevBuffer arange(std::size_t nelem, T start = 0)
   {
-    std::vector<std::int64_t> host_buffer(nelem);
+    std::vector<T> host_buffer(nelem);
     std::iota(host_buffer.begin(), host_buffer.end(), start);
     return DevBuffer{host_buffer};
   }
@@ -157,9 +158,9 @@ class DevBuffer {
     return ret;
   }
 
-  [[nodiscard]] std::vector<std::int64_t> to_vector() const
+  [[nodiscard]] std::vector<T> to_vector() const
   {
-    std::vector<std::int64_t> ret(nelem);
+    std::vector<T> ret(nelem);
     KVIKIO_CHECK_CUDA(cudaMemcpy(ret.data(), this->ptr, nbytes, cudaMemcpyDeviceToHost));
     return ret;
   }
@@ -177,7 +178,8 @@ class DevBuffer {
 /**
  * @brief Check that two buffers are equal
  */
-inline void expect_equal(DevBuffer const& a, DevBuffer const& b)
+template <typename T>
+inline void expect_equal(DevBuffer<T> const& a, DevBuffer<T> const& b)
 {
   EXPECT_EQ(a.nbytes, b.nbytes);
   auto a_vec = a.to_vector();

From a35bf582306553c40dc6eaa9c29a1e49ddb969fc Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 11 Aug 2025 11:31:18 -0400
Subject: [PATCH 050/103] Improve KvikIO Python binding performance by
 releasing GIL wherever deemed necessary (#796)

This PR improves the Python binding performance by releasing the Global Interpreter Lock (GIL) wherever necessary. The tasks include:
- For function declarations, add `nogil` if missing. Only one such case has been identified, which defines an embedded template function.
- At the call site of a C++ function, add `with nogil` context if missing. All the other changes fall into this category.

Closes #795

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/796
---
 python/kvikio/kvikio/_lib/buffer.pyx        |  13 +-
 python/kvikio/kvikio/_lib/cufile_driver.pyx |  74 ++++++---
 python/kvikio/kvikio/_lib/defaults.pyx      |  71 +++++++--
 python/kvikio/kvikio/_lib/file_handle.pyx   | 159 +++++++++++++-------
 python/kvikio/kvikio/_lib/future.pyx        |  12 +-
 python/kvikio/kvikio/_lib/mmap.pyx          |  70 ++++++---
 python/kvikio/kvikio/_lib/remote_handle.pyx | 116 ++++++++++----
 7 files changed, 370 insertions(+), 145 deletions(-)

diff --git a/python/kvikio/kvikio/_lib/buffer.pyx b/python/kvikio/kvikio/_lib/buffer.pyx
index 3b90f09816..a7b638eaf4 100644
--- a/python/kvikio/kvikio/_lib/buffer.pyx
+++ b/python/kvikio/kvikio/_lib/buffer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -17,14 +17,16 @@ def memory_register(buf) -> None:
     if not isinstance(buf, Array):
         buf = Array(buf)
     cdef Array arr = buf
-    cpp_memory_register(<void*>arr.ptr)
+    with nogil:
+        cpp_memory_register(<void*>arr.ptr)
 
 
 def memory_deregister(buf) -> None:
     if not isinstance(buf, Array):
         buf = Array(buf)
     cdef Array arr = buf
-    cpp_memory_deregister(<void*>arr.ptr)
+    with nogil:
+        cpp_memory_deregister(<void*>arr.ptr)
 
 
 cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
@@ -32,4 +34,7 @@ cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
 
 
 def bounce_buffer_free() -> int:
-    return cpp_alloc_retain_clear()
+    cdef size_t result
+    with nogil:
+        result = cpp_alloc_retain_clear()
+    return result
diff --git a/python/kvikio/kvikio/_lib/cufile_driver.pyx b/python/kvikio/kvikio/_lib/cufile_driver.pyx
index 0488eb3b20..0bc44b2066 100644
--- a/python/kvikio/kvikio/_lib/cufile_driver.pyx
+++ b/python/kvikio/kvikio/_lib/cufile_driver.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -15,15 +15,20 @@ cdef extern from "<kvikio/shim/cufile.hpp>" nogil:
 
 
 def libcufile_version() -> int:
-    return cpp_libcufile_version()
+    cdef int version
+    with nogil:
+        version = cpp_libcufile_version()
+    return version
 
 
 def driver_open():
-    cpp_driver_open()
+    with nogil:
+        cpp_driver_open()
 
 
 def driver_close():
-    cpp_driver_close()
+    with nogil:
+        cpp_driver_close()
 
 
 cdef extern from "<kvikio/cufile/driver.hpp>" nogil:
@@ -49,55 +54,90 @@ cdef class DriverProperties:
 
     @property
     def is_gds_available(self) -> bool:
+        cdef bool result
         try:
-            return self._handle.is_gds_available()
+            with nogil:
+                result = self._handle.is_gds_available()
+            return result
         except RuntimeError:
             return False
 
     @property
     def major_version(self) -> bool:
-        return self._handle.get_nvfs_major_version()
+        cdef unsigned int version
+        with nogil:
+            version = self._handle.get_nvfs_major_version()
+        return version
 
     @property
     def minor_version(self) -> bool:
-        return self._handle.get_nvfs_minor_version()
+        cdef unsigned int version
+        with nogil:
+            version = self._handle.get_nvfs_minor_version()
+        return version
 
     @property
     def allow_compat_mode(self) -> bool:
-        return self._handle.get_nvfs_allow_compat_mode()
+        cdef bool result
+        with nogil:
+            result = self._handle.get_nvfs_allow_compat_mode()
+        return result
 
     @property
     def poll_mode(self) -> bool:
-        return self._handle.get_nvfs_poll_mode()
+        cdef bool result
+        with nogil:
+            result = self._handle.get_nvfs_poll_mode()
+        return result
 
     @poll_mode.setter
     def poll_mode(self, enable: bool) -> None:
-        self._handle.set_nvfs_poll_mode(enable)
+        cdef bool cpp_enable = enable
+        with nogil:
+            self._handle.set_nvfs_poll_mode(cpp_enable)
 
     @property
     def poll_thresh_size(self) -> int:
-        return self._handle.get_nvfs_poll_thresh_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_nvfs_poll_thresh_size()
+        return size
 
     @poll_thresh_size.setter
     def poll_thresh_size(self, size_in_kb: int) -> None:
-        self._handle.set_nvfs_poll_thresh_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_nvfs_poll_thresh_size(size)
 
     @property
     def max_device_cache_size(self) -> int:
-        return self._handle.get_max_device_cache_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_max_device_cache_size()
+        return size
 
     @max_device_cache_size.setter
     def max_device_cache_size(self, size_in_kb: int) -> None:
-        self._handle.set_max_device_cache_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_max_device_cache_size(size)
 
     @property
     def per_buffer_cache_size(self) -> int:
-        return self._handle.get_per_buffer_cache_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_per_buffer_cache_size()
+        return size
 
     @property
     def max_pinned_memory_size(self) -> int:
-        return self._handle.get_max_pinned_memory_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_max_pinned_memory_size()
+        return size
 
     @max_pinned_memory_size.setter
     def max_pinned_memory_size(self, size_in_kb: int) -> None:
-        self._handle.set_max_pinned_memory_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_max_pinned_memory_size(size)
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index de511211e6..2fb6065680 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -43,68 +43,107 @@ cdef extern from "<kvikio/defaults.hpp>" namespace "kvikio" nogil:
 
 
 def is_compat_mode_preferred() -> bool:
-    return cpp_is_compat_mode_preferred()
+    cdef bool result
+    with nogil:
+        result = cpp_is_compat_mode_preferred()
+    return result
 
 
 def compat_mode() -> CompatMode:
-    return cpp_compat_mode()
+    cdef CompatMode result
+    with nogil:
+        result = cpp_compat_mode()
+    return result
 
 
 def set_compat_mode(compat_mode: CompatMode) -> None:
-    cpp_set_compat_mode(compat_mode)
+    with nogil:
+        cpp_set_compat_mode(compat_mode)
 
 
 def thread_pool_nthreads() -> int:
-    return cpp_thread_pool_nthreads()
+    cdef unsigned int result
+    with nogil:
+        result = cpp_thread_pool_nthreads()
+    return result
 
 
 def set_thread_pool_nthreads(nthreads: int) -> None:
-    cpp_set_thread_pool_nthreads(nthreads)
+    cdef unsigned int cpp_nthreads = nthreads
+    with nogil:
+        cpp_set_thread_pool_nthreads(cpp_nthreads)
 
 
 def task_size() -> int:
-    return cpp_task_size()
+    cdef size_t result
+    with nogil:
+        result = cpp_task_size()
+    return result
 
 
 def set_task_size(nbytes: int) -> None:
-    cpp_set_task_size(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_task_size(cpp_nbytes)
 
 
 def gds_threshold() -> int:
-    return cpp_gds_threshold()
+    cdef size_t result
+    with nogil:
+        result = cpp_gds_threshold()
+    return result
 
 
 def set_gds_threshold(nbytes: int) -> None:
-    cpp_set_gds_threshold(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_gds_threshold(cpp_nbytes)
 
 
 def bounce_buffer_size() -> int:
-    return cpp_bounce_buffer_size()
+    cdef size_t result
+    with nogil:
+        result = cpp_bounce_buffer_size()
+    return result
 
 
 def set_bounce_buffer_size(nbytes: int) -> None:
-    cpp_set_bounce_buffer_size(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_bounce_buffer_size(cpp_nbytes)
 
 
 def http_max_attempts() -> int:
-    return cpp_http_max_attempts()
+    cdef size_t result
+    with nogil:
+        result = cpp_http_max_attempts()
+    return result
 
 
 def set_http_max_attempts(attempts: int) -> None:
-    cpp_set_http_max_attempts(attempts)
+    cdef size_t cpp_attempts = attempts
+    with nogil:
+        cpp_set_http_max_attempts(cpp_attempts)
 
 
 def http_timeout() -> int:
-    return cpp_http_timeout()
+    cdef long result
+    with nogil:
+        result = cpp_http_timeout()
+    return result
 
 
 def set_http_timeout(timeout: int) -> None:
-    return cpp_set_http_timeout(timeout)
+    cdef long cpp_timeout = timeout
+    with nogil:
+        cpp_set_http_timeout(cpp_timeout)
 
 
 def http_status_codes() -> list[int]:
+    # Cannot use nogil here because we need the GIL for list creation
     return cpp_http_status_codes()
 
 
 def set_http_status_codes(status_codes: list[int]) -> None:
-    return cpp_set_http_status_codes(status_codes)
+    # Cannot use nogil here because we need the GIL for list conversion
+    cpp_set_http_status_codes(status_codes)
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index b17d283433..062d0d9fb5 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -94,88 +94,127 @@ cdef class CuFile:
     cdef FileHandle _handle
 
     def __init__(self, file_path, flags="r"):
-        self._handle = move(
-            FileHandle(
-                os.fsencode(file_path),
-                str(flags).encode()
+        cdef string cpp_file_path = os.fsencode(file_path)
+        cdef string cpp_flags = str(flags).encode()
+        with nogil:
+            self._handle = move(
+                FileHandle(
+                    cpp_file_path,
+                    cpp_flags
+                )
             )
-        )
 
     def close(self) -> None:
-        self._handle.close()
+        with nogil:
+            self._handle.close()
 
     def closed(self) -> bool:
-        return self._handle.closed()
+        cdef bool result
+        with nogil:
+            result = self._handle.closed()
+        return result
 
     def fileno(self) -> int:
-        return self._handle.fd()
+        cdef int result
+        with nogil:
+            result = self._handle.fd()
+        return result
 
     def open_flags(self) -> int:
-        return self._handle.fd_open_flags()
+        cdef int result
+        with nogil:
+            result = self._handle.fd_open_flags()
+        return result
 
     def pread(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            self._handle.pread(
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_task_size = task_size if task_size else defaults.task_size()
+        cdef future[size_t] fut
+        with nogil:
+            fut = self._handle.pread(
                 <void*>info.first,
                 info.second,
-                file_offset,
-                task_size if task_size else defaults.task_size()
+                cpp_file_offset,
+                cpp_task_size
             )
-        )
+        return _wrap_io_future(fut)
 
     def pwrite(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            self._handle.pwrite(
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_task_size = task_size if task_size else defaults.task_size()
+        cdef future[size_t] fut
+        with nogil:
+            fut = self._handle.pwrite(
                 <void*>info.first,
                 info.second,
-                file_offset,
-                task_size if task_size else defaults.task_size()
+                cpp_file_offset,
+                cpp_task_size
             )
-        )
+        return _wrap_io_future(fut)
 
     def read(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return self._handle.read(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.read(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+            )
+        return result
 
     def write(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return self._handle.write(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.write(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+            )
+        return result
 
     def read_async(self, buf, size: Optional[int], file_offset: int, dev_offset: int,
                    st: uintptr_t) -> IOFutureStream:
-        stream = <CUstream>st
+        cdef CUstream stream = <CUstream>st
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return _wrap_stream_future(self._handle.read_async(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-            stream,
-        ))
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef cpp_StreamFuture fut
+        with nogil:
+            fut = self._handle.read_async(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+                stream,
+            )
+        return _wrap_stream_future(fut)
 
     def write_async(self, buf, size: Optional[int], file_offset: int, dev_offset: int,
                     st: uintptr_t) -> IOFutureStream:
-        stream = <CUstream>st
+        cdef CUstream stream = <CUstream>st
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return _wrap_stream_future(self._handle.write_async(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-            stream,
-        ))
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef cpp_StreamFuture fut
+        with nogil:
+            fut = self._handle.write_async(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+                stream,
+            )
+        return _wrap_stream_future(fut)
 
 cdef extern from "<kvikio/file_utils.hpp>" nogil:
     pair[size_t, size_t] cpp_get_page_cache_info_str \
@@ -191,21 +230,37 @@ cdef extern from "<kvikio/file_utils.hpp>" nogil:
 
 def get_page_cache_info(file: Union[os.PathLike, str, int, io.IOBase]) \
         -> tuple[int, int]:
+    cdef pair[size_t, size_t] result
+    cdef string path_bytes
+    cdef int fd
+
     if isinstance(file, os.PathLike) or isinstance(file, str):
         # file is a path or a string object
         path_bytes = os.fsencode(file)
-        return cpp_get_page_cache_info_str(path_bytes)
+        with nogil:
+            result = cpp_get_page_cache_info_str(path_bytes)
+        return result
     elif isinstance(file, int):
         # file is a file descriptor
-        return cpp_get_page_cache_info_int(file)
+        fd = file
+        with nogil:
+            result = cpp_get_page_cache_info_int(fd)
+        return result
     elif isinstance(file, io.IOBase):
         # file is a file object
         # pass its file descriptor to the underlying C++ function
-        return cpp_get_page_cache_info_int(file.fileno())
+        fd = file.fileno()
+        with nogil:
+            result = cpp_get_page_cache_info_int(fd)
+        return result
     else:
         raise ValueError("The type of `file` must be `os.PathLike`, `str`, `int`, "
                          "or `io.IOBase`")
 
 
-def clear_page_cache(reclaim_dentries_and_inodes: bool, clear_dirty_pages: bool):
-    return cpp_clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
+def clear_page_cache(reclaim_dentries_and_inodes: bool,
+                     clear_dirty_pages: bool) -> bool:
+    cdef bool result
+    with nogil:
+        result = cpp_clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
+    return result
diff --git a/python/kvikio/kvikio/_lib/future.pyx b/python/kvikio/kvikio/_lib/future.pyx
index da6ab308dc..94d9dd6d9c 100644
--- a/python/kvikio/kvikio/_lib/future.pyx
+++ b/python/kvikio/kvikio/_lib/future.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -15,7 +15,10 @@ cdef extern from "<kvikio/utils.hpp>" namespace "kvikio" nogil:
 cdef class IOFutureStream:
     """Wrap a C++ StreamFuture in a Python object"""
     def check_bytes_done(self) -> int:
-        return self._handle.check_bytes_done()
+        cdef size_t bytes_done
+        with nogil:
+            bytes_done = self._handle.check_bytes_done()
+        return bytes_done
 
 
 cdef IOFutureStream _wrap_stream_future(cpp_StreamFuture &fut):
@@ -34,7 +37,10 @@ cdef class IOFuture:
         return ret
 
     def done(self) -> bool:
-        return is_future_done(self._handle)
+        cdef bool result
+        with nogil:
+            result = is_future_done(self._handle)
+        return result
 
 
 cdef IOFuture _wrap_io_future(future[size_t] &fut):
diff --git a/python/kvikio/kvikio/_lib/mmap.pyx b/python/kvikio/kvikio/_lib/mmap.pyx
index ac4889b25c..46fc3846d0 100644
--- a/python/kvikio/kvikio/_lib/mmap.pyx
+++ b/python/kvikio/kvikio/_lib/mmap.pyx
@@ -48,14 +48,17 @@ cdef class InternalMmapHandle:
         if not os.path.exists(file_path):
             raise RuntimeError("Unable to open file")
 
+        cdef string cpp_path_bytes = os.fsencode(file_path)
+        cdef string cpp_flags_bytes = str(flags).encode()
+
         cdef optional[size_t] cpp_initial_map_size
         if initial_map_size is None:
             cpp_initial_map_size = nullopt
         else:
             cpp_initial_map_size = <size_t>(initial_map_size)
 
-        path_bytes = os.fsencode(file_path)
-        flags_bytes = str(flags).encode()
+        cdef size_t cpp_initial_map_offset = initial_map_offset
+        cdef fcntl.mode_t cpp_mode = mode
 
         cdef optional[int] cpp_map_flags
         if map_flags is None:
@@ -63,54 +66,79 @@ cdef class InternalMmapHandle:
         else:
             cpp_map_flags = <int>(map_flags)
 
-        self._handle = move(CppMmapHandle(path_bytes,
-                                          flags_bytes,
-                                          cpp_initial_map_size,
-                                          initial_map_offset,
-                                          mode,
-                                          cpp_map_flags))
+        with nogil:
+            self._handle = move(CppMmapHandle(cpp_path_bytes,
+                                              cpp_flags_bytes,
+                                              cpp_initial_map_size,
+                                              cpp_initial_map_offset,
+                                              cpp_mode,
+                                              cpp_map_flags))
 
     def initial_map_size(self) -> int:
-        return self._handle.initial_map_size()
+        cdef size_t result
+        with nogil:
+            result = self._handle.initial_map_size()
+        return result
 
     def initial_map_offset(self) -> int:
-        return self._handle.initial_map_offset()
+        cdef size_t result
+        with nogil:
+            result = self._handle.initial_map_offset()
+        return result
 
     def file_size(self) -> int:
-        return self._handle.file_size()
+        cdef size_t result
+        with nogil:
+            result = self._handle.file_size()
+        return result
 
     def close(self) -> None:
-        self._handle.close()
+        with nogil:
+            self._handle.close()
 
     def closed(self) -> bool:
-        return self._handle.closed()
+        cdef bool result
+        with nogil:
+            result = self._handle.closed()
+        return result
 
     def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
         cdef optional[size_t] cpp_size
         if size is None:
             cpp_size = nullopt
         else:
             cpp_size = <size_t>(size)
-        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return self._handle.read(<void*>info.first,
-                                 cpp_size,
-                                 offset)
+        cdef size_t cpp_offset = offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.read(<void*>info.first,
+                                       cpp_size,
+                                       cpp_offset)
+        return result
 
     def pread(self, buf: Any, size: Optional[int] = None, offset: int = 0,
               task_size: Optional[int] = None) -> IOFuture:
         cdef optional[size_t] cpp_size
+        cdef size_t cpp_task_size
+
         if size is None:
             cpp_size = nullopt
         else:
             cpp_size = <size_t>(size)
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
 
+        cdef size_t cpp_offset = offset
+
         if task_size is None:
             cpp_task_size = defaults.task_size()
         else:
             cpp_task_size = task_size
 
-        return _wrap_io_future(self._handle.pread(<void*>info.first,
-                               cpp_size,
-                               offset,
-                               cpp_task_size))
+        cdef future[size_t] cpp_future
+        with nogil:
+            cpp_future = self._handle.pread(<void*>info.first,
+                                            cpp_size,
+                                            cpp_offset,
+                                            cpp_task_size)
+        return _wrap_io_future(cpp_future)
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 0c2ae4c3e4..5cb2d5cdd9 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -65,8 +65,9 @@ cdef pair[string, string] _to_string_pair(str s1, str s2):
     """Wrap two Python string objects in a C++ pair"""
     return pair[string, string](_to_string(s1), _to_string(s2))
 
+
 # Helper function to cast an endpoint to its base class `RemoteEndpoint`
-cdef extern from *:
+cdef extern from * nogil:
     """
     template <typename T>
     std::unique_ptr<kvikio::RemoteEndpoint> cast_to_remote_endpoint(T endpoint)
@@ -86,11 +87,16 @@ cdef class RemoteFile:
         nbytes: Optional[int],
     ):
         cdef RemoteFile ret = RemoteFile()
+
         if nbytes is None:
-            ret._handle = make_unique[cpp_RemoteHandle](move(ep))
+            with nogil:
+                ret._handle = make_unique[cpp_RemoteHandle](move(ep))
             return ret
+
         cdef size_t n = nbytes
-        ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
+
+        with nogil:
+            ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
         return ret
 
     @staticmethod
@@ -98,10 +104,16 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_HttpEndpoint](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_HttpEndpoint](_to_string(url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -111,12 +123,18 @@ cdef class RemoteFile:
         object_name: str,
         nbytes: Optional[int],
     ):
+        cdef pair[string, string] bucket_and_object_names = _to_string_pair(
+            bucket_name, object_name
+        )
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](bucket_and_object_names)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](
-                    _to_string_pair(bucket_name, object_name)
-                )
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -125,10 +143,16 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](_to_string(url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -137,11 +161,18 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
-        cdef pair[string, string] bucket_and_object = cpp_parse_s3_url(_to_string(url))
+        cdef string cpp_url = _to_string(url)
+        cdef pair[string, string] bucket_and_object_names
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            bucket_and_object_names = cpp_parse_s3_url(cpp_url)
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](bucket_and_object_names)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](bucket_and_object)
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -150,34 +181,55 @@ cdef class RemoteFile:
         presigned_url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(presigned_url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3EndpointWithPresignedUrl](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3EndpointWithPresignedUrl](_to_string(presigned_url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
     def __str__(self) -> str:
-        cdef string ep_str = deref(self._handle).endpoint().str()
+        cdef string ep_str
+        with nogil:
+            ep_str = deref(self._handle).endpoint().str()
         return f'<{self.__class__.__name__} "{ep_str.decode()}">'
 
     def nbytes(self) -> int:
-        return deref(self._handle).nbytes()
+        cdef size_t result
+        with nogil:
+            result = deref(self._handle).nbytes()
+        return result
 
     def read(self, buf, size: Optional[int], file_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return deref(self._handle).read(
-            <void*>info.first,
-            info.second,
-            file_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t result
+
+        with nogil:
+            result = deref(self._handle).read(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+            )
+
+        return result
 
     def pread(self, buf, size: Optional[int], file_offset: int) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            deref(self._handle).pread(
+        cdef size_t cpp_file_offset = file_offset
+        cdef future[size_t] fut
+
+        with nogil:
+            fut = deref(self._handle).pread(
                 <void*>info.first,
                 info.second,
-                file_offset,
+                cpp_file_offset,
             )
-        )
+
+        return _wrap_io_future(fut)

From 9f8e8736cc3c7ce4f9e39ed23add4d65889fd7c8 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 15 Aug 2025 11:20:42 -0400
Subject: [PATCH 051/103] Support WebHDFS (2/2): Python binding (#791)

## Summary
This PR adds Python binding for the WebHDFS support

Depends on PR https://github.com/rapidsai/kvikio/pull/788

Closes https://github.com/rapidsai/kvikio/issues/787

Python's built-in package `http.server` is well suited to server mocking. It enables high-level testing for the client. Closes https://github.com/rapidsai/kvikio/issues/634 too.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/791
---
 python/kvikio/kvikio/_lib/remote_handle.pyx |  15 ++
 python/kvikio/kvikio/remote_file.py         |  21 ++
 python/kvikio/tests/test_hdfs_io.py         | 234 ++++++++++++++++++++
 python/kvikio/tests/test_s3_io.py           |  13 +-
 python/kvikio/tests/utils.py                |  15 ++
 5 files changed, 289 insertions(+), 9 deletions(-)
 create mode 100644 python/kvikio/tests/test_hdfs_io.py
 create mode 100644 python/kvikio/tests/utils.py

diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 5cb2d5cdd9..17222685ae 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -53,6 +53,9 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             size_t file_offset
         ) except +
 
+cdef extern from "<kvikio/hdfs.hpp>" nogil:
+    cdef cppclass cpp_WebHdfsEndpoint "kvikio::WebHdfsEndpoint"(cpp_RemoteEndpoint):
+        cpp_WebHdfsEndpoint(string url) except +
 
 cdef string _to_string(str s):
     """Convert Python object to a C++ string (if None, return the empty string)"""
@@ -194,6 +197,18 @@ cdef class RemoteFile:
             nbytes
         )
 
+    @staticmethod
+    def open_webhdfs(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        return RemoteFile._from_endpoint(
+            cast_to_remote_endpoint(
+                make_unique[cpp_WebHdfsEndpoint](_to_string(url))
+            ),
+            nbytes
+        )
+
     def __str__(self) -> str:
         cdef string ep_str
         with nogil:
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index f06a40b45f..105c42e438 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -164,6 +164,27 @@ def open_s3_presigned_url(
             )
         )
 
+    @classmethod
+    def open_webhdfs(
+        cls,
+        url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a file on Apache Hadoop Distributed File System (HDFS) using WebHDFS.
+
+        If KvikIO is run within a Docker, the argument ``--network host`` needs to be
+        passed to the ``docker run`` command.
+
+        Parameters
+        ----------
+        url
+            URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server for the file
+            size.
+        """
+        return RemoteFile(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
+
     def close(self) -> None:
         """Close the file"""
         pass
diff --git a/python/kvikio/tests/test_hdfs_io.py b/python/kvikio/tests/test_hdfs_io.py
new file mode 100644
index 0000000000..eab4d0a5fb
--- /dev/null
+++ b/python/kvikio/tests/test_hdfs_io.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+from __future__ import annotations
+
+import json
+import urllib.parse
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from multiprocessing import Process, Queue
+from typing import Any, Generator
+
+import cupy as cp
+import numpy as np
+import numpy.typing as npt
+import pytest
+import utils
+
+import kvikio.defaults
+from kvikio import remote_file
+
+
+class RemoteFileData:
+    def __init__(self, file_path: str, num_elements: int, dtype: npt.DTypeLike) -> None:
+        self.file_path = file_path
+        self.num_elements = num_elements
+        self.dtype = dtype
+        self.buf = np.arange(0, self.num_elements, dtype=self.dtype)
+        self.file_size = self.buf.nbytes
+
+
+@pytest.fixture(scope="module")
+def remote_file_data() -> RemoteFileData:
+    return RemoteFileData(
+        file_path="/webhdfs/v1/home/test_user/test_file.bin",
+        num_elements=1024 * 1024,
+        dtype=np.float64,
+    )
+
+
+def run_mock_server(queue: Queue[int], file_size: int, buf: npt.NDArray[Any]) -> None:
+    """Run HTTP server in a separate process"""
+
+    class WebHdfsHandler(BaseHTTPRequestHandler):
+        def do_GET(self) -> None:
+            parsed_url = urllib.parse.urlparse(self.path)
+            query_dict = urllib.parse.parse_qs(parsed_url.query)
+            op = query_dict["op"]
+
+            # Client requests file size
+            if op == ["GETFILESTATUS"]:
+                self.send_response(200)
+                self.send_header("Content-Type", "application/json")
+                self.end_headers()
+                response = json.dumps({"length": file_size})
+                self.wfile.write(response.encode())
+
+            # Client requests file content
+            elif op == ["OPEN"]:
+                offset = int(query_dict["offset"][0])
+                length = int(query_dict["length"][0])
+
+                # Convert byte offsets to element indices
+                element_size = buf.itemsize
+                begin_idx = offset // element_size
+                end_idx = (offset + length) // element_size
+                range_data = buf[begin_idx:end_idx].tobytes()
+
+                self.send_response(200)
+                self.send_header("Content-Type", "application/octet-stream")
+                self.send_header("Content-Length", str(len(range_data)))
+                self.end_headers()
+                self.wfile.write(range_data)
+            else:
+                self.send_response(400)
+                self.end_headers()
+
+        def log_message(self, format: str, *args: Any) -> None:
+            pass
+
+    port = utils.find_free_port()
+    server = HTTPServer((utils.localhost(), port), WebHdfsHandler)
+
+    # Send port back to parent process
+    queue.put(port)
+
+    server.serve_forever()
+
+
+@pytest.fixture
+def mock_webhdfs_server(remote_file_data: RemoteFileData) -> Generator[str, None, None]:
+    """Start WebHDFS mock server in a separate process"""
+    queue: Queue[int] = Queue()
+    server_process = Process(
+        target=run_mock_server,
+        args=(
+            queue,
+            remote_file_data.file_size,
+            remote_file_data.buf,
+        ),
+        daemon=True,
+    )
+    server_process.start()
+
+    # Get the port the server is running on
+    port = queue.get(timeout=5)
+
+    yield f"http://{utils.localhost()}:{port}"
+
+    # Cleanup
+    server_process.terminate()
+    server_process.join(timeout=1)
+
+
+class TestWebHdfsOperations:
+    @pytest.mark.parametrize("url_query", ["", "?op=OPEN"])
+    def test_get_file_size(
+        self,
+        mock_webhdfs_server: str,
+        remote_file_data: RemoteFileData,
+        url_query: str,
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}{url_query}"
+        handle = remote_file.RemoteFile.open_webhdfs(url)
+        file_size = handle.nbytes()
+        assert file_size == remote_file_data.file_size
+
+    def test_parallel_read(
+        self, mock_webhdfs_server: str, remote_file_data: RemoteFileData, xp: Any
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}"
+        handle = remote_file.RemoteFile.open_webhdfs(url)
+        result_buf = xp.arange(
+            0, remote_file_data.num_elements, dtype=remote_file_data.dtype
+        )
+        fut = handle.pread(result_buf)
+        read_size = fut.get()
+
+        assert read_size == remote_file_data.file_size
+
+        result_buf_np = result_buf
+        if isinstance(result_buf, cp.ndarray):
+            result_buf_np = cp.asnumpy(result_buf)
+        assert np.array_equal(result_buf_np, remote_file_data.buf)
+
+    @pytest.mark.parametrize("size", [80, 8 * 9999])
+    @pytest.mark.parametrize("offset", [0, 800, 8000, 8 * 9999])
+    @pytest.mark.parametrize("num_threads", [1, 4])
+    @pytest.mark.parametrize("task_size", [1024, 4096])
+    def test_parallel_read_partial(
+        self,
+        mock_webhdfs_server: str,
+        remote_file_data: RemoteFileData,
+        size: int,
+        offset: int,
+        num_threads: int,
+        task_size: int,
+        xp: Any,
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}"
+        element_size = remote_file_data.buf.itemsize
+        begin_idx = offset // element_size
+        end_idx = (offset + size) // element_size
+        expected_buf = remote_file_data.buf[begin_idx:end_idx]
+
+        actual_num_elements = size // np.dtype(remote_file_data.dtype).itemsize
+        with kvikio.defaults.set({"num_threads": num_threads, "task_size": task_size}):
+            handle = remote_file.RemoteFile.open_webhdfs(url)
+            result_buf = xp.zeros(actual_num_elements, dtype=remote_file_data.dtype)
+            fut = handle.pread(result_buf, size, offset)
+            read_size = fut.get()
+
+            assert read_size == size
+
+            result_buf_np = result_buf
+            if isinstance(result_buf, cp.ndarray):
+                result_buf_np = cp.asnumpy(result_buf)
+            assert np.array_equal(result_buf_np, expected_buf)
+
+
+class TestWebHdfsErrors:
+    @pytest.fixture
+    def mock_bad_server(
+        self, remote_file_data: RemoteFileData
+    ) -> Generator[str, None, None]:
+        """Start a bad WebHDFS server that returns invalid JSON"""
+
+        def run_bad_server(queue: Queue[int]) -> None:
+            class BadHandler(BaseHTTPRequestHandler):
+                def do_GET(self):
+                    parsed = urllib.parse.urlparse(self.path)
+                    query = urllib.parse.parse_qs(parsed.query)
+
+                    if query.get("op") == ["GETFILESTATUS"]:
+                        self.send_response(200)
+                        self.send_header("Content-Type", "application/json")
+                        self.end_headers()
+                        # Missing "length" field
+                        response = json.dumps({})
+                        self.wfile.write(response.encode())
+                    else:
+                        self.send_response(400)
+                        self.end_headers()
+
+                def log_message(self, format, *args):
+                    pass
+
+            port = utils.find_free_port()
+            server = HTTPServer((utils.localhost(), port), BadHandler)
+            queue.put(port)
+            server.serve_forever()
+
+        queue: Queue[int] = Queue()
+        server_process = Process(target=run_bad_server, args=(queue,), daemon=True)
+        server_process.start()
+
+        port = queue.get(timeout=5)
+
+        yield f"http://{utils.localhost()}:{port}"
+
+        server_process.terminate()
+        server_process.join(timeout=1)
+
+    def test_missing_file_size(
+        self, mock_bad_server: str, remote_file_data: RemoteFileData
+    ) -> None:
+        url = f"{mock_bad_server}{remote_file_data.file_path}"
+
+        with pytest.raises(
+            RuntimeError,
+            match="Regular expression search failed. "
+            "Cannot extract file length from the JSON response.",
+        ):
+            handle = remote_file.RemoteFile.open_webhdfs(url)
+            handle.nbytes()
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
index 58a73184a8..2256bc1284 100644
--- a/python/kvikio/tests/test_s3_io.py
+++ b/python/kvikio/tests/test_s3_io.py
@@ -2,11 +2,11 @@
 # See file LICENSE for terms.
 
 import multiprocessing as mp
-import socket
 import time
 from contextlib import contextmanager
 
 import pytest
+import utils
 
 import kvikio
 import kvikio.defaults
@@ -26,18 +26,13 @@
 
 
 @pytest.fixture(scope="session")
-def endpoint_ip():
-    return "127.0.0.1"
+def endpoint_ip() -> str:
+    return utils.localhost()
 
 
 @pytest.fixture(scope="session")
 def endpoint_port():
-    # Return a free port per worker session.
-    sock = socket.socket()
-    sock.bind(("127.0.0.1", 0))
-    port = sock.getsockname()[1]
-    sock.close()
-    return port
+    return utils.find_free_port()
 
 
 def start_s3_server(ip_address, port):
diff --git a/python/kvikio/tests/utils.py b/python/kvikio/tests/utils.py
new file mode 100644
index 0000000000..58d801efcf
--- /dev/null
+++ b/python/kvikio/tests/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import socket
+
+
+def localhost() -> str:
+    return "127.0.0.1"
+
+
+def find_free_port(host: str = localhost()) -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind((host, 0))
+        _, port = s.getsockname()
+    return port

From e20807ee11d2e8f7795c7ffd1c1bb5433e87a199 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 18 Aug 2025 13:50:31 -0700
Subject: [PATCH 052/103] Remove Python nvCOMP bindings and Zarr 2 support
 (#798)

Removes the features that utilize nvCOMP - Python bindings and Zarr 2 support.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/798
---
 README.md                                     |    1 -
 docs/source/zarr.rst                          |   13 -
 notebooks/nvcomp_batch_codec.ipynb            |  357 ---
 notebooks/nvcomp_vs_zarr_lz4.ipynb            | 2832 -----------------
 notebooks/zarr.ipynb                          |  364 ---
 python/kvikio/cli/gpu_compressor.py           |  136 -
 python/kvikio/examples/zarr_cupy_nvcomp.py    |   88 -
 python/kvikio/kvikio/_lib/CMakeLists.txt      |    2 +-
 python/kvikio/kvikio/_lib/libnvcomp.pyx       |  235 --
 python/kvikio/kvikio/_lib/libnvcomp_ll.pyx    | 1182 -------
 python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd  |  212 --
 .../kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd  |  362 ---
 python/kvikio/kvikio/_nvcomp.py               |  368 ---
 python/kvikio/kvikio/_nvcomp_codec.py         |  228 --
 .../kvikio/benchmarks/single_node_io.py       |   34 +-
 python/kvikio/kvikio/benchmarks/zarr_io.py    |   87 +-
 python/kvikio/kvikio/nvcomp.py                |   20 -
 python/kvikio/kvikio/nvcomp_codec.py          |    9 -
 python/kvikio/kvikio/zarr/__init__.py         |    2 -
 python/kvikio/kvikio/zarr/_zarr_python_2.py   |  400 ---
 python/kvikio/kvikio/zarr/_zarr_python_3.py   |   12 +-
 python/kvikio/tests/conftest.py               |   13 -
 python/kvikio/tests/test_benchmarks.py        |   52 -
 python/kvikio/tests/test_examples.py          |   13 -
 python/kvikio/tests/test_nvcomp.py            |  444 ---
 python/kvikio/tests/test_nvcomp_codec.py      |  243 --
 python/kvikio/tests/test_zarr.py              |  292 --
 27 files changed, 38 insertions(+), 7963 deletions(-)
 delete mode 100644 notebooks/nvcomp_batch_codec.ipynb
 delete mode 100644 notebooks/nvcomp_vs_zarr_lz4.ipynb
 delete mode 100644 notebooks/zarr.ipynb
 delete mode 100755 python/kvikio/cli/gpu_compressor.py
 delete mode 100644 python/kvikio/examples/zarr_cupy_nvcomp.py
 delete mode 100644 python/kvikio/kvikio/_lib/libnvcomp.pyx
 delete mode 100644 python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
 delete mode 100644 python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
 delete mode 100644 python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
 delete mode 100644 python/kvikio/kvikio/_nvcomp.py
 delete mode 100644 python/kvikio/kvikio/_nvcomp_codec.py
 delete mode 100644 python/kvikio/kvikio/nvcomp.py
 delete mode 100644 python/kvikio/kvikio/nvcomp_codec.py
 delete mode 100644 python/kvikio/kvikio/zarr/_zarr_python_2.py
 delete mode 100644 python/kvikio/tests/test_nvcomp.py
 delete mode 100644 python/kvikio/tests/test_nvcomp_codec.py
 delete mode 100644 python/kvikio/tests/test_zarr.py

diff --git a/README.md b/README.md
index ac2ee2a4bd..549d8bcfc4 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ KvikIO also works efficiently when GDS isn't available and can read/write both h
 * Concurrent reads and writes using an internal thread pool.
 * Non-blocking API.
 * Transparently handles reads and writes to/from memory on both host and device.
-* (Deprecated) Provides Python bindings to [nvCOMP](https://docs.nvidia.com/cuda/nvcomp/py_api.html).
 
 
 ### Documentation
diff --git a/docs/source/zarr.rst b/docs/source/zarr.rst
index 019eff2767..f8e4564011 100644
--- a/docs/source/zarr.rst
+++ b/docs/source/zarr.rst
@@ -28,16 +28,3 @@ You can use any store, but KvikIO provides :py:class:`kvikio.zarr.GDSStore` to e
    ... )
    >>> type(z[:10, :10])
    cupy.ndarray
-
-
-
-Zarr Python 2.x
----------------
-
-
-The following uses zarr-python 2.x, and is an example of how to use the convenience function :py:meth:`kvikio.zarr.open_cupy_array`
-to create a new Zarr array and how to open an existing Zarr array.
-
-
-.. literalinclude:: ../../python/kvikio/examples/zarr_cupy_nvcomp.py
-    :language: python
diff --git a/notebooks/nvcomp_batch_codec.ipynb b/notebooks/nvcomp_batch_codec.ipynb
deleted file mode 100644
index f4f4689f3c..0000000000
--- a/notebooks/nvcomp_batch_codec.ipynb
+++ /dev/null
@@ -1,357 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "b543ae63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "\n",
-    "import numcodecs\n",
-    "\n",
-    "import numpy as np\n",
-    "\n",
-    "import zarr\n",
-    "\n",
-    "from IPython.display import display\n",
-    "\n",
-    "np.set_printoptions(precision=4, suppress=True)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "1a8e55d5",
-   "metadata": {},
-   "source": [
-    "### Basic usage\n",
-    "\n",
-    "Get nvCOMP codec from numcodecs registry:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "75524650",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "NvCompBatchCodec(algorithm='lz4', options={})"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "NVCOMP_CODEC_ID = \"nvcomp_batch\"\n",
-    "\n",
-    "# Currently supported algorithms.\n",
-    "LZ4_ALGO = \"LZ4\"\n",
-    "GDEFLATE_ALGO = \"Gdeflate\"\n",
-    "SNAPPY_ALGO = \"snappy\"\n",
-    "ZSTD_ALGO = \"zstd\"\n",
-    "\n",
-    "codec = numcodecs.registry.get_codec(dict(id=NVCOMP_CODEC_ID, algorithm=LZ4_ALGO))\n",
-    "# To pass algorithm-specific options, use options parameter:\n",
-    "# codec = numcodecs.registry.get_codec(dict(id=NVCOMP_CODEC_ID, algo=LZ4_ALGO, options={\"data_type\": 1}))\n",
-    "\n",
-    "display(codec)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "46641ccb",
-   "metadata": {},
-   "source": [
-    "Create data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "12a4fffd",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 1.6243, -0.6118, -0.5282, ...,  0.0436, -0.62  ,  0.698 ],\n",
-       "       [-0.4471,  1.2245,  0.4035, ...,  0.4203,  0.811 ,  1.0444],\n",
-       "       [-0.4009,  0.824 , -0.5623, ...,  0.7848, -0.9554,  0.5859],\n",
-       "       ...,\n",
-       "       [ 1.3797,  0.1387,  1.2255, ...,  1.8051,  0.3722,  0.1253],\n",
-       "       [ 0.7348, -0.7115, -0.1248, ..., -1.9533, -0.7684, -0.5345],\n",
-       "       [ 0.2183, -0.8654,  0.8886, ..., -1.0141, -0.0627, -1.4379]],\n",
-       "      dtype=float32)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">41006 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 41006 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "shape = (100, 100)\n",
-    "chunks = (10, 10)\n",
-    "\n",
-    "np.random.seed(1)\n",
-    "\n",
-    "x = zarr.array(np.random.randn(*shape).astype(np.float32), chunks=chunks, compressor=codec)\n",
-    "display(x[:])\n",
-    "display(x.info)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "c15cbdff",
-   "metadata": {},
-   "source": [
-    "Store and load back the data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "730cde85",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'chunks': [10, 10],\n",
-       " 'compressor': {'algorithm': 'lz4', 'id': 'nvcomp_batch', 'options': {}},\n",
-       " 'dtype': '<f4',\n",
-       " 'fill_value': 0.0,\n",
-       " 'filters': None,\n",
-       " 'order': 'C',\n",
-       " 'shape': [100, 100],\n",
-       " 'zarr_format': 2}"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Use simple dictionary store, see zarr documentation for other options.\n",
-    "zarr_store = {}\n",
-    "\n",
-    "zarr.save_array(zarr_store, x, compressor=codec)\n",
-    "\n",
-    "# Check stored metadata.\n",
-    "meta_info = json.loads(zarr_store[\".zarray\"])\n",
-    "display(meta_info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "11338a99",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">41006 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 41006 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "y = zarr.open_array(zarr_store)\n",
-    "display(y.info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "5b6cc2ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test the roundtrip.\n",
-    "np.testing.assert_equal(y[:], x[:])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "1a8eea79",
-   "metadata": {},
-   "source": [
-    "### CPU compression / GPU decompression\n",
-    "\n",
-    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU. For example, the data might be created using CPU LZ4 codec and then decompressed using GPU version of LZ4 codec."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "87d25b76",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'chunks': [10, 10],\n",
-       " 'compressor': {'acceleration': 1, 'id': 'lz4'},\n",
-       " 'dtype': '<f4',\n",
-       " 'fill_value': 0.0,\n",
-       " 'filters': None,\n",
-       " 'order': 'C',\n",
-       " 'shape': [100, 100],\n",
-       " 'zarr_format': 2}"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">LZ4(acceleration=1)</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">40973 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : LZ4(acceleration=1)\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 40973 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">40883 (39.9K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 40883 (39.9K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Get default (CPU) implementation of LZ4 codec.\n",
-    "cpu_codec = numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
-    "\n",
-    "x = zarr.array(np.random.randn(*shape).astype(np.float32), chunks=chunks, compressor=cpu_codec)\n",
-    "# Define a simple, dictionary-based store. In real scenarios this can be a filesystem or some other persistent store.\n",
-    "store = {}\n",
-    "zarr.save_array(store, x, compressor=cpu_codec)\n",
-    "\n",
-    "# Check that the data was written by the expected codec.\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "display(meta)\n",
-    "assert meta[\"compressor\"][\"id\"] == \"lz4\"\n",
-    "\n",
-    "# Change codec to GPU/nvCOMP-based.\n",
-    "meta[\"compressor\"] = {\"id\": NVCOMP_CODEC_ID, \"algorithm\": LZ4_ALGO}\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()\n",
-    "\n",
-    "y = zarr.open_array(store, compressor=codec)\n",
-    "\n",
-    "display(x.info)\n",
-    "display(y.info)\n",
-    "\n",
-    "np.testing.assert_equal(x[:], y[:])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9294992",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/nvcomp_vs_zarr_lz4.ipynb b/notebooks/nvcomp_vs_zarr_lz4.ipynb
deleted file mode 100644
index 3b6d947ac3..0000000000
--- a/notebooks/nvcomp_vs_zarr_lz4.ipynb
+++ /dev/null
@@ -1,2832 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "f95b6759-533a-470b-8f08-5f91ebcea625",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import cupy as cp\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import time\n",
-    "import zarr\n",
-    "\n",
-    "import kvikio.nvcomp\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "d1e60a9b-0bca-4c66-b2f0-829acc3b1ba2",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# conda install -c conda-forge zarr"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "2cedb529-c0fa-4883-a2fd-78b1ad3c1a59",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935288, 31467644, 15733822, 7866911, 3933455, 1966727, 983363, 491681, 245840, 122920, 61460, 30730, 15365, 7682, 3841]\n"
-     ]
-    }
-   ],
-   "source": [
-    "HOST_LZ4_MAX = 2013929216 # 2113929216\n",
-    "sizes = list(map(lambda x: HOST_LZ4_MAX//(2**x), np.arange(20)))\n",
-    "print(sizes)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "39483573-e79b-4dca-aee3-13bf392da3a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_size = []\n",
-    "cascaded_size = []\n",
-    "cascaded_temp_size = []\n",
-    "cascaded_round_trip_time = []\n",
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "bitcomp_gpu_size = []\n",
-    "bitcomp_gpu_temp_size = []\n",
-    "bitcomp_gpu_round_trip_time = []\n",
-    "lz4_size = []\n",
-    "lz4_round_trip_time = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "ccd9b1e7-b607-4948-8256-73bedf1ec7a8",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "821.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2023-09-29 13:44:43--  http://textfiles.com/etext/NONFICTION/kjv10.txt\n",
-      "Resolving textfiles.com (textfiles.com)... 208.86.224.90\n",
-      "Connecting to textfiles.com (textfiles.com)|208.86.224.90|:80... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 4432803 (4.2M) [text/plain]\n",
-      "Saving to: ‘kjv10.txt.2’\n",
-      "\n",
-      "kjv10.txt.2         100%[===================>]   4.23M   316KB/s    in 14s     \n",
-      "\n",
-      "2023-09-29 13:44:58 (304 KB/s) - ‘kjv10.txt.2’ saved [4432803/4432803]\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "!wget http://textfiles.com/etext/NONFICTION/kjv10.txt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "4c9a5c4c-4c49-4834-8dc2-3e6fc11ea930",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "text = open('kjv10.txt').read()\n",
-    "bib = np.frombuffer(bytes(text, 'utf-8'), dtype=np.int8)\n",
-    "data_buffer = np.tile(bib, 500)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "74740819-b987-4012-ba6c-ed3d3b9afd60",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# One of the three below keys, this will set the arrangement of test data for a full run of the notebook.\n",
-    "TARGET = \"Ascending\"\n",
-    "DTYPE = cp.int32"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "0a1307ed-034c-4943-a7e1-36665cba8ad5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = {\n",
-    "    \"Ascending\": np.arange(0, HOST_LZ4_MAX, dtype=np.int32),\n",
-    "    \"Random\": np.random.randint(0, 100, HOST_LZ4_MAX, dtype=np.int32),\n",
-    "    \"Text\": data_buffer\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "68adbb33-ddb7-4603-8863-fdd25b8bdc51",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_host_data(offset, dtype):\n",
-    "    exemplar = np.array([1], dtype=dtype)\n",
-    "    print(offset)\n",
-    "    print(exemplar.itemsize)\n",
-    "    print(data[TARGET].itemsize)\n",
-    "    index = offset // data[TARGET].itemsize\n",
-    "    index = index - (index % exemplar.itemsize)\n",
-    "    print(index)\n",
-    "    return data[TARGET][0:index].view(dtype)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "id": "f067cdc2-ee14-4258-b89d-0bb4a224c698",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2013929216\n",
-      "4\n",
-      "4\n",
-      "503482304\n",
-      "-----\n",
-      "Input size:  2013929216\n",
-      "Cascaded GPU compressor output size:  33434464\n",
-      "Cascaded GPU decompressor output size:  2013929216\n",
-      "Cascaded GPU compress/decompress round trip time:  0.1076362133026123\n",
-      "2021826967\n",
-      "Lz4 zarr time:  4.681669235229492\n",
-      "Lz4 compressed size:  2021826967\n",
-      "1006964608\n",
-      "4\n",
-      "4\n",
-      "251741152\n",
-      "-----\n",
-      "Input size:  1006964608\n",
-      "Cascaded GPU compressor output size:  16717276\n",
-      "Cascaded GPU decompressor output size:  1006964608\n",
-      "Cascaded GPU compress/decompress round trip time:  0.11769247055053711\n",
-      "1010913478\n",
-      "Lz4 zarr time:  2.57978892326355\n",
-      "Lz4 compressed size:  1010913478\n",
-      "503482304\n",
-      "4\n",
-      "4\n",
-      "125870576\n",
-      "-----\n",
-      "Input size:  503482304\n",
-      "Cascaded GPU compressor output size:  8358716\n",
-      "Cascaded GPU decompressor output size:  503482304\n",
-      "Cascaded GPU compress/decompress round trip time:  0.05775332450866699\n",
-      "505456734\n",
-      "Lz4 zarr time:  1.2365527153015137\n",
-      "Lz4 compressed size:  505456734\n",
-      "251741152\n",
-      "4\n",
-      "4\n",
-      "62935288\n",
-      "-----\n",
-      "Input size:  251741152\n",
-      "Cascaded GPU compressor output size:  4179436\n",
-      "Cascaded GPU decompressor output size:  251741152\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0284881591796875\n",
-      "252728362\n",
-      "Lz4 zarr time:  0.5986642837524414\n",
-      "Lz4 compressed size:  252728362\n",
-      "125870576\n",
-      "4\n",
-      "4\n",
-      "31467644\n",
-      "-----\n",
-      "Input size:  125870576\n",
-      "Cascaded GPU compressor output size:  2089796\n",
-      "Cascaded GPU decompressor output size:  125870576\n",
-      "Cascaded GPU compress/decompress round trip time:  0.01472783088684082\n",
-      "126364175\n",
-      "Lz4 zarr time:  0.30330395698547363\n",
-      "Lz4 compressed size:  126364175\n",
-      "62935288\n",
-      "4\n",
-      "4\n",
-      "15733820\n",
-      "-----\n",
-      "Input size:  62935280\n",
-      "Cascaded GPU compressor output size:  1044976\n",
-      "Cascaded GPU decompressor output size:  62935280\n",
-      "Cascaded GPU compress/decompress round trip time:  0.007399559020996094\n",
-      "63182074\n",
-      "Lz4 zarr time:  0.1610257625579834\n",
-      "Lz4 compressed size:  63182074\n",
-      "31467644\n",
-      "4\n",
-      "4\n",
-      "7866908\n",
-      "-----\n",
-      "Input size:  31467632\n",
-      "Cascaded GPU compressor output size:  522532\n",
-      "Cascaded GPU decompressor output size:  31467632\n",
-      "Cascaded GPU compress/decompress round trip time:  0.004503726959228516\n",
-      "31591024\n",
-      "Lz4 zarr time:  0.1471562385559082\n",
-      "Lz4 compressed size:  31591024\n",
-      "15733822\n",
-      "4\n",
-      "4\n",
-      "3933452\n",
-      "-----\n",
-      "Input size:  15733808\n",
-      "Cascaded GPU compressor output size:  261344\n",
-      "Cascaded GPU decompressor output size:  15733808\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0025734901428222656\n",
-      "15795499\n",
-      "Lz4 zarr time:  0.03436875343322754\n",
-      "Lz4 compressed size:  15795499\n",
-      "7866911\n",
-      "4\n",
-      "4\n",
-      "1966724\n",
-      "-----\n",
-      "Input size:  7866896\n",
-      "Cascaded GPU compressor output size:  130716\n",
-      "Cascaded GPU decompressor output size:  7866896\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0018618106842041016\n",
-      "7897736\n",
-      "Lz4 zarr time:  0.010539531707763672\n",
-      "Lz4 compressed size:  7897736\n",
-      "3933455\n",
-      "4\n",
-      "4\n",
-      "983360\n",
-      "-----\n",
-      "Input size:  3933440\n",
-      "Cascaded GPU compressor output size:  65436\n",
-      "Cascaded GPU decompressor output size:  3933440\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0017323493957519531\n",
-      "3948855\n",
-      "Lz4 zarr time:  0.028203964233398438\n",
-      "Lz4 compressed size:  3948855\n",
-      "1966727\n",
-      "4\n",
-      "4\n",
-      "491680\n",
-      "-----\n",
-      "Input size:  1966720\n",
-      "Cascaded GPU compressor output size:  32796\n",
-      "Cascaded GPU decompressor output size:  1966720\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0020630359649658203\n",
-      "1974422\n",
-      "Lz4 zarr time:  0.002621889114379883\n",
-      "Lz4 compressed size:  1974422\n",
-      "983363\n",
-      "4\n",
-      "4\n",
-      "245840\n",
-      "-----\n",
-      "Input size:  983360\n",
-      "Cascaded GPU compressor output size:  16476\n",
-      "Cascaded GPU decompressor output size:  983360\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0014410018920898438\n",
-      "987206\n",
-      "Lz4 zarr time:  0.0007197856903076172\n",
-      "Lz4 compressed size:  987206\n",
-      "491681\n",
-      "4\n",
-      "4\n",
-      "122920\n",
-      "-----\n",
-      "Input size:  491680\n",
-      "Cascaded GPU compressor output size:  8316\n",
-      "Cascaded GPU decompressor output size:  491680\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0011644363403320312\n",
-      "493597\n",
-      "Lz4 zarr time:  0.000965118408203125\n",
-      "Lz4 compressed size:  493597\n",
-      "245840\n",
-      "4\n",
-      "4\n",
-      "61460\n",
-      "-----\n",
-      "Input size:  245840\n",
-      "Cascaded GPU compressor output size:  4236\n",
-      "Cascaded GPU decompressor output size:  245840\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0015044212341308594\n",
-      "246793\n",
-      "Lz4 zarr time:  0.0004220008850097656\n",
-      "Lz4 compressed size:  246793\n",
-      "122920\n",
-      "4\n",
-      "4\n",
-      "30728\n",
-      "-----\n",
-      "Input size:  122912\n",
-      "Cascaded GPU compressor output size:  2184\n",
-      "Cascaded GPU decompressor output size:  122912\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0011115074157714844\n",
-      "123383\n",
-      "Lz4 zarr time:  0.0002646446228027344\n",
-      "Lz4 compressed size:  123383\n",
-      "61460\n",
-      "4\n",
-      "4\n",
-      "15364\n",
-      "-----\n",
-      "Input size:  61456\n",
-      "Cascaded GPU compressor output size:  1148\n",
-      "Cascaded GPU decompressor output size:  61456\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0009233951568603516\n",
-      "61678\n",
-      "Lz4 zarr time:  0.00020623207092285156\n",
-      "Lz4 compressed size:  61678\n",
-      "30730\n",
-      "4\n",
-      "4\n",
-      "7680\n",
-      "-----\n",
-      "Input size:  30720\n",
-      "Cascaded GPU compressor output size:  632\n",
-      "Cascaded GPU decompressor output size:  30720\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001186370849609375\n",
-      "30822\n",
-      "Lz4 zarr time:  0.00011777877807617188\n",
-      "Lz4 compressed size:  30822\n",
-      "15365\n",
-      "4\n",
-      "4\n",
-      "3840\n",
-      "-----\n",
-      "Input size:  15360\n",
-      "Cascaded GPU compressor output size:  360\n",
-      "Cascaded GPU decompressor output size:  15360\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001523733139038086\n",
-      "15401\n",
-      "Lz4 zarr time:  0.0003781318664550781\n",
-      "Lz4 compressed size:  15401\n",
-      "7682\n",
-      "4\n",
-      "4\n",
-      "1920\n",
-      "-----\n",
-      "Input size:  7680\n",
-      "Cascaded GPU compressor output size:  224\n",
-      "Cascaded GPU decompressor output size:  7680\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0012781620025634766\n",
-      "7699\n",
-      "Lz4 zarr time:  0.0001780986785888672\n",
-      "Lz4 compressed size:  7699\n",
-      "3841\n",
-      "4\n",
-      "4\n",
-      "960\n",
-      "-----\n",
-      "Input size:  3840\n",
-      "Cascaded GPU compressor output size:  156\n",
-      "Cascaded GPU decompressor output size:  3840\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001318216323852539\n",
-      "3852\n",
-      "Lz4 zarr time:  0.00019931793212890625\n",
-      "Lz4 compressed size:  3852\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_size = []\n",
-    "cascaded_size = []\n",
-    "cascaded_temp_size = []\n",
-    "cascaded_round_trip_time = []\n",
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "lz4_size = []\n",
-    "lz4_round_trip_time = []\n",
-    "for size in sizes:\n",
-    "    data_host = get_host_data(size, DTYPE)\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "    \"\"\"Cascaded GPU\"\"\"\n",
-    "    t_gpu = time.time()\n",
-    "    compressor = kvikio.nvcomp.CascadedManager(dtype=data_gpu.dtype)\n",
-    "    compressed = compressor.compress(data_gpu)\n",
-    "    output_size = compressed.nbytes\n",
-    "\n",
-    "    decompressed = compressor.decompress(compressed)\n",
-    "    decompressed_size = decompressed.size * decompressed.itemsize\n",
-    "    input_size.append(data_gpu.size * data_gpu.itemsize)\n",
-    "    cascaded_round_trip_time.append(time.time() - t_gpu)\n",
-    "    cascaded_size.append(output_size)\n",
-    "    print('-----')\n",
-    "    print('Input size: ', data_gpu.size * data_gpu.itemsize)\n",
-    "    print('Cascaded GPU compressor output size: ', output_size)\n",
-    "    print('Cascaded GPU decompressor output size: ', decompressed_size)\n",
-    "    print('Cascaded GPU compress/decompress round trip time: ',time.time() - t_gpu)\n",
-    "    \n",
-    "    del compressor\n",
-    "        \n",
-    "    \"\"\"LZ4 Host\"\"\"\n",
-    "    lz4 = zarr.LZ4()\n",
-    "    t_host = time.time()\n",
-    "    host_compressed = lz4.encode(data_gpu.get())\n",
-    "    del data_gpu\n",
-    "    print(len(host_compressed))\n",
-    "    host_compressed = host_compressed[:2113929216]\n",
-    "    host_decompressed = lz4.decode(host_compressed)\n",
-    "    print('Lz4 zarr time: ', time.time() - t_host)\n",
-    "    print('Lz4 compressed size: ', len(host_compressed))\n",
-    "    lz4_size.append(len(host_compressed))\n",
-    "    lz4_round_trip_time.append(time.time() - t_host)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "c981e8bc-e96a-4af4-9fe1-414aa2ff4c99",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2013929216\n",
-      "4\n",
-      "4\n",
-      "503482304\n",
-      "lz4 GPU compressor output size:  2022340697\n",
-      "lz4 GPU decompressor output size:  2013929216\n",
-      "lz4 GPU compress/decompress round trip time:  0.7271463871002197\n",
-      "1006964608\n",
-      "4\n",
-      "4\n",
-      "251741152\n",
-      "lz4 GPU compressor output size:  1011170371\n",
-      "lz4 GPU decompressor output size:  1006964608\n",
-      "lz4 GPU compress/decompress round trip time:  0.36713171005249023\n",
-      "503482304\n",
-      "4\n",
-      "4\n",
-      "125870576\n",
-      "lz4 GPU compressor output size:  505585200\n",
-      "lz4 GPU decompressor output size:  503482304\n",
-      "lz4 GPU compress/decompress round trip time:  0.1900792121887207\n",
-      "251741152\n",
-      "4\n",
-      "4\n",
-      "62935288\n",
-      "lz4 GPU compressor output size:  252792621\n",
-      "lz4 GPU decompressor output size:  251741152\n",
-      "lz4 GPU compress/decompress round trip time:  0.09049177169799805\n",
-      "125870576\n",
-      "4\n",
-      "4\n",
-      "31467644\n",
-      "lz4 GPU compressor output size:  126396327\n",
-      "lz4 GPU decompressor output size:  125870576\n",
-      "lz4 GPU compress/decompress round trip time:  0.04643416404724121\n",
-      "62935288\n",
-      "4\n",
-      "4\n",
-      "15733820\n",
-      "lz4 GPU compressor output size:  63198181\n",
-      "lz4 GPU decompressor output size:  62935280\n",
-      "lz4 GPU compress/decompress round trip time:  0.02284073829650879\n",
-      "31467644\n",
-      "4\n",
-      "4\n",
-      "7866908\n",
-      "lz4 GPU compressor output size:  31599109\n",
-      "lz4 GPU decompressor output size:  31467632\n",
-      "lz4 GPU compress/decompress round trip time:  0.015845537185668945\n",
-      "15733822\n",
-      "4\n",
-      "4\n",
-      "3933452\n",
-      "lz4 GPU compressor output size:  15799573\n",
-      "lz4 GPU decompressor output size:  15733808\n",
-      "lz4 GPU compress/decompress round trip time:  0.009501934051513672\n",
-      "7866911\n",
-      "4\n",
-      "4\n",
-      "1966724\n",
-      "lz4 GPU compressor output size:  7899801\n",
-      "lz4 GPU decompressor output size:  7866896\n",
-      "lz4 GPU compress/decompress round trip time:  0.011568546295166016\n",
-      "3933455\n",
-      "4\n",
-      "4\n",
-      "983360\n",
-      "lz4 GPU compressor output size:  3949915\n",
-      "lz4 GPU decompressor output size:  3933440\n",
-      "lz4 GPU compress/decompress round trip time:  0.00696110725402832\n",
-      "1966727\n",
-      "4\n",
-      "4\n",
-      "491680\n",
-      "lz4 GPU compressor output size:  1974981\n",
-      "lz4 GPU decompressor output size:  1966720\n",
-      "lz4 GPU compress/decompress round trip time:  0.012327194213867188\n",
-      "983363\n",
-      "4\n",
-      "4\n",
-      "245840\n",
-      "lz4 GPU compressor output size:  987514\n",
-      "lz4 GPU decompressor output size:  983360\n",
-      "lz4 GPU compress/decompress round trip time:  0.006538867950439453\n",
-      "491681\n",
-      "4\n",
-      "4\n",
-      "122920\n",
-      "lz4 GPU compressor output size:  493774\n",
-      "lz4 GPU decompressor output size:  491680\n",
-      "lz4 GPU compress/decompress round trip time:  0.012677907943725586\n",
-      "245840\n",
-      "4\n",
-      "4\n",
-      "61460\n",
-      "lz4 GPU compressor output size:  246904\n",
-      "lz4 GPU decompressor output size:  245840\n",
-      "lz4 GPU compress/decompress round trip time:  0.006706953048706055\n",
-      "122920\n",
-      "4\n",
-      "4\n",
-      "30728\n",
-      "lz4 GPU compressor output size:  123459\n",
-      "lz4 GPU decompressor output size:  122912\n",
-      "lz4 GPU compress/decompress round trip time:  0.010996580123901367\n",
-      "61460\n",
-      "4\n",
-      "4\n",
-      "15364\n",
-      "lz4 GPU compressor output size:  61745\n",
-      "lz4 GPU decompressor output size:  61456\n",
-      "lz4 GPU compress/decompress round trip time:  0.006911039352416992\n",
-      "30730\n",
-      "4\n",
-      "4\n",
-      "7680\n",
-      "lz4 GPU compressor output size:  30907\n",
-      "lz4 GPU decompressor output size:  30720\n",
-      "lz4 GPU compress/decompress round trip time:  0.004134178161621094\n",
-      "15365\n",
-      "4\n",
-      "4\n",
-      "3840\n",
-      "lz4 GPU compressor output size:  15498\n",
-      "lz4 GPU decompressor output size:  15360\n",
-      "lz4 GPU compress/decompress round trip time:  0.0048847198486328125\n",
-      "7682\n",
-      "4\n",
-      "4\n",
-      "1920\n",
-      "lz4 GPU compressor output size:  7787\n",
-      "lz4 GPU decompressor output size:  7680\n",
-      "lz4 GPU compress/decompress round trip time:  0.0031135082244873047\n",
-      "3841\n",
-      "4\n",
-      "4\n",
-      "960\n",
-      "lz4 GPU compressor output size:  3940\n",
-      "lz4 GPU decompressor output size:  3840\n",
-      "lz4 GPU compress/decompress round trip time:  0.0027506351470947266\n"
-     ]
-    }
-   ],
-   "source": [
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "for size in sizes:\n",
-    "    data_host = get_host_data(size, DTYPE)\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "\n",
-    "    \"\"\"LZ4 GPU\"\"\"\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "    t_gpu = time.time()\n",
-    "    compressor = kvikio.nvcomp.LZ4Manager(dtype=data_gpu.dtype)\n",
-    "    compressed = compressor.compress(data_gpu)\n",
-    "    output_size = compressed.nbytes\n",
-    "\n",
-    "    decompressed = compressor.decompress(compressed)\n",
-    "    decompressed_size = decompressed.size * decompressed.itemsize\n",
-    "    lz4_gpu_round_trip_time.append(time.time() - t_gpu)\n",
-    "    lz4_gpu_size.append(output_size)\n",
-    "    print('lz4 GPU compressor output size: ', output_size)\n",
-    "    print('lz4 GPU decompressor output size: ', decompressed_size)\n",
-    "    print('lz4 GPU compress/decompress round trip time: ',time.time() - t_gpu)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "0b9e6efb-439b-4d9e-b221-1a728adee7d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# zarr lz4 max buffer size is 264241152 int64s\n",
-    "# zarr lz4 max buffer size is 2113929216 bytes\n",
-    "# cascaded max buffer size is 2147483640 bytes\n",
-    "# cascaded max buffer size is 268435456 int64s"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "9cd69f83-88de-4929-b760-b8ebfb916b8f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935280, 31467632, 15733808, 7866896, 3933440, 1966720, 983360, 491680, 245840, 122912, 61456, 30720, 15360, 7680, 3840]\n",
-      "[33434464, 16717276, 8358716, 4179436, 2089796, 1044976, 522532, 261344, 130716, 65436, 32796, 16476, 8316, 4236, 2184, 1148, 632, 360, 224, 156]\n",
-      "[]\n",
-      "[0.10751104354858398, 0.11756682395935059, 0.05767321586608887, 0.028416156768798828, 0.014620304107666016, 0.007331132888793945, 0.004427194595336914, 0.0025060176849365234, 0.0017902851104736328, 0.0016641616821289062, 0.001974821090698242, 0.0013790130615234375, 0.0011060237884521484, 0.0014438629150390625, 0.0010533332824707031, 0.0008640289306640625, 0.001127481460571289, 0.0014081001281738281, 0.0011692047119140625, 0.0012063980102539062]\n",
-      "[2022340697, 1011170371, 505585200, 252792621, 126396327, 63198181, 31599109, 15799573, 7899801, 3949915, 1974981, 987514, 493774, 246904, 123459, 61745, 30907, 15498, 7787, 3940]\n",
-      "[]\n",
-      "[0.7270452976226807, 0.3670234680175781, 0.18999958038330078, 0.09043264389038086, 0.04634451866149902, 0.022789478302001953, 0.015785932540893555, 0.009443283081054688, 0.011508703231811523, 0.00690460205078125, 0.012271881103515625, 0.00648951530456543, 0.012626171112060547, 0.006663322448730469, 0.010945320129394531, 0.00687098503112793, 0.004094123840332031, 0.004844188690185547, 0.0030717849731445312, 0.0027098655700683594]\n",
-      "[2021826967, 1010913478, 505456734, 252728362, 126364175, 63182074, 31591024, 15795499, 7897736, 3948855, 1974422, 987206, 493597, 246793, 123383, 61678, 30822, 15401, 7699, 3852]\n",
-      "[4.681788921356201, 2.579982280731201, 1.2367866039276123, 0.5987403392791748, 0.3033754825592041, 0.16110515594482422, 0.1472797393798828, 0.03442859649658203, 0.010602712631225586, 0.028273344039916992, 0.0026633739471435547, 0.0007534027099609375, 0.0009970664978027344, 0.0004544258117675781, 0.0002968311309814453, 0.0002384185791015625, 0.00015044212341308594, 0.00044274330139160156, 0.00023889541625976562, 0.00026869773864746094]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(input_size)\n",
-    "print(cascaded_size)\n",
-    "print(cascaded_temp_size)\n",
-    "print(cascaded_round_trip_time)\n",
-    "print(lz4_gpu_size)\n",
-    "print(lz4_gpu_temp_size)\n",
-    "print(lz4_gpu_round_trip_time)\n",
-    "print(lz4_size)\n",
-    "print(lz4_round_trip_time)\n",
-    "df = pd.DataFrame({\n",
-    "    'Input Size (Bytes)': input_size,\n",
-    "    'cascaded_size': cascaded_size,\n",
-    "    'cascaded_round_trip_time': cascaded_round_trip_time,\n",
-    "    'lz4_gpu_size': lz4_gpu_size,\n",
-    "    'lz4_gpu_round_trip_time': lz4_gpu_round_trip_time,\n",
-    "    'lz4_size': lz4_size,\n",
-    "    'lz4_round_trip_time': lz4_round_trip_time\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "id": "c7a23383-a073-4156-9be6-9da6b8c9026e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### You'll need the following to display the upcoming plots. ###\n",
-    "\n",
-    "# !conda install -c conda-forge plotly\n",
-    "# !npm install require"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "id": "8a7d2c60-79d7-4840-a5fb-c7e1eb42f829",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Index(['Input Size (Bytes)', 'cascaded_size', 'cascaded_round_trip_time',\n",
-      "       'lz4_gpu_size', 'lz4_gpu_round_trip_time', 'lz4_size',\n",
-      "       'lz4_round_trip_time', 'Cascaded Compression Ratio',\n",
-      "       'Lz4 Gpu Compression Ratio', 'Lz4 Host Compression Ratio',\n",
-      "       'Cascaded Speedup', 'Lz4 Gpu Speedup'],\n",
-      "      dtype='object')\n"
-     ]
-    }
-   ],
-   "source": [
-    "df['Cascaded Compression Ratio'] = df['Input Size (Bytes)'] / df['cascaded_size']\n",
-    "df['Lz4 Gpu Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_gpu_size']\n",
-    "df['Lz4 Host Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_size']\n",
-    "df['Cascaded Speedup'] = df['lz4_round_trip_time'] / df['cascaded_round_trip_time']\n",
-    "df['Lz4 Gpu Speedup'] = df['lz4_round_trip_time'] / df['lz4_gpu_round_trip_time']\n",
-    "print(df.columns)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "8c6f225a-61e6-42b2-a991-6eeab56aae48",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.plotly.v1+json": {
-       "config": {
-        "plotlyServerURL": "https://plot.ly"
-       },
-       "data": [
-        {
-         "hovertemplate": "variable=Cascaded Speedup<br>Input Size (Bytes)=%{x}<br>Multiple Faster=%{y}<extra></extra>",
-         "legendgroup": "Cascaded Speedup",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Cascaded Speedup",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          43.54705123144407,
-          21.94481567030547,
-          21.444731065444667,
-          21.07041934455389,
-          20.75028537882,
-          21.975478877361866,
-          33.267057999892295,
-          13.738369327371325,
-          5.922359834864829,
-          16.989541547277938,
-          1.3486659422914402,
-          0.5463347164591977,
-          0.9014873895236042,
-          0.31472919418758255,
-          0.2818017202354006,
-          0.27593818984547464,
-          0.13343201522520617,
-          0.3144260074500508,
-          0.20432300163132136,
-          0.22272727272727272
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Lz4 Gpu Speedup<br>Input Size (Bytes)=%{x}<br>Multiple Faster=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Gpu Speedup",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Gpu Speedup",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          6.439473491768513,
-          7.029474967000302,
-          6.5094175546732655,
-          6.620843022182852,
-          6.5460919936414195,
-          7.069277927730003,
-          9.329809246197762,
-          3.645829125429206,
-          0.9212777858341448,
-          4.094854972375691,
-          0.21703061858874728,
-          0.11609537455453911,
-          0.07896823898183467,
-          0.06819808215256906,
-          0.027119456303912173,
-          0.03469933030292515,
-          0.036745865362217564,
-          0.09139679102273846,
-          0.0777708786091276,
-          0.09915537568185817
-         ],
-         "yaxis": "y"
-        }
-       ],
-       "layout": {
-        "legend": {
-         "title": {
-          "text": "variable"
-         },
-         "tracegroupgap": 0
-        },
-        "template": {
-         "data": {
-          "bar": [
-           {
-            "error_x": {
-             "color": "#2a3f5f"
-            },
-            "error_y": {
-             "color": "#2a3f5f"
-            },
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "bar"
-           }
-          ],
-          "barpolar": [
-           {
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "barpolar"
-           }
-          ],
-          "carpet": [
-           {
-            "aaxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "baxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "type": "carpet"
-           }
-          ],
-          "choropleth": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "choropleth"
-           }
-          ],
-          "contour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "contour"
-           }
-          ],
-          "contourcarpet": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "contourcarpet"
-           }
-          ],
-          "heatmap": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmap"
-           }
-          ],
-          "heatmapgl": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmapgl"
-           }
-          ],
-          "histogram": [
-           {
-            "marker": {
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "histogram"
-           }
-          ],
-          "histogram2d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2d"
-           }
-          ],
-          "histogram2dcontour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2dcontour"
-           }
-          ],
-          "mesh3d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "mesh3d"
-           }
-          ],
-          "parcoords": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "parcoords"
-           }
-          ],
-          "pie": [
-           {
-            "automargin": true,
-            "type": "pie"
-           }
-          ],
-          "scatter": [
-           {
-            "fillpattern": {
-             "fillmode": "overlay",
-             "size": 10,
-             "solidity": 0.2
-            },
-            "type": "scatter"
-           }
-          ],
-          "scatter3d": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatter3d"
-           }
-          ],
-          "scattercarpet": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattercarpet"
-           }
-          ],
-          "scattergeo": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergeo"
-           }
-          ],
-          "scattergl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergl"
-           }
-          ],
-          "scattermapbox": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattermapbox"
-           }
-          ],
-          "scatterpolar": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolar"
-           }
-          ],
-          "scatterpolargl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolargl"
-           }
-          ],
-          "scatterternary": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterternary"
-           }
-          ],
-          "surface": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "surface"
-           }
-          ],
-          "table": [
-           {
-            "cells": {
-             "fill": {
-              "color": "#EBF0F8"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "header": {
-             "fill": {
-              "color": "#C8D4E3"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "type": "table"
-           }
-          ]
-         },
-         "layout": {
-          "annotationdefaults": {
-           "arrowcolor": "#2a3f5f",
-           "arrowhead": 0,
-           "arrowwidth": 1
-          },
-          "autotypenumbers": "strict",
-          "coloraxis": {
-           "colorbar": {
-            "outlinewidth": 0,
-            "ticks": ""
-           }
-          },
-          "colorscale": {
-           "diverging": [
-            [
-             0,
-             "#8e0152"
-            ],
-            [
-             0.1,
-             "#c51b7d"
-            ],
-            [
-             0.2,
-             "#de77ae"
-            ],
-            [
-             0.3,
-             "#f1b6da"
-            ],
-            [
-             0.4,
-             "#fde0ef"
-            ],
-            [
-             0.5,
-             "#f7f7f7"
-            ],
-            [
-             0.6,
-             "#e6f5d0"
-            ],
-            [
-             0.7,
-             "#b8e186"
-            ],
-            [
-             0.8,
-             "#7fbc41"
-            ],
-            [
-             0.9,
-             "#4d9221"
-            ],
-            [
-             1,
-             "#276419"
-            ]
-           ],
-           "sequential": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ],
-           "sequentialminus": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ]
-          },
-          "colorway": [
-           "#636efa",
-           "#EF553B",
-           "#00cc96",
-           "#ab63fa",
-           "#FFA15A",
-           "#19d3f3",
-           "#FF6692",
-           "#B6E880",
-           "#FF97FF",
-           "#FECB52"
-          ],
-          "font": {
-           "color": "#2a3f5f"
-          },
-          "geo": {
-           "bgcolor": "white",
-           "lakecolor": "white",
-           "landcolor": "#E5ECF6",
-           "showlakes": true,
-           "showland": true,
-           "subunitcolor": "white"
-          },
-          "hoverlabel": {
-           "align": "left"
-          },
-          "hovermode": "closest",
-          "mapbox": {
-           "style": "light"
-          },
-          "paper_bgcolor": "white",
-          "plot_bgcolor": "#E5ECF6",
-          "polar": {
-           "angularaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "radialaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "scene": {
-           "xaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "yaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "zaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           }
-          },
-          "shapedefaults": {
-           "line": {
-            "color": "#2a3f5f"
-           }
-          },
-          "ternary": {
-           "aaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "baxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "caxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "title": {
-           "x": 0.05
-          },
-          "xaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          },
-          "yaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          }
-         }
-        },
-        "title": {
-         "text": "Gpu Acceleration over Zarr Lz4 - Ascending <class 'numpy.int32'>"
-        },
-        "xaxis": {
-         "anchor": "y",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Input Size (Bytes)"
-         },
-         "type": "category"
-        },
-        "yaxis": {
-         "anchor": "x",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Multiple Faster"
-         }
-        }
-       }
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import plotly.express as px\n",
-    "title = 'Gpu Acceleration over Zarr Lz4 - ' + TARGET + \" \" + str(DTYPE)\n",
-    "subtitle = 'Includes host->gpu copy time'\n",
-    "fig = px.line(df, x='Input Size (Bytes)',\n",
-    "              y=['Cascaded Speedup', 'Lz4 Gpu Speedup'],\n",
-    "              labels={'value': 'Multiple Faster'},\n",
-    "              title=title)\n",
-    "fig.update_xaxes(type='category')\n",
-    "fig.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "id": "e3d57a90-ca86-41da-9747-696151d66184",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.plotly.v1+json": {
-       "config": {
-        "plotlyServerURL": "https://plot.ly"
-       },
-       "data": [
-        {
-         "hovertemplate": "variable=Lz4 Gpu Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Gpu Compression Ratio",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Gpu Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          0.9958407201059258,
-          0.9958406979470327,
-          0.995840669386683,
-          0.9958405866601621,
-          0.9958404566613712,
-          0.9958400543205508,
-          0.9958392181247895,
-          0.9958375457362044,
-          0.9958347052033336,
-          0.9958290241688745,
-          0.9958171749500375,
-          0.9957934773582957,
-          0.9957591934771778,
-          0.9956906327965525,
-          0.9955693793081104,
-          0.9953194590655113,
-          0.9939495907076067,
-          0.9910956252419667,
-          0.9862591498651598,
-          0.9746192893401016
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Cascaded Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Cascaded Compression Ratio",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Cascaded Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          60.23512792069884,
-          60.23496938137529,
-          60.23440729413465,
-          60.23328315112374,
-          60.231034990975196,
-          60.226531518427215,
-          60.2214448110355,
-          60.20344067589078,
-          60.183114538388566,
-          60.11125374411639,
-          59.96828881570923,
-          59.68438941490653,
-          59.12457912457913,
-          58.035882908404155,
-          56.27838827838828,
-          53.53310104529617,
-          48.607594936708864,
-          42.666666666666664,
-          34.285714285714285,
-          24.615384615384617
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Lz4 Host Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Host Compression Ratio",
-         "line": {
-          "color": "#00cc96",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Host Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          0.9960937552377597,
-          0.9960937606571312,
-          0.9960937705105339,
-          0.9960937902173401,
-          0.9960938375136782,
-          0.9960939237290628,
-          0.9960940803944817,
-          0.9960943937257063,
-          0.9960950834517639,
-          0.996096336786233,
-          0.9960991115374525,
-          0.9961041565792752,
-          0.9961162648881577,
-          0.9961384642190013,
-          0.9961826183509884,
-          0.9964006615000487,
-          0.996690675491532,
-          0.9973378352055061,
-          0.997532147032082,
-          0.9968847352024922
-         ],
-         "yaxis": "y"
-        }
-       ],
-       "layout": {
-        "legend": {
-         "title": {
-          "text": "variable"
-         },
-         "tracegroupgap": 0
-        },
-        "template": {
-         "data": {
-          "bar": [
-           {
-            "error_x": {
-             "color": "#2a3f5f"
-            },
-            "error_y": {
-             "color": "#2a3f5f"
-            },
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "bar"
-           }
-          ],
-          "barpolar": [
-           {
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "barpolar"
-           }
-          ],
-          "carpet": [
-           {
-            "aaxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "baxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "type": "carpet"
-           }
-          ],
-          "choropleth": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "choropleth"
-           }
-          ],
-          "contour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "contour"
-           }
-          ],
-          "contourcarpet": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "contourcarpet"
-           }
-          ],
-          "heatmap": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmap"
-           }
-          ],
-          "heatmapgl": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmapgl"
-           }
-          ],
-          "histogram": [
-           {
-            "marker": {
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "histogram"
-           }
-          ],
-          "histogram2d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2d"
-           }
-          ],
-          "histogram2dcontour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2dcontour"
-           }
-          ],
-          "mesh3d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "mesh3d"
-           }
-          ],
-          "parcoords": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "parcoords"
-           }
-          ],
-          "pie": [
-           {
-            "automargin": true,
-            "type": "pie"
-           }
-          ],
-          "scatter": [
-           {
-            "fillpattern": {
-             "fillmode": "overlay",
-             "size": 10,
-             "solidity": 0.2
-            },
-            "type": "scatter"
-           }
-          ],
-          "scatter3d": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatter3d"
-           }
-          ],
-          "scattercarpet": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattercarpet"
-           }
-          ],
-          "scattergeo": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergeo"
-           }
-          ],
-          "scattergl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergl"
-           }
-          ],
-          "scattermapbox": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattermapbox"
-           }
-          ],
-          "scatterpolar": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolar"
-           }
-          ],
-          "scatterpolargl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolargl"
-           }
-          ],
-          "scatterternary": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterternary"
-           }
-          ],
-          "surface": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "surface"
-           }
-          ],
-          "table": [
-           {
-            "cells": {
-             "fill": {
-              "color": "#EBF0F8"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "header": {
-             "fill": {
-              "color": "#C8D4E3"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "type": "table"
-           }
-          ]
-         },
-         "layout": {
-          "annotationdefaults": {
-           "arrowcolor": "#2a3f5f",
-           "arrowhead": 0,
-           "arrowwidth": 1
-          },
-          "autotypenumbers": "strict",
-          "coloraxis": {
-           "colorbar": {
-            "outlinewidth": 0,
-            "ticks": ""
-           }
-          },
-          "colorscale": {
-           "diverging": [
-            [
-             0,
-             "#8e0152"
-            ],
-            [
-             0.1,
-             "#c51b7d"
-            ],
-            [
-             0.2,
-             "#de77ae"
-            ],
-            [
-             0.3,
-             "#f1b6da"
-            ],
-            [
-             0.4,
-             "#fde0ef"
-            ],
-            [
-             0.5,
-             "#f7f7f7"
-            ],
-            [
-             0.6,
-             "#e6f5d0"
-            ],
-            [
-             0.7,
-             "#b8e186"
-            ],
-            [
-             0.8,
-             "#7fbc41"
-            ],
-            [
-             0.9,
-             "#4d9221"
-            ],
-            [
-             1,
-             "#276419"
-            ]
-           ],
-           "sequential": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ],
-           "sequentialminus": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ]
-          },
-          "colorway": [
-           "#636efa",
-           "#EF553B",
-           "#00cc96",
-           "#ab63fa",
-           "#FFA15A",
-           "#19d3f3",
-           "#FF6692",
-           "#B6E880",
-           "#FF97FF",
-           "#FECB52"
-          ],
-          "font": {
-           "color": "#2a3f5f"
-          },
-          "geo": {
-           "bgcolor": "white",
-           "lakecolor": "white",
-           "landcolor": "#E5ECF6",
-           "showlakes": true,
-           "showland": true,
-           "subunitcolor": "white"
-          },
-          "hoverlabel": {
-           "align": "left"
-          },
-          "hovermode": "closest",
-          "mapbox": {
-           "style": "light"
-          },
-          "paper_bgcolor": "white",
-          "plot_bgcolor": "#E5ECF6",
-          "polar": {
-           "angularaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "radialaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "scene": {
-           "xaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "yaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "zaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           }
-          },
-          "shapedefaults": {
-           "line": {
-            "color": "#2a3f5f"
-           }
-          },
-          "ternary": {
-           "aaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "baxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "caxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "title": {
-           "x": 0.05
-          },
-          "xaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          },
-          "yaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          }
-         }
-        },
-        "title": {
-         "text": "Compression - Ascending <class 'numpy.int32'>"
-        },
-        "xaxis": {
-         "anchor": "y",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Input Size (Bytes)"
-         },
-         "type": "category"
-        },
-        "yaxis": {
-         "anchor": "x",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Compression Factor"
-         }
-        }
-       }
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import plotly.express as px\n",
-    "title = 'Compression - ' + TARGET + \" \" + str(DTYPE)\n",
-    "fig = px.line(df, x='Input Size (Bytes)',\n",
-    "              y=[\n",
-    "                  'Lz4 Gpu Compression Ratio',\n",
-    "                  'Cascaded Compression Ratio',\n",
-    "                  'Lz4 Host Compression Ratio'\n",
-    "              ],\n",
-    "              labels={'value': 'Compression Factor'},\n",
-    "              title=title)\n",
-    "fig.update_xaxes(type='category')\n",
-    "fig.show()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/zarr.ipynb b/notebooks/zarr.ipynb
deleted file mode 100644
index 33a981ebf5..0000000000
--- a/notebooks/zarr.ipynb
+++ /dev/null
@@ -1,364 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "7a060f7d-9a0c-4763-98df-7dc82409c6ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "In this tutorial, we will show how to use KvikIO to read and write GPU memory directly to/from Zarr files.\n",
-    "\"\"\"\n",
-    "import json\n",
-    "import shutil\n",
-    "import numpy\n",
-    "import cupy\n",
-    "import zarr\n",
-    "import kvikio\n",
-    "import kvikio.zarr\n",
-    "from kvikio.nvcomp_codec import NvCompBatchCodec\n",
-    "from numcodecs import LZ4"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "99f4d25b-2006-4026-8629-1accafb338ef",
-   "metadata": {},
-   "source": [
-    "We need to set three Zarr arguments: \n",
-    " - `meta_array`: in order to make Zarr read into GPU memory (instead of CPU memory), we set the `meta_array` argument to an empty CuPy array. \n",
-    " - `store`: we need to use a GPU compatible Zarr Store, which will be KvikIO’s GDS store in our case. \n",
-    " - `compressor`: finally, we need to use a GPU compatible compressor (or `None`). KvikIO provides a nvCOMP compressor `kvikio.nvcomp_codec.NvCompBatchCodec` that we will use."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "c179c24a-766e-4e09-83c5-349868042576",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<zarr.core.Array (10,) int64>,\n",
-       " NvCompBatchCodec(algorithm='lz4', options={}),\n",
-       " <kvikio.zarr.GDSStore at 0x7fd42021ac20>)"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Let's create a new Zarr array using KvikIO's GDS store and LZ4 compression\n",
-    "z = zarr.array(\n",
-    "    cupy.arange(10), \n",
-    "    chunks=2, \n",
-    "    store=kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"), \n",
-    "    meta_array=cupy.empty(()),\n",
-    "    compressor=NvCompBatchCodec(\"lz4\"),\n",
-    "    overwrite=True,\n",
-    ")\n",
-    "z, z.compressor, z.store"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "cupy.ndarray"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# And because we set the `meta_array` argument, reading the Zarr array returns a CuPy array\n",
-    "type(z[:])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "549ded39-1053-4f82-a8a7-5a2ee999a4a1",
-   "metadata": {},
-   "source": [
-    "From this point onwards, `z` can be used just like any other Zarr array."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "8221742d-f15c-450a-9701-dc8c05326126",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([1, 2, 3, 4, 5, 6, 7, 8])"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z[1:9]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "f0c451c1-a240-4b26-a5ef-6e70a5bbeb55",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([42, 43, 44, 45, 46, 47, 48, 49, 50, 51])"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z[:] + 42"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7797155f-40f4-4c50-b704-2356ca64cba3",
-   "metadata": {},
-   "source": [
-    "### GPU compression / CPU decompression"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a0029deb-19b9-4dbb-baf0-ce4b199605a5",
-   "metadata": {},
-   "source": [
-    "In order to read GPU-written Zarr file into a NumPy array, we simply open that file **without** setting the `meta_array` argument:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "399f23f7-4475-496a-a537-a7163a35c888",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z = zarr.open_array(kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"))\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8e9f31d5",
-   "metadata": {},
-   "source": [
-    "And we don't need to use `kvikio.zarr.GDSStore` either:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "4b1f46b2",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z = zarr.open_array(\"my-zarr-file.zarr\")\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f10fd704-35f7-46b7-aabe-ea68fb2bf88d",
-   "metadata": {},
-   "source": [
-    "However, the above use `NvCompBatchCodec(\"lz4\")` for decompression. In the following, we will show how to read Zarr file written and compressed using a GPU on the CPU.\n",
-    "\n",
-    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU but Zarr will always use the compressor used to write the Zarr file. We are working with the Zarr team to fix this shortcoming but for now, we will use a workaround where we _patch_ the metadata manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "d980361a-e132-4f29-ab13-cbceec5bbbb5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray, numcodecs.lz4.LZ4, array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Read the Zarr metadata and replace the compressor with a CPU implementation of LZ4\n",
-    "store = zarr.DirectoryStore(\"my-zarr-file.zarr\")  # We could also have used kvikio.zarr.GDSStore\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "meta[\"compressor\"] = LZ4().get_config()\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
-    "\n",
-    "# And then open the file as usually\n",
-    "z = zarr.open_array(store)\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8ea73705",
-   "metadata": {},
-   "source": [
-    "### CPU compression / GPU decompression\n",
-    "\n",
-    "Now, let's try the otherway around."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "c9b2d56a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<zarr.core.Array (10,) int64>,\n",
-       " LZ4(acceleration=1),\n",
-       " <zarr.storage.DirectoryStore at 0x7fd351e7a9b0>)"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numcodecs\n",
-    "# Let's create a new Zarr array using the default compression.\n",
-    "z = zarr.array(\n",
-    "    numpy.arange(10), \n",
-    "    chunks=2, \n",
-    "    store=\"my-zarr-file.zarr\", \n",
-    "    overwrite=True,\n",
-    "    # The default (CPU) implementation of LZ4 codec.\n",
-    "    compressor=numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
-    ")\n",
-    "z, z.compressor, z.store"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dedd4623",
-   "metadata": {},
-   "source": [
-    "Again, we will use a workaround where we _patch_ the metadata manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "ac3f30b1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(cupy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Read the Zarr metadata and replace the compressor with a GPU implementation of LZ4\n",
-    "store = kvikio.zarr.GDSStore(\"my-zarr-file.zarr\")  # We could also have used zarr.DirectoryStore\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "meta[\"compressor\"] = NvCompBatchCodec(\"lz4\").get_config()\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
-    "\n",
-    "# And then open the file as usually\n",
-    "z = zarr.open_array(store, meta_array=cupy.empty(()))\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "80682922-b7b0-4b08-b595-228c2b446a78",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Clean up\n",
-    "shutil.rmtree(\"my-zarr-file.zarr\", ignore_errors=True)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/kvikio/cli/gpu_compressor.py b/python/kvikio/cli/gpu_compressor.py
deleted file mode 100755
index ac34b15e8f..0000000000
--- a/python/kvikio/cli/gpu_compressor.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# NVIDIA 2022
-
-import argparse
-import os
-import sys
-import time
-
-import cupy
-
-import kvikio
-import kvikio.nvcomp as nvcomp
-
-
-def get_parser():
-    class NvcompParser(argparse.ArgumentParser):
-        """
-        Handle special case and show help on invalid argument
-        """
-
-        def error(self, message):
-            sys.stderr.write("\nERROR: {}\n\n".format(message))
-            self.print_help()
-            sys.exit(2)
-
-    parser = NvcompParser()
-    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose Output")
-    parser.add_argument(
-        "-o",
-        "--out_file",
-        action="store",
-        dest="out_file",
-        help="Output filename",
-    )
-    parser.add_argument(
-        "-c",
-        choices=["ans", "bitcomp", "cascaded", "gdeflate", "lz4", "snappy"],
-        action="store",
-        dest="compression",
-        help="Which GPU algorithm to use for compression.",
-    )
-    parser.add_argument(
-        "-d",
-        action="store_true",
-        help="Decompress the incoming file",
-    )
-    parser.add_argument(action="store", dest="filename", help="Relative Filename")
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    print("GPU Compression Initialized") if args.verbose else None
-
-    file_size = os.path.getsize(args.filename)
-    """ test
-    data = cupy.arange(10000, dtype="uint8")
-    """
-    data = cupy.zeros(file_size, dtype=cupy.int8)
-    t = time.time()
-    f = kvikio.CuFile(args.filename, "r")
-    f.read(data)
-    f.close()
-    read_time = time.time() - t
-    print(f"File read time: {read_time:.3} seconds.") if args.verbose else None
-
-    if args.d:
-        compressor = nvcomp.ManagedDecompressionManager(data)
-    elif args.compression == "ans":
-        compressor = nvcomp.ANSManager()
-    elif args.compression == "bitcomp":
-        compressor = nvcomp.BitcompManager()
-    elif args.compression == "cascaded":
-        compressor = nvcomp.CascadedManager()
-    elif args.compression == "gdeflate":
-        compressor = nvcomp.GdeflateManager()
-    elif args.compression == "snappy":
-        compressor = nvcomp.SnappyManager()
-    else:
-        compressor = nvcomp.LZ4Manager(chunk_size=1 << 16)
-
-    if args.d is True:
-        print(f"Decompressing {file_size} bytes") if args.verbose else None
-        t = time.time()
-        converted = compressor.decompress(data)
-        decompress_time = time.time() - t
-        print(
-            f"Decompression time: {decompress_time:.3} seconds"
-        ) if args.verbose else None
-
-        if not args.out_file:
-            raise ValueError("Must specify filename with -o for decompression.")
-
-        t = time.time()
-        o = kvikio.CuFile(args.out_file, "w")
-        o.write(converted)
-        o.close()
-        io_time = time.time() - t
-        print(f"File write time: {io_time:.3} seconds") if args.verbose else None
-
-        print(
-            f"Decompressed file size {os.path.getsize(args.out_file)}"
-        ) if args.verbose else None
-    else:
-        file_size = os.path.getsize(args.filename)
-
-        print(f"Compressing {file_size} bytes") if args.verbose else None
-        t = time.time()
-        converted = compressor.compress(data)
-        compress_time = time.time() - t
-        print(f"Compression time: {compress_time:.3} seconds") if args.verbose else None
-
-        t = time.time()
-        if args.out_file:
-            o = kvikio.CuFile(args.out_file, "w")
-        else:
-            o = kvikio.CuFile(args.filename + ".gpc", "w")
-        o.write(converted)
-        o.close()
-        io_time = time.time() - t
-        print(f"File write time: {io_time:.3} seconds") if args.verbose else None
-
-        print(
-            f"Compressed file size {compressor.get_compressed_output_size(converted)}"
-        ) if args.verbose else None
-
-    if args.out_file:
-        end_name = args.out_file
-    else:
-        end_name = args.filename + ".gpc"
-    print(f"Created file {end_name}") if args.verbose else None
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/kvikio/examples/zarr_cupy_nvcomp.py b/python/kvikio/examples/zarr_cupy_nvcomp.py
deleted file mode 100644
index 9f05f7874a..0000000000
--- a/python/kvikio/examples/zarr_cupy_nvcomp.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import cupy
-import numpy
-import zarr
-
-import kvikio
-import kvikio.zarr
-
-
-def main(path):
-    a = cupy.arange(20)
-
-    # Let's use KvikIO's convenience function `open_cupy_array()` to create
-    # a new Zarr file on disk. Its semantic is the same as `zarr.open_array()`
-    # but uses a GDS file store, nvCOMP compression, and CuPy arrays.
-    z = kvikio.zarr.open_cupy_array(store=path, mode="w", shape=(20,), chunks=(5,))
-
-    # `z` is a regular Zarr Array that we can write to as usual
-    z[0:10] = numpy.arange(0, 10)
-    # but it also support direct reads and writes of CuPy arrays
-    z[10:20] = cupy.arange(10, 20)
-
-    # Reading `z` returns a CuPy array
-    assert isinstance(z[:], cupy.ndarray)
-    assert (a == z[:]).all()
-
-    # Normally, we cannot assume that GPU and CPU compressors are compatible.
-    # E.g., `open_cupy_array()` uses nvCOMP's Snappy GPU compression by default,
-    # which, as far as we know, isn’t compatible with any CPU compressor. Thus,
-    # let's re-write our Zarr array using a CPU and GPU compatible compressor.
-    #
-    # Warning: it isn't possible to use `CompatCompressor` as a compressor argument
-    #          in Zarr directly. It is only meant for `open_cupy_array()`. However,
-    #          in an example further down, we show how to write using regular Zarr.
-    z = kvikio.zarr.open_cupy_array(
-        store=path,
-        mode="w",
-        shape=(20,),
-        chunks=(5,),
-        compressor=kvikio.zarr.CompatCompressor.lz4(),
-    )
-    z[:] = a
-
-    # Because we are using a CompatCompressor, it is now possible to open the file
-    # using Zarr's built-in LZ4 decompressor that uses the CPU.
-    z = zarr.open_array(path)
-    # `z` is now read as a regular NumPy array
-    assert isinstance(z[:], numpy.ndarray)
-    assert (a.get() == z[:]).all()
-    # and we can write to is as usual
-    z[:] = numpy.arange(20, 40)
-
-    # And we can read the Zarr file back into a CuPy array.
-    z = kvikio.zarr.open_cupy_array(store=path, mode="r")
-    assert isinstance(z[:], cupy.ndarray)
-    assert (cupy.arange(20, 40) == z[:]).all()
-
-    # Similarly, we can also open a file written by regular Zarr.
-    # Let's write the file without any compressor.
-    ary = numpy.arange(10)
-    z = zarr.open(store=path, mode="w", shape=ary.shape, compressor=None)
-    z[:] = ary
-    # This works as before where the file is read as a CuPy array
-    z = kvikio.zarr.open_cupy_array(store=path)
-    assert isinstance(z[:], cupy.ndarray)
-    assert (z[:] == cupy.asarray(ary)).all()
-
-    # Using a compressor is a bit more tricky since not all CPU compressors
-    # are GPU compatible. To make sure we use a compable compressor, we use
-    # the CPU-part of `CompatCompressor.lz4()`.
-    ary = numpy.arange(10)
-    z = zarr.open(
-        store=path,
-        mode="w",
-        shape=ary.shape,
-        compressor=kvikio.zarr.CompatCompressor.lz4().cpu,
-    )
-    z[:] = ary
-    # This works as before where the file is read as a CuPy array
-    z = kvikio.zarr.open_cupy_array(store=path)
-    assert isinstance(z[:], cupy.ndarray)
-    assert (z[:] == cupy.asarray(ary)).all()
-
-
-if __name__ == "__main__":
-    main("/tmp/zarr-cupy-nvcomp")
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index 7dfb47cc0f..5b454295c0 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build, one .so per file
 set(cython_modules arr.pyx buffer.pyx defaults.pyx cufile_driver.pyx file_handle.pyx future.pyx
-                   libnvcomp.pyx libnvcomp_ll.pyx mmap.pyx
+                   mmap.pyx
 )
 
 if(KvikIO_REMOTE_SUPPORT)
diff --git a/python/kvikio/kvikio/_lib/libnvcomp.pyx b/python/kvikio/kvikio/_lib/libnvcomp.pyx
deleted file mode 100644
index dc5359e9b3..0000000000
--- a/python/kvikio/kvikio/_lib/libnvcomp.pyx
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) 2022 Carson Swope
-# Use, modification, and distribution is subject to the MIT License
-# https://github.com/carsonswope/py-nvcomp/blob/main/LICENSE)
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: MIT
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-from enum import Enum
-
-from libc.stdint cimport uint8_t, uintptr_t
-from libcpp cimport nullptr
-from libcpp.memory cimport make_shared, shared_ptr
-from libcpp.utility cimport move
-
-from kvikio._lib.arr cimport Array
-from kvikio._lib.nvcomp_cxx_api cimport (
-    ANSManager,
-    BitcompManager,
-    CascadedManager,
-    CompressionConfig,
-    DecompressionConfig,
-    GdeflateManager,
-    LZ4Manager,
-    SnappyManager,
-    create_manager,
-    nvcompBatchedANSDefaultOpts,
-    nvcompBatchedANSOpts_t,
-    nvcompBatchedBitcompFormatOpts,
-    nvcompBatchedCascadedDefaultOpts,
-    nvcompBatchedCascadedOpts_t,
-    nvcompBatchedGdeflateOpts_t,
-    nvcompBatchedLZ4Opts_t,
-    nvcompBatchedSnappyDefaultOpts,
-    nvcompBatchedSnappyOpts_t,
-    nvcompManagerBase,
-    nvcompType_t,
-)
-
-
-class pyNvcompType_t(Enum):
-    pyNVCOMP_TYPE_CHAR = nvcompType_t.NVCOMP_TYPE_CHAR
-    pyNVCOMP_TYPE_UCHAR = nvcompType_t.NVCOMP_TYPE_UCHAR
-    pyNVCOMP_TYPE_SHORT = nvcompType_t.NVCOMP_TYPE_SHORT
-    pyNVCOMP_TYPE_USHORT = nvcompType_t.NVCOMP_TYPE_USHORT
-    pyNVCOMP_TYPE_INT = nvcompType_t.NVCOMP_TYPE_INT
-    pyNVCOMP_TYPE_UINT = nvcompType_t.NVCOMP_TYPE_UINT
-    pyNVCOMP_TYPE_LONGLONG = nvcompType_t.NVCOMP_TYPE_LONGLONG
-    pyNVCOMP_TYPE_ULONGLONG = nvcompType_t.NVCOMP_TYPE_ULONGLONG
-    pyNVCOMP_TYPE_BITS = nvcompType_t.NVCOMP_TYPE_BITS
-
-
-cdef class _nvcompManager:
-    # Temporary storage for factory allocated manager to prevent cleanup
-    cdef shared_ptr[nvcompManagerBase] _mgr
-    cdef nvcompManagerBase* _impl
-    cdef shared_ptr[CompressionConfig] _compression_config
-    cdef shared_ptr[DecompressionConfig] _decompression_config
-
-    def __dealloc__(self):
-        # `ManagedManager` uses a temporary object, self._mgr
-        # to retain a reference count to the Manager created by
-        # create_manager. If it is present, then the `shared_ptr`
-        # system will free self._impl. Otherwise, we need to free
-        # self._iNonempl
-        if self._mgr == nullptr:
-            del self._impl
-
-    def configure_compression(self, decomp_buffer_size):
-        cdef shared_ptr[CompressionConfig] partial = make_shared[
-            CompressionConfig](
-                self._impl.configure_compression(decomp_buffer_size)
-        )
-        self._compression_config = make_shared[CompressionConfig](
-            (move(partial.get()[0]))
-        )
-        cdef const CompressionConfig* compression_config_ptr = \
-            self._compression_config.get()
-        return {
-            "uncompressed_buffer_size": compression_config_ptr.
-            uncompressed_buffer_size,
-            "max_compressed_buffer_size": compression_config_ptr.
-            max_compressed_buffer_size,
-            "num_chunks": compression_config_ptr.num_chunks
-        }
-
-    def compress(self, Array decomp_buffer, Array comp_buffer):
-        cdef uintptr_t comp_buffer_ptr = comp_buffer.ptr
-        self._impl.compress(
-            <const uint8_t*>decomp_buffer.ptr,
-            <uint8_t*>comp_buffer_ptr,
-            <CompressionConfig&>self._compression_config.get()[0]
-        )
-        size = self._impl.get_compressed_output_size(
-            <uint8_t*>comp_buffer_ptr
-        )
-        return size
-
-    def configure_decompression_with_compressed_buffer(
-        self,
-        Array comp_buffer
-    ) -> dict:
-        cdef shared_ptr[DecompressionConfig] partial = make_shared[
-            DecompressionConfig](self._impl.configure_decompression(
-                <uint8_t*>comp_buffer.ptr
-            )
-        )
-        self._decompression_config = make_shared[DecompressionConfig](
-            (move(partial.get()[0]))
-        )
-        cdef const DecompressionConfig* decompression_config_ptr = \
-            self._decompression_config.get()
-        return {
-            "decomp_data_size": decompression_config_ptr.decomp_data_size,
-            "num_chunks": decompression_config_ptr.num_chunks
-        }
-
-    def decompress(
-        self,
-        Array decomp_buffer,
-        Array comp_buffer,
-    ):
-        self._impl.decompress(
-            <uint8_t*>decomp_buffer.ptr,
-            <const uint8_t*>comp_buffer.ptr,
-            <DecompressionConfig&>self._decompression_config.get()[0]
-        )
-
-    def get_compressed_output_size(self, Array comp_buffer):
-        return self._impl.get_compressed_output_size(
-            <uint8_t*>comp_buffer.ptr
-        )
-
-
-cdef class _ANSManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-    ):
-        self._impl = <nvcompManagerBase*>new ANSManager(
-            uncomp_chunk_size,
-            <nvcompBatchedANSOpts_t>nvcompBatchedANSDefaultOpts
-        )
-
-
-cdef class _BitcompManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        nvcompType_t data_type,
-        int bitcomp_algo,
-    ):
-        cdef opts = nvcompBatchedBitcompFormatOpts(bitcomp_algo, data_type)
-        self._impl = <nvcompManagerBase*>new BitcompManager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _CascadedManager(_nvcompManager):
-    def __cinit__(
-        self,
-        _options,
-    ):
-        self._impl = <nvcompManagerBase*>new CascadedManager(
-            _options["chunk_size"],
-            <nvcompBatchedCascadedOpts_t>nvcompBatchedCascadedDefaultOpts
-        )
-
-
-cdef class _GdeflateManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        int algo,
-    ):
-        cdef opts = nvcompBatchedGdeflateOpts_t(algo)
-        self._impl = <nvcompManagerBase*>new GdeflateManager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _LZ4Manager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        nvcompType_t data_type,
-    ):
-        # TODO: Doesn't work with user specified streams passed down
-        # from anywhere up. I'm not going to rabbit hole on it until
-        # everything else works.
-        # cdef cudaStream_t stream = <cudaStream_t><void*>user_stream
-        cdef opts = nvcompBatchedLZ4Opts_t(data_type)
-        self._impl = <nvcompManagerBase*>new LZ4Manager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _SnappyManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-    ):
-        # TODO: Doesn't work with user specified streams passed down
-        # from anywhere up. I'm not going to rabbit hole on it until
-        # everything else works.
-        self._impl = <nvcompManagerBase*>new SnappyManager(
-            uncomp_chunk_size,
-            <nvcompBatchedSnappyOpts_t>nvcompBatchedSnappyDefaultOpts
-        )
-
-
-cdef class _ManagedManager(_nvcompManager):
-    def __init__(self, compressed_buffer):
-        cdef shared_ptr[nvcompManagerBase] _mgr = create_manager(
-            <uint8_t*><uintptr_t>compressed_buffer.ptr
-        )
-        self._mgr = _mgr
-        self._impl = move(_mgr).get()
diff --git a/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx b/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
deleted file mode 100644
index 46c7b399a9..0000000000
--- a/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
+++ /dev/null
@@ -1,1182 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from enum import IntEnum
-
-from libc.stdint cimport uint32_t, uintptr_t
-
-from kvikio._lib.nvcomp_ll_cxx_api cimport cudaStream_t, nvcompStatus_t, nvcompType_t
-
-import cupy
-
-
-class nvCompStatus(IntEnum):
-    Success = nvcompStatus_t.nvcompSuccess,
-    ErrorInvalidValue = nvcompStatus_t.nvcompErrorInvalidValue,
-    ErrorNotSupported = nvcompStatus_t.nvcompErrorNotSupported,
-    ErrorCannotDecompress = nvcompStatus_t.nvcompErrorCannotDecompress,
-    ErrorBadChecksum = nvcompStatus_t.nvcompErrorBadChecksum,
-    ErrorCannotVerifyChecksums = nvcompStatus_t.nvcompErrorCannotVerifyChecksums,
-    ErrorCudaError = nvcompStatus_t.nvcompErrorCudaError,
-    ErrorInternal = nvcompStatus_t.nvcompErrorInternal,
-
-
-class nvCompType(IntEnum):
-    CHAR = nvcompType_t.NVCOMP_TYPE_CHAR
-    UCHAR = nvcompType_t.NVCOMP_TYPE_UCHAR
-    SHORT = nvcompType_t.NVCOMP_TYPE_SHORT
-    USHORT = nvcompType_t.NVCOMP_TYPE_USHORT
-    INT = nvcompType_t.NVCOMP_TYPE_INT
-    UINT = nvcompType_t.NVCOMP_TYPE_UINT
-    LONGLONG = nvcompType_t.NVCOMP_TYPE_LONGLONG
-    ULONGLONG = nvcompType_t.NVCOMP_TYPE_ULONGLONG
-    BITS = nvcompType_t.NVCOMP_TYPE_BITS
-
-
-class nvCompBatchAlgorithm(ABC):
-    """Abstract class that provides interface to nvCOMP batched algorithms."""
-
-    # TODO(akamenev): it might be possible to have a simpler implementation that
-    # eilminates the need to have a separate implementation class for each algorithm,
-    # potentially using fused types in Cython (similar to C++ templates),
-    # but I could not figure out how to do that (e.g. each algorithm API set has
-    # a different type for the options and so on).
-
-    def get_compress_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Get temporary space required for compression.
-
-        Parameters
-        ----------
-        batch_size: int
-            The number of items in the batch.
-        max_uncompressed_chunk_bytes: int
-            The maximum size in bytes of a chunk in the batch.
-
-        Returns
-        -------
-        int
-            The size in bytes of the required GPU workspace for compression.
-        """
-        err, temp_size = self._get_comp_temp_size(
-            batch_size,
-            max_uncompressed_chunk_bytes
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get compress temp buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-        return temp_size
-
-    @abstractmethod
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_compress_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        """Get the maximum size any chunk could compress to in the batch.
-
-        Parameters
-        ----------
-        max_uncompressed_chunk_bytes: int
-            The maximum size in bytes of a chunk in the batch.
-
-        Returns
-        -------
-        int
-            The maximum compressed size in bytes of the largest chunk. That is,
-            the minimum amount of output memory required to be given to
-            the corresponding *CompressAsync function.
-        """
-        err, comp_chunk_size = self._get_comp_chunk_size(max_uncompressed_chunk_bytes)
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get output buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-        return comp_chunk_size
-
-    @abstractmethod
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        """Algorithm-specific implementation."""
-        ...
-
-    def compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        """Perform compression.
-
-        Parameters
-        ----------
-        uncomp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to uncompressed batched items.
-        uncomp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each uncompressed batch item on the GPU.
-        max_uncomp_chunk_bytes: int
-            The maximum size in bytes of the largest chunk in the batch.
-        batch_size: int
-            The number of chunks to compress.
-        temp_buf: cp.ndarray
-            The temporary GPU workspace.
-        comp_chunks: cp.ndarray[uintp]
-            (output) The list of pointers on the GPU, to the output location for each
-            compressed batch item.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            (output) The compressed size in bytes of each chunk.
-        stream: cp.cuda.Stream
-            CUDA stream.
-        """
-
-        err = self._compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_uncomp_chunk_bytes,
-            batch_size,
-            temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(f"Compression failed, error: {nvCompStatus(err)!r}.")
-
-    @abstractmethod
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_decompress_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Get the amount of temp space required on the GPU for decompression.
-
-        Parameters
-        ----------
-        batch_size: int
-            The number of items in the batch.
-        max_uncompressed_chunk_bytes: int
-            The size in bytes of the largest chunk when uncompressed.
-
-        Returns
-        -------
-        int
-            The amount of temporary GPU space in bytes that will be
-            required to decompress.
-        """
-        err, temp_size = self._get_decomp_temp_size(
-            batch_size,
-            max_uncompressed_chunk_bytes
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get decompress temp buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-
-        return temp_size
-
-    @abstractmethod
-    def _get_decomp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_decompress_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        """Get the amount of space required on the GPU for decompression.
-
-        Parameters
-        ----------
-        comp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each compressed batch item.
-        stream: cp.cuda.Stream
-            CUDA stream.
-
-        Returns
-        -------
-        cp.ndarray[uint64]
-            The amount of GPU space in bytes that will be required
-            to decompress each chunk.
-        """
-
-        assert len(comp_chunks) == len(comp_chunk_sizes)
-        batch_size = len(comp_chunks)
-
-        # nvCOMP requires all buffers to be in GPU memory.
-        uncomp_chunk_sizes = cupy.empty_like(comp_chunk_sizes)
-
-        err = self._get_decomp_size(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            uncomp_chunk_sizes,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get decompress buffer size, error: {nvCompStatus(err)!r}."
-            )
-
-        return uncomp_chunk_sizes
-
-    @abstractmethod
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        """Perform decompression.
-
-        Parameters
-        ----------
-        comp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each compressed batch item.
-        batch_size: int
-            The number of chunks to decompress.
-        temp_buf: cp.ndarray
-            The temporary GPU workspace.
-        uncomp_chunks: cp.ndarray[uintp]
-            (output) The pointers on the GPU, to the output location for each
-            decompressed batch item.
-        uncomp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each decompress chunk location on the GPU.
-        actual_uncomp_chunk_sizes: cp.ndarray[uint64]
-            (output) The actual decompressed size in bytes of each chunk on the GPU.
-        statuses: cp.ndarray
-            (output) The status for each chunk of whether it was decompressed or not.
-        stream: cp.cuda.Stream
-            CUDA stream.
-        """
-
-        err = self._decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            temp_buf,
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(f"Decompression failed, error: {nvCompStatus(err)!r}.")
-
-    @abstractmethod
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-
-cdef uintptr_t to_ptr(buf):
-    return buf.data.ptr
-
-
-cdef cudaStream_t to_stream(stream):
-    return <cudaStream_t><size_t>stream.ptr
-
-
-#
-# LZ4 algorithm.
-#
-
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedLZ4CompressAsync,
-    nvcompBatchedLZ4CompressGetMaxOutputChunkSize,
-    nvcompBatchedLZ4CompressGetTempSize,
-    nvcompBatchedLZ4DecompressAsync,
-    nvcompBatchedLZ4DecompressGetTempSize,
-    nvcompBatchedLZ4DefaultOpts,
-    nvcompBatchedLZ4GetDecompressSizeAsync,
-    nvcompBatchedLZ4Opts_t,
-)
-
-
-class nvCompBatchAlgorithmLZ4(nvCompBatchAlgorithm):
-    """LZ4 algorithm implementation."""
-
-    algo_id: str = "lz4"
-
-    options: nvcompBatchedLZ4Opts_t
-
-    HEADER_SIZE_BYTES: size_t = sizeof(uint32_t)
-
-    def __init__(self, data_type: int = None, has_header: bool = True):
-        """Initialize the codec.
-
-        Parameters
-        ----------
-        data_type: int or None
-            Source data type. If None, uses nvcomp default options.
-        has_header: bool
-            Whether the compressed data has a header.
-            This enables data compatibility between numcodecs LZ4 codec,
-            which has the header and nvCOMP LZ4 codec which does not
-            require the header.
-        """
-        if data_type is None:
-            self.options = nvcompBatchedLZ4DefaultOpts
-        else:
-            self.options = nvcompBatchedLZ4Opts_t(data_type)
-
-        self.has_header = has_header
-
-        # Note on LZ4 header structure: numcodecs LZ4 codec prepends
-        # a 4-byte (uint32_t) header to each compressed chunk.
-        # The header stores the size of the original (uncompressed) data:
-        # https://github.com/zarr-developers/numcodecs/blob/cb155432e36536e17a2d054c8c24b7bf6f4a7347/numcodecs/lz4.pyx#L89
-        #
-        # The following CUDA kernels read / write chunk header by
-        # casting the chunk pointer to a pointer to unsigned int.
-
-        # CUDA kernel that copies uncompressed chunk size from the chunk header.
-        self._get_size_from_header_kernel = cupy.ElementwiseKernel(
-            "uint64 comp_chunk_ptr",
-            "uint64 uncomp_chunk_size",
-            "uncomp_chunk_size = *((unsigned int *)comp_chunk_ptr)",
-            "get_size_from_header",
-        )
-
-        # CUDA kernel that copies uncompressed chunk size to the chunk header.
-        self._set_chunk_size_header_kernel = cupy.ElementwiseKernel(
-            "uint64 uncomp_chunk_size",
-            "uint64 comp_chunk_ptr",
-            "((unsigned int *)comp_chunk_ptr)[0] = (unsigned int)uncomp_chunk_size",
-            "set_chunk_size_header",
-            no_return=True,
-        )
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedLZ4CompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        # Add header size, if needed.
-        if err == nvcompStatus_t.nvcompSuccess and self.has_header:
-            max_compressed_bytes += self.HEADER_SIZE_BYTES
-
-        return (err, max_compressed_bytes)
-
-    def compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        if self.has_header:
-            # If there is a header, we need to:
-            # 1. Copy the uncompressed chunk size to the compressed chunk header.
-            # 2. Update target pointers in comp_chunks to skip the header portion,
-            # which is not compressed.
-            #
-            self._set_chunk_size_header_kernel(uncomp_chunk_sizes, comp_chunks)
-            # Update chunk pointer to skip the header.
-            comp_chunks += self.HEADER_SIZE_BYTES
-
-        super().compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_uncomp_chunk_bytes,
-            batch_size,
-            temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
-            stream,
-        )
-
-        if self.has_header:
-            # Update chunk pointer and size to include the header.
-            comp_chunks -= self.HEADER_SIZE_BYTES
-            comp_chunk_sizes += self.HEADER_SIZE_BYTES
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedLZ4CompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedLZ4DecompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def get_decompress_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        if not self.has_header:
-            return super().get_decompress_size(
-                comp_chunks,
-                comp_chunk_sizes,
-                stream,
-            )
-
-        return self._get_size_from_header_kernel(comp_chunks)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedLZ4GetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        if self.has_header:
-            # Update chunk pointer and size to exclude the header.
-            comp_chunks += self.HEADER_SIZE_BYTES
-            comp_chunk_sizes -= self.HEADER_SIZE_BYTES
-
-        super().decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            temp_buf,
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            stream,
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedLZ4DecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(data_type={self.options['data_type']})"
-
-
-#
-# Gdeflate algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedGdeflateCompressAsync,
-    nvcompBatchedGdeflateCompressGetMaxOutputChunkSize,
-    nvcompBatchedGdeflateCompressGetTempSize,
-    nvcompBatchedGdeflateDecompressAsync,
-    nvcompBatchedGdeflateDecompressGetTempSize,
-    nvcompBatchedGdeflateDefaultOpts,
-    nvcompBatchedGdeflateGetDecompressSizeAsync,
-    nvcompBatchedGdeflateOpts_t,
-)
-
-
-class nvCompBatchAlgorithmGdeflate(nvCompBatchAlgorithm):
-    """Gdeflate algorithm implementation."""
-
-    algo_id: str = "gdeflate"
-
-    options: nvcompBatchedGdeflateOpts_t
-
-    def __init__(self, algo: int = None):
-        if algo is None:
-            self.options = nvcompBatchedGdeflateDefaultOpts
-        else:
-            self.options = nvcompBatchedGdeflateOpts_t(algo)
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedGdeflateCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedGdeflateCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedGdeflateCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedGdeflateDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedGdeflateGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedGdeflateDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(algo={self.options['algo']})"
-
-
-#
-# zstd algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedZstdCompressAsync,
-    nvcompBatchedZstdCompressGetMaxOutputChunkSize,
-    nvcompBatchedZstdCompressGetTempSize,
-    nvcompBatchedZstdDecompressAsync,
-    nvcompBatchedZstdDecompressGetTempSize,
-    nvcompBatchedZstdDefaultOpts,
-    nvcompBatchedZstdGetDecompressSizeAsync,
-    nvcompBatchedZstdOpts_t,
-)
-
-
-class nvCompBatchAlgorithmZstd(nvCompBatchAlgorithm):
-    """zstd algorithm implementation."""
-
-    algo_id: str = "zstd"
-
-    options: nvcompBatchedZstdOpts_t
-
-    def __init__(self):
-        self.options = nvcompBatchedZstdDefaultOpts
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedZstdCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedZstdCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedZstdDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedZstdGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedZstdDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>to_ptr(actual_uncomp_chunk_sizes),
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>to_ptr(statuses),
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}()"
-
-
-#
-# Snappy algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedSnappyCompressAsync,
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize,
-    nvcompBatchedSnappyCompressGetTempSize,
-    nvcompBatchedSnappyDecompressAsync,
-    nvcompBatchedSnappyDecompressGetTempSize,
-    nvcompBatchedSnappyDefaultOpts,
-    nvcompBatchedSnappyGetDecompressSizeAsync,
-    nvcompBatchedSnappyOpts_t,
-)
-
-
-class nvCompBatchAlgorithmSnappy(nvCompBatchAlgorithm):
-    """Snappy algorithm implementation."""
-
-    algo_id: str = "snappy"
-
-    options: nvcompBatchedSnappyOpts_t
-
-    def __init__(self):
-        self.options = nvcompBatchedSnappyDefaultOpts
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedSnappyCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedSnappyCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedSnappyDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedSnappyGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedSnappyDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}()"
-
-
-#
-# Deflate algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedDeflateCompressAsync,
-    nvcompBatchedDeflateCompressGetMaxOutputChunkSize,
-    nvcompBatchedDeflateCompressGetTempSize,
-    nvcompBatchedDeflateDecompressAsync,
-    nvcompBatchedDeflateDecompressGetTempSize,
-    nvcompBatchedDeflateDefaultOpts,
-    nvcompBatchedDeflateGetDecompressSizeAsync,
-    nvcompBatchedDeflateOpts_t,
-)
-
-
-class nvCompBatchAlgorithmDeflate(nvCompBatchAlgorithm):
-    """Deflate algorithm implementation."""
-
-    algo_id: str = "deflate"
-
-    options: nvcompBatchedDeflateOpts_t
-
-    def __init__(self, algo: int = None):
-        if algo is None:
-            self.options = nvcompBatchedDeflateDefaultOpts
-        else:
-            self.options = nvcompBatchedDeflateOpts_t(algo)
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedDeflateCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedDeflateCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedDeflateDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedDeflateGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedDeflateDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>to_ptr(actual_uncomp_chunk_sizes),
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>to_ptr(statuses),
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(algo={self.options['algo']})"
-
-
-SUPPORTED_ALGORITHMS = {
-    a.algo_id: a for a in [
-        nvCompBatchAlgorithmLZ4,
-        nvCompBatchAlgorithmGdeflate,
-        nvCompBatchAlgorithmZstd,
-        nvCompBatchAlgorithmSnappy,
-        nvCompBatchAlgorithmDeflate,
-    ]
-}
diff --git a/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd b/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
deleted file mode 100644
index b86797a93f..0000000000
--- a/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2022 Carson Swope
-# Use, modification, and distribution is subject to the MIT License
-# https://github.com/carsonswope/py-nvcomp/blob/main/LICENSE)
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: MIT
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-from libc.stdint cimport uint8_t, uint32_t
-from libcpp.memory cimport shared_ptr
-from libcpp.vector cimport vector
-
-
-cdef extern from "cuda_runtime.h":
-    ctypedef void* cudaStream_t
-
-cdef extern from "nvcomp.h":
-    ctypedef enum nvcompType_t:
-        NVCOMP_TYPE_CHAR = 0,       # 1B
-        NVCOMP_TYPE_UCHAR = 1,      # 1B
-        NVCOMP_TYPE_SHORT = 2,      # 2B
-        NVCOMP_TYPE_USHORT = 3,     # 2B
-        NVCOMP_TYPE_INT = 4,        # 4B
-        NVCOMP_TYPE_UINT = 5,       # 4B
-        NVCOMP_TYPE_LONGLONG = 6,   # 8B
-        NVCOMP_TYPE_ULONGLONG = 7,  # 8B
-        NVCOMP_TYPE_BITS = 0xff     # 1b
-
-
-cdef extern from "nvcomp/shared_types.h":
-    ctypedef enum nvcompStatus_t:
-        nvcompSuccess = 0,
-        nvcompErrorInvalidValue = 10,
-        nvcompErrorNotSupported = 11,
-        nvcompErrorCannotDecompress = 12,
-        nvcompErrorBadChecksum = 13,
-        nvcompErrorCannotVerifyChecksums = 14,
-        nvcompErrorCudaError = 1000,
-        nvcompErrorInternal = 10000,
-
-# Manager Factory
-cdef extern from "nvcomp/nvcompManagerFactory.hpp" namespace 'nvcomp':
-    cdef shared_ptr[nvcompManagerBase] create_manager "nvcomp::create_manager"(
-        const uint8_t* comp_buffer
-    ) except +
-
-
-# Compression Manager
-cdef extern from "nvcomp/nvcompManager.hpp" namespace 'nvcomp':
-    cdef cppclass PinnedPtrPool[T]:
-        pass
-
-    cdef cppclass CompressionConfig "nvcomp::CompressionConfig":
-        const size_t uncompressed_buffer_size
-        const size_t max_compressed_buffer_size
-        const size_t num_chunks
-        CompressionConfig(
-            PinnedPtrPool[nvcompStatus_t]* pool,
-            size_t uncompressed_buffer_size) except +
-        nvcompStatus_t* get_status() const
-        CompressionConfig(CompressionConfig& other)
-        CompressionConfig& operator=(const CompressionConfig& other) except +
-        # Commented as Cython doesn't support rvalues, but a user can call
-        # `move` with the existing operator and generate correct C++ code
-        # xref: https://github.com/cython/cython/issues/1445
-        # CompressionConfig& operator=(CompressionConfig&& other) except +
-
-    cdef cppclass DecompressionConfig "nvcomp::DecompressionConfig":
-        size_t decomp_data_size
-        uint32_t num_chunks
-        DecompressionConfig(PinnedPtrPool[nvcompStatus_t]& pool) except +
-        nvcompStatus_t* get_status() const
-        DecompressionConfig(DecompressionConfig& other)
-        DecompressionConfig& operator=(const DecompressionConfig& other) except +
-        # Commented as Cython doesn't support rvalues, but a user can call
-        # `move` with the existing operator and generate correct C++ code
-        # xref: https://github.com/cython/cython/issues/1445
-        # DecompressionConfig& operator=(DecompressionConfig&& other) except +
-
-    cdef cppclass nvcompManagerBase "nvcomp::nvcompManagerBase":
-        CompressionConfig configure_compression(
-            const size_t decomp_buffer_size)
-        void compress(
-            const uint8_t* decomp_buffer,
-            uint8_t* comp_buffer,
-            const CompressionConfig& comp_config) except +
-        DecompressionConfig configure_decompression(
-            const uint8_t* comp_buffer)
-        DecompressionConfig configure_decompression(
-            const CompressionConfig& comp_config)
-        void decompress(
-            uint8_t* decomp_buffer,
-            const uint8_t* comp_buffer,
-            const DecompressionConfig& decomp_config)
-        size_t get_compressed_output_size(uint8_t* comp_buffer) except +
-
-    cdef cppclass PimplManager "nvcomp::PimplManager":
-        CompressionConfig configure_compression(
-            const size_t decomp_buffer_size) except +
-        void compress(
-            const uint8_t* decomp_buffer,
-            uint8_t* comp_buffer,
-            const CompressionConfig& comp_config) except +
-        DecompressionConfig configure_decompression(
-            const uint8_t* comp_buffer)
-        DecompressionConfig configure_decompression(
-            const CompressionConfig& comp_config)
-        void decompress(
-            uint8_t* decomp_buffer,
-            const uint8_t* comp_buffer,
-            const DecompressionConfig& decomp_config) except +
-        size_t get_compressed_output_size(uint8_t* comp_buffer) except +
-
-# C++ Concrete ANS Manager
-cdef extern from "nvcomp/ans.h" nogil:
-    ctypedef enum nvcompANSType_t:
-        nvcomp_rANS = 0
-
-    ctypedef struct nvcompBatchedANSOpts_t:
-        nvcompANSType_t type
-    cdef nvcompBatchedANSOpts_t nvcompBatchedANSDefaultOpts
-
-cdef extern from "nvcomp/ans.hpp":
-    cdef cppclass ANSManager "nvcomp::ANSManager":
-        ANSManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedANSOpts_t& format_opts,
-        ) except +
-
-# C++ Concrete Bitcomp Manager
-cdef extern from "nvcomp/bitcomp.h" nogil:
-    ctypedef struct nvcompBatchedBitcompFormatOpts:
-        int algorithm_type
-        nvcompType_t data_type
-    cdef nvcompBatchedBitcompFormatOpts nvcompBatchedBitcompDefaultOpts
-
-cdef extern from "nvcomp/bitcomp.hpp":
-    cdef cppclass BitcompManager "nvcomp::BitcompManager":
-        BitcompManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedBitcompFormatOpts& format_opts,
-        ) except +
-
-# C++ Concrete Cascaded Manager
-cdef extern from "nvcomp/cascaded.h" nogil:
-    ctypedef struct nvcompBatchedCascadedOpts_t:
-        size_t chunk_size
-        nvcompType_t type
-        int num_RLEs
-        int num_deltas
-        int use_bp
-    cdef nvcompBatchedCascadedOpts_t nvcompBatchedCascadedDefaultOpts
-
-cdef extern from "nvcomp/cascaded.hpp" nogil:
-    cdef cppclass CascadedManager "nvcomp::CascadedManager":
-        CascadedManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedCascadedOpts_t& options,
-        )
-
-# C++ Concrete Gdeflate Manager
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedGdeflateOpts_t:
-        int algo
-    cdef nvcompBatchedGdeflateOpts_t nvcompBatchedGdeflateDefaultOpts
-
-cdef extern from "nvcomp/gdeflate.hpp":
-    cdef cppclass GdeflateManager "nvcomp::GdeflateManager":
-        GdeflateManager(
-            int uncomp_chunk_size,
-            const nvcompBatchedGdeflateOpts_t& format_opts,
-        ) except +
-
-# C++ Concrete LZ4 Manager
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedLZ4Opts_t:
-        nvcompType_t data_type
-    cdef nvcompBatchedLZ4Opts_t nvcompBatchedLZ4DefaultOpts
-
-cdef extern from "nvcomp/lz4.hpp":
-    cdef cppclass LZ4Manager "nvcomp::LZ4Manager":
-        LZ4Manager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedLZ4Opts_t& format_opts,
-        ) except +
-
-# C++ Concrete Snappy Manager
-cdef extern from "nvcomp/snappy.h" nogil:
-    ctypedef struct nvcompBatchedSnappyOpts_t:
-        int reserved
-    cdef nvcompBatchedSnappyOpts_t nvcompBatchedSnappyDefaultOpts
-
-cdef extern from "nvcomp/snappy.hpp":
-    cdef cppclass SnappyManager "nvcomp::SnappyManager":
-        SnappyManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedSnappyOpts_t& format_opts,
-        ) except +
diff --git a/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd b/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
deleted file mode 100644
index 6a23eb5cd1..0000000000
--- a/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
+++ /dev/null
@@ -1,362 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-# distutils: language = c++
-# cython: language_level=3
-
-cdef extern from "cuda_runtime.h":
-    ctypedef void* cudaStream_t
-
-    ctypedef enum cudaMemcpyKind:
-        cudaMemcpyHostToHost = 0,
-        cudaMemcpyHostToDevice = 1,
-        cudaMemcpyDeviceToHost = 2,
-        cudaMemcpyDeviceToDevice = 3,
-        cudaMemcpyDefault = 4
-
-cdef extern from "nvcomp.h":
-    ctypedef enum nvcompType_t:
-        NVCOMP_TYPE_CHAR = 0,       # 1B
-        NVCOMP_TYPE_UCHAR = 1,      # 1B
-        NVCOMP_TYPE_SHORT = 2,      # 2B
-        NVCOMP_TYPE_USHORT = 3,     # 2B
-        NVCOMP_TYPE_INT = 4,        # 4B
-        NVCOMP_TYPE_UINT = 5,       # 4B
-        NVCOMP_TYPE_LONGLONG = 6,   # 8B
-        NVCOMP_TYPE_ULONGLONG = 7,  # 8B
-        NVCOMP_TYPE_BITS = 0xff     # 1b
-
-cdef extern from "nvcomp/shared_types.h":
-    ctypedef enum nvcompStatus_t:
-        nvcompSuccess = 0,
-        nvcompErrorInvalidValue = 10,
-        nvcompErrorNotSupported = 11,
-        nvcompErrorCannotDecompress = 12,
-        nvcompErrorBadChecksum = 13,
-        nvcompErrorCannotVerifyChecksums = 14,
-        nvcompErrorCudaError = 1000,
-        nvcompErrorInternal = 10000,
-
-# nvCOMP Low-Level Interface.
-# https://github.com/NVIDIA/nvcomp/blob/main/doc/lowlevel_c_quickstart.md
-
-#
-# LZ4 batch compression/decompression API.
-#
-cdef extern from "nvcomp/lz4.h" nogil:
-    ctypedef struct nvcompBatchedLZ4Opts_t:
-        nvcompType_t data_type
-
-    cdef nvcompBatchedLZ4Opts_t nvcompBatchedLZ4DefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedLZ4DecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4GetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedLZ4DecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# Gdeflate batch compression/decompression API.
-#
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedGdeflateOpts_t:
-        int algo
-
-    cdef nvcompBatchedGdeflateOpts_t nvcompBatchedGdeflateDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedGdeflateDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedGdeflateGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedGdeflateDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# zstd batch compression/decompression API.
-#
-cdef extern from "nvcomp/zstd.h" nogil:
-    ctypedef struct nvcompBatchedZstdOpts_t:
-        int reserved
-
-    cdef nvcompBatchedZstdOpts_t nvcompBatchedZstdDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedZstdCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedZstdCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedZstdDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedZstdGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedZstdDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# Snappy batch compression/decompression API.
-#
-cdef extern from "nvcomp/snappy.h" nogil:
-    ctypedef struct nvcompBatchedSnappyOpts_t:
-        int reserved
-
-    cdef nvcompBatchedSnappyOpts_t nvcompBatchedSnappyDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedSnappyDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedSnappyGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedSnappyDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-
-#
-# Deflate batch compression/decompression API.
-#
-cdef extern from "nvcomp/deflate.h" nogil:
-    ctypedef struct nvcompBatchedDeflateOpts_t:
-        int algo
-
-    cdef nvcompBatchedDeflateOpts_t nvcompBatchedDeflateDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedDeflateDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedDeflateGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedDeflateDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
diff --git a/python/kvikio/kvikio/_nvcomp.py b/python/kvikio/kvikio/_nvcomp.py
deleted file mode 100644
index 5606ad5ce5..0000000000
--- a/python/kvikio/kvikio/_nvcomp.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from enum import Enum
-
-import cupy as cp
-import numpy as np
-
-import kvikio._lib.libnvcomp as _lib
-from kvikio._lib.arr import asarray
-
-_dtype_map = {
-    cp.dtype("int8"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_CHAR,
-    cp.dtype("uint8"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR,
-    cp.dtype("int16"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_SHORT,
-    cp.dtype("uint16"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_USHORT,
-    cp.dtype("int32"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_INT,
-    cp.dtype("uint32"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_UINT,
-    cp.dtype("int64"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_LONGLONG,
-    cp.dtype("uint64"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_ULONGLONG,
-}
-
-
-def cp_to_nvcomp_dtype(in_type: cp.dtype) -> Enum:
-    """Convert np/cp dtypes to nvcomp integral dtypes.
-
-    Parameters
-    ----------
-    in_type
-        A type argument that can be used to initialize a cupy/numpy dtype.
-
-    Returns
-    -------
-    int
-        The value of the NVCOMP_TYPE for supported dtype.
-    """
-    cp_type = cp.dtype(in_type)
-    return _dtype_map[cp_type]
-
-
-class nvCompManager:
-    """Base class for nvComp Compression Managers.
-
-    Compression managers compress uncompressed data and decompress the result.
-
-    Child types of nvCompManager implement only their constructor, as they each
-    take different options to build. The rest of their implementation is
-    in nvCompManager.
-
-    nvCompManager also keeps all of the options for its child types.
-    """
-
-    _manager: _lib._nvcompManager = None
-    config: dict = {}
-    decompression_config: dict = {}
-
-    # This is a python option: What type was the data when it was passed in?
-    # This is used only for returning a decompressed view of the original
-    # datatype. Untested so far.
-    input_type = cp.int8
-
-    # Default options exist for every option type for every class that inherits
-    # from nvCompManager, which takes advantage of the below property-setting
-    # code.
-    chunk_size: int = 1 << 16
-    data_type: _lib.pyNvcompType_t = _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR
-    # Some classes have this defined as type, some as data_type.
-    type: _lib.pyNvcompType_t = _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR
-
-    # Bitcomp Defaults
-    bitcomp_algo: int = 0
-
-    # Gdeflate defaults
-    algo: int = 0
-
-    def __init__(self, kwargs):
-        """Stores the results of all input arguments as class members.
-
-        This code does type correction, fixing inputs to have an expected
-        shape before calling one of the nvCompManager methods on a child
-        class.
-
-        Special case: Convert data_type to a _lib.pyNvcompType_t
-        """
-        # data_type will be passed in as a python object. Convert it to
-        # a C++ nvcompType_t here.
-        if kwargs.get("data_type"):
-            if not isinstance(kwargs["data_type"], _lib.pyNvcompType_t):
-                kwargs["input_type"] = kwargs.get("data_type")
-                kwargs["data_type"] = cp_to_nvcomp_dtype(
-                    cp.dtype(kwargs["data_type"]).type
-                )
-        # Special case: Convert type to a _lib.pyNvcompType_t
-        if kwargs.get("type"):
-            if not isinstance(kwargs["type"], _lib.pyNvcompType_t):
-                kwargs["input_type"] = kwargs.get("type")
-                kwargs["type"] = cp_to_nvcomp_dtype(cp.dtype(kwargs["type"]).type)
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
-    def compress(self, data: cp.ndarray) -> cp.ndarray:
-        """Compress a buffer.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of data to compress.
-
-        Returns
-        -------
-        cp.ndarray
-            A GPU buffer of compressed bytes.
-        """
-        # TODO: An option: check if incoming data size matches the size of the
-        # last incoming data, and reuse temp and out buffer if so.
-        data_size = data.size * data.itemsize
-        self.config = self._manager.configure_compression(data_size)
-        self.compress_out_buffer = cp.empty(
-            self.config["max_compressed_buffer_size"], dtype="uint8"
-        )
-        size = self._manager.compress(asarray(data), asarray(self.compress_out_buffer))
-        return self.compress_out_buffer[0:size]
-
-    def decompress(self, data: cp.ndarray) -> cp.ndarray:
-        """Decompress a GPU buffer.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of data to decompress.
-
-        Returns
-        -------
-        cp.ndarray
-            An array of `self.dtype` produced after decompressing the input argument.
-        """
-        self.decompression_config = (
-            self._manager.configure_decompression_with_compressed_buffer(asarray(data))
-        )
-        decomp_buffer = cp.empty(
-            self.decompression_config["decomp_data_size"], dtype="uint8"
-        )
-        self._manager.decompress(asarray(decomp_buffer), asarray(data))
-        return decomp_buffer.view(self.input_type)
-
-    def configure_compression(self, data_size: int) -> dict:
-        """Return the compression configuration object.
-
-        Parameters
-        ----------
-        data_size: int
-            The size of the buffer that is staged to be compressed.
-
-        Returns
-        -------
-        dict {
-            "uncompressed_buffer_size": The size of the input data
-            "max_compressed_buffer_size": The maximum size of the compressed data. The
-                size of the buffer that must be allocated before calling compress.
-            "num_chunks": The number of configured chunks to compress the data over
-        }
-        """
-        return self._manager.configure_compression(data_size)
-
-    def configure_decompression_with_compressed_buffer(
-        self, data: cp.ndarray
-    ) -> cp.ndarray:
-        """Return the decompression configuration object.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of previously compressed data.
-
-        Returns
-        -------
-        dict {
-            "decomp_data_size": The size of each decompression chunk.
-            "num_chunks": The number of chunks that the decompressed data is returned
-            in.
-        }
-        """
-        return self._manager.configure_decompression_with_compressed_buffer(
-            asarray(data)
-        )
-
-    def get_compressed_output_size(self, comp_buffer: cp.ndarray) -> int:
-        """Return the actual size of compression result.
-
-        Returns the number of bytes that should be copied out of
-        `comp_buffer`.
-
-        Parameters
-        ----------
-        comp_buffer: cp.ndarray
-            A GPU buffer that has been previously compressed.
-
-        Returns
-        -------
-        int
-        """
-        return self._manager.get_compressed_output_size(asarray(comp_buffer))
-
-
-class ANSManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Initialize an ANSManager object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._ANSManager(self.chunk_size)
-
-
-class BitcompManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU BitcompCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._BitcompManager(
-            self.chunk_size,
-            self.data_type.value,
-            self.bitcomp_algo,
-        )
-
-
-class CascadedManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Initialize a CascadedManager for a specific dtype.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096 and can't currently be changed.
-        dtype: cp.dtype (optional)
-            The dtype of the input buffer to be compressed.
-        num_RLEs: int (optional)
-            Number of Run-Length Encoders to use, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#run-length-encoding-rle)  # noqa: E501
-        num_deltas: int (optional)
-            Number of Delta Encoders to use, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#delta-encoding)  # noqa: E501
-        use_bp: bool (optional)
-            Enable Bitpacking, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#bitpacking)  # noqa: E501
-        """
-        super().__init__(kwargs)
-        default_options = {
-            "chunk_size": 1 << 12,
-            "type": np.int32,
-            "num_RLEs": 2,
-            "num_deltas": 1,
-            "use_bp": True,
-        }
-        # Replace any options that may have been excluded, they are not optional.
-        for k, v in default_options.items():
-            try:
-                getattr(self, k)
-            except Exception:
-                setattr(self, k, v)
-
-        self.options = {
-            "chunk_size": self.chunk_size,
-            "type": self.type,
-            "num_RLEs": self.num_RLEs,
-            "num_deltas": self.num_deltas,
-            "use_bp": self.use_bp,
-        }
-        self._manager = _lib._CascadedManager(default_options)
-
-
-class GdeflateManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU GdeflateCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-        algo: int (optional)
-            Integer in the range [0, 1, 2]. Only algorithm #0 is currently
-            supported.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._GdeflateManager(self.chunk_size, self.algo)
-
-
-class LZ4Manager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU LZ4Compressor object.
-
-        Used to compress and decompress GPU buffers of a specific dtype.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            The size of each chunk of data to decompress indepentently with
-            LZ4. Must be within the range of [32768, 16777216]. Larger sizes will
-            result in higher compression, but with decreased parallelism. The
-            recommended size is 65536.
-            Defaults to the recommended size.
-        data_type: pyNVCOMP_TYPE (optional)
-            The data type returned for decompression.
-            Defaults to pyNVCOMP_TYPE.UCHAR
-        """
-        super().__init__(kwargs)
-        self._manager = _lib._LZ4Manager(self.chunk_size, self.data_type.value)
-
-
-class SnappyManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU SnappyCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-        """
-        super().__init__(kwargs)
-        self._manager = _lib._SnappyManager(self.chunk_size)
-
-
-class ManagedDecompressionManager(nvCompManager):
-    def __init__(self, compressed_buffer):
-        """Create a Managed compressor using the
-        create_manager factory method.
-
-        This function is used in order to automatically
-        identify which compression algorithm was used on
-        an input buffer.
-
-        It returns a ManagedDecompressionManager that can
-        then be used normally to decompress the unknown
-        compressed binary data, or compress other data
-        into the same format.
-
-        Parameters
-        ----------
-        compressed_buffer: cp.ndarray
-            A buffer of compressed bytes of unknown origin.
-        """
-        super().__init__({})
-        self._manager = _lib._ManagedManager(asarray(compressed_buffer))
diff --git a/python/kvikio/kvikio/_nvcomp_codec.py b/python/kvikio/kvikio/_nvcomp_codec.py
deleted file mode 100644
index dc60d9c7dc..0000000000
--- a/python/kvikio/kvikio/_nvcomp_codec.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from typing import Any, Mapping, Optional, Sequence
-
-import cupy as cp
-import cupy.typing
-from numcodecs.compat import ensure_contiguous_ndarray_like
-
-from kvikio._lib.libnvcomp_ll import SUPPORTED_ALGORITHMS
-from kvikio.numcodecs import BufferLike, CudaCodec
-
-
-class NvCompBatchCodec(CudaCodec):
-    """Codec that uses batch algorithms from nvCOMP library.
-
-    An algorithm is selected using `algorithm` parameter.
-    If the algorithm takes additional options, they can be
-    passed to the algorithm using `options` dictionary.
-    """
-
-    # Header stores original uncompressed size. This is required to enable
-    # data compatibility between existing numcodecs codecs and NvCompBatchCodec.
-    HEADER_SIZE_BYTES: int = 4
-
-    codec_id: str = "nvcomp_batch"
-    algorithm: str
-    options: Mapping[str, Any]
-
-    def __init__(
-        self,
-        algorithm: str,
-        options: Optional[Mapping[str, Any]] = None,
-        stream: Optional[cp.cuda.Stream] = None,
-    ) -> None:
-        algo_id = algorithm.lower()
-        algo_t = SUPPORTED_ALGORITHMS.get(algo_id, None)
-        if algo_t is None:
-            raise ValueError(
-                f"{algorithm} is not supported. "
-                f"Must be one of: {list(SUPPORTED_ALGORITHMS.keys())}"
-            )
-
-        self.algorithm = algo_id
-        self.options = dict(options) if options is not None else {}
-
-        # Create an algorithm.
-        self._algo = algo_t(**self.options)
-        # Use default stream, if needed.
-        self._stream = stream if stream is not None else cp.cuda.Stream.ptds
-
-    def encode(self, buf: BufferLike) -> cupy.typing.NDArray:
-        return self.encode_batch([buf])[0]
-
-    def encode_batch(self, bufs: Sequence[Any]) -> Sequence[Any]:
-        """Encode data in `bufs` using nvCOMP.
-
-        Parameters
-        ----------
-        bufs :
-            Data to be encoded. Each buffer in the list may be any object
-            supporting the new-style buffer protocol.
-
-        Returns
-        -------
-            List of encoded buffers. Each buffer may be any object supporting
-            the new-style buffer protocol.
-        """
-        num_chunks = len(bufs)
-        if num_chunks == 0:
-            return []
-
-        bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
-        buf_sizes = [b.size * b.itemsize for b in bufs]
-
-        max_chunk_size = max(buf_sizes)
-
-        # Get temp and output buffer sizes.
-        temp_size = self._algo.get_compress_temp_size(num_chunks, max_chunk_size)
-        comp_chunk_size = self._algo.get_compress_chunk_size(max_chunk_size)
-
-        # Prepare data and size buffers.
-        # uncomp_chunks is used as a container that stores pointers to actual chunks.
-        # nvCOMP requires this and sizes buffers to be in GPU memory.
-        uncomp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
-        uncomp_chunk_sizes = cp.array(buf_sizes, dtype=cp.uint64)
-
-        temp_buf = cp.empty(temp_size, dtype=cp.uint8)
-
-        comp_chunks = cp.empty((num_chunks, comp_chunk_size), dtype=cp.uint8)
-        # Array of pointers to each compressed chunk.
-        comp_chunk_ptrs = cp.array([c.data.ptr for c in comp_chunks], dtype=cp.uintp)
-        # Resulting compressed chunk sizes.
-        comp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
-
-        self._algo.compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_chunk_size,
-            num_chunks,
-            temp_buf,
-            comp_chunk_ptrs,
-            comp_chunk_sizes,
-            self._stream,
-        )
-
-        res = []
-        # Copy to host to subsequently avoid many smaller D2H copies.
-        comp_chunks = cp.asnumpy(comp_chunks, self._stream)
-        comp_chunk_sizes = cp.asnumpy(comp_chunk_sizes, self._stream)
-        self._stream.synchronize()
-
-        for i in range(num_chunks):
-            res.append(comp_chunks[i, : comp_chunk_sizes[i]].tobytes())
-        return res
-
-    def decode(self, buf: BufferLike, out: Optional[BufferLike] = None) -> BufferLike:
-        return self.decode_batch([buf], [out])[0]
-
-    def decode_batch(
-        self, bufs: Sequence[Any], out: Optional[Sequence[Any]] = None
-    ) -> Sequence[Any]:
-        """Decode data in `bufs` using nvCOMP.
-
-        Parameters
-        ----------
-        bufs :
-            Encoded data. Each buffer in the list may be any object
-            supporting the new-style buffer protocol.
-        out :
-            List of writeable buffers to store decoded data.
-            N.B. if provided, each buffer must be exactly the right size
-            to store the decoded data.
-
-        Returns
-        -------
-            List of decoded buffers. Each buffer may be any object supporting
-            the new-style buffer protocol.
-        """
-        num_chunks = len(bufs)
-        if num_chunks == 0:
-            return []
-
-        # TODO(akamenev): check only first buffer, assuming they are all
-        # of the same kind.
-        is_host_buffer = not hasattr(bufs[0], "__cuda_array_interface__")
-        if is_host_buffer:
-            bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
-
-        # Prepare compressed chunks buffers.
-        comp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
-        comp_chunk_sizes = cp.array([b.size for b in bufs], dtype=cp.uint64)
-
-        # Get uncompressed chunk sizes.
-        uncomp_chunk_sizes = self._algo.get_decompress_size(
-            comp_chunks,
-            comp_chunk_sizes,
-            self._stream,
-        )
-
-        # Check whether the uncompressed chunks are all the same size.
-        # cupy.unique returns sorted sizes.
-        sorted_chunk_sizes = cp.unique(uncomp_chunk_sizes)
-        max_chunk_size = sorted_chunk_sizes[-1].item()
-        is_equal_chunks = sorted_chunk_sizes.shape[0] == 1
-
-        # Get temp buffer size.
-        temp_size = self._algo.get_decompress_temp_size(num_chunks, max_chunk_size)
-
-        temp_buf = cp.empty(temp_size, dtype=cp.uint8)
-
-        # Prepare uncompressed chunks buffers.
-        # First, allocate chunks of max_chunk_size and then
-        # copy the pointers to a pointer array in GPU memory as required by nvCOMP.
-        # For performance reasons, we use max_chunk_size so we can create
-        # a rectangular array with the same pointer increments.
-        uncomp_chunks = cp.empty((num_chunks, max_chunk_size), dtype=cp.uint8)
-        p_start = uncomp_chunks.data.ptr
-        uncomp_chunk_ptrs = cp.uint64(p_start) + (
-            cp.arange(0, num_chunks * max_chunk_size, max_chunk_size, dtype=cp.uint64)
-        )
-
-        # TODO(akamenev): currently we provide the following 2 buffers to decompress()
-        # but do not check/use them afterwards since some of the algos
-        # (e.g. LZ4 and Gdeflate) do not require it and run faster
-        # without those arguments passed, while other algos (e.g. zstd) require
-        # these buffers to be valid.
-        actual_uncomp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
-        statuses = cp.empty(num_chunks, dtype=cp.int32)
-
-        self._algo.decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            num_chunks,
-            temp_buf,
-            uncomp_chunk_ptrs,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            self._stream,
-        )
-
-        # If all chunks are the same size, we can just return uncomp_chunks.
-        if is_equal_chunks and out is None:
-            return cp.asnumpy(uncomp_chunks) if is_host_buffer else uncomp_chunks
-
-        res = []
-        uncomp_chunk_sizes = uncomp_chunk_sizes.get()
-        for i in range(num_chunks):
-            ret = uncomp_chunks[i, : uncomp_chunk_sizes[i]]
-            if out is None or out[i] is None:
-                res.append(cp.asnumpy(ret) if is_host_buffer else ret)
-            else:
-                o = ensure_contiguous_ndarray_like(out[i])
-                if hasattr(o, "__cuda_array_interface__"):
-                    cp.copyto(o, ret.view(dtype=o.dtype), casting="no")
-                else:
-                    cp.asnumpy(ret.view(dtype=o.dtype), out=o, stream=self._stream)
-                res.append(o)
-        self._stream.synchronize()
-
-        return res
-
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}"
-            f"(algorithm={self.algorithm!r}, options={self.options!r})"
-        )
diff --git a/python/kvikio/kvikio/benchmarks/single_node_io.py b/python/kvikio/kvikio/benchmarks/single_node_io.py
index e3b152cbaf..ddc6680167 100644
--- a/python/kvikio/kvikio/benchmarks/single_node_io.py
+++ b/python/kvikio/kvikio/benchmarks/single_node_io.py
@@ -8,7 +8,7 @@
 import statistics
 import tempfile
 from time import perf_counter as clock
-from typing import Any, ContextManager, Dict, Union
+from typing import ContextManager, Union
 
 import cupy
 from dask.utils import format_bytes, parse_bytes
@@ -19,21 +19,6 @@
 from kvikio.benchmarks.utils import parse_directory, pprint_sys_info
 
 
-def get_zarr_compressors() -> Dict[str, Any]:
-    """Returns a dict of available Zarr compressors"""
-    try:
-        import kvikio.zarr
-    except ImportError:
-        return {}
-    try:
-        compressors = kvikio.zarr.nvcomp_compressors
-    except AttributeError:
-        # zarr-python 3.x
-        return {}
-    else:
-        return {c.__name__.lower(): c for c in compressors}
-
-
 def create_data(nbytes):
     """Return a random uint8 cupy array"""
     return cupy.arange(nbytes, dtype="uint8")
@@ -223,10 +208,6 @@ def run_zarr(args):
     if not kvikio.zarr.supported:
         raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
 
-    compressor = None
-    if args.zarr_compressor is not None:
-        compressor = get_zarr_compressors()[args.zarr_compressor]()
-
     a = create_data(args.nbytes)
 
     shutil.rmtree(str(dir_path), ignore_errors=True)
@@ -236,7 +217,6 @@ def run_zarr(args):
     z = zarr.array(
         a,
         chunks=False,
-        compressor=compressor,
         store=kvikio.zarr.GDSStore(dir_path),
         meta_array=cupy.empty(()),
     )
@@ -277,8 +257,6 @@ def main(args):
     print(f"directory         | {args.dir}")
     print(f"nthreads          | {args.nthreads}")
     print(f"nruns             | {args.nruns}")
-    if args.zarr_compressor is not None:
-        print(f"Zarr compressor   | {args.zarr_compressor}")
     print("==================================")
 
     # Run each benchmark using the requested APIs
@@ -354,16 +332,6 @@ def pprint_api_res(name, samples):
         choices=tuple(API.keys()) + ("all",),
         help="List of APIs to use {%(choices)s}",
     )
-    parser.add_argument(
-        "--zarr-compressor",
-        metavar="COMPRESSOR",
-        default=None,
-        choices=tuple(get_zarr_compressors().keys()),
-        help=(
-            "Set a nvCOMP compressor to use with Zarr "
-            "{%(choices)s} (default: %(default)s)"
-        ),
-    )
 
     args = parser.parse_args()
     if "all" in args.api:
diff --git a/python/kvikio/kvikio/benchmarks/zarr_io.py b/python/kvikio/kvikio/benchmarks/zarr_io.py
index 7882fcad8c..c4d51a6d5f 100644
--- a/python/kvikio/kvikio/benchmarks/zarr_io.py
+++ b/python/kvikio/kvikio/benchmarks/zarr_io.py
@@ -13,9 +13,9 @@
 from typing import ContextManager, Union
 
 import cupy
-import numcodecs.blosc
 import numpy
 import zarr
+import zarr.storage
 from dask.utils import format_bytes, parse_bytes
 
 import kvikio
@@ -26,59 +26,47 @@
 if not kvikio.zarr.supported:
     raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
 
-compressors = {
-    "none": (None, None),
-    "lz4": (numcodecs.blosc.Blosc(cname="lz4"), kvikio.zarr.LZ4()),
-}
-
 
 def create_src_data(args):
     return cupy.random.random(args.nelem, dtype=args.dtype)
 
 
 def run_kvikio(args):
-    dir_path = args.dir / "kvikio"
-    shutil.rmtree(str(dir_path), ignore_errors=True)
-
-    # Get the GPU compressor
-    compressor = compressors[args.compressor][1]
-
-    src = create_src_data(args)
-
-    # Write
-    if args.drop_vm_cache:
-        drop_vm_cache()
-    t0 = clock()
-    z = zarr.create(
-        shape=(args.nelem,),
-        chunks=(args.chunksize,),
-        dtype=args.dtype,
-        compressor=compressor,
-        store=kvikio.zarr.GDSStore(dir_path),
-        meta_array=cupy.empty(()),
-    )
-    z[:] = src
-    os.sync()
-    write_time = clock() - t0
-
-    # Read
-    if args.drop_vm_cache:
-        drop_vm_cache()
-    t0 = clock()
-    res = z[:]
-    read_time = clock() - t0
-    assert res.nbytes == args.nbytes
-
-    return read_time, write_time
+    with zarr.config.enable_gpu():
+        dir_path = args.dir / "kvikio"
+        shutil.rmtree(str(dir_path), ignore_errors=True)
+
+        src = create_src_data(args)
+
+        # Write
+        if args.drop_vm_cache:
+            drop_vm_cache()
+        t0 = clock()
+        z = zarr.create(
+            shape=(args.nelem,),
+            chunks=(args.chunksize,),
+            dtype=args.dtype,
+            store=kvikio.zarr.GDSStore(dir_path),
+        )
+        z[:] = src
+        os.sync()
+        write_time = clock() - t0
+
+        # Read
+        if args.drop_vm_cache:
+            drop_vm_cache()
+        t0 = clock()
+        res = z[:]
+        read_time = clock() - t0
+        assert res.nbytes == args.nbytes
+
+        return read_time, write_time
 
 
 def run_posix(args):
     dir_path = args.dir / "posix"
     shutil.rmtree(str(dir_path), ignore_errors=True)
 
-    # Get the CPU compressor
-    compressor = compressors[args.compressor][0]
-
     src = create_src_data(args)
 
     # Write
@@ -89,9 +77,7 @@ def run_posix(args):
         shape=(args.nelem,),
         chunks=(args.chunksize,),
         dtype=args.dtype,
-        compressor=compressor,
-        store=zarr.DirectoryStore(dir_path),
-        meta_array=numpy.empty(()),
+        store=zarr.storage.LocalStore(dir_path),
     )
     z[:] = src.get()
     os.sync()
@@ -135,7 +121,6 @@ def main(args):
     print(f"directory         | {args.dir}")
     print(f"nthreads          | {args.nthreads}")
     print(f"nruns             | {args.nruns}")
-    print(f"compressor        | {args.compressor}")
     print("==================================")
 
     # Run each benchmark using the requested APIs
@@ -226,16 +211,6 @@ def pprint_api_res(name, samples):
         choices=tuple(API.keys()) + ("all",),
         help="List of APIs to use {%(choices)s}",
     )
-    parser.add_argument(
-        "--compressor",
-        metavar="COMPRESSOR",
-        default="none",
-        choices=tuple(compressors.keys()),
-        help=(
-            "Set a nvCOMP compressor to use with Zarr "
-            "{%(choices)s} (default: %(default)s)"
-        ),
-    )
     parser.add_argument(
         "--drop-vm-cache",
         action="store_true",
diff --git a/python/kvikio/kvikio/nvcomp.py b/python/kvikio/kvikio/nvcomp.py
deleted file mode 100644
index 3b62e51e8c..0000000000
--- a/python/kvikio/kvikio/nvcomp.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-
-from kvikio._nvcomp import (  # noqa: F401
-    ANSManager,
-    BitcompManager,
-    CascadedManager,
-    GdeflateManager,
-    LZ4Manager,
-    ManagedDecompressionManager,
-    SnappyManager,
-    cp_to_nvcomp_dtype,
-    nvCompManager,
-)
-from kvikio.utils import kvikio_deprecate_module
-
-kvikio_deprecate_module(
-    "Use the official nvCOMP API from 'nvidia.nvcomp' instead.", since="25.06"
-)
diff --git a/python/kvikio/kvikio/nvcomp_codec.py b/python/kvikio/kvikio/nvcomp_codec.py
deleted file mode 100644
index ded350cdd5..0000000000
--- a/python/kvikio/kvikio/nvcomp_codec.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from kvikio._nvcomp_codec import NvCompBatchCodec  # noqa: F401
-from kvikio.utils import kvikio_deprecate_module
-
-kvikio_deprecate_module(
-    "Use the official nvCOMP API from 'nvidia.nvcomp' instead.", since="25.06"
-)
diff --git a/python/kvikio/kvikio/zarr/__init__.py b/python/kvikio/kvikio/zarr/__init__.py
index 7ec22c275a..758670ea21 100644
--- a/python/kvikio/kvikio/zarr/__init__.py
+++ b/python/kvikio/kvikio/zarr/__init__.py
@@ -6,5 +6,3 @@
 
 if _parse(_metadata.version("zarr")) >= _Version("3.0.0"):
     from ._zarr_python_3 import *  # noqa: F401,F403
-else:
-    from ._zarr_python_2 import *  # type: ignore[assignment] # noqa: F401,F403
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_2.py b/python/kvikio/kvikio/zarr/_zarr_python_2.py
deleted file mode 100644
index bd1418e799..0000000000
--- a/python/kvikio/kvikio/zarr/_zarr_python_2.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-from __future__ import annotations
-
-import contextlib
-import os
-import os.path
-from abc import abstractmethod
-from typing import Any, Literal, Mapping, Optional, Sequence, Union
-
-import cupy
-import cupy.typing
-import numcodecs
-import numpy
-import numpy as np
-import zarr
-import zarr.creation
-import zarr.errors
-import zarr.storage
-from numcodecs.abc import Codec
-from numcodecs.compat import ensure_contiguous_ndarray_like
-from numcodecs.registry import register_codec
-from packaging.version import parse
-
-import kvikio
-import kvikio._nvcomp
-import kvikio._nvcomp_codec
-import kvikio.zarr
-from kvikio._nvcomp_codec import NvCompBatchCodec
-from kvikio.numcodecs import BufferLike, CudaCodec
-
-MINIMUM_ZARR_VERSION = "2.15"
-
-# Is this version of zarr supported? We depend on the `Context`
-# argument introduced in https://github.com/zarr-developers/zarr-python/pull/1131
-# in zarr v2.15.
-supported = parse(zarr.__version__) >= parse(MINIMUM_ZARR_VERSION)
-
-
-class GDSStore(zarr.storage.DirectoryStore):  # type: ignore[name-defined]
-    """GPUDirect Storage (GDS) class using directories and files.
-
-    This class works like `zarr.storage.DirectoryStore` but implements
-    getitems() in order to support direct reading into device memory.
-    It uses KvikIO for reads and writes, which in turn will use GDS
-    when applicable.
-
-    Parameters
-    ----------
-    path : string
-        Location of directory to use as the root of the storage hierarchy.
-    normalize_keys : bool, optional
-        If True, all store keys will be normalized to use lower case characters
-        (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be
-        useful to avoid potential discrepancies between case-sensitive and
-        case-insensitive file system. Default value is False.
-    dimension_separator : {'.', '/'}, optional
-        Separator placed between the dimensions of a chunk.
-    compressor_config_overwrite
-        If not None, use this `Mapping` to specify what is written to the Zarr metadata
-        file on disk (`.zarray`). Normally, Zarr writes the configuration[1] given by
-        the `compressor` argument to the `.zarray` file. Use this argument to overwrite
-        the normal configuration and use the specified `Mapping` instead.
-    decompressor_config_overwrite
-        If not None, use this `Mapping` to specify what compressor configuration[1] is
-        used for decompressing no matter the configuration found in the Zarr metadata
-        on disk (the `.zarray` file).
-
-    [1] https://github.com/zarr-developers/numcodecs/blob/cb155432/numcodecs/abc.py#L79
-
-    Notes
-    -----
-    Atomic writes are used, which means that data are first written to a
-    temporary file, then moved into place when the write is successfully
-    completed. Files are only held open while they are being read or written and are
-    closed immediately afterwards, so there is no need to manually close any files.
-
-    Safe to write in multiple threads or processes.
-    """
-
-    # The default output array type used by getitems().
-    default_meta_array = numpy.empty(())
-
-    def __init__(
-        self,
-        path,
-        normalize_keys=False,
-        dimension_separator=None,
-        *,
-        compressor_config_overwrite: Optional[Mapping] = None,
-        decompressor_config_overwrite: Optional[Mapping] = None,
-    ) -> None:
-        if not kvikio.zarr.supported:
-            raise RuntimeError(
-                f"GDSStore requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}"
-            )
-        super().__init__(
-            path, normalize_keys=normalize_keys, dimension_separator=dimension_separator
-        )
-        self.compressor_config_overwrite = compressor_config_overwrite
-        self.decompressor_config_overwrite = decompressor_config_overwrite
-
-    def __eq__(self, other):
-        return isinstance(other, GDSStore) and self.path == other.path
-
-    def _tofile(self, a, fn):
-        with kvikio.CuFile(fn, "w") as f:
-            written = f.write(a)
-            assert written == a.nbytes
-
-    def __getitem__(self, key):
-        ret = super().__getitem__(key)
-        if self.decompressor_config_overwrite and key == ".zarray":
-            meta = self._metadata_class.decode_array_metadata(ret)
-            if meta["compressor"]:
-                meta["compressor"] = self.decompressor_config_overwrite
-                ret = self._metadata_class.encode_array_metadata(meta)
-        return ret
-
-    def __setitem__(self, key, value):
-        if self.compressor_config_overwrite and key == ".zarray":
-            meta = self._metadata_class.decode_array_metadata(value)
-            if meta["compressor"]:
-                meta["compressor"] = self.compressor_config_overwrite
-                value = self._metadata_class.encode_array_metadata(meta)
-        super().__setitem__(key, value)
-
-    def getitems(
-        self,
-        keys: Sequence[str],
-        *,
-        contexts: Mapping[str, Mapping] = {},
-    ) -> Mapping[str, Any]:
-        """Retrieve data from multiple keys.
-
-        Parameters
-        ----------
-        keys : Iterable[str]
-            The keys to retrieve
-        contexts: Mapping[str, Context]
-            A mapping of keys to their context. Each context is a mapping of store
-            specific information. If the "meta_array" key exist, GDSStore use its
-            values as the output array otherwise GDSStore.default_meta_array is used.
-
-        Returns
-        -------
-        Mapping
-            A collection mapping the input keys to their results.
-        """
-        ret = {}
-        io_results = []
-
-        with contextlib.ExitStack() as stack:
-            for key in keys:
-                filepath = os.path.join(self.path, key)
-                if not os.path.isfile(filepath):
-                    continue
-                try:
-                    meta_array = contexts[key]["meta_array"]
-                except KeyError:
-                    meta_array = self.default_meta_array
-
-                nbytes = os.path.getsize(filepath)
-                f = stack.enter_context(kvikio.CuFile(filepath, "r"))
-                ret[key] = numpy.empty_like(meta_array, shape=(nbytes,), dtype="u1")
-                io_results.append((f.pread(ret[key]), nbytes))
-
-            for future, nbytes in io_results:
-                nbytes_read = future.get()
-                if nbytes_read != nbytes:
-                    raise RuntimeError(
-                        f"Incomplete read ({nbytes_read}) expected {nbytes}"
-                    )
-        return ret
-
-
-class NVCompCompressor(CudaCodec):
-    """Abstract base class for nvCOMP compressors
-
-    The derived classes must set `codec_id` and implement
-    `get_nvcomp_manager`
-    """
-
-    @abstractmethod
-    def get_nvcomp_manager(self) -> kvikio.nvcomp.nvCompManager:
-        """Abstract method that should return the nvCOMP compressor manager"""
-        pass  # TODO: cache Manager
-
-    def encode(self, buf: BufferLike) -> cupy.typing.NDArray:
-        buf = cupy.asarray(ensure_contiguous_ndarray_like(buf))
-        return self.get_nvcomp_manager().compress(buf)
-
-    def decode(self, buf: BufferLike, out: Optional[BufferLike] = None) -> BufferLike:
-        buf = ensure_contiguous_ndarray_like(buf)
-        is_host_buffer = not hasattr(buf, "__cuda_array_interface__")
-        if is_host_buffer:
-            buf = cupy.asarray(buf)
-
-        ret = self.get_nvcomp_manager().decompress(buf)
-
-        if is_host_buffer:
-            ret = cupy.asnumpy(ret)
-
-        if out is not None:
-            out = ensure_contiguous_ndarray_like(out)
-            if hasattr(out, "__cuda_array_interface__"):
-                cupy.copyto(out, ret.view(dtype=out.dtype), casting="no")
-            else:
-                np.copyto(out, cupy.asnumpy(ret.view(dtype=out.dtype)), casting="no")
-        return ret
-
-
-class ANS(NVCompCompressor):
-    codec_id = "nvcomp_ANS"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.ANSManager()
-
-
-class Bitcomp(NVCompCompressor):
-    codec_id = "nvcomp_Bitcomp"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.BitcompManager()
-
-
-class Cascaded(NVCompCompressor):
-    codec_id = "nvcomp_Cascaded"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.CascadedManager()
-
-
-class Gdeflate(NVCompCompressor):
-    codec_id = "nvcomp_Gdeflate"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.GdeflateManager()
-
-
-class LZ4(NVCompCompressor):
-    codec_id = "nvcomp_LZ4"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.LZ4Manager()
-
-
-class Snappy(NVCompCompressor):
-    codec_id = "nvcomp_Snappy"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.SnappyManager()
-
-
-# Expose a list of available nvCOMP compressors and register them as Zarr condecs
-nvcomp_compressors = [ANS, Bitcomp, Cascaded, Gdeflate, LZ4, Snappy]
-for c in nvcomp_compressors:
-    register_codec(c)
-
-
-class CompatCompressor:
-    """A pair of compatible compressors one using the CPU and one using the GPU
-
-    Warning
-    -------
-    `CompatCompressor` is only supported by KvikIO's `open_cupy_array()` and
-    cannot be used as a compressor argument in Zarr functions like `open()`
-    and `open_array()` directly. However, it is possible to use its `.cpu`
-    like: `open(..., compressor=CompatCompressor.lz4().cpu)`.
-
-    Parameters
-    ----------
-    cpu
-        The CPU compressor.
-    gpu
-        The GPU compressor.
-    """
-
-    def __init__(self, cpu: Codec, gpu: CudaCodec) -> None:
-        self.cpu = cpu
-        self.gpu = gpu
-
-    @classmethod
-    def lz4(cls) -> CompatCompressor:
-        """A compatible pair of LZ4 compressors"""
-        return cls(cpu=numcodecs.LZ4(), gpu=NvCompBatchCodec("lz4"))
-
-
-def open_cupy_array(
-    store: Union[os.PathLike, str],
-    mode: Literal["r", "r+", "a", "w", "w-"] = "a",
-    compressor: Codec | CompatCompressor = Snappy(),
-    meta_array=cupy.empty(()),
-    **kwargs,
-) -> zarr.Array:
-    """Open an Zarr array as a CuPy-like array using file-mode-like semantics.
-
-    This function is a CUDA friendly version of `zarr.open_array` that reads
-    and writes to CuPy arrays. Beside the arguments listed below, the arguments
-    have the same semantics as in `zarr.open_array`.
-
-    Parameters
-    ----------
-    store
-        Path to directory in file system. As opposed to `zarr.open_array`,
-        Store and path to zip files isn't supported.
-    mode
-        Persistence mode: 'r' means read only (must exist); 'r+' means
-        read/write (must exist); 'a' means read/write (create if doesn't
-        exist); 'w' means create (overwrite if exists); 'w-' means create
-        (fail if exists).
-    compressor
-        The compressor used when creating a Zarr file or None if no compressor
-        is to be used. If a `CompatCompressor` is given, `CompatCompressor.gpu`
-        is used for compression and decompression; and `CompatCompressor.cpu`
-        is written as the compressor in the Zarr file metadata on disk.
-        This argument is ignored in "r" and "r+" mode. By default the
-        Snappy compressor by nvCOMP is used.
-    meta_array : array-like, optional
-        An CuPy-like array instance to use for determining arrays to create and
-        return to users. It must implement `__cuda_array_interface__`.
-    **kwargs
-        The rest of the arguments are forwarded to `zarr.open_array` as-is.
-
-    Returns
-    -------
-    Zarr array backed by a GDS file store, nvCOMP compression, and CuPy arrays.
-    """
-
-    if not isinstance(store, (str, os.PathLike)):
-        raise ValueError("store must be a path")
-    store = str(os.fspath(store))
-    if not hasattr(meta_array, "__cuda_array_interface__"):
-        raise ValueError("meta_array must implement __cuda_array_interface__")
-
-    if mode in ("r", "r+", "a"):
-        # In order to handle "a", we start by trying to open the file in read mode.
-        try:
-            ret = zarr.open_array(
-                store=kvikio.zarr.GDSStore(path=store),  # type: ignore[call-arg]
-                mode="r+",
-                meta_array=meta_array,
-                **kwargs,
-            )
-        except (
-            zarr.errors.ContainsGroupError,
-            zarr.errors.ArrayNotFoundError,  # type: ignore[attr-defined]
-        ):
-            # If we are reading, this is a genuine error.
-            if mode in ("r", "r+"):
-                raise
-        else:
-            if ret.compressor is None:
-                return ret
-            # If we are reading a LZ4-CPU compressed file, we overwrite the
-            # metadata on-the-fly to make Zarr use LZ4-GPU for both compression
-            # and decompression.
-            compat_lz4 = CompatCompressor.lz4()
-            if ret.compressor == compat_lz4.cpu:
-                ret = zarr.open_array(
-                    store=kvikio.zarr.GDSStore(  # type: ignore[call-arg]
-                        path=store,
-                        compressor_config_overwrite=compat_lz4.cpu.get_config(),
-                        decompressor_config_overwrite=compat_lz4.gpu.get_config(),
-                    ),
-                    mode=mode,
-                    meta_array=meta_array,
-                    **kwargs,
-                )
-            elif not isinstance(ret.compressor, CudaCodec):
-                raise ValueError(
-                    "The Zarr file was written using a non-CUDA compatible "
-                    f"compressor, {ret.compressor}, please use something "
-                    "like kvikio.zarr.CompatCompressor"
-                )
-            return ret
-
-    # At this point, we known that we are writing a new array
-    if mode not in ("w", "w-", "a"):
-        raise ValueError(f"Unknown mode: {mode}")
-
-    if isinstance(compressor, CompatCompressor):
-        compressor_config_overwrite = compressor.cpu.get_config()
-        decompressor_config_overwrite = compressor.gpu.get_config()
-        compressor = compressor.gpu
-    else:
-        compressor_config_overwrite = None
-        decompressor_config_overwrite = None
-
-    return zarr.open_array(
-        store=kvikio.zarr.GDSStore(  # type: ignore[call-arg]
-            path=store,
-            compressor_config_overwrite=compressor_config_overwrite,
-            decompressor_config_overwrite=decompressor_config_overwrite,
-        ),
-        mode=mode,
-        meta_array=meta_array,
-        compressor=compressor,
-        **kwargs,
-    )
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_3.py b/python/kvikio/kvikio/zarr/_zarr_python_3.py
index 5305cd9b72..2f21be360a 100644
--- a/python/kvikio/kvikio/zarr/_zarr_python_3.py
+++ b/python/kvikio/kvikio/zarr/_zarr_python_3.py
@@ -8,6 +8,7 @@
 
 import packaging
 import zarr.storage
+from packaging.version import parse
 from zarr.abc.store import (
     ByteRequest,
     OffsetByteRequest,
@@ -23,6 +24,10 @@
 # at https://github.com/zarr-developers/zarr-python/blob/main/src/zarr/storage/_local.py
 # with differences coming swapping in `cuFile` for the stdlib open file object.
 
+MINIMUM_ZARR_VERSION = "3"
+
+supported = parse(zarr.__version__) >= parse(MINIMUM_ZARR_VERSION)
+
 
 @functools.cache
 def _is_ge_zarr_3_0_7():
@@ -138,10 +143,3 @@ async def _set(self, key: str, value: Buffer, exclusive: bool = False) -> None:
         path = self.root / key
 
         await asyncio.to_thread(_put, path, value, start=None, exclusive=exclusive)
-
-
-# Matching the check that zarr.__version__ > 2.15 that's
-# part of the public API for our zarr 2.x support
-# This module is behind a check that zarr.__version__ > 3
-# so we can just assume it's already checked and supported.
-supported = True
diff --git a/python/kvikio/tests/conftest.py b/python/kvikio/tests/conftest.py
index 07636095eb..06aef52ecc 100644
--- a/python/kvikio/tests/conftest.py
+++ b/python/kvikio/tests/conftest.py
@@ -57,19 +57,6 @@ def run_cmd(cmd: Iterable[str], cwd, verbose=True):
     p.kill()
 
 
-@pytest.fixture()
-def managers():
-    libnvcomp = pytest.importorskip("kvikio.nvcomp")
-    return [
-        libnvcomp.ANSManager,
-        libnvcomp.BitcompManager,
-        libnvcomp.CascadedManager,
-        libnvcomp.GdeflateManager,
-        libnvcomp.LZ4Manager,
-        libnvcomp.SnappyManager,
-    ]
-
-
 @pytest.fixture(
     params=[("cupy", False), ("cupy", True), ("numpy", False)],
     ids=["cupy", "cupy_async", "numpy"],
diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py
index 8450fdfc25..584f719fc8 100644
--- a/python/kvikio/tests/test_benchmarks.py
+++ b/python/kvikio/tests/test_benchmarks.py
@@ -7,7 +7,6 @@
 from pathlib import Path
 
 import pytest
-from packaging.version import parse
 
 import kvikio
 
@@ -26,25 +25,12 @@
         "cufile-mfma",
         "cufile-mf",
         "cufile-ma",
-        "zarr",
     ],
 )
 @pytest.mark.timeout(30, method="thread")
 def test_single_node_io(run_cmd, tmp_path, api):
     """Test benchmarks/single_node_io.py"""
 
-    if "zarr" in api:
-        kz = pytest.importorskip("kvikio.zarr")
-        import zarr
-
-        if not kz.supported:
-            pytest.skip(f"requires Zarr >={kz.MINIMUM_ZARR_VERSION}")
-
-        if parse(zarr.__version__) >= parse("3.0.0"):
-            pytest.skip(
-                "requires Zarr<3",
-            )
-
     retcode = run_cmd(
         cmd=[
             sys.executable or "python",
@@ -61,44 +47,6 @@ def test_single_node_io(run_cmd, tmp_path, api):
     assert retcode == 0
 
 
-@pytest.mark.parametrize(
-    "api",
-    [
-        "kvikio",
-        "posix",
-    ],
-)
-@pytest.mark.timeout(30, method="thread")
-def test_zarr_io(run_cmd, tmp_path, api):
-    """Test benchmarks/zarr_io.py"""
-
-    kz = pytest.importorskip("kvikio.zarr")
-    import zarr
-
-    if not kz.supported:
-        pytest.skip(f"requires Zarr >={kz.MINIMUM_ZARR_VERSION}")
-
-    if parse(zarr.__version__) >= parse("3.0.0"):
-        pytest.skip(
-            "requires Zarr<3",
-        )
-
-    retcode = run_cmd(
-        cmd=[
-            sys.executable or "python",
-            "zarr_io.py",
-            "-n",
-            "1MiB",
-            "-d",
-            str(tmp_path),
-            "--api",
-            api,
-        ],
-        cwd=benchmarks_path,
-    )
-    assert retcode == 0
-
-
 @pytest.mark.parametrize(
     "api",
     [
diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py
index f32485b6c4..1d08525a23 100644
--- a/python/kvikio/tests/test_examples.py
+++ b/python/kvikio/tests/test_examples.py
@@ -6,7 +6,6 @@
 from pathlib import Path
 
 import pytest
-from packaging.version import parse
 
 import kvikio
 
@@ -21,18 +20,6 @@ def test_hello_world(tmp_path, monkeypatch):
     import_module("hello_world").main(tmp_path / "test-file")
 
 
-def test_zarr_cupy_nvcomp(tmp_path, monkeypatch):
-    """Test examples/zarr_cupy_nvcomp.py"""
-
-    # `examples/zarr_cupy_nvcomp.py` requires the Zarr submodule
-    zarr = pytest.importorskip("zarr")
-    if parse(zarr.__version__) >= parse("3.0.0"):
-        pytest.skip(reason="Requires zarr<3")
-
-    monkeypatch.syspath_prepend(str(examples_path))
-    import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file")
-
-
 def test_http_io(tmp_path, monkeypatch):
     """Test examples/http_io.py"""
 
diff --git a/python/kvikio/tests/test_nvcomp.py b/python/kvikio/tests/test_nvcomp.py
deleted file mode 100644
index 356c5e77cd..0000000000
--- a/python/kvikio/tests/test_nvcomp.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import pytest
-
-np = pytest.importorskip("numpy")
-cupy = pytest.importorskip("cupy")
-kvikio = pytest.importorskip("kvikio")
-libnvcomp = pytest.importorskip("kvikio.nvcomp")
-
-
-# TODO: don't hardcode the following expected values
-LEN = {
-    "ANS": 11144,
-    "Bitcomp": 3208,
-    "Cascaded": 600,
-    "Gdeflate": 760,
-    "LZ4": 393,
-    "Snappy": 3548,
-}
-
-
-def assert_compression_size(actual, desired, rtol=0.1):
-    """Compression ratios might change slightly between library versions
-
-    We mark a failure as "xfail"
-    """
-    try:
-        np.testing.assert_allclose(actual, desired, rtol=rtol)
-    except AssertionError:
-        pytest.xfail("mismatch in compression ratios is acceptable")
-        raise
-
-
-def managers():
-    return [
-        libnvcomp.ANSManager,
-        libnvcomp.BitcompManager,
-        libnvcomp.CascadedManager,
-        libnvcomp.GdeflateManager,
-        libnvcomp.LZ4Manager,
-        libnvcomp.SnappyManager,
-    ]
-
-
-def dtypes():
-    return [
-        "uint8",
-        "uint16",
-        "uint32",
-        "int8",
-        "int16",
-        "int32",
-    ]
-
-
-@pytest.mark.parametrize("manager, dtype", zip(managers(), dtypes()))
-def test_round_trip_dtypes(manager, dtype):
-    length = 10000
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager(data_type=dtype)
-    compressed = compressor_instance.compress(data)
-    decompressed = compressor_instance.decompress(compressed)
-    assert (data == decompressed).all()
-
-
-#
-# ANS Options test
-#
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-    ],
-)
-def test_ans_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.ANSManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["ANS"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "data_type": np.uint8,
-            "algo": 0,
-        },
-        {"data_type": np.uint8},
-        {
-            "algo": 0,
-        },
-    ],
-)
-def test_bitcomp_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.BitcompManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Bitcomp"])
-
-
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip(
-        [
-            {"algo": 0},
-            {"algo": 1},
-            {"algo": 2},
-        ],
-        [LEN["Bitcomp"], LEN["Bitcomp"], LEN["Bitcomp"]],
-    ),
-)
-def test_bitcomp_algorithms(inputs, expected):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.BitcompManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-            "chunk_size": 1 << 16,
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-            "data_type": np.uint8,
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-        },
-    ],
-)
-def test_cascaded_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.CascadedManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Cascaded"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-            "algo": 0,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "algo": 0,
-        },
-    ],
-)
-def test_gdeflate_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Gdeflate"])
-
-
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip(
-        [
-            {"algo": 0},
-        ],
-        [LEN["Gdeflate"]],
-    ),
-)
-def test_gdeflate_algorithms(inputs, expected):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.xfail(raises=ValueError)
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip([{"algo": 1}, {"algo": 2}], [LEN["Gdeflate"], LEN["Gdeflate"]]),
-)
-def test_gdeflate_algorithms_not_implemented(inputs, expected):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-            "data_type": np.uint8,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "data_type": np.uint8,
-        },
-    ],
-)
-def test_lz4_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.LZ4Manager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["LZ4"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {},
-    ],
-)
-def test_snappy_inputs(inputs):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.SnappyManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Snappy"])
-
-
-@pytest.mark.parametrize(
-    "compressor_size",
-    zip(
-        managers(),
-        [
-            {  # ANS
-                "max_compressed_buffer_size": 89373,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Bitcomp
-                "max_compressed_buffer_size": 16432,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Cascaded
-                "max_compressed_buffer_size": 12460,
-                "num_chunks": 3,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Gdeflate
-                "max_compressed_buffer_size": 131160,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # LZ4
-                "max_compressed_buffer_size": 65888,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Snappy
-                "max_compressed_buffer_size": 76575,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-        ],
-    ),
-)
-def test_get_compression_config_with_default_options(compressor_size):
-    compressor = compressor_size[0]
-    expected = compressor_size[1]
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = compressor()
-    result = compressor_instance.configure_compression(len(data))
-    assert_compression_size(
-        result.pop("max_compressed_buffer_size"),
-        expected.pop("max_compressed_buffer_size"),
-    )
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    "manager,expected",
-    zip(
-        managers(),
-        [
-            {  # ANS
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Bitcomp
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Cascaded
-                "num_chunks": 3,
-                "decomp_data_size": 10000,
-            },
-            {  # Gdeflate
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # LZ4
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Snappy
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-        ],
-    ),
-)
-def test_get_decompression_config_with_default_options(manager, expected):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    result = compressor_instance.configure_decompression_with_compressed_buffer(
-        compressed
-    )
-    assert_compression_size(
-        result.pop("decomp_data_size"), expected.pop("decomp_data_size")
-    )
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    "manager, expected",
-    zip(managers(), list(LEN.values())),
-)
-def test_get_compressed_output_size(manager, expected):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    buffer_size = compressor_instance.get_compressed_output_size(compressed)
-    assert_compression_size(buffer_size, expected)
-
-
-@pytest.mark.parametrize("manager", managers())
-def test_managed_manager(manager):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    manager = libnvcomp.ManagedDecompressionManager(compressed)
-    decompressed = manager.decompress(compressed)
-    assert len(decompressed) == 10000
diff --git a/python/kvikio/tests/test_nvcomp_codec.py b/python/kvikio/tests/test_nvcomp_codec.py
deleted file mode 100644
index 29e50ad64b..0000000000
--- a/python/kvikio/tests/test_nvcomp_codec.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import itertools as it
-import json
-
-import cupy as cp
-import numcodecs
-import numpy as np
-import packaging
-import packaging.version
-import pytest
-import zarr
-from numpy.testing import assert_equal
-
-from kvikio.nvcomp_codec import NvCompBatchCodec
-
-NVCOMP_CODEC_ID = "nvcomp_batch"
-
-LZ4_ALGO = "LZ4"
-GDEFLATE_ALGO = "Gdeflate"
-SNAPPY_ALGO = "snappy"
-ZSTD_ALGO = "zstd"
-DEFLATE_ALGO = "deflate"
-
-SUPPORTED_CODECS = [LZ4_ALGO, GDEFLATE_ALGO, SNAPPY_ALGO, ZSTD_ALGO, DEFLATE_ALGO]
-
-
-def skip_if_zarr_v3():
-    return pytest.mark.skipif(
-        packaging.version.parse(zarr.__version__) >= packaging.version.Version("3.0.0"),
-        reason="zarr 3.x not supported.",
-    )
-
-
-def _get_codec(algo: str, **kwargs):
-    codec_args = {"id": NVCOMP_CODEC_ID, "algorithm": algo, "options": kwargs}
-    return numcodecs.registry.get_codec(codec_args)
-
-
-@pytest.fixture(params=[(32,), (8, 16), (16, 16)])
-def shape(request):
-    return request.param
-
-
-# Separate fixture for combinations of shapes and chunks, since
-# chunks array must have the same rank as data array.
-@pytest.fixture(
-    params=it.chain(
-        it.product([(64,)], [(64,), (100,)]),
-        it.product([(16, 8), (16, 16)], [(8, 16), (16, 16), (40, 12)]),
-    )
-)
-def shape_chunks(request):
-    return request.param
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-def test_codec_registry(algo: str):
-    codec = _get_codec(algo)
-    assert isinstance(codec, numcodecs.abc.Codec)
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-def test_basic(algo: str, shape):
-    codec = NvCompBatchCodec(algo)
-
-    # Create data.
-    dtype = np.float32
-    data = np.ones(shape, dtype=dtype)
-    # Do roundtrip.
-    comp_data = codec.encode(data)
-    # Decompress and cast to original data type/shape.
-    decomp_data = codec.decode(comp_data).view(dtype).reshape(shape)
-
-    assert_equal(decomp_data, data)
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@skip_if_zarr_v3()
-def test_basic_zarr(algo: str, shape_chunks):
-    shape, chunks = shape_chunks
-
-    codec = NvCompBatchCodec(algo)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    # This will do the compression.
-    z = zarr.array(data, chunks=chunks, compressor=codec)
-
-    # Test the decompression.
-    assert_equal(z[:], data[:])
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@pytest.mark.parametrize("chunk_sizes", [(100, 100), (100, 150)])
-@pytest.mark.parametrize("out", [None, "cpu", "gpu"])
-def test_batch_comp_decomp(algo: str, chunk_sizes, out: str):
-    codec = _get_codec(algo)
-
-    np.random.seed(1)
-
-    dtype = np.float32
-    chunks = [np.random.randn(s).astype(dtype) for s in chunk_sizes]
-    out_buf = None
-    if out == "cpu":
-        out_buf = [np.empty_like(c) for c in chunks]
-    elif out == "gpu":
-        out_buf = [cp.empty_like(c) for c in chunks]
-
-    comp_chunks = codec.encode_batch([c.tobytes() for c in chunks])
-    assert len(comp_chunks) == 2
-
-    decomp_chunks = codec.decode_batch(comp_chunks, out=out_buf)
-    assert len(decomp_chunks) == 2
-
-    for i, dc in enumerate(decomp_chunks):
-        dc = dc.view(dtype=dtype)
-        if isinstance(dc, cp.ndarray):
-            dc = dc.get()
-        assert_equal(dc, chunks[i], f"{i=}")
-
-        if out_buf is not None:
-            ob = out_buf[i]
-            if isinstance(ob, cp.ndarray):
-                ob = ob.get()
-            assert_equal(ob, chunks[i], f"{i=}")
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@skip_if_zarr_v3()
-def test_comp_decomp(algo: str, shape_chunks):
-    shape, chunks = shape_chunks
-
-    codec = _get_codec(algo)
-
-    np.random.seed(1)
-
-    data = np.random.randn(*shape).astype(np.float32)
-
-    z1 = zarr.array(data, chunks=chunks, compressor=codec)
-
-    zarr_store = zarr.MemoryStore()
-    zarr.save_array(zarr_store, z1, compressor=codec)
-    # Check the store.
-    meta = json.loads(zarr_store[".zarray"])
-    assert meta["compressor"]["id"] == NVCOMP_CODEC_ID
-    assert meta["compressor"]["algorithm"] == algo.lower()
-
-    # Read back/decompress.
-    z2 = zarr.open_array(zarr_store)
-
-    assert_equal(z1[:], z2[:])
-
-
-@pytest.mark.parametrize(
-    "algo, options",
-    [
-        ("lz4", {"data_type": 4}),  # NVCOMP_TYPE_INT data type.
-        ("gdeflate", {"algo": 1}),  # low-throughput, high compression ratio algo
-    ],
-)
-@skip_if_zarr_v3()
-def test_codec_options(algo, options):
-    codec = NvCompBatchCodec(algo, options)
-
-    shape = (16, 16)
-    chunks = (8, 8)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    z = zarr.array(data, chunks=chunks, compressor=codec)
-
-    assert_equal(z[:], data[:])
-
-
-@skip_if_zarr_v3()
-def test_codec_invalid_options():
-    # There are currently only 3 supported algos in Gdeflate
-    codec = NvCompBatchCodec(GDEFLATE_ALGO, options={"algo": 10})
-
-    data = np.ones((16, 16), dtype=np.float32)
-
-    with pytest.raises(RuntimeError):
-        zarr.array(data, compressor=codec)
-
-
-@pytest.mark.parametrize(
-    "cpu_algo, gpu_algo",
-    [
-        ("lz4", LZ4_ALGO),
-        ("zstd", ZSTD_ALGO),
-    ],
-)
-@skip_if_zarr_v3()
-def test_cpu_comp_gpu_decomp(cpu_algo, gpu_algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": cpu_algo})
-    gpu_codec = _get_codec(gpu_algo)
-
-    shape = (16, 16)
-    chunks = (8, 8)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    z1 = zarr.array(data, chunks=chunks)
-    store = {}
-    zarr.save_array(store, z1, compressor=cpu_codec)
-
-    meta = json.loads(store[".zarray"])
-    assert meta["compressor"]["id"] == cpu_algo
-
-    meta["compressor"] = {"id": NVCOMP_CODEC_ID, "algorithm": gpu_algo}
-    store[".zarray"] = json.dumps(meta).encode()
-
-    z2 = zarr.open_array(store, compressor=gpu_codec)
-
-    assert_equal(z1[:], z2[:])
-
-
-@skip_if_zarr_v3()
-def test_lz4_codec_header(shape_chunks):
-    shape, chunks = shape_chunks
-
-    # Test LZ4 nvCOMP codecs with and without the header.
-    codec_h = _get_codec(LZ4_ALGO, has_header=True)
-    codec_no_h = _get_codec(LZ4_ALGO, has_header=False)
-
-    np.random.seed(1)
-
-    data = np.random.randn(*shape).astype(np.float32)
-
-    z_h = zarr.array(data, chunks=chunks, compressor=codec_h)
-    z_no_h = zarr.array(data, chunks=chunks, compressor=codec_no_h)
-
-    # Result must be the same regardless of the header presence.
-    assert_equal(z_h[:], z_no_h[:])
-
-
-def test_empty_batch():
-    codec = _get_codec(LZ4_ALGO)
-
-    assert len(codec.encode_batch([])) == 0
-    assert len(codec.decode_batch([])) == 0
diff --git a/python/kvikio/tests/test_zarr.py b/python/kvikio/tests/test_zarr.py
deleted file mode 100644
index a793e2568e..0000000000
--- a/python/kvikio/tests/test_zarr.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-
-import math
-
-import numpy
-import pytest
-from packaging.version import parse
-
-cupy = pytest.importorskip("cupy")
-zarr = pytest.importorskip("zarr")
-kvikio_zarr = pytest.importorskip("kvikio.zarr")
-kvikio_nvcomp_codec = pytest.importorskip("kvikio.nvcomp_codec")
-numcodecs = pytest.importorskip("numcodecs")
-
-if not kvikio_zarr.supported:
-    pytest.skip(
-        f"requires Zarr >={kvikio_zarr.MINIMUM_ZARR_VERSION}",
-        allow_module_level=True,
-    )
-
-if parse(zarr.__version__) >= parse("3.0.0"):
-    pytest.skip(
-        "requires Zarr<3",
-        allow_module_level=True,
-    )
-
-
-@pytest.fixture
-def store(tmp_path):
-    """Fixture that creates a GDS Store"""
-    return kvikio_zarr.GDSStore(tmp_path / "test-file.zarr")
-
-
-def test_direct_store_access(store, xp):
-    """Test accessing the GDS Store directly"""
-
-    a = xp.arange(5, dtype="u1")
-    store["a"] = a
-    b = store["a"]
-
-    # Notice, unless using getitems(), GDSStore always returns bytes
-    assert isinstance(b, bytes)
-    assert (xp.frombuffer(b, dtype="u1") == a).all()
-
-
-@pytest.mark.parametrize("xp_write", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_read_a", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_read_b", ["numpy", "cupy"])
-def test_direct_store_access_getitems(store, xp_write, xp_read_a, xp_read_b):
-    """Test accessing the GDS Store directly using getitems()"""
-
-    xp_read_a = pytest.importorskip(xp_read_a)
-    xp_read_b = pytest.importorskip(xp_read_b)
-    xp_write = pytest.importorskip(xp_write)
-    a = xp_write.arange(5, dtype="u1")
-    b = a * 2
-    store["a"] = a
-    store["b"] = b
-
-    res = store.getitems(
-        keys=["a", "b"],
-        contexts={
-            "a": {"meta_array": xp_read_a.empty(())},
-            "b": {"meta_array": xp_read_b.empty(())},
-        },
-    )
-    assert isinstance(res["a"], xp_read_a.ndarray)
-    assert isinstance(res["b"], xp_read_b.ndarray)
-    cupy.testing.assert_array_equal(res["a"], a)
-    cupy.testing.assert_array_equal(res["b"], b)
-
-
-def test_array(store, xp):
-    """Test Zarr array"""
-
-    a = xp.arange(100)
-    z = zarr.array(a, chunks=10, compressor=None, store=store, meta_array=xp.empty(()))
-    assert isinstance(z.meta_array, type(a))
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(a, type(z[:]))
-    xp.testing.assert_array_equal(a, z[:])
-
-
-def test_group(store, xp):
-    """Test Zarr group"""
-
-    g = zarr.open_group(store, meta_array=xp.empty(()))
-    g.ones("data", shape=(10, 11), dtype=int, compressor=None)
-    a = g["data"]
-    assert a.shape == (10, 11)
-    assert a.dtype == int
-    assert isinstance(a, zarr.Array)
-    assert isinstance(a.meta_array, xp.ndarray)
-    assert isinstance(a[:], xp.ndarray)
-    assert (a[:] == 1).all()
-
-
-def test_open_array(store, xp):
-    """Test Zarr's open_array()"""
-
-    a = xp.arange(10)
-    z = zarr.open_array(
-        store,
-        shape=a.shape,
-        dtype=a.dtype,
-        chunks=(10,),
-        compressor=None,
-        meta_array=xp.empty(()),
-    )
-    z[:] = a
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(a, type(z[:]))
-    xp.testing.assert_array_equal(a, z[:])
-
-
-@pytest.mark.parametrize("inline_array", [True, False])
-def test_dask_read(store, xp, inline_array):
-    """Test Zarr read in Dask"""
-
-    da = pytest.importorskip("dask.array")
-    a = xp.arange(100)
-    z = zarr.array(a, chunks=10, compressor=None, store=store, meta_array=xp.empty(()))
-    d = da.from_zarr(z, inline_array=inline_array)
-    d += 1
-    xp.testing.assert_array_equal(a + 1, d.compute())
-
-
-def test_dask_write(store, xp):
-    """Test Zarr write in Dask"""
-
-    da = pytest.importorskip("dask.array")
-
-    # Write dask array to disk using Zarr
-    a = xp.arange(100)
-    d = da.from_array(a, chunks=10)
-    da.to_zarr(d, store, compressor=None, meta_array=xp.empty(()))
-
-    # Validate the written Zarr array
-    z = zarr.open_array(store)
-    xp.testing.assert_array_equal(a, z[:])
-
-
-@pytest.mark.parametrize("xp_read", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_write", ["numpy", "cupy"])
-@pytest.mark.parametrize("compressor", kvikio_zarr.nvcomp_compressors)
-def test_compressor(store, xp_write, xp_read, compressor):
-    xp_read = pytest.importorskip(xp_read)
-    xp_write = pytest.importorskip(xp_write)
-
-    shape = (10, 1)
-    chunks = (10, 1)
-    a = xp_write.arange(math.prod(shape)).reshape(shape)
-    z = zarr.creation.create(
-        shape=shape,
-        chunks=chunks,
-        compressor=compressor(),
-        store=store,
-        meta_array=xp_read.empty(()),
-    )
-    z[:] = a
-    b = z[:]
-    assert isinstance(b, xp_read.ndarray)
-    cupy.testing.assert_array_equal(b, a)
-
-
-@pytest.mark.parametrize("algo", ["lz4", "zstd"])
-def test_decompressor_config_overwrite(tmp_path, xp, algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": algo})
-    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
-
-    # Write using Zarr's default file store and the `cpu_codec` compressor
-    z = zarr.open_array(tmp_path, mode="w", shape=(10,), compressor=cpu_codec)
-    z[:] = range(10)
-    assert z.compressor == cpu_codec
-
-    # Open file using GDSStore and use `gpu_codec` as decompressor.
-    z = zarr.open_array(
-        kvikio_zarr.GDSStore(
-            tmp_path,
-            decompressor_config_overwrite=gpu_codec.get_config(),
-        ),
-        mode="r",
-        meta_array=xp.empty(()),
-    )
-    assert z.compressor == gpu_codec
-    assert isinstance(z[:], xp.ndarray)
-    xp.testing.assert_array_equal(z[:], range(10))
-
-
-@pytest.mark.parametrize("algo", ["lz4"])
-def test_compressor_config_overwrite(tmp_path, xp, algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": algo})
-    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
-
-    # Write file using GDSStore and the `gpu_codec` compressor. In order
-    # to make the file compatible with Zarr's builtin CPU decompressor,
-    # we set `cpu_codec` as the compressor in the meta file on disk.
-    z = zarr.open_array(
-        kvikio_zarr.GDSStore(
-            tmp_path,
-            compressor_config_overwrite=cpu_codec.get_config(),
-            decompressor_config_overwrite=gpu_codec.get_config(),
-        ),
-        mode="w",
-        shape=10,
-        compressor=gpu_codec,
-        meta_array=xp.empty(()),
-    )
-    assert z.compressor == gpu_codec
-    z[:] = xp.arange(10)
-
-    # We can now open the file using Zarr's builtin CPU decompressor
-    z = zarr.open_array(tmp_path, mode="r")
-    assert isinstance(z[:], numpy.ndarray)
-    numpy.testing.assert_array_equal(z[:], range(10))
-
-
-@pytest.mark.parametrize("write_mode", ["w", "w-", "a"])
-@pytest.mark.parametrize("read_mode", ["r", "r+", "a"])
-def test_open_cupy_array(tmp_path, write_mode, read_mode):
-    a = cupy.arange(10)
-    z = kvikio_zarr.open_cupy_array(
-        tmp_path,
-        mode=write_mode,
-        shape=a.shape,
-        dtype=a.dtype,
-        chunks=(2,),
-        compressor=kvikio_zarr.CompatCompressor.lz4(),
-    )
-    z[:] = a
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], type(a))
-    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
-    cupy.testing.assert_array_equal(a, z[:])
-
-    z = kvikio_zarr.open_cupy_array(
-        tmp_path,
-        mode=read_mode,
-    )
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], type(a))
-    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
-    cupy.testing.assert_array_equal(a, z[:])
-
-    z = zarr.open_array(tmp_path, mode=read_mode)
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], numpy.ndarray)
-    assert z.compressor == kvikio_zarr.CompatCompressor.lz4().cpu
-    numpy.testing.assert_array_equal(a.get(), z[:])
-
-
-@pytest.mark.parametrize("compressor", [None, kvikio_zarr.CompatCompressor.lz4().cpu])
-def test_open_cupy_array_written_by_zarr(tmp_path, compressor):
-    data = numpy.arange(100)
-    z = zarr.open_array(
-        tmp_path,
-        shape=data.shape,
-        mode="w",
-        compressor=compressor,
-    )
-    z[:] = data
-
-    z = kvikio_zarr.open_cupy_array(tmp_path, mode="r")
-    assert isinstance(z[:], cupy.ndarray)
-    cupy.testing.assert_array_equal(z[:], data)
-
-
-@pytest.mark.parametrize("mode", ["r", "r+", "a"])
-def test_open_cupy_array_incompatible_compressor(tmp_path, mode):
-    zarr.create((10,), store=tmp_path, compressor=numcodecs.Blosc())
-
-    with pytest.raises(ValueError, match="non-CUDA compatible compressor"):
-        kvikio_zarr.open_cupy_array(tmp_path, mode=mode)
-
-
-def test_open_cupy_array_unknown_mode(tmp_path):
-    a = cupy.arange(10)
-    with pytest.raises(ValueError, match="Unknown mode: x"):
-        kvikio_zarr.open_cupy_array(
-            tmp_path,
-            mode="x",
-            shape=a.shape,
-            dtype=a.dtype,
-            chunks=(2,),
-        )

From c1391d696fa5eca90048c98ba9f29cd54a588393 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 18 Aug 2025 18:35:38 -0700
Subject: [PATCH 053/103] Remove remaining nvcomp references (#801)

https://github.com/rapidsai/kvikio/pull/798 removed usage of nvcomp but left the linkage in place, kvikio extension modules still relied on nvcomp existing even though they didn't actually use any of its functionality. That is now causing problems in #800. Removing the linkage entirely here (while still revendoring manually until we can move the vendoring to cudf) should resolve that.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/801
---
 cpp/CMakeLists.txt                            |  1 -
 python/kvikio/CMakeLists.txt                  |  4 ---
 python/kvikio/cmake/CMakeLists.txt            | 15 ---------
 .../kvikio/cmake/thirdparty/get_nvcomp.cmake  | 33 -------------------
 python/kvikio/kvikio/_lib/CMakeLists.txt      | 12 +------
 python/kvikio/pyproject.toml                  |  7 ----
 python/libkvikio/CMakeLists.txt               | 29 +++++-----------
 python/libkvikio/libkvikio/load.py            |  5 +--
 8 files changed, 12 insertions(+), 94 deletions(-)
 delete mode 100644 python/kvikio/cmake/CMakeLists.txt
 delete mode 100644 python/kvikio/cmake/thirdparty/get_nvcomp.cmake

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6107a0a795..9fa8e1f798 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -49,7 +49,6 @@ option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON)
 option(KvikIO_BUILD_TESTS "Configure CMake to build tests" ON)
 option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote IO support" ON)
 option(KvikIO_CUDA_SUPPORT "Configure CMake to build with CUDA support" ON)
-option(KvikIO_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 
 # ##################################################################################################
 # * conda environment ------------------------------------------------------------------------------
diff --git a/python/kvikio/CMakeLists.txt b/python/kvikio/CMakeLists.txt
index 3e5af8c857..480e485d2b 100644
--- a/python/kvikio/CMakeLists.txt
+++ b/python/kvikio/CMakeLists.txt
@@ -26,8 +26,6 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
-
 find_package(kvikio REQUIRED "${RAPIDS_VERSION}")
 
 find_package(CUDAToolkit REQUIRED)
@@ -35,8 +33,6 @@ find_package(CUDAToolkit REQUIRED)
 include(rapids-cython-core)
 rapids_cython_init()
 
-add_subdirectory(cmake)
-
 set(cython_lib_dir kvikio)
 
 add_subdirectory(kvikio/_lib)
diff --git a/python/kvikio/cmake/CMakeLists.txt b/python/kvikio/cmake/CMakeLists.txt
deleted file mode 100644
index d3882b5ab3..0000000000
--- a/python/kvikio/cmake/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-include(thirdparty/get_nvcomp.cmake)
diff --git a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake b/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
deleted file mode 100644
index a2c6326e76..0000000000
--- a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-# =============================================================================
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(KVIKIO_USE_PROPRIETARY_BINARY ON)
-
-# This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp)
-
-  include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  set(export_args)
-  if(KvikIO_EXPORT_NVCOMP)
-    set(export_args BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports)
-  endif()
-  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
-
-  # Per-thread default stream
-  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
-  endif()
-endfunction()
-
-find_and_configure_nvcomp()
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index 5b454295c0..b46d59c960 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -30,15 +30,5 @@ endif()
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_modules}"
-  LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp
+  LINKED_LIBRARIES kvikio::kvikio
 )
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
-  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_property(
-      TARGET ${tgt}
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endforeach()
-endif()
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 8baa9aa89c..234738fe5b 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -110,12 +110,6 @@ skip = [
 
 [tool.mypy]
 ignore_missing_imports = true
-exclude = [
-    # we type check against zarr-python 3.x
-    # and ignore modules using 2.x
-    "python/kvikio/kvikio/zarr/_zarr_python_2.py",
-    "python/kvikio/tests/test_nvcomp_codec.py",
-]
 
 [project.entry-points."numcodecs.codecs"]
 nvcomp_batch = "kvikio.nvcomp_codec:NvCompBatchCodec"
@@ -160,7 +154,6 @@ filterwarnings = [
     "error",
     "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache",
     "ignore::DeprecationWarning:botocore.*",
-    "ignore:This module is deprecated since.*Use the official nvCOMP API from 'nvidia.nvcomp' instead.:FutureWarning:.*nvcomp|.*nvcomp_codec",
 ]
 markers = [
     "cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`",
diff --git a/python/libkvikio/CMakeLists.txt b/python/libkvikio/CMakeLists.txt
index ecde2dc288..231bebbd84 100644
--- a/python/libkvikio/CMakeLists.txt
+++ b/python/libkvikio/CMakeLists.txt
@@ -39,28 +39,15 @@ unset(kvikio_FOUND)
 set(KvikIO_BUILD_BENCHMARKS OFF)
 set(KvikIO_BUILD_EXAMPLES OFF)
 set(KvikIO_BUILD_TESTS OFF)
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(KvikIO_EXPORT_NVCOMP OFF)
-else()
-  # vendor nvcomp but not the entire kvikio-export set because that's huge
-  include(cmake/thirdparty/get_nvcomp.cmake)
-  include(cmake/Modules/WheelHelpers.cmake)
-  install_aliased_imported_targets(
-    TARGETS nvcomp::nvcomp DESTINATION ${SKBUILD_PLATLIB_DIR}/libkvikio/lib64/
-  )
-endif()
+
+# vendor nvcomp but not the entire kvikio-export set because that's huge TODO: Move nvcomp vendoring
+# to libcudf wheel instead
+include(cmake/thirdparty/get_nvcomp.cmake)
+include(cmake/Modules/WheelHelpers.cmake)
+install_aliased_imported_targets(
+  TARGETS nvcomp::nvcomp DESTINATION ${SKBUILD_PLATLIB_DIR}/libkvikio/lib64/
+)
 
 set(CUDA_STATIC_RUNTIME ON)
 
 add_subdirectory(../../cpp kvikio-cpp)
-
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
-  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_property(
-      TARGET ${tgt}
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endforeach()
-endif()
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index c790d2dd00..2e573a0889 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -44,8 +44,9 @@ def _load_wheel_installation(soname: str):
 
 
 def load_library():
-    # TODO: remove this nvcomp load when `nvcomp` is re-de-vendored
-    # https://github.com/rapidsai/build-planning/issues/171
+    # TODO: remove this nvcomp load when `nvcomp` is vendored into cudf instead.
+    # Currently this load only exists to ensure that libcudf wheels are not broken by
+    # prematurely removing the load
     _load_library("libnvcomp.so.4")
     return _load_library("libkvikio.so")
 

From 0ff3c675c72e82d1120f38dfac9e66d0aedcffab Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Tue, 19 Aug 2025 09:52:38 -0700
Subject: [PATCH 054/103] Use build cluster in devcontainers (#797)

RAPIDS has deployed an autoscaling cloud build cluster that can be used to accelerate building large RAPIDS projects. This contributes to https://github.com/rapidsai/build-planning/issues/209.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/797
---
 .devcontainer/Dockerfile                      | 34 +++++++++++++++++--
 .../cuda12.9-conda/devcontainer.json          |  4 ++-
 .devcontainer/cuda12.9-pip/devcontainer.json  |  4 ++-
 .github/workflows/pr.yaml                     | 15 +++++---
 .../cmake/thirdparty/get_nvcomp.cmake         |  2 +-
 5 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 5d1d536704..4775d28f23 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -13,6 +13,8 @@ ENV DEFAULT_CONDA_ENV=rapids
 
 FROM ${PYTHON_PACKAGE_MANAGER}-base
 
+ARG TARGETARCH
+
 ARG CUDA
 ENV CUDAARCHS="RAPIDS"
 ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
@@ -24,7 +26,35 @@ ENV PYTHONSAFEPATH="1"
 ENV PYTHONUNBUFFERED="1"
 ENV PYTHONDONTWRITEBYTECODE="1"
 
+ENV HISTFILE="/home/coder/.cache/._bash_history"
+
+###
+# sccache configuration
+###
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
-ENV HISTFILE="/home/coder/.cache/._bash_history"
+# 2hr (1 minute longer than sccache-dist request timeout)
+ENV SCCACHE_IDLE_TIMEOUT=7200
+
+###
+# sccache-dist configuration
+###
+# Enable sccache-dist by default
+ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1
+# Compile locally if max retries exceeded
+ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true
+# Retry transient errors 4 times (for a total of 5 attempts)
+ENV SCCACHE_DIST_MAX_RETRIES=4
+ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
+ENV SCCACHE_DIST_CONNECTION_POOL=false
+# 1hr 59min (to accommodate debug builds)
+ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140
+ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true
+ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20
+ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600
+ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com"
+
+# Build as much in parallel as possible
+ENV INFER_NUM_DEVICE_ARCHITECTURES=1
+ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20
diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index fc3e89da7b..175af7a73c 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index 334b4b6d6c..ff13ba7e3c 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7a3dada523..8db71539ab 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -144,13 +144,20 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
     with:
-      arch: '["amd64"]'
+      arch: '["amd64", "arm64"]'
       cuda: '["12.9"]'
+      node_type: "cpu8"
+      rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
+      env: |
+        SCCACHE_DIST_MAX_RETRIES=inf
+        SCCACHE_SERVER_LOG=sccache=debug
+        SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
+        SCCACHE_DIST_AUTH_TOKEN_VAR=RAPIDS_AUX_SECRET_1
       build_command: |
-        sccache -z;
-        build-all --verbose;
+        sccache --zero-stats;
+        build-all --verbose 2>&1 | tee telemetry-artifacts/build.log;
         python -c "import kvikio; print(kvikio.__version__)";
-        sccache -s;
+        sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
diff --git a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
index 0901c1e349..fdaeeef407 100644
--- a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
+++ b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvcomp)
   set(export_args)
   if(KvikIO_EXPORT_NVCOMP)
     # We're vendoring nvcomp and we only want `libnvcomp.so.4`
-    set(export_args BUILD_EXPORT_SET nvcomp)
+    set(export_args BUILD_EXPORT_SET nvcomp INSTALL_EXPORT_SET nvcomp)
   endif()
 
   rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})

From 927b5e36f079f938262773901166515858066b65 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Aug 2025 12:35:13 -0500
Subject: [PATCH 055/103] Remove more nvcomp packaging for conda (#804)

Follow-up to #798 and #801.

After libcudf wheels vendor libnvcomp, we can finalize removal of nvcomp from kvikio.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/kvikio/pull/804
---
 conda/environments/all_cuda-129_arch-aarch64.yaml |  1 -
 conda/environments/all_cuda-129_arch-x86_64.yaml  |  1 -
 conda/recipes/kvikio/conda_build_config.yaml      |  3 ---
 conda/recipes/kvikio/recipe.yaml                  |  1 -
 dependencies.yaml                                 | 10 ----------
 python/kvikio/pyproject.toml                      |  3 ---
 6 files changed, 19 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index f72a97faf3..0a9f52ca87 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -19,7 +19,6 @@ dependencies:
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
-- libnvcomp-dev==4.2.0.11
 - moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 890bfc29d7..dd4b21ea32 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -19,7 +19,6 @@ dependencies:
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
-- libnvcomp-dev==4.2.0.11
 - moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
index 2694d62eac..f5f37a39a2 100644
--- a/conda/recipes/kvikio/conda_build_config.yaml
+++ b/conda/recipes/kvikio/conda_build_config.yaml
@@ -18,6 +18,3 @@ c_stdlib_version:
 
 libcurl_version:
   - "==8.5.0"
-
-nvcomp_version:
-  - "=4.2.0.11"
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 2e9e411d9e..7ec50ec153 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -66,7 +66,6 @@ requirements:
     - cython >=3.0.0
     - libcurl ${{ libcurl_version }}
     - libkvikio =${{ version }}
-    - libnvcomp-dev ${{ nvcomp_version }}
     - pip
     - python =${{ py_version }}
     - rapids-build-backend >=0.4.0,<0.5.0.dev0
diff --git a/dependencies.yaml b/dependencies.yaml
index 92fdec6bd5..13b4f3c54d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -13,7 +13,6 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
-      - depends_on_libnvcomp
       - docs
       - py_version
       - rapids_build_skbuild
@@ -66,10 +65,6 @@ files:
       table: project
     includes:
       - depends_on_cupy
-      # TODO: restore runtime dependency when we no longer vendor nvcomp
-      # (when nvcomp ships C++ wheels)
-      # https://github.com/rapidsai/build-planning/issues/171
-      # - depends_on_libnvcomp
       - depends_on_libkvikio
       - run
   py_rapids_build_libkvikio:
@@ -220,11 +215,6 @@ dependencies:
           - matrix: # All CUDA 12 versions
             packages:
               - cupy-cuda12x>=12.0.0
-  depends_on_libnvcomp:
-    common:
-      - output_types: conda
-        packages:
-          - libnvcomp-dev==4.2.0.11
   depends_on_libkvikio:
     common:
       - output_types: conda
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 234738fe5b..1d3191cc67 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -111,9 +111,6 @@ skip = [
 [tool.mypy]
 ignore_missing_imports = true
 
-[project.entry-points."numcodecs.codecs"]
-nvcomp_batch = "kvikio.nvcomp_codec:NvCompBatchCodec"
-
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"

From 8820d52067dd6fe4367278f6ec870954fb53cc50 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Tue, 19 Aug 2025 16:35:39 -0500
Subject: [PATCH 056/103] Optionally require zarr>=3.0.0 (#802)

This makes zarr an optional dependency of kvikio. The `pyproject.toml` now includes an optional dependency group 'zarr' that requires zarr>=3.0.0. `zarr` is no longer present as a (required) dependency in the conda recipes.

Authors:
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/802
---
 .../all_cuda-129_arch-aarch64.yaml            |  3 +-
 .../all_cuda-129_arch-x86_64.yaml             |  3 +-
 conda/recipes/kvikio/recipe.yaml              |  3 --
 dependencies.yaml                             | 27 ++++++++++++--
 docs/source/zarr.rst                          | 12 +++----
 python/kvikio/kvikio/benchmarks/zarr_io.py    |  3 --
 python/kvikio/kvikio/zarr/__init__.py         | 10 +++---
 python/kvikio/kvikio/zarr/_zarr_python_3.py   | 30 +++++++++-------
 python/kvikio/pyproject.toml                  |  6 ++--
 python/kvikio/tests/test_zarr_missing.py      | 36 +++++++++++++++++++
 python/kvikio/tests/test_zarr_v3.py           |  5 +--
 11 files changed, 97 insertions(+), 41 deletions(-)
 create mode 100644 python/kvikio/tests/test_zarr_missing.py

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 0a9f52ca87..cd8fcff024 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -21,7 +21,6 @@ dependencies:
 - libnuma
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - packaging
@@ -39,5 +38,5 @@ dependencies:
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-aarch64=2.28
-- zarr>=2.0.0,<4.0.0
+- zarr>=3.0.0,<4.0.0
 name: all_cuda-129_arch-aarch64
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index dd4b21ea32..3c18c5589c 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -21,7 +21,6 @@ dependencies:
 - libnuma
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - packaging
@@ -39,5 +38,5 @@ dependencies:
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-64=2.28
-- zarr>=2.0.0,<4.0.0
+- zarr>=3.0.0,<4.0.0
 name: all_cuda-129_arch-x86_64
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 7ec50ec153..f4d8f07143 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -75,12 +75,9 @@ requirements:
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
     - cupy >=12.0.0
     - libkvikio =${{ version }}
-    # See https://github.com/zarr-developers/numcodecs/pull/475
-    - numcodecs !=0.12.0
     - numpy >=1.23,<3.0a0
     - packaging
     - python
-    - zarr >=2.0.0,<4.0.0a0
     - cuda-cudart
   ignore_run_exports:
     by_name:
diff --git a/dependencies.yaml b/dependencies.yaml
index 13b4f3c54d..5d70bdba18 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -86,6 +86,14 @@ files:
       - build-cpp
       - build-py-wrapper
       - build-use-libkvikio-wheel
+  py_optional_zarr:
+    output: pyproject
+    pyproject_dir: python/kvikio
+    extras:
+      table: project.optional-dependencies
+      key: zarr
+    includes:
+      - zarr
   py_optional_test:
     output: pyproject
     pyproject_dir: python/kvikio
@@ -94,6 +102,7 @@ files:
       key: test
     includes:
       - test_python
+      - zarr
   test_java:
     output: none
     includes:
@@ -240,6 +249,7 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - numpydoc
+          - zarr>=3.0.0,<4.0.0
           - sphinx
           - sphinx-click
           - sphinx_rtd_theme
@@ -285,10 +295,12 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - numpy>=1.23,<3.0a0
-          - zarr>=2.0.0,<4.0.0
-          # See https://github.com/zarr-developers/numcodecs/pull/475
-          - numcodecs !=0.12.0
           - packaging
+  zarr:
+    common:
+      - output_types: [requirements, pyproject]
+        packages:
+          - "zarr>=3.0.0,<4.0.0; python_version >= '3.11'"
   test_libkvikio:
     common:
       - output_types: conda
@@ -324,6 +336,15 @@ dependencies:
         packages:
           - moto>=4.0.8
     specific:
+      - output_types: [conda]
+        matrices:
+          # zarr 3 is not supported on Python 3.10
+          - matrix:
+              py: "3.1[123]"
+            packages:
+              - zarr>=3.0.0,<4.0.0
+          - matrix:
+            packages:
       - output_types: [conda, requirements, pyproject]
         matrices:
           - matrix: # All CUDA 12 versions
diff --git a/docs/source/zarr.rst b/docs/source/zarr.rst
index f8e4564011..baa48fa3e2 100644
--- a/docs/source/zarr.rst
+++ b/docs/source/zarr.rst
@@ -3,18 +3,16 @@ Zarr
 
 `Zarr <https://github.com/zarr-developers/zarr-specs>`_ is a binary file format for chunked, compressed, N-Dimensional array. It is used throughout the PyData ecosystem and especially for climate and biological science applications.
 
-
 `Zarr-Python <https://zarr.readthedocs.io/en/stable/>`_ is the official Python package for reading and writing Zarr arrays. Its main feature is a NumPy-like array that translates array operations into file IO seamlessly.
 KvikIO provides a GPU backend to Zarr-Python that enables `GPUDirect Storage (GDS) <https://developer.nvidia.com/blog/gpudirect-storage/>`_ seamlessly.
 
-KvikIO supports either zarr-python 2.x or zarr-python 3.x.
-However, the API provided in :mod:`kvikio.zarr` differs based on which version of zarr you have, following the differences between zarr-python 2.x and zarr-python 3.x.
-
+If the optional zarr-python dependency is installed, then ``kvikio.zarr`` will be available.
+KvikIO supports zarr-python 3.x.
 
-Zarr Python 3.x
----------------
+Usage
+-----
 
-Zarr-python includes native support for reading Zarr chunks into device memory if you `configure Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html#>`__ to use GPUs.
+Zarr-Python includes native support for reading Zarr chunks into device memory if you `configure Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html#>`__ to use GPUs.
 You can use any store, but KvikIO provides :py:class:`kvikio.zarr.GDSStore` to efficiently load data directly into GPU memory.
 
 .. code-block:: python
diff --git a/python/kvikio/kvikio/benchmarks/zarr_io.py b/python/kvikio/kvikio/benchmarks/zarr_io.py
index c4d51a6d5f..14c9fb38d3 100644
--- a/python/kvikio/kvikio/benchmarks/zarr_io.py
+++ b/python/kvikio/kvikio/benchmarks/zarr_io.py
@@ -23,9 +23,6 @@
 import kvikio.zarr
 from kvikio.benchmarks.utils import drop_vm_cache, parse_directory, pprint_sys_info
 
-if not kvikio.zarr.supported:
-    raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
-
 
 def create_src_data(args):
     return cupy.random.random(args.nelem, dtype=args.dtype)
diff --git a/python/kvikio/kvikio/zarr/__init__.py b/python/kvikio/kvikio/zarr/__init__.py
index 758670ea21..ca6cfde7e9 100644
--- a/python/kvikio/kvikio/zarr/__init__.py
+++ b/python/kvikio/kvikio/zarr/__init__.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
-from importlib import metadata as _metadata
+try:
+    from ._zarr_python_3 import GDSStore
+except ImportError as e:
+    raise ImportError("kvikio.zarr requires the optional 'zarr>=3' dependency") from e
 
-from packaging.version import Version as _Version, parse as _parse
-
-if _parse(_metadata.version("zarr")) >= _Version("3.0.0"):
-    from ._zarr_python_3 import *  # noqa: F401,F403
+__all__ = ["GDSStore"]
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_3.py b/python/kvikio/kvikio/zarr/_zarr_python_3.py
index 2f21be360a..b12ff93125 100644
--- a/python/kvikio/kvikio/zarr/_zarr_python_3.py
+++ b/python/kvikio/kvikio/zarr/_zarr_python_3.py
@@ -6,32 +6,38 @@
 import os
 from pathlib import Path
 
-import packaging
-import zarr.storage
-from packaging.version import parse
-from zarr.abc.store import (
+import packaging.version
+import zarr
+
+_zarr_version = packaging.version.parse(zarr.__version__)
+
+if _zarr_version < packaging.version.parse("3.0.0"):
+    # We include this runtime package checking to help users who relied on
+    # installing kvikio to also include zarr, which is not an optional dependency.
+    raise ImportError(
+        f"'zarr>=3' is required, but 'zarr=={_zarr_version}' is installed."
+    )
+
+import zarr.storage  # noqa: E402
+from zarr.abc.store import (  # noqa: E402
     ByteRequest,
     OffsetByteRequest,
     RangeByteRequest,
     SuffixByteRequest,
 )
-from zarr.core.buffer import Buffer, BufferPrototype
-from zarr.core.buffer.core import default_buffer_prototype
+from zarr.core.buffer import Buffer, BufferPrototype  # noqa: E402
+from zarr.core.buffer.core import default_buffer_prototype  # noqa: E402
 
-import kvikio
+import kvikio  # noqa: E402
 
 # The GDSStore implementation follows the `LocalStore` implementation
 # at https://github.com/zarr-developers/zarr-python/blob/main/src/zarr/storage/_local.py
 # with differences coming swapping in `cuFile` for the stdlib open file object.
 
-MINIMUM_ZARR_VERSION = "3"
-
-supported = parse(zarr.__version__) >= parse(MINIMUM_ZARR_VERSION)
-
 
 @functools.cache
 def _is_ge_zarr_3_0_7():
-    return packaging.version.parse(zarr.__version__) >= packaging.version.parse("3.0.7")
+    return _zarr_version >= packaging.version.parse("3.0.7")
 
 
 def _get(
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 1d3191cc67..51ff2419de 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -21,10 +21,8 @@ requires-python = ">=3.10"
 dependencies = [
     "cupy-cuda12x>=12.0.0",
     "libkvikio==25.10.*,>=0.0.0a0",
-    "numcodecs !=0.12.0",
     "numpy>=1.23,<3.0a0",
     "packaging",
-    "zarr>=2.0.0,<4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -49,6 +47,10 @@ test = [
     "pytest-timeout",
     "rangehttpserver",
     "rapids-dask-dependency==25.10.*,>=0.0.0a0",
+    "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+zarr = [
+    "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/kvikio/tests/test_zarr_missing.py b/python/kvikio/tests/test_zarr_missing.py
new file mode 100644
index 0000000000..ef25b41dab
--- /dev/null
+++ b/python/kvikio/tests/test_zarr_missing.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import sys
+
+import pytest
+
+
+def test_zarr_missing_raises(monkeypatch):
+    modules = list(sys.modules)
+    for module in modules:
+        pkg = module.split(".")[0]
+        if pkg == "kvikio":
+            # remove from the import cache
+            monkeypatch.delitem(sys.modules, module, raising=False)
+        elif pkg == "zarr":
+            # force an ImportError
+            monkeypatch.setitem(sys.modules, module, None)
+
+    with pytest.raises(ImportError):
+        import kvikio.zarr  # noqa: F401
+
+
+def test_zarr_2_installed_raises(monkeypatch):
+    modules = list(sys.modules)
+    zarr = pytest.importorskip("zarr")
+    monkeypatch.setattr(zarr, "__version__", "2.0.0")
+
+    for module in modules:
+        pkg = module.split(".")[0]
+        if pkg == "kvikio":
+            # remove from the import cache
+            monkeypatch.delitem(sys.modules, module, raising=False)
+
+    with pytest.raises(ImportError):
+        import kvikio.zarr  # noqa: F401
diff --git a/python/kvikio/tests/test_zarr_v3.py b/python/kvikio/tests/test_zarr_v3.py
index e44de36d22..59331d5513 100644
--- a/python/kvikio/tests/test_zarr_v3.py
+++ b/python/kvikio/tests/test_zarr_v3.py
@@ -6,16 +6,17 @@
 import cupy as cp
 import pytest
 
-import kvikio.zarr
-
 pytest.importorskip("zarr", minversion="3.0.0")
 
+# these must follow the pytest.importorskip
 
 import zarr.core.buffer  # noqa: E402
 import zarr.storage  # noqa: E402
 from zarr.core.buffer.gpu import Buffer  # noqa: E402
 from zarr.testing.store import StoreTests  # noqa: E402
 
+import kvikio.zarr  # noqa: E402
+
 
 @pytest.mark.asyncio
 async def test_basic(tmp_path: pathlib.Path) -> None:

From e43b3d196f2e1b3b802499b08d6e62c07fa3c126 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 19 Aug 2025 17:11:26 -0700
Subject: [PATCH 057/103] Upgrade to nvCOMP 5.0.0.6 (#800)

Upgrade the nvCOMP dependency to 5.0.0.6.
This library is not used directly, but it's till vendored and used in libcudf wheels. Future changes will completely remove the dependency in kvikIO.

Depends on https://github.com/rapidsai/rapids-cmake/pull/896

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/800
---
 ci/build_wheel_python.sh                           | 2 +-
 python/libkvikio/cmake/thirdparty/get_nvcomp.cmake | 2 +-
 python/libkvikio/libkvikio/load.py                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index 55ee0c05a0..ed2df80542 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -22,7 +22,7 @@ export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 
 python -m auditwheel repair \
     --exclude libkvikio.so \
-    --exclude libnvcomp.so.4 \
+    --exclude libnvcomp.so.5 \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
diff --git a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
index fdaeeef407..9d62dfda38 100644
--- a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
+++ b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
@@ -20,7 +20,7 @@ function(find_and_configure_nvcomp)
   include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
   set(export_args)
   if(KvikIO_EXPORT_NVCOMP)
-    # We're vendoring nvcomp and we only want `libnvcomp.so.4`
+    # We're vendoring nvcomp and we only want `libnvcomp.so.5`
     set(export_args BUILD_EXPORT_SET nvcomp INSTALL_EXPORT_SET nvcomp)
   endif()
 
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index 2e573a0889..7a432d95f8 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -47,7 +47,7 @@ def load_library():
     # TODO: remove this nvcomp load when `nvcomp` is vendored into cudf instead.
     # Currently this load only exists to ensure that libcudf wheels are not broken by
     # prematurely removing the load
-    _load_library("libnvcomp.so.4")
+    _load_library("libnvcomp.so.5")
     return _load_library("libkvikio.so")
 
 
From c7740795e5904289b721a7ad5d536a9a59654e19 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 20 Aug 2025 16:02:50 -0500
Subject: [PATCH 058/103] Build and test with CUDA 13.0.0 (#803)

Contributes to https://github.com/rapidsai/build-planning/issues/208

* uses CUDA 13.0.0 to build and test

Contributes to https://github.com/rapidsai/build-planning/issues/68

* updates to CUDA 13 dependencies in fallback entries in `dependencies.yaml` matrices (i.e., the ones that get written to `pyproject.toml` in source control)

## Notes for Reviewers

This switches GitHub Actions workflows to the `cuda13.0` branch from here: https://github.com/rapidsai/shared-workflows/pull/413

A future round of PRs will revert that back to `branch-25.10`, once all of RAPIDS supports CUDA 13.

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Bradley Dice (https://github.com/bdice)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/803
---
 .../cuda13.0-conda/devcontainer.json          | 44 ++++++++++++
 .devcontainer/cuda13.0-pip/devcontainer.json  | 43 ++++++++++++
 .github/workflows/build.yaml                  | 16 ++---
 .github/workflows/pr.yaml                     | 28 ++++----
 .github/workflows/test.yaml                   |  6 +-
 .../trigger-breaking-change-alert.yaml        |  2 +-
 .../all_cuda-129_arch-aarch64.yaml            |  4 +-
 .../all_cuda-129_arch-x86_64.yaml             |  4 +-
 .../all_cuda-130_arch-aarch64.yaml            | 42 ++++++++++++
 .../all_cuda-130_arch-x86_64.yaml             | 42 ++++++++++++
 conda/recipes/kvikio/recipe.yaml              |  2 +-
 cpp/doxygen/main_page.md                      |  9 +++
 cpp/src/mmap.cpp                              |  2 +
 cpp/src/shim/cuda.cpp                         |  2 +-
 dependencies.yaml                             | 67 ++++++++++++-------
 docs/source/install.rst                       | 18 +++--
 python/kvikio/pyproject.toml                  |  4 +-
 17 files changed, 271 insertions(+), 64 deletions(-)
 create mode 100644 .devcontainer/cuda13.0-conda/devcontainer.json
 create mode 100644 .devcontainer/cuda13.0-pip/devcontainer.json
 create mode 100644 conda/environments/all_cuda-130_arch-aarch64.yaml
 create mode 100644 conda/environments/all_cuda-130_arch-x86_64.yaml

diff --git a/.devcontainer/cuda13.0-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json
new file mode 100644
index 0000000000..21e7bf83c4
--- /dev/null
+++ b/.devcontainer/cuda13.0-conda/devcontainer.json
@@ -0,0 +1,44 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "13.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge"
+    }
+  },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-conda",
+    "--ulimit",
+    "nofile=500000"
+  ],
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda13.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda13.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda13.0-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json
new file mode 100644
index 0000000000..3c035eee05
--- /dev/null
+++ b/.devcontainer/cuda13.0-pip/devcontainer.json
@@ -0,0 +1,43 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "13.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda13.0"
+    }
+  },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-pip",
+    "--ulimit",
+    "nofile=500000"
+  ],
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0070274f37..e119ba8ac1 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8db71539ab..4afc5ca548 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda13.0
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda13.0
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda13.0
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,10 +142,10 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda13.0
     with:
       arch: '["amd64", "arm64"]'
-      cuda: '["12.9"]'
+      cuda: '["13.0"]'
       node_type: "cpu8"
       rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
       env: |
@@ -160,7 +160,7 @@ jobs:
         sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -170,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -179,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5f618498b3..4c4ff60385 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 48bf37afc4..72751d071b 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda13.0
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index cd8fcff024..279b130e55 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -9,9 +9,9 @@ dependencies:
 - c-compiler
 - cmake>=3.30.4
 - cuda-nvcc
-- cuda-python>=12.6.2,<13.0a0
+- cuda-python>=12.9.2,<13.0a0
 - cuda-version=12.9
-- cupy>=12.0.0
+- cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 3c18c5589c..5460d73ab7 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -9,9 +9,9 @@ dependencies:
 - c-compiler
 - cmake>=3.30.4
 - cuda-nvcc
-- cuda-python>=12.6.2,<13.0a0
+- cuda-python>=12.9.2,<13.0a0
 - cuda-version=12.9
-- cupy>=12.0.0
+- cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
new file mode 100644
index 0000000000..1b752cbe8e
--- /dev/null
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -0,0 +1,42 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- boto3>=1.21.21
+- c-compiler
+- cmake>=3.30.4
+- cuda-nvcc
+- cuda-python>=13.0.1,<14.0a0
+- cuda-version=13.0
+- cupy>=13.6.0
+- cxx-compiler
+- cython>=3.0.0
+- doxygen=1.9.1
+- gcc_linux-aarch64=14.*
+- libcufile-dev
+- libcurl>=8.5.0,<9.0a0
+- libnuma
+- moto>=4.0.8
+- ninja
+- numpy>=1.23,<3.0a0
+- numpydoc
+- packaging
+- pre-commit
+- pytest
+- pytest-asyncio
+- pytest-cov
+- pytest-timeout
+- python>=3.10,<3.14
+- rangehttpserver
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
+- sphinx
+- sphinx-click
+- sphinx_rtd_theme
+- sysroot_linux-aarch64=2.28
+- zarr>=3.0.0,<4.0.0
+name: all_cuda-130_arch-aarch64
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
new file mode 100644
index 0000000000..812bfdad5e
--- /dev/null
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -0,0 +1,42 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- boto3>=1.21.21
+- c-compiler
+- cmake>=3.30.4
+- cuda-nvcc
+- cuda-python>=13.0.1,<14.0a0
+- cuda-version=13.0
+- cupy>=13.6.0
+- cxx-compiler
+- cython>=3.0.0
+- doxygen=1.9.1
+- gcc_linux-64=14.*
+- libcufile-dev
+- libcurl>=8.5.0,<9.0a0
+- libnuma
+- moto>=4.0.8
+- ninja
+- numpy>=1.23,<3.0a0
+- numpydoc
+- packaging
+- pre-commit
+- pytest
+- pytest-asyncio
+- pytest-cov
+- pytest-timeout
+- python>=3.10,<3.14
+- rangehttpserver
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
+- sphinx
+- sphinx-click
+- sphinx_rtd_theme
+- sysroot_linux-64=2.28
+- zarr>=3.0.0,<4.0.0
+name: all_cuda-130_arch-x86_64
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index f4d8f07143..c4c52b540c 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -73,7 +73,7 @@ requirements:
     - cuda-cudart-dev
   run:
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-    - cupy >=12.0.0
+    - cupy >=13.6.0
     - libkvikio =${{ version }}
     - numpy >=1.23,<3.0a0
     - packaging
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 8d9d0e9320..cd9fe05557 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -30,9 +30,14 @@ For convenience we release Conda packages that makes it easy to include KvikIO i
 We strongly recommend using [mamba](https://github.com/mamba-org/mamba) in place of conda, which we will do throughout the documentation.
 
 Install the **stable release** from the ``rapidsai`` channel with the following:
+
 ```sh
 # Install in existing environment
 mamba install -c rapidsai -c conda-forge libkvikio
+
+# Create new environment (CUDA 13)
+mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=13.0 libkvikio
+
 # Create new environment (CUDA 12)
 mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.9 libkvikio
 ```
@@ -42,6 +47,10 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel with the f
 ```sh
 # Install in existing environment
 mamba install -c rapidsai-nightly -c conda-forge libkvikio
+
+# Create new environment (CUDA 13)
+mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=13.0 libkvikio
+
 # Create new environment (CUDA 12)
 mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 libkvikio
 ```
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index 968b96aa42..ee41a55d3d 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -218,7 +218,9 @@ void read_impl(void* dst_buf,
                                                &attrs,
                                                attrs_idxs,
                                                static_cast<std::size_t>(1) /* num_attrs */,
+#if CUDA_VERSION < 13000
                                                static_cast<std::size_t*>(nullptr),
+#endif
                                                stream));
       } else {
         // Fall back to the conventional H2D copy if the batch copy API is not available.
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
index 8053ea152a..ae69a495aa 100644
--- a/cpp/src/shim/cuda.cpp
+++ b/cpp/src/shim/cuda.cpp
@@ -56,7 +56,7 @@ cudaAPI::cudaAPI()
   CUDA_DRIVER_TRY(DriverGetVersion(&driver_version));
 
 #if CUDA_VERSION >= 12080
-  // cuMemcpyBatchAsync was introduced in CUDA 12.8.
+  // cuMemcpyBatchAsync was introduced in CUDA 12.8, and its parameters were changed in CUDA 13.0.
   try {
     decltype(cuMemcpyBatchAsync)* fp;
     get_symbol(fp, lib, KVIKIO_STRINGIFY(cuMemcpyBatchAsync));
diff --git a/dependencies.yaml b/dependencies.yaml
index 5d70bdba18..ce1b6f1b8d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["12.9"]
+      cuda: ["12.9", "13.0"]
       arch: [aarch64, x86_64]
     includes:
       - build-universal
@@ -128,6 +128,7 @@ dependencies:
       - output_types: conda
         packages:
           - c-compiler
+          - cuda-nvcc
           - cxx-compiler
           - libcurl>=8.5.0,<9.0a0
     specific:
@@ -135,22 +136,14 @@ dependencies:
         matrices:
           - matrix:
               arch: x86_64
-              cuda: "12.*"
             packages:
               - gcc_linux-64=14.*
               - sysroot_linux-64=2.28
           - matrix:
               arch: aarch64
-              cuda: "12.*"
             packages:
               - gcc_linux-aarch64=14.*
               - sysroot_linux-aarch64=2.28
-      - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - cuda-nvcc
   build-use-libkvikio-wheel:
     common:
       - output_types: conda
@@ -164,6 +157,11 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - libkvikio-cu12==25.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "13.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu13==25.10.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -199,31 +197,35 @@ dependencies:
               cuda: "12.9"
             packages:
               - cuda-version=12.9
-  cuda:
-    specific:
-      - output_types: conda
-        matrices:
           - matrix:
-              cuda: "12.*"
+              cuda: "13.0"
             packages:
+              - cuda-version=13.0
+  cuda:
+    common:
       - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - libcufile-dev
-              - libnuma
+        packages:
+          - libcufile-dev
+          - libnuma
   depends_on_cupy:
     common:
       - output_types: conda
         packages:
-          - cupy>=12.0.0
+          - &cupy_unsuffixed cupy>=13.6.0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: # All CUDA 12 versions
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cupy-cuda12x>=13.6.0
+          - matrix:
+              cuda: "13.*"
+            packages:
+              - &cupy_cu13 cupy-cuda13x>=13.6.0
+          - matrix:
             packages:
-              - cupy-cuda12x>=12.0.0
+              - *cupy_cu13
   depends_on_libkvikio:
     common:
       - output_types: conda
@@ -232,7 +234,7 @@ dependencies:
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for libkvikio-cu12.
+          # This index is needed for libkvikio-cu{12,13}.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
@@ -243,6 +245,11 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - libkvikio-cu12==25.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "13.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu13==25.10.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
@@ -347,9 +354,17 @@ dependencies:
             packages:
       - output_types: [conda, requirements, pyproject]
         matrices:
-          - matrix: # All CUDA 12 versions
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cuda-python>=12.9.2,<13.0a0
+          - matrix:
+              cuda: "13.*"
+            packages:
+              - &cuda_python_cu13 cuda-python>=13.0.1,<14.0a0
+          - matrix:
             packages:
-              - cuda-python>=12.6.2,<13.0a0
+              - *cuda_python_cu13
   test_java:
     common:
       - output_types: conda
diff --git a/docs/source/install.rst b/docs/source/install.rst
index 5642b137af..91cc39af1f 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -15,6 +15,10 @@ Install the **stable release** from the ``rapidsai`` channel like:
 
   # Install in existing environment
   mamba install -c rapidsai -c conda-forge kvikio
+
+  # Create new environment (CUDA 13)
+  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=13.0 kvikio
+
   # Create new environment (CUDA 12)
   mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=12.9 kvikio
 
@@ -24,10 +28,13 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel like:
 
   # Install in existing environment
   mamba install -c rapidsai-nightly -c conda-forge kvikio
+
+  # Create new environment (CUDA 13)
+  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=13.0 kvikio
+
   # Create new environment (CUDA 12)
   mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 kvikio
 
-
 .. note::
 
   If the nightly install doesn't work, set ``channel_priority: flexible`` in your ``.condarc``.
@@ -40,8 +47,11 @@ KvikIO is also available on PyPI. Install the latest release like:
 
 .. code-block::
 
-  pip install kvikio-cu12 # for CUDA 12
+  pip install kvikio-cu13 # for CUDA 13
+
+.. code-block::
 
+  pip install kvikio-cu12 # for CUDA 12
 
 Build from source
 -----------------
@@ -50,8 +60,8 @@ In order to setup a development environment, we recommend Conda:
 
 .. code-block::
 
-  # CUDA 12
-  mamba env create --name kvikio-dev --file conda/environments/all_cuda-128_arch-x86_64.yaml
+  # CUDA 13
+  mamba env create --name kvikio-dev --file conda/environments/all_cuda-130_arch-x86_64.yaml
 
 The Python library depends on the C++ library, thus we build and install both:
 
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 51ff2419de..8f214c31d9 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cupy-cuda12x>=12.0.0",
+    "cupy-cuda13x>=13.6.0",
     "libkvikio==25.10.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
     "packaging",
@@ -39,7 +39,7 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "boto3>=1.21.21",
-    "cuda-python>=12.6.2,<13.0a0",
+    "cuda-python>=13.0.1,<14.0a0",
     "moto[server]>=4.0.8",
     "pytest",
     "pytest-asyncio",

From 7841a2529e1d37f5526f18ea75a06586140d49be Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 22 Aug 2025 18:11:50 -0500
Subject: [PATCH 059/103] Devendor libnvcomp from libkvikio (#805)

This fully devendors libnvcomp from libkvikio wheels.

A complementary PR is needed to vendor libnvcomp.so.* inside of libcudf wheels: https://github.com/rapidsai/cudf/pull/19743

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Matthew Murray (https://github.com/Matt711)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/kvikio/pull/805
---
 ci/build_wheel_cpp.sh                         |  1 -
 ci/build_wheel_python.sh                      |  2 -
 python/libkvikio/CMakeLists.txt               |  8 ---
 .../cmake/Modules/WheelHelpers.cmake          | 59 -------------------
 .../cmake/thirdparty/get_nvcomp.cmake         | 35 -----------
 python/libkvikio/libkvikio/load.py            |  4 --
 python/libkvikio/pyproject.toml               |  4 +-
 7 files changed, 1 insertion(+), 112 deletions(-)
 delete mode 100644 python/libkvikio/cmake/Modules/WheelHelpers.cmake
 delete mode 100644 python/libkvikio/cmake/thirdparty/get_nvcomp.cmake

diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 8107f24160..61654c66b5 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -27,7 +27,6 @@ rapids-pip-retry install \
 # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
 export PIP_NO_BUILD_ISOLATION=0
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=OFF"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index ed2df80542..312fedc0e7 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -17,12 +17,10 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 LIBKVIKIO_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libkvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp)
 echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo "${LIBKVIKIO_WHEELHOUSE}"/libkvikio_*.whl)" >> "${PIP_CONSTRAINT}"
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
     --exclude libkvikio.so \
-    --exclude libnvcomp.so.5 \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
diff --git a/python/libkvikio/CMakeLists.txt b/python/libkvikio/CMakeLists.txt
index 231bebbd84..fe0019bf58 100644
--- a/python/libkvikio/CMakeLists.txt
+++ b/python/libkvikio/CMakeLists.txt
@@ -40,14 +40,6 @@ set(KvikIO_BUILD_BENCHMARKS OFF)
 set(KvikIO_BUILD_EXAMPLES OFF)
 set(KvikIO_BUILD_TESTS OFF)
 
-# vendor nvcomp but not the entire kvikio-export set because that's huge TODO: Move nvcomp vendoring
-# to libcudf wheel instead
-include(cmake/thirdparty/get_nvcomp.cmake)
-include(cmake/Modules/WheelHelpers.cmake)
-install_aliased_imported_targets(
-  TARGETS nvcomp::nvcomp DESTINATION ${SKBUILD_PLATLIB_DIR}/libkvikio/lib64/
-)
-
 set(CUDA_STATIC_RUNTIME ON)
 
 add_subdirectory(../../cpp kvikio-cpp)
diff --git a/python/libkvikio/cmake/Modules/WheelHelpers.cmake b/python/libkvikio/cmake/Modules/WheelHelpers.cmake
deleted file mode 100644
index abdde95298..0000000000
--- a/python/libkvikio/cmake/Modules/WheelHelpers.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Making libraries available inside wheels by installing the associated targets.
-function(install_aliased_imported_targets)
-  list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets")
-
-  set(options "")
-  set(one_value "DESTINATION")
-  set(multi_value "TARGETS")
-  cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN})
-
-  message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'")
-
-  foreach(target IN LISTS __TARGETS)
-
-    if(NOT TARGET ${target})
-      message(VERBOSE "No target named ${target}")
-      continue()
-    endif()
-
-    get_target_property(alias_target ${target} ALIASED_TARGET)
-    if(alias_target)
-      set(target ${alias_target})
-    endif()
-
-    get_target_property(is_imported ${target} IMPORTED)
-    if(NOT is_imported)
-      # If the target isn't imported, install it into the wheel
-      install(TARGETS ${target} DESTINATION ${__DESTINATION})
-      message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})")
-    else()
-      # If the target is imported, make sure it's global
-      get_target_property(type ${target} TYPE)
-      if(${type} STREQUAL "UNKNOWN_LIBRARY")
-        install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})
-        message(VERBOSE "install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})")
-      else()
-        install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION})
-        message(
-          VERBOSE
-          "install(IMPORTED_RUNTIME_ARTIFACTS $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})"
-        )
-      endif()
-    endif()
-  endforeach()
-endfunction()
diff --git a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
deleted file mode 100644
index 9d62dfda38..0000000000
--- a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(KVIKIO_USE_PROPRIETARY_BINARY ON)
-
-# This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp)
-
-  include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  set(export_args)
-  if(KvikIO_EXPORT_NVCOMP)
-    # We're vendoring nvcomp and we only want `libnvcomp.so.5`
-    set(export_args BUILD_EXPORT_SET nvcomp INSTALL_EXPORT_SET nvcomp)
-  endif()
-
-  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
-
-  # Per-thread default stream
-  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
-  endif()
-endfunction()
-
-find_and_configure_nvcomp()
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index 7a432d95f8..2fd25e642c 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -44,10 +44,6 @@ def _load_wheel_installation(soname: str):
 
 
 def load_library():
-    # TODO: remove this nvcomp load when `nvcomp` is vendored into cudf instead.
-    # Currently this load only exists to ensure that libcudf wheels are not broken by
-    # prematurely removing the load
-    _load_library("libnvcomp.so.5")
     return _load_library("libkvikio.so")
 
 
diff --git a/python/libkvikio/pyproject.toml b/python/libkvikio/pyproject.toml
index 43b659aec8..3239d8c651 100644
--- a/python/libkvikio/pyproject.toml
+++ b/python/libkvikio/pyproject.toml
@@ -67,6 +67,4 @@ select = [
 ]
 
 # PyPI limit is 100 MiB, fail CI before we get too close to that
-# TODO: drop this to 75M after we re-de-vendor nvcomp
-# https://github.com/rapidsai/build-planning/issues/171
-max_allowed_size_compressed = '90M'
+max_allowed_size_compressed = '75M'

From 46fa7dd127994df2f8b816eebea0aef6488c508a Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 25 Aug 2025 12:03:50 -0400
Subject: [PATCH 060/103] Add a unified remote I/O interface that infers the
 endpoint type from URL (1/2): C++ implementation (#793)

This PR adds a new remote I/O utility function `RemoteHandle::open(url)` that infers the remote endpoint type from the URL to facilitate `RemoteHandle` creation.

- Supported endpoint types include S3, S3 with presigned URL, WebHDFS, and generic HTTP/HTTPS.
- Optionally, instead of letting `open` figure it out, users can explicitly specify the endpoint type by passing an enum argument `RemoteEndpointType`.
- Optionally, users can provide an allowlist that restricts the endpoint candidates
- Optionally, users can specify the expected file size. This design is to fully support the existing constructor overload `RemoteHandle(endpoint, nbytes)`.

A byproduct of this PR is an internal utility class `UrlParser` that uses the idiomatic libcurl URL API to validate the URL against "[RFC 3986 plus](https://curl.se/docs/url-syntax.html)".

## This PR depends on
- [x] #791
- [x] #788

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/793
---
 cpp/CMakeLists.txt                   |   2 +-
 cpp/include/kvikio/detail/url.hpp    | 199 +++++++++++++++++++++++
 cpp/include/kvikio/hdfs.hpp          |   8 +
 cpp/include/kvikio/remote_handle.hpp | 136 ++++++++++++++++
 cpp/src/detail/url.cpp               | 142 ++++++++++++++++
 cpp/src/hdfs.cpp                     |  15 +-
 cpp/src/remote_handle.cpp            | 197 +++++++++++++++++++++-
 cpp/tests/CMakeLists.txt             |   1 +
 cpp/tests/test_defaults.cpp          |   2 +-
 cpp/tests/test_remote_handle.cpp     | 233 ++++++++++++++++++++++++++-
 cpp/tests/test_url.cpp               |  64 ++++++++
 11 files changed, 991 insertions(+), 8 deletions(-)
 create mode 100644 cpp/include/kvikio/detail/url.hpp
 create mode 100644 cpp/src/detail/url.cpp
 create mode 100644 cpp/tests/test_url.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9fa8e1f798..6f9c249cfa 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -163,7 +163,7 @@ set(SOURCES
 
 if(KvikIO_REMOTE_SUPPORT)
   list(APPEND SOURCES "src/hdfs.cpp" "src/remote_handle.cpp" "src/detail/remote_handle.cpp"
-       "src/shim/libcurl.cpp"
+       "src/detail/url.cpp" "src/shim/libcurl.cpp"
   )
 endif()
 
diff --git a/cpp/include/kvikio/detail/url.hpp b/cpp/include/kvikio/detail/url.hpp
new file mode 100644
index 0000000000..e57d2c4c94
--- /dev/null
+++ b/cpp/include/kvikio/detail/url.hpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include <curl/curl.h>
+
+namespace kvikio::detail {
+/**
+ * @brief RAII wrapper for libcurl's URL handle (CURLU)
+ *
+ * This class provides automatic resource management for libcurl URL handles,
+ * ensuring proper cleanup when the handle goes out of scope. The class is
+ * move-only to prevent accidental sharing of the underlying resource.
+ */
+class CurlUrlHandle {
+ private:
+  CURLU* _handle{nullptr};
+
+ public:
+  /**
+   * @brief Create a new libcurl URL handle
+   *
+   * @exception std::runtime_error if libcurl cannot allocate the handle (usually due to out of
+   * memory)
+   */
+  CurlUrlHandle();
+
+  /**
+   * @brief Clean up the underlying URL handle
+   */
+  ~CurlUrlHandle() noexcept;
+
+  CurlUrlHandle(CurlUrlHandle const&)            = delete;
+  CurlUrlHandle& operator=(CurlUrlHandle const&) = delete;
+
+  CurlUrlHandle(CurlUrlHandle&& other) noexcept;
+  CurlUrlHandle& operator=(CurlUrlHandle&& other) noexcept;
+
+  /**
+   * @brief Get the underlying libcurl URL handle
+   *
+   * @return Pointer to the underlying libcurl URL handle
+   * @note The returned pointer should not be freed manually as it is managed by this class
+   */
+  CURLU* get() const;
+};
+
+/**
+ * @brief URL parsing utility using libcurl's URL API
+ *
+ * This class provides static methods for parsing URLs into their constituent
+ * components (scheme, host, port, path, query, fragment).
+ *
+ * @note This class uses libcurl's URL parsing which follows RFC 3986 plus. See
+ * https://curl.se/docs/url-syntax.html
+ *
+ * Example:
+ * @code{.cpp}
+ * auto components = UrlParser::parse("https://example.com:8080/path?query=1#frag");
+ * if (components.scheme.has_value()) {
+ *     std::cout << "Scheme: " << components.scheme.value() << std::endl;
+ * }
+ * if (components.host.has_value()) {
+ *     std::cout << "Host: " << components.host.value() << std::endl;
+ * }
+ * @endcode
+ */
+class UrlParser {
+ public:
+  /**
+   * @brief Container for parsed URL components
+   */
+  struct UrlComponents {
+    /**
+     * @brief The URL scheme (e.g., "http", "https", "ftp"). May be empty for scheme-relative URLs
+     * or paths.
+     */
+    std::optional<std::string> scheme;
+
+    /**
+     * @brief The hostname or IP address. May be empty for URLs without an authority component
+     * (e.g., "file:///path").
+     */
+    std::optional<std::string> host;
+
+    /**
+     * @brief The port number as a string. Will be empty if no explicit port is specified in the
+     * URL.
+     * @note Default ports (e.g., 80 for HTTP, 443 for HTTPS) are not automatically filled in.
+     */
+    std::optional<std::string> port;
+
+    /**
+     * @brief The path component of the URL. Libcurl ensures that the path component is always
+     * present, even if empty (will be "/" for URLs like "http://example.com").
+     */
+    std::optional<std::string> path;
+
+    /**
+     * @brief The query string (without the leading "?"). Empty if no query parameters are present.
+     */
+    std::optional<std::string> query;
+
+    /**
+     * @brief The fragment identifier (without the leading "#"). Empty if no fragment is present.
+     */
+    std::optional<std::string> fragment;
+  };
+
+  /**
+   * @brief Parses the given URL according to RFC 3986 plus and extracts its components.
+   *
+   * @param url The URL string to parse
+   * @param bitmask_url_flags Optional flags for URL parsing. Common flags include:
+   *  - CURLU_DEFAULT_SCHEME: Allows URLs without schemes
+   *  - CURLU_NON_SUPPORT_SCHEME: Accept non-supported schemes
+   *  - CURLU_URLENCODE: URL encode the path
+   * @param bitmask_component_flags Optional flags for component extraction. Common flags include:
+   *  - CURLU_URLDECODE: URL decode the component
+   *  - CURLU_PUNYCODE: Return host as punycode
+   *
+   * @return UrlComponents structure containing the parsed URL components
+   *
+   * @throw std::runtime_error if the URL cannot be parsed or if component extraction fails
+   *
+   * Example:
+   * @code{.cpp}
+   * // Basic parsing
+   * auto components = UrlParser::parse("https://api.example.com/v1/users?page=1");
+   *
+   * // Parsing with URL decoding
+   * auto decoded = UrlParser::parse(
+   *     "https://example.com/hello%20world",
+   *     std::nullopt,
+   *     CURLU_URLDECODE
+   * );
+   *
+   * // Allow non-standard schemes
+   * auto custom = UrlParser::parse(
+   *     "myscheme://example.com",
+   *     CURLU_NON_SUPPORT_SCHEME
+   * );
+   * @endcode
+   */
+  static UrlComponents parse(std::string const& url,
+                             std::optional<unsigned int> bitmask_url_flags       = std::nullopt,
+                             std::optional<unsigned int> bitmask_component_flags = std::nullopt);
+
+  /**
+   * @brief Extract a specific component from a CurlUrlHandle
+   *
+   * @param handle The CurlUrlHandle containing the parsed URL
+   * @param part The URL part to extract (e.g., CURLUPART_SCHEME)
+   * @param bitmask_component_flags Flags controlling extraction behavior
+   * @param allowed_err_code Optional error code to treat as valid (e.g., CURLUE_NO_SCHEME)
+   * @return The extracted component as a string, or std::nullopt if not present
+   * @throw std::runtime_error if extraction fails with an unexpected error
+   */
+  static std::optional<std::string> extract_component(
+    CurlUrlHandle const& handle,
+    CURLUPart part,
+    std::optional<unsigned int> bitmask_component_flags = std::nullopt,
+    std::optional<CURLUcode> allowed_err_code           = std::nullopt);
+
+  /**
+   * @brief Extract a specific component from a URL string
+   *
+   * @param url The URL string from which to extract a component
+   * @param part The URL part to extract
+   * @param bitmask_url_flags Optional flags for URL parsing.
+   * @param bitmask_component_flags Flags controlling extraction behavior
+   * @param allowed_err_code Optional error code to treat as valid
+   * @return The extracted component as a string, or std::nullopt if not present
+   * @throw std::runtime_error if extraction fails with an unexpected error
+   */
+  static std::optional<std::string> extract_component(
+    std::string const& url,
+    CURLUPart part,
+    std::optional<unsigned int> bitmask_url_flags       = std::nullopt,
+    std::optional<unsigned int> bitmask_component_flags = std::nullopt,
+    std::optional<CURLUcode> allowed_err_code           = std::nullopt);
+};
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
index 0b20d658bd..345051bcbd 100644
--- a/cpp/include/kvikio/hdfs.hpp
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -58,5 +58,13 @@ class WebHdfsEndpoint : public RemoteEndpoint {
   std::string str() const override;
   std::size_t get_file_size() override;
   void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for the WebHDFS endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index b2e2d1d0ff..0d56231d03 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -34,6 +34,18 @@ namespace kvikio {
 
 class CurlHandle;  // Prototype
 
+/**
+ * @brief Type of remote file.
+ */
+enum class RemoteEndpointType : uint8_t {
+  AUTO,  ///< Let KvikIO infer the type of remote file from the URL and create a proper endpoint.
+  S3,    ///< AWS S3 (based on HTTP/HTTPS protocols).
+  S3_PRESIGNED_URL,  ///< AWS S3 presigned URL (based on HTTP/HTTPS protocols).
+  WEBHDFS,           ///< Apache Hadoop WebHDFS (based on HTTP/HTTPS protocols).
+  HTTP,  ///< Generic HTTP/HTTPS, excluding all the specific types listed above that use HTTP/HTTPS
+         ///< protocols.
+};
+
 /**
  * @brief Abstract base class for remote endpoints.
  *
@@ -43,6 +55,10 @@ class CurlHandle;  // Prototype
  * its own ctor that takes communication protocol specific arguments.
  */
 class RemoteEndpoint {
+ protected:
+  RemoteEndpointType _remote_endpoint_type{RemoteEndpointType::AUTO};
+  RemoteEndpoint(RemoteEndpointType remote_endpoint_type);
+
  public:
   virtual ~RemoteEndpoint() = default;
 
@@ -74,6 +90,13 @@ class RemoteEndpoint {
    * size.
    */
   virtual void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) = 0;
+
+  /**
+   * @brief Get the type of the remote file.
+   *
+   * @return The type of the remote file.
+   */
+  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
 };
 
 /**
@@ -96,6 +119,14 @@ class HttpEndpoint : public RemoteEndpoint {
   std::string str() const override;
   std::size_t get_file_size() override;
   void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for HTTP/HTTPS endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
@@ -206,6 +237,14 @@ class S3Endpoint : public RemoteEndpoint {
   std::string str() const override;
   std::size_t get_file_size() override;
   void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 endpoints (excluding presigned URL).
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
@@ -224,6 +263,14 @@ class S3EndpointWithPresignedUrl : public RemoteEndpoint {
   std::string str() const override;
   std::size_t get_file_size() override;
   void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 endpoints with presigned URL.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
@@ -235,6 +282,88 @@ class RemoteHandle {
   std::size_t _nbytes;
 
  public:
+  /**
+   * @brief Create a remote file handle from a URL.
+   *
+   * This function creates a RemoteHandle for reading data from various remote endpoints
+   * including HTTP/HTTPS servers, AWS S3 buckets, S3 presigned URLs, and WebHDFS.
+   * The endpoint type can be automatically detected from the URL or explicitly specified.
+   *
+   * @param url The URL of the remote file. Supported formats include:
+   *   - S3 with credentials
+   *   - S3 presigned URL
+   *   - WebHDFS
+   *   - HTTP/HTTPS
+   * @param remote_endpoint_type The type of remote endpoint. Default is RemoteEndpointType::AUTO
+   * which automatically detects the endpoint type from the URL. Can be explicitly set to
+   * RemoteEndpointType::S3, RemoteEndpointType::S3_PRESIGNED_URL, RemoteEndpointType::WEBHDFS, or
+   * RemoteEndpointType::HTTP to force a specific endpoint type.
+   * @param allow_list Optional list of allowed endpoint types. If provided:
+   *   - If remote_endpoint_type is RemoteEndpointType::AUTO, Types are tried in the exact order
+   *     specified until a match is found.
+   *   - In explicit mode, the specified type must be in this list, otherwise an exception is
+   *     thrown.
+   *
+   * If not provided, defaults to all supported types in this order: RemoteEndpointType::S3,
+   * RemoteEndpointType::S3_PRESIGNED_URL, RemoteEndpointType::WEBHDFS, and
+   * RemoteEndpointType::HTTP.
+   * @param nbytes Optional file size in bytes. If not provided, the function sends additional
+   * request to the server to query the file size.
+   * @return A RemoteHandle object that can be used to read data from the remote file.
+   * @exception std::runtime_error If:
+   *   - If the URL is malformed or missing required components.
+   *   - RemoteEndpointType::AUTO mode is used and the URL doesn't match any supported endpoint
+   * type.
+   *   - The specified endpoint type is not in the `allow_list`.
+   *   - The URL is invalid for the specified endpoint type.
+   *   - Unable to connect to the remote server or determine file size (when nbytes not provided).
+   *
+   * Example:
+   * - Auto-detect endpoint type from URL
+   *   @code{.cpp}
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://bucket.s3.amazonaws.com/object?X-Amz-Algorithm=AWS4-HMAC-SHA256"
+   *       "&X-Amz-Credential=...&X-Amz-Signature=..."
+   *   );
+   *   @endcode
+   *
+   * - Open S3 file with explicit endpoint type
+   *   @code{.cpp}
+   *
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://my-bucket.s3.us-east-1.amazonaws.com/data.bin",
+   *       kvikio::RemoteEndpointType::S3
+   *   );
+   *   @endcode
+   *
+   * - Restrict endpoint type candidates
+   *   @code{.cpp}
+   *   std::vector<kvikio::RemoteEndpointType> allow_list = {
+   *       kvikio::RemoteEndpointType::HTTP,
+   *       kvikio::RemoteEndpointType::S3_PRESIGNED_URL
+   *   };
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       user_provided_url,
+   *       kvikio::RemoteEndpointType::AUTO,
+   *       allow_list
+   *   );
+   *   @endcode
+   *
+   * - Provide known file size to skip HEAD request
+   *   @code{.cpp}
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://example.com/large-file.bin",
+   *       kvikio::RemoteEndpointType::HTTP,
+   *       std::nullopt,
+   *       1024 * 1024 * 100  // 100 MB
+   *   );
+   *   @endcode
+   */
+  static RemoteHandle open(std::string url,
+                           RemoteEndpointType remote_endpoint_type = RemoteEndpointType::AUTO,
+                           std::optional<std::vector<RemoteEndpointType>> allow_list = std::nullopt,
+                           std::optional<std::size_t> nbytes = std::nullopt);
+
   /**
    * @brief Create a new remote handle from an endpoint and a file size.
    *
@@ -258,6 +387,13 @@ class RemoteHandle {
   RemoteHandle(RemoteHandle const&)            = delete;
   RemoteHandle& operator=(RemoteHandle const&) = delete;
 
+  /**
+   * @brief Get the type of the remote file.
+   *
+   * @return The type of the remote file.
+   */
+  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
+
   /**
    * @brief Get the file size.
    *
diff --git a/cpp/src/detail/url.cpp b/cpp/src/detail/url.cpp
new file mode 100644
index 0000000000..64f5b8fde9
--- /dev/null
+++ b/cpp/src/detail/url.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+#include <stdexcept>
+#include <utility>
+
+#include <curl/curl.h>
+#include <kvikio/detail/url.hpp>
+#include <kvikio/error.hpp>
+
+#define CHECK_CURL_URL_ERR(err_code) check_curl_url_err(err_code, __LINE__, __FILE__)
+
+namespace kvikio::detail {
+namespace {
+void check_curl_url_err(CURLUcode err_code, int line_number, char const* filename)
+{
+  if (err_code == CURLUcode::CURLUE_OK) { return; }
+
+  std::stringstream ss;
+  ss << "KvikIO detects an URL error at: " << filename << ":" << line_number << ": ";
+  char const* msg = curl_url_strerror(err_code);
+  if (msg == nullptr) {
+    ss << "(no message)";
+  } else {
+    ss << msg;
+  }
+  throw std::runtime_error(ss.str());
+}
+}  // namespace
+
+CurlUrlHandle::CurlUrlHandle() : _handle(curl_url())
+{
+  KVIKIO_EXPECT(_handle != nullptr,
+                "Libcurl is unable to allocate a URL handle (likely out of memory).");
+}
+
+CurlUrlHandle::~CurlUrlHandle() noexcept
+{
+  if (_handle) { curl_url_cleanup(_handle); }
+}
+
+CurlUrlHandle::CurlUrlHandle(CurlUrlHandle&& other) noexcept
+  : _handle{std::exchange(other._handle, nullptr)}
+{
+}
+
+CurlUrlHandle& CurlUrlHandle::operator=(CurlUrlHandle&& other) noexcept
+{
+  if (this != &other) {
+    if (_handle) { curl_url_cleanup(_handle); }
+    _handle = std::exchange(other._handle, nullptr);
+  }
+
+  return *this;
+}
+
+CURLU* CurlUrlHandle::get() const { return _handle; }
+
+std::optional<std::string> UrlParser::extract_component(
+  CurlUrlHandle const& handle,
+  CURLUPart part,
+  std::optional<unsigned int> bitmask_component_flags,
+  std::optional<CURLUcode> allowed_err_code)
+{
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  char* value{};
+  auto err_code = curl_url_get(handle.get(), part, &value, bitmask_component_flags.value());
+
+  if (err_code == CURLUcode::CURLUE_OK && value != nullptr) {
+    std::string result{value};
+    curl_free(value);
+    return result;
+  }
+
+  if (allowed_err_code.has_value() && allowed_err_code.value() == err_code) { return std::nullopt; }
+
+  // Throws an exception and explains the reason.
+  CHECK_CURL_URL_ERR(err_code);
+  return std::nullopt;
+}
+
+std::optional<std::string> UrlParser::extract_component(
+  std::string const& url,
+  CURLUPart part,
+  std::optional<unsigned int> bitmask_url_flags,
+  std::optional<unsigned int> bitmask_component_flags,
+  std::optional<CURLUcode> allowed_err_code)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  CurlUrlHandle handle;
+  CHECK_CURL_URL_ERR(
+    curl_url_set(handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+
+  return extract_component(handle, part, bitmask_component_flags, allowed_err_code);
+}
+
+UrlParser::UrlComponents UrlParser::parse(std::string const& url,
+                                          std::optional<unsigned int> bitmask_url_flags,
+                                          std::optional<unsigned int> bitmask_component_flags)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  CurlUrlHandle handle;
+  CHECK_CURL_URL_ERR(
+    curl_url_set(handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+
+  UrlComponents components;
+  CURLUcode err_code{};
+
+  components.scheme = extract_component(
+    handle, CURLUPART_SCHEME, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_SCHEME);
+  components.host = extract_component(
+    handle, CURLUPART_HOST, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_HOST);
+  components.port = extract_component(
+    handle, CURLUPART_PORT, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_PORT);
+  components.path  = extract_component(handle, CURLUPART_PATH, bitmask_component_flags.value());
+  components.query = extract_component(
+    handle, CURLUPART_QUERY, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_QUERY);
+  components.fragment = extract_component(
+    handle, CURLUPART_FRAGMENT, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_FRAGMENT);
+
+  return components;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
index 12455b3a26..2e032a1af7 100644
--- a/cpp/src/hdfs.cpp
+++ b/cpp/src/hdfs.cpp
@@ -25,7 +25,7 @@
 
 namespace kvikio {
 
-WebHdfsEndpoint::WebHdfsEndpoint(std::string url)
+WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpointType::WEBHDFS}
 {
   // todo: Use libcurl URL API for more secure and idiomatic parsing.
   // Split the URL into two parts: one without query and one with.
@@ -64,7 +64,7 @@ WebHdfsEndpoint::WebHdfsEndpoint(std::string host,
                                  std::string port,
                                  std::string file_path,
                                  std::optional<std::string> username)
-  : _username{std::move(username)}
+  : RemoteEndpoint{RemoteEndpointType::WEBHDFS}, _username{std::move(username)}
 {
   std::stringstream ss;
   ss << "http://" << host << ":" << port << "/webhdfs/v1" << file_path;
@@ -128,4 +128,15 @@ void WebHdfsEndpoint::setup_range_request(CurlHandle& curl,
   ss << "op=OPEN&offset=" << file_offset << "&length=" << size;
   curl.setopt(CURLOPT_URL, ss.str().c_str());
 }
+
+bool WebHdfsEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    std::regex const pattern(R"(^https?://[^/]+:\d+/webhdfs/v1/.+$)", std::regex_constants::icase);
+    std::smatch match_result;
+    return std::regex_match(url, match_result, pattern);
+  } catch (...) {
+    return false;
+  }
+}
 }  // namespace kvikio
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 23cf5c6305..3cf2acc862 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstring>
@@ -26,7 +27,9 @@
 
 #include <kvikio/defaults.hpp>
 #include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/detail/url.hpp>
 #include <kvikio/error.hpp>
+#include <kvikio/hdfs.hpp>
 #include <kvikio/nvtx.hpp>
 #include <kvikio/parallel_operation.hpp>
 #include <kvikio/posix_io.hpp>
@@ -177,9 +180,69 @@ void setup_range_request_impl(CurlHandle& curl, std::size_t file_offset, std::si
   curl.setopt(CURLOPT_RANGE, byte_range.c_str());
 }
 
+/**
+ * @brief Whether the given URL is compatible with the S3 endpoint (including the credential-based
+ * access and presigned URL) which uses HTTP/HTTPS.
+ *
+ * @param url A URL.
+ * @return Boolean answer.
+ */
+bool url_has_aws_s3_http_format(std::string const& url)
+{
+  // Currently KvikIO supports the following AWS S3 HTTP URL formats:
+  static std::array const s3_patterns = {
+    // Virtual host style: https://<bucket-name>.s3.<region-code>.amazonaws.com/<object-key-name>
+    std::regex(R"(https?://[^/]+\.s3\.[^.]+\.amazonaws\.com/.+$)", std::regex_constants::icase),
+
+    // Path style (deprecated but still popular):
+    // https://s3.<region-code>.amazonaws.com/<bucket-name>/<object-key-name>
+    std::regex(R"(https?://s3\.[^.]+\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase),
+
+    // Legacy global endpoint: no region code
+    std::regex(R"(https?://[^/]+\.s3\.amazonaws\.com/.+$)", std::regex_constants::icase),
+    std::regex(R"(https?://s3\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase),
+
+    // Legacy regional endpoint: s3 and region code are delimited by - instead of .
+    std::regex(R"(https?://[^/]+\.s3-[^.]+\.amazonaws\.com/.+$)", std::regex_constants::icase),
+    std::regex(R"(https?://s3-[^.]+\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase)};
+
+  return std::any_of(s3_patterns.begin(), s3_patterns.end(), [&url = url](auto const& pattern) {
+    std::smatch match_result;
+    return std::regex_match(url, match_result, pattern);
+  });
+}
+
+char const* get_remote_endpoint_type_name(RemoteEndpointType remote_endpoint_type)
+{
+  switch (remote_endpoint_type) {
+    case RemoteEndpointType::S3: return "S3";
+    case RemoteEndpointType::S3_PRESIGNED_URL: return "S3 with presigned URL";
+    case RemoteEndpointType::WEBHDFS: return "WebHDFS";
+    case RemoteEndpointType::HTTP: return "HTTP";
+    case RemoteEndpointType::AUTO: return "AUTO";
+    default:
+      // Unreachable
+      KVIKIO_FAIL("Unknown RemoteEndpointType: " +
+                  std::to_string(static_cast<int>(remote_endpoint_type)));
+      return "UNKNOWN";
+  }
+}
 }  // namespace
 
-HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
+RemoteEndpoint::RemoteEndpoint(RemoteEndpointType remote_endpoint_type)
+  : _remote_endpoint_type{remote_endpoint_type}
+{
+}
+
+RemoteEndpointType RemoteEndpoint::remote_endpoint_type() const noexcept
+{
+  return _remote_endpoint_type;
+}
+
+HttpEndpoint::HttpEndpoint(std::string url)
+  : RemoteEndpoint{RemoteEndpointType::HTTP}, _url{std::move(url)}
+{
+}
 
 std::string HttpEndpoint::str() const { return _url; }
 
@@ -194,6 +257,19 @@ void HttpEndpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset
   setup_range_request_impl(curl, file_offset, size);
 }
 
+bool HttpEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    auto parsed_url = detail::UrlParser::parse(url);
+    if ((parsed_url.scheme != "http") && (parsed_url.scheme != "https")) { return false; };
+
+    // Check whether the file path exists, excluding the leading "/"
+    return parsed_url.path->length() > 1;
+  } catch (...) {
+    return false;
+  }
+}
+
 void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
 
 void S3Endpoint::setopt(CurlHandle& curl)
@@ -256,7 +332,7 @@ S3Endpoint::S3Endpoint(std::string url,
                        std::optional<std::string> aws_access_key,
                        std::optional<std::string> aws_secret_access_key,
                        std::optional<std::string> aws_session_token)
-  : _url{std::move(url)}
+  : RemoteEndpoint{RemoteEndpointType::S3}, _url{std::move(url)}
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match http[s]://
@@ -348,8 +424,29 @@ void S3Endpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset,
   setup_range_request_impl(curl, file_offset, size);
 }
 
+bool S3Endpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    auto parsed_url = detail::UrlParser::parse(url, CURLU_NON_SUPPORT_SCHEME);
+
+    if (parsed_url.scheme == "s3") {
+      if (!parsed_url.host.has_value()) { return false; }
+      if (!parsed_url.path.has_value()) { return false; }
+
+      // Check whether the S3 object key exists
+      std::regex const pattern(R"(^/[^/]+$)", std::regex::icase);
+      std::smatch match_result;
+      return std::regex_search(parsed_url.path.value(), match_result, pattern);
+    } else if ((parsed_url.scheme == "http") || (parsed_url.scheme == "https")) {
+      return url_has_aws_s3_http_format(url) && !S3EndpointWithPresignedUrl::is_url_valid(url);
+    }
+  } catch (...) {
+  }
+  return false;
+}
+
 S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
-  : _url{std::move(presigned_url)}
+  : RemoteEndpoint{RemoteEndpointType::S3_PRESIGNED_URL}, _url{std::move(presigned_url)}
 {
 }
 
@@ -439,6 +536,95 @@ void S3EndpointWithPresignedUrl::setup_range_request(CurlHandle& curl,
   setup_range_request_impl(curl, file_offset, size);
 }
 
+bool S3EndpointWithPresignedUrl::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    if (!url_has_aws_s3_http_format(url)) { return false; }
+
+    auto parsed_url = detail::UrlParser::parse(url);
+    if (!parsed_url.query.has_value()) { return false; }
+
+    // Reference: https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html
+    return parsed_url.query->find("X-Amz-Algorithm") != std::string::npos &&
+           parsed_url.query->find("X-Amz-Signature") != std::string::npos;
+  } catch (...) {
+    return false;
+  }
+}
+
+RemoteHandle RemoteHandle::open(std::string url,
+                                RemoteEndpointType remote_endpoint_type,
+                                std::optional<std::vector<RemoteEndpointType>> allow_list,
+                                std::optional<std::size_t> nbytes)
+{
+  if (!allow_list.has_value()) {
+    allow_list = {RemoteEndpointType::S3,
+                  RemoteEndpointType::S3_PRESIGNED_URL,
+                  RemoteEndpointType::WEBHDFS,
+                  RemoteEndpointType::HTTP};
+  }
+
+  auto const scheme =
+    detail::UrlParser::extract_component(url, CURLUPART_SCHEME, CURLU_NON_SUPPORT_SCHEME);
+  KVIKIO_EXPECT(scheme.has_value(), "Missing scheme in URL.");
+
+  // Helper to create endpoint based on type
+  auto create_endpoint =
+    [&url = url, &scheme = scheme](RemoteEndpointType type) -> std::unique_ptr<RemoteEndpoint> {
+    switch (type) {
+      case RemoteEndpointType::S3:
+        if (!S3Endpoint::is_url_valid(url)) { return nullptr; }
+        if (scheme.value() == "s3") {
+          auto const [bucket, object] = S3Endpoint::parse_s3_url(url);
+          return std::make_unique<S3Endpoint>(std::pair{bucket, object});
+        }
+        return std::make_unique<S3Endpoint>(url);
+
+      case RemoteEndpointType::S3_PRESIGNED_URL:
+        if (!S3EndpointWithPresignedUrl::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<S3EndpointWithPresignedUrl>(url);
+
+      case RemoteEndpointType::WEBHDFS:
+        if (!WebHdfsEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<WebHdfsEndpoint>(url);
+
+      case RemoteEndpointType::HTTP:
+        if (!HttpEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<HttpEndpoint>(url);
+
+      default: return nullptr;
+    }
+  };
+
+  std::unique_ptr<RemoteEndpoint> endpoint;
+
+  if (remote_endpoint_type == RemoteEndpointType::AUTO) {
+    // Try each allowed type in the order of allowlist
+    for (auto const& type : allow_list.value()) {
+      endpoint = create_endpoint(type);
+      if (endpoint) { break; }
+    }
+    KVIKIO_EXPECT(endpoint.get() != nullptr, "Unsupported endpoint URL.", std::runtime_error);
+  } else {
+    // Validate it is in the allow list
+    KVIKIO_EXPECT(
+      std::find(allow_list->begin(), allow_list->end(), remote_endpoint_type) != allow_list->end(),
+      std::string{get_remote_endpoint_type_name(remote_endpoint_type)} +
+        " is not in the allowlist.",
+      std::runtime_error);
+
+    // Create the specific type
+    endpoint = create_endpoint(remote_endpoint_type);
+    KVIKIO_EXPECT(endpoint.get() != nullptr,
+                  std::string{"Invalid URL for "} +
+                    get_remote_endpoint_type_name(remote_endpoint_type) + " endpoint",
+                  std::runtime_error);
+  }
+
+  return nbytes.has_value() ? RemoteHandle(std::move(endpoint), nbytes.value())
+                            : RemoteHandle(std::move(endpoint));
+}
+
 RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
   : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
 {
@@ -452,6 +638,11 @@ RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
   _endpoint = std::move(endpoint);
 }
 
+RemoteEndpointType RemoteHandle::remote_endpoint_type() const noexcept
+{
+  return _endpoint->remote_endpoint_type();
+}
+
 std::size_t RemoteHandle::nbytes() const noexcept { return _nbytes; }
 
 RemoteEndpoint const& RemoteHandle::endpoint() const noexcept { return *_endpoint; }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 41de4bb6fa..a6fd2c67e4 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -79,6 +79,7 @@ kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
 if(KvikIO_REMOTE_SUPPORT)
   kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
   kvikio_add_test(NAME HDFS_TEST SOURCES test_hdfs.cpp utils/hdfs_helper.cpp)
+  kvikio_add_test(NAME URL_TEST SOURCES test_url.cpp)
 endif()
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/tests/libkvikio)
diff --git a/cpp/tests/test_defaults.cpp b/cpp/tests/test_defaults.cpp
index 89bbe7399c..a74f38c86f 100644
--- a/cpp/tests/test_defaults.cpp
+++ b/cpp/tests/test_defaults.cpp
@@ -19,9 +19,9 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <kvikio/compat_mode.hpp>
 #include <kvikio/defaults.hpp>
 
-#include "kvikio/compat_mode.hpp"
 #include "utils/env.hpp"
 
 using ::testing::HasSubstr;
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index 918479b0f0..ffb7c82266 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -14,12 +14,101 @@
  * limitations under the License.
  */
 
+#include <functional>
+#include <optional>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <kvikio/hdfs.hpp>
 #include <kvikio/remote_handle.hpp>
 
 #include "utils/env.hpp"
 
-TEST(RemoteHandleTest, s3_endpoint_constructor)
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+class RemoteHandleTest : public testing::Test {
+ protected:
+  void SetUp() override
+  {
+    _sample_urls = {
+      // Endpoint type: S3
+      {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3},
+      {"https://s3.region-code.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3.amazonaws.com/object-key-name", kvikio::RemoteEndpointType::S3},
+      {"https://s3.amazonaws.com/bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3-region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3},
+      {"https://s3-region-code.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3},
+
+      // Endpoint type: S3 presigned URL
+      {"https://bucket-name.s3.region-code.amazonaws.com/"
+       "object-key-name?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Signature=sig&X-Amz-Credential=cred&"
+       "X-Amz-SignedHeaders=host",
+       kvikio::RemoteEndpointType::S3_PRESIGNED_URL},
+
+      // Endpoint type: WebHDFS
+      {"https://host:1234/webhdfs/v1/data.bin", kvikio::RemoteEndpointType::WEBHDFS},
+    };
+  }
+
+  void TearDown() override {}
+
+  void test_helper(kvikio::RemoteEndpointType expected_endpoint_type,
+                   std::function<bool(const std::string&)> url_validity_checker)
+  {
+    for (auto const& [url, endpoint_type] : _sample_urls) {
+      if (endpoint_type == expected_endpoint_type) {
+        // Given that the URL is the expected endpoint type
+
+        // Test URL validity checker
+        EXPECT_TRUE(url_validity_checker(url));
+
+        // Test unified interface
+        {
+          // Here we pass the 1-byte argument to RemoteHandle::open. This prevents the endpoint
+          // constructor from querying the file size and sending requests to the server, thus
+          // allowing us to use dummy URLs for testing purpose.
+          auto remote_handle =
+            kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1);
+          EXPECT_EQ(remote_handle.remote_endpoint_type(), expected_endpoint_type);
+        }
+
+        // Test explicit endpoint type specification
+        {
+          EXPECT_NO_THROW({
+            auto remote_handle =
+              kvikio::RemoteHandle::open(url, expected_endpoint_type, std::nullopt, 1);
+          });
+        }
+      } else {
+        // Given that the URL is NOT the expected endpoint type
+
+        // Test URL validity checker
+        EXPECT_FALSE(url_validity_checker(url));
+
+        // Test explicit endpoint type specification
+        {
+          EXPECT_ANY_THROW({
+            auto remote_handle =
+              kvikio::RemoteHandle::open(url, expected_endpoint_type, std::nullopt, 1);
+          });
+        }
+      }
+    }
+  }
+
+  std::vector<std::pair<std::string, kvikio::RemoteEndpointType>> _sample_urls;
+};
+
+TEST_F(RemoteHandleTest, s3_endpoint_constructor)
 {
   kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
                                           {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
@@ -37,3 +126,145 @@ TEST(RemoteHandleTest, s3_endpoint_constructor)
 
   EXPECT_EQ(s1.str(), s2.str());
 }
+
+TEST_F(RemoteHandleTest, test_http_url)
+{
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{// Incorrect scheme
+                                                "s3://example.com",
+                                                "hdfs://example.com",
+                                                // Missing file path
+                                                "http://example.com"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::HttpEndpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_s3_url)
+{
+  kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
+                                          {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
+                                          {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"}};
+
+  {
+    test_helper(kvikio::RemoteEndpointType::S3, kvikio::S3Endpoint::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{
+      // Lack object-name
+      "s3://bucket-name",
+      "https://bucket-name.s3.region-code.amazonaws.com",
+      // Presigned URL
+      "https://bucket-name.s3.region-code.amazonaws.com/"
+      "object-key-name?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Signature=sig&X-Amz-Credential="
+      "cred&"
+      "X-Amz-SignedHeaders=host"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::S3Endpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_s3_url_with_presigned_url)
+{
+  {
+    test_helper(kvikio::RemoteEndpointType::S3_PRESIGNED_URL,
+                kvikio::S3EndpointWithPresignedUrl::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{
+      // Presigned URL should not use S3 scheme
+      "s3://bucket-name/object-key-name",
+
+      // Completely missing query
+      "https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+
+      // Missing key parameters ("X-Amz-..."") in query
+      "https://bucket-name.s3.region-code.amazonaws.com/object-key-name?k0=v0&k1=v2"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::S3EndpointWithPresignedUrl::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_webhdfs_url)
+{
+  {
+    test_helper(kvikio::RemoteEndpointType::WEBHDFS, kvikio::WebHdfsEndpoint::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{// Missing file
+                                                "https://host:1234/webhdfs/v1",
+                                                "https://host:1234/webhdfs/v1/",
+
+                                                // Missing WebHDFS identifier
+                                                "https://host:1234/data.bin",
+
+                                                // Missing port number
+                                                "https://host/webhdfs/v1/data.bin"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::WebHdfsEndpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_open)
+{
+  // Missing scheme
+  {
+    std::vector<std::string> const urls{
+      "example.com/path", "example.com:8080/path", "//example.com/path", "://example.com/path"};
+    for (auto const& url : urls) {
+      EXPECT_THROW(
+        { kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1); },
+        std::runtime_error);
+    }
+  }
+
+  // Unsupported type
+  {
+    std::string const url{"unsupported://example.com/path"};
+    EXPECT_THAT(
+      [&] { kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1); },
+      ThrowsMessage<std::runtime_error>(HasSubstr("Unsupported endpoint URL")));
+  }
+
+  // Specified URL not in the allowlist
+  {
+    std::string const url{"https://host:1234/webhdfs/v1/data.bin"};
+    std::vector<std::vector<kvikio::RemoteEndpointType>> const wrong_allowlists{
+      {},
+      {kvikio::RemoteEndpointType::S3},
+    };
+    for (auto const& wrong_allowlist : wrong_allowlists) {
+      EXPECT_THAT(
+        [&] {
+          kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::WEBHDFS, wrong_allowlist, 1);
+        },
+        ThrowsMessage<std::runtime_error>(HasSubstr("is not in the allowlist")));
+    }
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::pair<std::string, kvikio::RemoteEndpointType>> const invalid_urls{
+      {"s3://bucket-name", kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PRESIGNED_URL},
+      {"https://host:1234/webhdfs/v1", kvikio::RemoteEndpointType::WEBHDFS},
+      {"http://example.com", kvikio::RemoteEndpointType::HTTP},
+    };
+    for (auto const& [invalid_url, endpoint_type] : invalid_urls) {
+      EXPECT_THAT([&] { kvikio::RemoteHandle::open(invalid_url, endpoint_type, std::nullopt, 1); },
+                  ThrowsMessage<std::runtime_error>(HasSubstr("Invalid URL")));
+    }
+  }
+}
diff --git a/cpp/tests/test_url.cpp b/cpp/tests/test_url.cpp
new file mode 100644
index 0000000000..ce419ed5a5
--- /dev/null
+++ b/cpp/tests/test_url.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <kvikio/detail/url.hpp>
+#include <stdexcept>
+
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+TEST(UrlTest, parse_scheme)
+{
+  {
+    std::vector<std::string> invalid_scheme_urls{
+      "invalid_scheme://host",
+      // The S3 scheme is not supported by libcurl. Without the CURLU_NON_SUPPORT_SCHEME flag, an
+      // exception is expected.
+      "s3://host"};
+
+    for (auto const& invalid_scheme_url : invalid_scheme_urls) {
+      EXPECT_THAT([&] { kvikio::detail::UrlParser::parse(invalid_scheme_url); },
+                  ThrowsMessage<std::runtime_error>(HasSubstr("KvikIO detects an URL error")));
+    }
+  }
+
+  // With the CURLU_NON_SUPPORT_SCHEME flag, the S3 scheme is now accepted.
+  {
+    std::vector<std::string> schemes{"s3", "S3"};
+    for (auto const& scheme : schemes) {
+      auto parsed_url =
+        kvikio::detail::UrlParser::parse(scheme + "://host", CURLU_NON_SUPPORT_SCHEME);
+      EXPECT_EQ(parsed_url.scheme.value(), "s3");  // Lowercase due to CURL's normalization
+    }
+  }
+}
+
+TEST(UrlTest, parse_host)
+{
+  std::vector<std::string> invalid_host_urls{"http://host with spaces.com",
+                                             "http://host[brackets].com",
+                                             "http://host{braces}.com",
+                                             "http://host<angle>.com",
+                                             R"(http://host\backslash.com)",
+                                             "http://host^caret.com",
+                                             "http://host`backtick.com"};
+  for (auto const& invalid_host_url : invalid_host_urls) {
+    EXPECT_THROW({ kvikio::detail::UrlParser::parse(invalid_host_url); }, std::runtime_error);
+  }
+}

From ecfe488b6f5c9d18efefb0a6cf0ce7c1616e7cd1 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 26 Aug 2025 16:24:46 -0400
Subject: [PATCH 061/103] Update rapids-dependency-file-generator (#809)

This PR updates the rapids-dependency-file-generator hook to get https://github.com/rapidsai/dependency-file-generator/pull/163.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/kvikio/pull/809
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f095cc2807..17ccb948fc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -95,10 +95,10 @@ repos:
       - id: verify-codeowners
         args: [--fix, --project-prefix=kvikio]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.19.0
+    rev: v1.20.0
     hooks:
       - id: rapids-dependency-file-generator
-        args: ["--clean"]
+        args: ["--clean", "--warn-all", "--strict"]
   - repo: https://github.com/shellcheck-py/shellcheck-py
     rev: v0.10.0.1
     hooks:

From fed9abdf09d4fa727278d7fac265b91e132fb153 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Wed, 27 Aug 2025 09:22:11 -0400
Subject: [PATCH 062/103] Fix an S3 parsing bug in the open function. Improve
 regex usage (#810)

AWS S3 provides a non-standard S3 scheme for internal use (such as for AWS CLI). The URL takes the form `s3://<bucket-name>/<object-name>`, where `<object-name>` may contain `/` characters indicating subdirectories.

The newly added `open` function for remote I/O currently uses an incorrect regular expression, causing object names containing subdirectories to be rejected. This PR fixes this bug.

This PR also improves the usage of regular expression by making the pattern constant `static`.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/810
---
 cpp/src/hdfs.cpp                 | 12 ++++++------
 cpp/src/http_status_codes.cpp    |  4 ++--
 cpp/src/remote_handle.cpp        | 11 +++++------
 cpp/tests/test_remote_handle.cpp |  1 +
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
index 2e032a1af7..2aa5f53278 100644
--- a/cpp/src/hdfs.cpp
+++ b/cpp/src/hdfs.cpp
@@ -29,7 +29,7 @@ WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpoin
 {
   // todo: Use libcurl URL API for more secure and idiomatic parsing.
   // Split the URL into two parts: one without query and one with.
-  std::regex const pattern{R"(^([^?]+)\?([^#]*))"};
+  std::regex static const pattern{R"(^([^?]+)\?([^#]*))"};
   // Regex meaning:
   // ^: From the start of the line
   // [^?]+: Matches non-question-mark characters one or more times. The question mark ushers in the
@@ -51,7 +51,7 @@ WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpoin
   {
     // Extract user name if provided. In WebHDFS, user name is specified as the key=value pair in
     // the query
-    std::regex const pattern{R"(user.name=([^&]+))"};
+    std::regex static const pattern{R"(user.name=([^&]+))"};
     // Regex meaning:
     // [^&]+: Matches the non-ampersand character one or more times. The ampersand delimits
     // different parameters.
@@ -104,7 +104,7 @@ std::size_t WebHdfsEndpoint::get_file_size()
   KVIKIO_EXPECT(http_status_code == 200, "HTTP response is not successful.");
 
   // The response is in JSON format. The file size is given by `"length":<file_size>`.
-  std::regex const pattern{R"("length"\s*:\s*(\d+)[^\d])"};
+  std::regex static const pattern{R"("length"\s*:\s*(\d+)[^\d])"};
   // Regex meaning:
   // \s*: Matches the space character zero or more times.
   // \d+: Matches the digit one or more times.
@@ -132,9 +132,9 @@ void WebHdfsEndpoint::setup_range_request(CurlHandle& curl,
 bool WebHdfsEndpoint::is_url_valid(std::string const& url) noexcept
 {
   try {
-    std::regex const pattern(R"(^https?://[^/]+:\d+/webhdfs/v1/.+$)", std::regex_constants::icase);
-    std::smatch match_result;
-    return std::regex_match(url, match_result, pattern);
+    std::regex static const pattern(R"(^https?://[^/]+:\d+/webhdfs/v1/.+$)",
+                                    std::regex_constants::icase);
+    return std::regex_match(url, pattern);
   } catch (...) {
     return false;
   }
diff --git a/cpp/src/http_status_codes.cpp b/cpp/src/http_status_codes.cpp
index 9b9cd3d793..da4a4cc379 100644
--- a/cpp/src/http_status_codes.cpp
+++ b/cpp/src/http_status_codes.cpp
@@ -31,13 +31,13 @@ std::vector<int> parse_http_status_codes(std::string_view env_var_name,
                                          std::string const& status_codes)
 {
   // Ensure `status_codes` consists only of 3-digit integers separated by commas, allowing spaces.
-  std::regex const check_pattern(R"(^\s*\d{3}\s*(\s*,\s*\d{3}\s*)*$)");
+  std::regex static const check_pattern(R"(^\s*\d{3}\s*(\s*,\s*\d{3}\s*)*$)");
   KVIKIO_EXPECT(std::regex_match(status_codes, check_pattern),
                 std::string{env_var_name} + ": invalid format, expected comma-separated integers.",
                 std::invalid_argument);
 
   // Match every integer in `status_codes`.
-  std::regex const number_pattern(R"(\d+)");
+  std::regex static const number_pattern(R"(\d+)");
 
   // For each match, we push_back `std::stoi(match.str())` into `ret`.
   std::vector<int> ret;
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 3cf2acc862..a173804321 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -320,7 +320,7 @@ std::pair<std::string, std::string> S3Endpoint::parse_s3_url(std::string const&
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match s3://<bucket>/<object>
-  std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
+  std::regex static const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
   std::smatch matches;
   if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; }
   KVIKIO_FAIL("Input string does not match the expected S3 URL format.", std::invalid_argument);
@@ -336,7 +336,7 @@ S3Endpoint::S3Endpoint(std::string url,
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match http[s]://
-  std::regex pattern{R"(^https?://.*)", std::regex_constants::icase};
+  std::regex static const pattern{R"(^https?://.*)", std::regex_constants::icase};
   KVIKIO_EXPECT(std::regex_search(_url, pattern),
                 "url must start with http:// or https://",
                 std::invalid_argument);
@@ -434,9 +434,8 @@ bool S3Endpoint::is_url_valid(std::string const& url) noexcept
       if (!parsed_url.path.has_value()) { return false; }
 
       // Check whether the S3 object key exists
-      std::regex const pattern(R"(^/[^/]+$)", std::regex::icase);
-      std::smatch match_result;
-      return std::regex_search(parsed_url.path.value(), match_result, pattern);
+      std::regex static const pattern(R"(^/.+$)");
+      return std::regex_search(parsed_url.path.value(), pattern);
     } else if ((parsed_url.scheme == "http") || (parsed_url.scheme == "https")) {
       return url_has_aws_s3_http_format(url) && !S3EndpointWithPresignedUrl::is_url_valid(url);
     }
@@ -485,7 +484,7 @@ std::size_t callback_header(char* data, std::size_t size, std::size_t num_bytes,
   // Content-Range: <unit> <range>/<size>
   // Content-Range: <unit> <range>/*
   // Content-Range: <unit> */<size>
-  std::regex const pattern(R"(Content-Range:[^/]+/(.*))", std::regex::icase);
+  std::regex static const pattern(R"(Content-Range:[^/]+/(.*))", std::regex::icase);
   std::smatch match_result;
   bool found = std::regex_search(header_line, match_result, pattern);
   if (found) {
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index ffb7c82266..a7e3d1e829 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -37,6 +37,7 @@ class RemoteHandleTest : public testing::Test {
     _sample_urls = {
       // Endpoint type: S3
       {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
+      {"s3://bucket-name/object-key-name-dir/object-key-name-file", kvikio::RemoteEndpointType::S3},
       {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
        kvikio::RemoteEndpointType::S3},
       {"https://s3.region-code.amazonaws.com/bucket-name/object-key-name",

From 6efd22dc6ae3389caea7d3e736c7f954b9db0619 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Wed, 27 Aug 2025 09:44:43 -0400
Subject: [PATCH 063/103] Add a unified remote I/O interface that infers the
 endpoint type from URL (2/2): Python binding (#808)

This PR adds Python binding to https://github.com/rapidsai/kvikio/pull/793
Closes #807

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/808
---
 cpp/include/kvikio/remote_handle.hpp        |  22 ++-
 docs/source/api.rst                         |   2 +
 python/kvikio/kvikio/__init__.py            |   3 +-
 python/kvikio/kvikio/_lib/remote_handle.pyx |  86 +++++++++-
 python/kvikio/kvikio/remote_file.py         | 170 ++++++++++++++++++++
 python/kvikio/tests/test_s3_io.py           |  61 +++++++
 6 files changed, 333 insertions(+), 11 deletions(-)

diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 0d56231d03..1c6c887281 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -35,15 +35,23 @@ namespace kvikio {
 class CurlHandle;  // Prototype
 
 /**
- * @brief Type of remote file.
+ * @brief Types of remote file endpoints supported by KvikIO.
+ *
+ * This enum defines the different protocols and services that can be used to access remote files.
+ * It is used to specify or detect the type of remote endpoint when opening files.
  */
 enum class RemoteEndpointType : uint8_t {
-  AUTO,  ///< Let KvikIO infer the type of remote file from the URL and create a proper endpoint.
-  S3,    ///< AWS S3 (based on HTTP/HTTPS protocols).
-  S3_PRESIGNED_URL,  ///< AWS S3 presigned URL (based on HTTP/HTTPS protocols).
-  WEBHDFS,           ///< Apache Hadoop WebHDFS (based on HTTP/HTTPS protocols).
-  HTTP,  ///< Generic HTTP/HTTPS, excluding all the specific types listed above that use HTTP/HTTPS
-         ///< protocols.
+  AUTO,  ///< Automatically detect the endpoint type from the URL. KvikIO will attempt to infer the
+         ///< appropriate protocol based on the URL format.
+  S3,    ///< AWS S3 endpoint using credentials-based authentication. Requires AWS environment
+         ///< variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION) to be
+         ///< set.
+  S3_PRESIGNED_URL,  ///< AWS S3 endpoint using a presigned URL. No credentials required as
+                     ///< authentication is embedded in the URL with time-limited access.
+  WEBHDFS,  ///< Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System) endpoint for
+            ///< accessing files stored in HDFS over HTTP/HTTPS.
+  HTTP,  ///< Generic HTTP or HTTPS endpoint for accessing files from web servers. This is used for
+         ///< standard web resources that do not fit the other specific categories.
 };
 
 /**
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 7ae724dec5..e11f4cf55b 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -51,6 +51,8 @@ RemoteFile
 ----------
 .. currentmodule:: kvikio.remote_file
 
+.. autoclass:: RemoteEndpointType
+
 .. autoclass:: RemoteFile
     :members:
 
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index 5c6d8b6353..124698206e 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -16,7 +16,7 @@
 from kvikio._version import __git_commit__, __version__
 from kvikio.cufile import CuFile, clear_page_cache, get_page_cache_info
 from kvikio.mmap import Mmap
-from kvikio.remote_file import RemoteFile, is_remote_file_available
+from kvikio.remote_file import RemoteEndpointType, RemoteFile, is_remote_file_available
 
 __all__ = [
     "__git_commit__",
@@ -26,5 +26,6 @@
     "Mmap",
     "get_page_cache_info",
     "is_remote_file_available",
+    "RemoteEndpointType",
     "RemoteFile",
 ]
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 17222685ae..edfd59e78d 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -7,17 +7,25 @@
 from typing import Optional
 
 from cython.operator cimport dereference as deref
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uint8_t, uintptr_t
 from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.optional cimport nullopt, optional
 from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.utility cimport move, pair
+from libcpp.vector cimport vector
 
 from kvikio._lib.arr cimport parse_buffer_argument
 from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
 
 
-cdef extern from "<kvikio/remote_handle.hpp>" nogil:
+cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
+    cpdef enum class RemoteEndpointType(uint8_t):
+        AUTO = 0
+        S3 = 1
+        S3_PRESIGNED_URL = 2
+        WEBHDFS = 3
+        HTTP = 4
     cdef cppclass cpp_RemoteEndpoint "kvikio::RemoteEndpoint":
         string str() except +
 
@@ -40,7 +48,8 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes
         ) except +
         cpp_RemoteHandle(unique_ptr[cpp_RemoteEndpoint] endpoint) except +
-        size_t nbytes() except +
+        RemoteEndpointType remote_endpoint_type() noexcept
+        size_t nbytes() noexcept
         const cpp_RemoteEndpoint& endpoint() except +
         size_t read(
             void* buf,
@@ -53,6 +62,14 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             size_t file_offset
         ) except +
 
+        @staticmethod
+        cpp_RemoteHandle cpp_easy_open "open"(
+            string url,
+            RemoteEndpointType remote_endpoint_type,
+            optional[vector[RemoteEndpointType]] allow_list,
+            optional[size_t] nbytes
+        ) except +
+
 cdef extern from "<kvikio/hdfs.hpp>" nogil:
     cdef cppclass cpp_WebHdfsEndpoint "kvikio::WebHdfsEndpoint"(cpp_RemoteEndpoint):
         cpp_WebHdfsEndpoint(string url) except +
@@ -80,6 +97,28 @@ cdef extern from * nogil:
     """
     cdef unique_ptr[cpp_RemoteEndpoint] cast_to_remote_endpoint[T](T handle) except +
 
+# Helper function for the cpp_RemoteHandle.open method to return
+# unique_ptr[cpp_RemoteHandle] instead of cpp_RemoteHandle. Due to lack of a nullary
+# constructor, cpp_RemoteHandle cannot be created as a stack variable in Cython.
+cdef extern from * nogil:
+    """
+    inline std::unique_ptr<kvikio::RemoteHandle> create_remote_handle_from_open(
+        std::string url,
+        kvikio::RemoteEndpointType remote_endpoint_type,
+        std::optional<std::vector<kvikio::RemoteEndpointType>> allow_list,
+        std::optional<std::size_t> nbytes)
+    {
+        return std::make_unique<kvikio::RemoteHandle>(
+            kvikio::RemoteHandle::open(url, remote_endpoint_type, allow_list, nbytes)
+        );
+    }
+    """
+    cdef unique_ptr[cpp_RemoteHandle] create_remote_handle_from_open(
+        string url,
+        RemoteEndpointType remote_endpoint_type,
+        optional[vector[RemoteEndpointType]] allow_list,
+        optional[size_t] nbytes
+    ) except +
 
 cdef class RemoteFile:
     cdef unique_ptr[cpp_RemoteHandle] _handle
@@ -209,12 +248,53 @@ cdef class RemoteFile:
             nbytes
         )
 
+    @staticmethod
+    def open(
+        url: str,
+        remote_endpoint_type: RemoteEndpointType,
+        allow_list: Optional[list],
+        nbytes: Optional[int]
+    ):
+        cdef optional[vector[RemoteEndpointType]] cpp_allow_list
+        cdef vector[RemoteEndpointType] vec_allow_list
+        if allow_list is None:
+            cpp_allow_list = nullopt
+        else:
+            for allow_item in allow_list:
+                vec_allow_list.push_back(allow_item.value)
+            cpp_allow_list = vec_allow_list
+
+        cdef optional[size_t] cpp_nbytes
+        if nbytes is None:
+            cpp_nbytes = nullopt
+        else:
+            cpp_nbytes = <size_t>nbytes
+
+        cdef RemoteFile ret = RemoteFile()
+        cdef unique_ptr[cpp_RemoteHandle] cpp_handle
+        cdef string cpp_url = _to_string(url)
+        with nogil:
+            cpp_handle = create_remote_handle_from_open(
+                cpp_url,
+                remote_endpoint_type,
+                cpp_allow_list,
+                cpp_nbytes)
+        ret._handle = move(cpp_handle)
+
+        return ret
+
     def __str__(self) -> str:
         cdef string ep_str
         with nogil:
             ep_str = deref(self._handle).endpoint().str()
         return f'<{self.__class__.__name__} "{ep_str.decode()}">'
 
+    def remote_endpoint_type(self) -> RemoteEndpointType:
+        cdef RemoteEndpointType result
+        with nogil:
+            result = deref(self._handle).remote_endpoint_type()
+        return result
+
     def nbytes(self) -> int:
         cdef size_t result
         with nogil:
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 105c42e438..a3f73d271a 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import enum
 import functools
 import urllib.parse
 from typing import Optional
@@ -10,6 +11,50 @@
 from kvikio.cufile import IOFuture
 
 
+class RemoteEndpointType(enum.Enum):
+    """
+    Types of remote file endpoints supported by KvikIO.
+
+    This enum defines the different protocols and services that can be used
+    to access remote files. It is used to specify or detect the type of
+    remote endpoint when opening files.
+
+    Attributes
+    ----------
+    AUTO : int
+        Automatically detect the endpoint type from the URL. KvikIO will
+        attempt to infer the appropriate protocol based on the URL format.
+    S3 : int
+        AWS S3 endpoint using credentials-based authentication. Requires
+        AWS environment variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
+        AWS_DEFAULT_REGION) to be set.
+    S3_PRESIGNED_URL : int
+        AWS S3 endpoint using a presigned URL. No credentials required as
+        authentication is embedded in the URL with time-limited access.
+    WEBHDFS : int
+        Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System)
+        endpoint for accessing files stored in HDFS over HTTP/HTTPS.
+    HTTP : int
+        Generic HTTP or HTTPS endpoint for accessing files from web servers.
+        This is used for standard web resources that do not fit the other
+        specific categories.
+
+    See Also
+    --------
+    RemoteFile.open : Factory method that uses this enum to specify endpoint types.
+    """
+
+    AUTO = 0
+    S3 = 1
+    S3_PRESIGNED_URL = 2
+    WEBHDFS = 3
+    HTTP = 4
+
+    @staticmethod
+    def _map_to_internal(remote_endpoint_type: RemoteEndpointType):
+        return _get_remote_module().RemoteEndpointType[remote_endpoint_type.name]
+
+
 @functools.cache
 def is_remote_file_available() -> bool:
     """Check if the remote module is available"""
@@ -185,6 +230,122 @@ def open_webhdfs(
         """
         return RemoteFile(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
 
+    @classmethod
+    def open(
+        cls,
+        url: str,
+        remote_endpoint_type: RemoteEndpointType = RemoteEndpointType.AUTO,
+        allow_list: Optional[list] = None,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """
+        Create a remote file handle from a URL.
+
+        This function creates a RemoteFile for reading data from various remote
+        endpoints including HTTP/HTTPS servers, AWS S3 buckets, S3 presigned URLs,
+        and WebHDFS. The endpoint type can be automatically detected from the URL
+        or explicitly specified.
+
+        Parameters
+        ----------
+        url : str
+            The URL of the remote file. Supported formats include:
+
+            - S3 with credentials
+            - S3 presigned URL
+            - WebHDFS
+            - HTTP/HTTPS
+        remote_endpoint_type : RemoteEndpointType, optional
+            The type of remote endpoint. Default is :class:`RemoteEndpointType.AUTO`
+            which automatically detects the endpoint type from the URL. Can be
+            explicitly set to :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
+            :class:`RemoteEndpointType.WEBHDFS`, or :class:`RemoteEndpointType.HTTP`
+            to force a specific endpoint type.
+        allow_list : list of RemoteEndpointType, optional
+            List of allowed endpoint types. If provided:
+
+            - If remote_endpoint_type is :class:`RemoteEndpointType.AUTO`, types are
+              tried in the exact order specified until a match is found.
+            - In explicit mode, the specified type must be in this list, otherwise an
+              exception is thrown.
+
+            If not provided, defaults to all supported types in this order:
+            :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
+            :class:`RemoteEndpointType.WEBHDFS`, and :class:`RemoteEndpointType.HTTP`.
+        nbytes : int, optional
+            File size in bytes. If not provided, the function sends an additional
+            request to the server to query the file size.
+
+        Returns
+        -------
+        RemoteFile
+            A RemoteFile object that can be used to read data from the remote file.
+
+        Raises
+        ------
+        RuntimeError
+            - If the URL is malformed or missing required components.
+            - :class:`RemoteEndpointType.AUTO` mode is used and the URL does not match
+              any supported endpoint type.
+            - The specified endpoint type is not in the `allow_list`.
+            - The URL is invalid for the specified endpoint type.
+            - Unable to connect to the remote server or determine file size
+              (when nbytes not provided).
+
+        Examples
+        --------
+        - Auto-detect endpoint type from URL:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://bucket.s3.amazonaws.com/object?X-Amz-Algorithm=AWS4-HMAC-SHA256"
+                 "&X-Amz-Credential=...&X-Amz-Signature=..."
+             )
+
+        - Open S3 file with explicit endpoint type:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://my-bucket.s3.us-east-1.amazonaws.com/data.bin",
+                 remote_endpoint_type=RemoteEndpointType.S3
+             )
+
+        - Restrict endpoint type candidates:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 user_provided_url,
+                 remote_endpoint_type=RemoteEndpointType.AUTO,
+                 allow_list=[
+                     RemoteEndpointType.HTTP,
+                     RemoteEndpointType.S3_PRESIGNED_URL
+                 ]
+             )
+
+        - Provide known file size to skip HEAD request:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://example.com/large-file.bin",
+                 remote_endpoint_type=RemoteEndpointType.HTTP,
+                 nbytes=1024 * 1024 * 100  # 100 MB
+             )
+        """
+        return RemoteFile(
+            _get_remote_module().RemoteFile.open(
+                url,
+                RemoteEndpointType._map_to_internal(remote_endpoint_type),
+                allow_list,
+                nbytes,
+            )
+        )
+
     def close(self) -> None:
         """Close the file"""
         pass
@@ -198,6 +359,15 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
     def __str__(self) -> str:
         return str(self._handle)
 
+    def remote_endpoint_type(self) -> RemoteEndpointType:
+        """Get the type of the remote file.
+
+        Returns
+        -------
+        The type of the remote file.
+        """
+        return RemoteEndpointType[self._handle.remote_endpoint_type().name]
+
     def nbytes(self) -> int:
         """Get the file size.
 
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
index 2256bc1284..510940d284 100644
--- a/python/kvikio/tests/test_s3_io.py
+++ b/python/kvikio/tests/test_s3_io.py
@@ -156,3 +156,64 @@ def test_read_with_file_offset(s3_base, xp, start, end):
             b = xp.zeros(shape=(end - start,), dtype=xp.int64)
             assert f.read(b, file_offset=start * a.itemsize) == b.nbytes
             xp.testing.assert_array_equal(a[start:end], b)
+
+
+@pytest.mark.parametrize("scheme", ["S3"])
+@pytest.mark.parametrize(
+    "remote_endpoint_type",
+    [kvikio.RemoteEndpointType.S3.AUTO, kvikio.RemoteEndpointType.S3],
+)
+@pytest.mark.parametrize("allow_list", [None, [kvikio.RemoteEndpointType.S3]])
+@pytest.mark.parametrize("nbytes", [None, 1])
+def test_open_valid(s3_base, scheme, remote_endpoint_type, allow_list, nbytes):
+    bucket_name = "bucket_name"
+    object_name = "object_name"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        if scheme == "S3":
+            url = f"{scheme}://{bucket_name}/{object_name}"
+        else:
+            url = f"{server_address}/{bucket_name}/{object_name}"
+
+        if nbytes is None:
+            expected_file_size = len(data)
+        else:
+            expected_file_size = nbytes
+
+        with kvikio.RemoteFile.open(url, remote_endpoint_type, allow_list, nbytes) as f:
+            assert f.nbytes() == expected_file_size
+            assert f.remote_endpoint_type() == kvikio.RemoteEndpointType.S3
+
+
+def test_open_invalid(s3_base):
+    bucket_name = "bucket_name"
+    object_name = "object_name"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        # Missing scheme
+        url = f"://{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="Bad scheme"):
+            kvikio.RemoteFile.open(url)
+
+        # Unsupported type
+        url = f"unsupported://{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="Unsupported endpoint URL"):
+            kvikio.RemoteFile.open(url)
+
+        # Specified URL not in the allowlist
+        url = f"{server_address}/{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="not in the allowlist"):
+            kvikio.RemoteFile.open(
+                url, kvikio.RemoteEndpointType.S3, [kvikio.RemoteEndpointType.WEBHDFS]
+            )
+
+        # Invalid URLs
+        url = f"s3://{bucket_name}"
+        with pytest.raises(RuntimeError, match="Unsupported endpoint URL"):
+            kvikio.RemoteFile.open(url)
+        with pytest.raises(RuntimeError, match="Invalid URL"):
+            kvikio.RemoteFile.open(url, kvikio.RemoteEndpointType.S3)

From 2ec42b4a565872a781765f7690b91b41535b1897 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Fri, 5 Sep 2025 14:54:32 -0500
Subject: [PATCH 064/103] Use branch-25.10 again (#812)

Contributes to https://github.com/rapidsai/build-planning/issues/208

Now that https://github.com/rapidsai/shared-workflows/pull/413 is
merged, this converts all GitHub Actions references from `@cuda13.0`
back to `branch-25.10`.

## Notes for Reviewers

This is safe to admin-merge because the change is a no-op... configs on
those 2 branches are identical.
---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 4 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e119ba8ac1..0070274f37 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4afc5ca548..f610dce204 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,7 +142,7 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
     with:
       arch: '["amd64", "arm64"]'
       cuda: '["13.0"]'
@@ -160,7 +160,7 @@ jobs:
         sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -170,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -179,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4c4ff60385..5f618498b3 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 72751d071b..48bf37afc4 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From f4e022e9fcb768421c6e6ab5dddb0611b746fac8 Mon Sep 17 00:00:00 2001
From: ahoyle-nvidia <ahoyle@nvidia.com>
Date: Tue, 9 Sep 2025 06:30:04 -0700
Subject: [PATCH 065/103] Skip max_device_cache_size setter when BAR1 memory
 isn't present on the GPUs in the system (#814)

We've seen multiple issues over the months from DGX Spark users when it comes to this specific file. This PR address these issues by applying a skip for the max_device_cache_size (cuFileDriverSetMaxCacheSize) setter by examining the output of nvidia-smi.

Authors:
  - https://github.com/ahoyle-nvidia
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/814
---
 python/kvikio/tests/test_cufile_driver.py | 34 +++++++++++++++++------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/python/kvikio/tests/test_cufile_driver.py b/python/kvikio/tests/test_cufile_driver.py
index d85cd35d3e..ed715b859a 100644
--- a/python/kvikio/tests/test_cufile_driver.py
+++ b/python/kvikio/tests/test_cufile_driver.py
@@ -1,11 +1,21 @@
 # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
+import subprocess
+
 import pytest
 
 import kvikio.cufile_driver
 
 
+def has_bar_memory() -> bool:
+    try:
+        output = subprocess.check_output(["nvidia-smi"], text=True)
+        return "Not Supported" not in output
+    except Exception:
+        return False
+
+
 def test_version():
     major, minor = kvikio.cufile_driver.libcufile_version()
     assert major >= 0
@@ -56,16 +66,22 @@ def test_property_accessor():
         max_device_cache_size_default = kvikio.cufile_driver.get(
             "max_device_cache_size"
         )
-        with kvikio.cufile_driver.set(
-            {"poll_mode": True, "max_device_cache_size": 2048}
-        ):
-            assert kvikio.cufile_driver.get("poll_mode") and (
-                kvikio.cufile_driver.get("max_device_cache_size") == 2048
+        if has_bar_memory():
+            with kvikio.cufile_driver.set(
+                {"poll_mode": True, "max_device_cache_size": 2048}
+            ):
+                assert kvikio.cufile_driver.get("poll_mode") and (
+                    kvikio.cufile_driver.get("max_device_cache_size") == 2048
+                )
+            assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
+                kvikio.cufile_driver.get("max_device_cache_size")
+                == max_device_cache_size_default
             )
-        assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
-            kvikio.cufile_driver.get("max_device_cache_size")
-            == max_device_cache_size_default
-        )
+        else:
+            with kvikio.cufile_driver.set("poll_mode", True):
+                assert kvikio.cufile_driver.get("poll_mode")
+            assert kvikio.cufile_driver.get("poll_mode") == poll_mode_default
+
     except RuntimeError as e:
         if "KvikIO not compiled with cuFile.h" in str(e):
             pytest.skip("KvikIO not compiled with cuFile.h, skipping cuFile tests")

From c87a02201829053019f033a8afb7aa99f7518987 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Wed, 10 Sep 2025 18:11:52 -0500
Subject: [PATCH 066/103] Added KVIKIO_REMOTE_VERBOSE option (#815)

Our HTTP library, libcurl, includes a [`CURLOPT_VERBOSE`](https://curl.se/libcurl/c/CURLOPT_VERBOSE.html) setting that can be useful for debugging. To help our users debug things, I've added a new `KVIKIO_REMOTE_VERBOSE` option that configures this. By default, it's off (no change). If the user sets `KVIKIO_REMOTE_VERBOSE=1` then information from the HTTP requests and responses will be printed to stderr.

Authors:
  - Tom Augspurger (https://github.com/TomAugspurger)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/815
---
 cpp/doxygen/main_page.md         |  7 +++++++
 cpp/src/shim/libcurl.cpp         |  4 ++++
 docs/source/runtime_settings.rst | 11 +++++++++++
 python/kvikio/kvikio/defaults.py |  3 ++-
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index cd9fe05557..a74bad75d6 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -137,6 +137,13 @@ Note that if you're reading a large file that has been split into multiple reads
 
 These settings can also be controlled by `defaults::http_max_attempts()`, `defaults::http_max_attempts_reset()`, `defaults::http_status_codes()`, and `defaults::http_status_codes_reset()`.
 
+#### Remote Verbose (KVIKIO_REMOTE_VERBOSE)
+For debugging HTTP requests, you can enable verbose output that shows detailed information about HTTP communication including headers, request/response bodies, connection details, and SSL handshake information.
+
+Set the environment variable `KVIKIO_REMOTE_VERBOSE` to `true`, `on`, `yes`, or `1` (case-insensitive) to enable verbose output. Otherwise, verbose output is disabled by default.
+
+**Warning** this may show sensitive contents from headers and data.
+
 ## Example
 
 ```cpp
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
index 613dad32f8..feddf33254 100644
--- a/cpp/src/shim/libcurl.cpp
+++ b/cpp/src/shim/libcurl.cpp
@@ -112,6 +112,10 @@ CurlHandle::CurlHandle(LibCurl::UniqueHandlePtr handle,
 
   // Make requests time out after `value` seconds.
   setopt(CURLOPT_TIMEOUT, kvikio::defaults::http_timeout());
+
+  // Optionally enable verbose output if it's configured.
+  auto const verbose = getenv_or("KVIKIO_REMOTE_VERBOSE", false);
+  if (verbose) { setopt(CURLOPT_VERBOSE, 1L); }
 }
 
 CurlHandle::~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
index bb347ba23c..0e304c9ed5 100644
--- a/docs/source/runtime_settings.rst
+++ b/docs/source/runtime_settings.rst
@@ -53,3 +53,14 @@ KvikIO will retry a request should any of the HTTP status code in ``KVIKIO_HTTP_
 The maximum number of attempts to make before throwing an exception is controlled by ``KVIKIO_HTTP_MAX_ATTEMPTS``. The default value is 3. This setting can also be accessed using :py:func:`kvikio.defaults.http_max_attempts` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 The maximum duration of each HTTP request is controlled by ``KVIKIO_HTTP_TIMEOUT``. The default value is 60, which is the duration in seconds to allow. This setting can also be accessed using :py:func:`kvikio.defaults.http_timoeout` (getter) and :py:func:`kvikio.defaults.set` (setter).
+
+HTTP Verbose ``KVIKIO_REMOTE_VERBOSE``
+--------------------------------------
+
+For debugging HTTP requests, you can enable verbose output that shows detailed information about HTTP communication including headers, request/response bodies, connection details, and SSL handshake information.
+
+Set the environment variable ``KVIKIO_REMOTE_VERBOSE`` to ``true``, ``on``, ``yes``, or ``1`` (case-insensitive) to enable verbose output. Otherwise, verbose output is disabled by default.
+
+.. warning::
+
+   This may show sensitive contents from headers and data.
diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py
index be57d2739c..ee0ebf5f95 100644
--- a/python/kvikio/kvikio/defaults.py
+++ b/python/kvikio/kvikio/defaults.py
@@ -124,7 +124,7 @@ def set(*config) -> ConfigContextManager:
         - ``"bounce_buffer_size"``
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
-        - ``*http_timeout*``
+        - ``"http_timeout"``
 
     Returns
     -------
@@ -167,6 +167,7 @@ def get(config_name: str) -> Any:
         - ``"bounce_buffer_size"``
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
+        - ``"http_timeout"``
 
     Returns
     -------

From d637c6ecb0d32259e9a5e2d8137f875c7de569e2 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 12 Sep 2025 09:15:32 -0400
Subject: [PATCH 067/103] Use C++20 for KvikIO main library (#819)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous PR https://github.com/rapidsai/kvikio/pull/749 forgets to bring the entrée to the table: Only the C++ code in tests and benchmarks use C++20, but not the main library. This PR fixes this oversight.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/819
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6f9c249cfa..f94409c57e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -205,7 +205,7 @@ set_target_properties(
   kvikio
   PROPERTIES BUILD_RPATH "\$ORIGIN"
              INSTALL_RPATH "\$ORIGIN"
-             CXX_STANDARD 17
+             CXX_STANDARD 20
              CXX_STANDARD_REQUIRED ON
              POSITION_INDEPENDENT_CODE ON
              INTERFACE_POSITION_INDEPENDENT_CODE ON

From dde14359d9461707da865d1c65bebc268954a03e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 15 Sep 2025 09:02:55 -0400
Subject: [PATCH 068/103] Explicitly manage TLS/SSL CA paths for remote I/O
 (#817)

## Background

`libcurl` have two path parameters related to the certificate authority (CA):

- `CURLOPT_CAINFO`, which specifies the CA bundle file path.
- `CURLOPT_CAPATH`, which specifies the directory of individual CA certificates with hash-based naming.

The default paths are determined at compile-time, which can cause issues if the Linux distributions where `libcurl` is built and run are different (e.g. on Rocky Linux vs Ubuntu as in our CI vs our lab system), and the certificates files are likely at different locations. This problem has been observed in KvikIO's wheel distribution, where HTTPS would fail with the message:

>error setting certificate verify locations:  CAfile: /etc/pki/tls/certs/ca-bundle.crt CApath: /etc/ssl/certs

## This PR

This PR addresses this problem. The certificate path is now explicitly searched for in the following order. The compile-time parameters, if any, are still used but treated with lowest priority.

- CA bundle file: Check env vars `CURL_CA_BUNDLE`, and `SSL_CERT_FILE`
- CA directory: Check env vars `SSL_CERT_DIR`
- CA bundle file: Search a set of distribution-specific locations for accessible bundle
- CA directory: Search a set of distribution-specific locations for accessible directory
- CA bundle file: Check if the compile-time path is given and accessible
- CA directory: Check if the compile-time parameter is given and accessible

Depends on https://github.com/rapidsai/kvikio/pull/819 for the use of  `static` structured binding which is only available in C++ >=20

Closes #711

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/817
---
 cpp/CMakeLists.txt                |   2 +-
 cpp/include/kvikio/detail/tls.hpp |  58 ++++++++++++
 cpp/src/detail/tls.cpp            | 150 ++++++++++++++++++++++++++++++
 cpp/src/shim/libcurl.cpp          |   3 +
 cpp/tests/CMakeLists.txt          |   1 +
 cpp/tests/test_tls.cpp            |  48 ++++++++++
 docs/source/runtime_settings.rst  |  10 ++
 7 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 cpp/include/kvikio/detail/tls.hpp
 create mode 100644 cpp/src/detail/tls.cpp
 create mode 100644 cpp/tests/test_tls.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f94409c57e..9c66e51875 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -163,7 +163,7 @@ set(SOURCES
 
 if(KvikIO_REMOTE_SUPPORT)
   list(APPEND SOURCES "src/hdfs.cpp" "src/remote_handle.cpp" "src/detail/remote_handle.cpp"
-       "src/detail/url.cpp" "src/shim/libcurl.cpp"
+       "src/detail/tls.cpp" "src/detail/url.cpp" "src/shim/libcurl.cpp"
   )
 endif()
 
diff --git a/cpp/include/kvikio/detail/tls.hpp b/cpp/include/kvikio/detail/tls.hpp
new file mode 100644
index 0000000000..be40eaf5e5
--- /dev/null
+++ b/cpp/include/kvikio/detail/tls.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio::detail {
+/**
+ * @brief Search for the CA bundle file and directory paths
+ *
+ * This function searches for the Certificate Authority (CA) paths required for TLS/SSL verification
+ * in libcurl. The search is performed in the following priority order, returning as soon as either
+ * a bundle file or a directory is found:
+ * - CA bundle file: Check env vars CURL_CA_BUNDLE, SSL_CERT_FILE
+ * - CA directory: Check env var SSL_CERT_DIR
+ * - CA bundle file: Search distribution-specific locations for accessible bundle
+ * - CA directory: Search distribution-specific locations for accessible directory
+ * - CA bundle file: Check if curl's compile-time default bundle path is accessible
+ * - CA directory: Check if curl's compile-time default directory path is accessible
+ *
+ * @return Result containing CA bundle file and CA certificate directory
+ *
+ * @exception std::runtime_error if neither CA bundle nor directory is found
+ *
+ * @note Environment Variables:
+ * - CURL_CA_BUNDLE: Override CA bundle file location (curl-specific)
+ * - SSL_CERT_FILE: Override CA bundle file location (OpenSSL-compatible)
+ * - SSL_CERT_DIR: Override CA directory location (OpenSSL-compatible)
+ */
+std::pair<std::optional<std::string>, std::optional<std::string>> get_ca_paths();
+
+/**
+ * @brief Configure curl handle with discovered CA certificate paths
+ *
+ * As a performance optimization, the discovered CA certificate paths are cached to avoid repeated
+ * searching.
+ *
+ * @param curl Curl handle to configure with CA certificate paths
+ */
+void set_up_ca_paths(CurlHandle& curl);
+}  // namespace kvikio::detail
diff --git a/cpp/src/detail/tls.cpp b/cpp/src/detail/tls.cpp
new file mode 100644
index 0000000000..0dccb6b612
--- /dev/null
+++ b/cpp/src/detail/tls.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unistd.h>
+#include <cstdlib>
+
+#include <curl/curl.h>
+#include <kvikio/detail/tls.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/libcurl.hpp>
+#include <optional>
+#include <stdexcept>
+
+namespace kvikio::detail {
+
+namespace {
+/**
+ * @brief Search for a CA certificate path using environment variables
+ *
+ * @param env_vars Environment variable names to check in order
+ * @return Path string if found in any environment variable, std::nullopt otherwise
+ */
+std::optional<std::string> find_ca_path_from_env_var(std::vector<std::string> const& env_vars)
+{
+  for (auto const& env_var : env_vars) {
+    auto const* path = std::getenv(env_var.data());
+    if (path != nullptr) { return path; }
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Search for a CA certificate path in standard system locations
+ *
+ * @param system_paths file system paths to check in order
+ * @return First accessible path if found, std::nullopt otherwise
+ */
+std::optional<std::string> find_ca_path_in_system_locations(
+  std::vector<std::string> const& system_paths)
+{
+  for (auto const& path : system_paths) {
+    // Check whether the file/directory exists, and whether it grants read permission to the calling
+    // process's real UID and GID. If the path is a symbolic link, it is dereferenced.
+    auto const result = access(path.data(), R_OK);
+
+    if (result != -1) { return path; }
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Get CA certificate path from curl's compile-time defaults
+ *
+ * @param default_path Path provided by curl_version_info (may be nullptr)
+ * @return Path string if accessible, std::nullopt otherwise
+ */
+std::optional<std::string> get_ca_path_from_curl_defaults(char const* default_path)
+{
+  if (default_path != nullptr && access(default_path, R_OK) != -1) { return default_path; }
+
+  return std::nullopt;
+}
+}  // namespace
+
+std::pair<std::optional<std::string>, std::optional<std::string>> get_ca_paths()
+{
+  auto* version_info = curl_version_info(::CURLVERSION_NOW);
+  KVIKIO_EXPECT(version_info != nullptr, "Failed to get curl version info", std::runtime_error);
+
+  std::optional<std::string> ca_bundle_file;
+  std::optional<std::string> ca_directory;
+
+  // Priority 1: CA bundle file from environment variables
+  ca_bundle_file = find_ca_path_from_env_var({
+    "CURL_CA_BUNDLE",  // curl program
+    "SSL_CERT_FILE"    // OpenSSL
+  });
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 2: CA directory from environment variables
+  ca_directory = find_ca_path_from_env_var({
+    "SSL_CERT_DIR"  // OpenSSL
+  });
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 3: CA bundle file from system locations
+  ca_bundle_file = find_ca_path_in_system_locations(
+    {"/etc/ssl/certs/ca-certificates.crt",                 // Debian/Ubuntu, Arch, Alpine, Gentoo
+     "/etc/pki/tls/certs/ca-bundle.crt",                   // RHEL/CentOS/Rocky/AlmaLinux, Fedora
+     "/etc/ssl/ca-bundle.pem",                             // OpenSUSE/SLES
+     "/etc/pki/tls/cert.pem",                              // RHEL-based (symlink to ca-bundle.crt)
+     "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem",  // Fedora 28+, RHEL 8+
+
+     // Additional locations mentioned by libcurl:
+     // https://github.com/curl/curl/blob/master/CMakeLists.txt
+     "/usr/share/ssl/certs/ca-bundle.crt",
+     "/usr/local/share/certs/ca-root-nss.crt",
+     "/etc/ssl/cert.pem"});
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 4: CA directory from system locations
+  ca_directory = find_ca_path_in_system_locations({
+    "/etc/ssl/certs",     // Debian/Ubuntu, Arch, Alpine, OpenSUSE, Gentoo
+    "/etc/pki/tls/certs"  // RHEL/CentOS/Rocky/AlmaLinux, Fedora
+  });
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 5: CA bundle file from curl compile-time defaults
+  ca_bundle_file = get_ca_path_from_curl_defaults(version_info->cainfo);
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 6: CA directory from curl compile-time defaults
+  ca_directory = get_ca_path_from_curl_defaults(version_info->capath);
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // At least one path must exist
+  KVIKIO_EXPECT(ca_bundle_file.has_value() || ca_directory.has_value(),
+                "Failed to find accessible CA certificates.",
+                std::runtime_error);
+  return {ca_bundle_file, ca_directory};
+}
+
+void set_up_ca_paths(CurlHandle& curl)
+{
+  static auto const [ca_bundle_file, ca_directory] = get_ca_paths();
+
+  if (ca_bundle_file.has_value()) {
+    curl.setopt(CURLOPT_CAINFO, ca_bundle_file->c_str());
+    curl.setopt(CURLOPT_CAPATH, nullptr);
+  } else if (ca_directory.has_value()) {
+    curl.setopt(CURLOPT_CAINFO, nullptr);
+    curl.setopt(CURLOPT_CAPATH, ca_directory->c_str());
+  }
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
index feddf33254..a78fb33d30 100644
--- a/cpp/src/shim/libcurl.cpp
+++ b/cpp/src/shim/libcurl.cpp
@@ -29,6 +29,7 @@
 #include <curl/curl.h>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/tls.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/parallel_operation.hpp>
 #include <kvikio/posix_io.hpp>
@@ -116,6 +117,8 @@ CurlHandle::CurlHandle(LibCurl::UniqueHandlePtr handle,
   // Optionally enable verbose output if it's configured.
   auto const verbose = getenv_or("KVIKIO_REMOTE_VERBOSE", false);
   if (verbose) { setopt(CURLOPT_VERBOSE, 1L); }
+
+  detail::set_up_ca_paths(*this);
 }
 
 CurlHandle::~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a6fd2c67e4..afa7e8d97b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -79,6 +79,7 @@ kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
 if(KvikIO_REMOTE_SUPPORT)
   kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
   kvikio_add_test(NAME HDFS_TEST SOURCES test_hdfs.cpp utils/hdfs_helper.cpp)
+  kvikio_add_test(NAME TLS_TEST SOURCES test_tls.cpp utils/env.cpp)
   kvikio_add_test(NAME URL_TEST SOURCES test_url.cpp)
 endif()
 
diff --git a/cpp/tests/test_tls.cpp b/cpp/tests/test_tls.cpp
new file mode 100644
index 0000000000..4b04f10296
--- /dev/null
+++ b/cpp/tests/test_tls.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <kvikio/detail/tls.hpp>
+
+#include "utils/env.hpp"
+
+TEST(TlsTest, get_ca_paths)
+{
+  std::string const expected_ca_bundle_path{"ca_bundle_path"};
+  std::string const expected_ca_directory{"ca_directory"};
+  {
+    // Env var CURL_CA_BUNDLE has the highest priority. Both SSL_CERT_FILE and SSL_CERT_DIR shall be
+    // skipped
+    kvikio::test::EnvVarContext env_var_ctx{{"CURL_CA_BUNDLE", expected_ca_bundle_path},
+                                            {"SSL_CERT_FILE", "another_ca_bundle_path"},
+                                            {"SSL_CERT_DIR", expected_ca_directory}};
+    auto const& [ca_bundle_file, ca_directory] = kvikio::detail::get_ca_paths();
+
+    EXPECT_EQ(ca_bundle_file, expected_ca_bundle_path);
+    EXPECT_EQ(ca_directory, std::nullopt);
+  }
+
+  {
+    // Env var CURL_CA_BUNDLE and SSL_CERT_FILE are not specified, SSL_CERT_DIR shall be used
+    kvikio::test::EnvVarContext env_var_ctx{{"SSL_CERT_DIR", expected_ca_directory}};
+    auto const& [ca_bundle_file, ca_directory] = kvikio::detail::get_ca_paths();
+
+    EXPECT_EQ(ca_bundle_file, std::nullopt);
+    EXPECT_EQ(ca_directory, expected_ca_directory);
+  }
+}
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
index 0e304c9ed5..e707031720 100644
--- a/docs/source/runtime_settings.rst
+++ b/docs/source/runtime_settings.rst
@@ -64,3 +64,13 @@ Set the environment variable ``KVIKIO_REMOTE_VERBOSE`` to ``true``, ``on``, ``ye
 .. warning::
 
    This may show sensitive contents from headers and data.
+
+CA bundle file and CA directory ``CURL_CA_BUNDLE``, ``SSL_CERT_FILE``, ``SSL_CERT_DIR``
+---------------------------------------------------------------------------------------
+
+The Certificate Authority (CA) paths required for TLS/SSL verification in ``libcurl`` can be explicitly specified using the following environment variables in order of overriding priority:
+
+  * ``CURL_CA_BUNDLE`` (also used in the ``curl`` program) or ``SSL_CERT_FILE`` (also used in OpenSSL): Specifies the CA certificate bundle file location.
+  * ``SSL_CERT_DIR`` (also used in OpenSSL): Specifies the CA certificate directory.
+
+When neither is specified, KvikIO searches several standard system locations for the CA file and directory, and if the search fails falls back to the libcurl compile-time defaults.

From b8626abbe9cfd341e7542b23e1d19bd13a685873 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 15 Sep 2025 08:27:11 -0700
Subject: [PATCH 069/103] Reduce duplication between compat manager and
 defaults (#816)

Some of these APIs were identical but presumably duplicated due to otherwise creating a circular include dependency. Moving the manager out of the compat_mode header resolves that and allows us to remove the duplication.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/816
---
 cpp/CMakeLists.txt                         |   1 +
 cpp/include/kvikio/compat_mode.hpp         | 102 +------------------
 cpp/include/kvikio/compat_mode_manager.hpp | 106 ++++++++++++++++++++
 cpp/include/kvikio/file_handle.hpp         |   4 +-
 cpp/src/compat_mode.cpp                    |  99 -------------------
 cpp/src/compat_mode_manager.cpp            | 110 +++++++++++++++++++++
 6 files changed, 219 insertions(+), 203 deletions(-)
 create mode 100644 cpp/include/kvikio/compat_mode_manager.hpp
 create mode 100644 cpp/src/compat_mode_manager.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9c66e51875..5db5fa6f50 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -144,6 +144,7 @@ set(SOURCES
     "src/bounce_buffer.cpp"
     "src/buffer.cpp"
     "src/compat_mode.cpp"
+    "src/compat_mode_manager.cpp"
     "src/http_status_codes.cpp"
     "src/cufile/config.cpp"
     "src/cufile/driver.cpp"
diff --git a/cpp/include/kvikio/compat_mode.hpp b/cpp/include/kvikio/compat_mode.hpp
index 03ed443fe0..d25f0ffaef 100644
--- a/cpp/include/kvikio/compat_mode.hpp
+++ b/cpp/include/kvikio/compat_mode.hpp
@@ -17,9 +17,7 @@
 #pragma once
 
 #include <cstdint>
-#include <string>
-
-#include <kvikio/file_utils.hpp>
+#include <string_view>
 
 namespace kvikio {
 /**
@@ -49,102 +47,4 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str);
 
 }  // namespace detail
 
-// Forward declaration.
-class FileHandle;
-
-/**
- * @brief Store and manage the compatibility mode data associated with a FileHandle.
- */
-class CompatModeManager {
- private:
-  CompatMode _compat_mode_requested{CompatMode::AUTO};
-  bool _is_compat_mode_preferred{true};
-  bool _is_compat_mode_preferred_for_async{true};
-
- public:
-  /**
-   * @brief Construct an empty compatibility mode manager.
-   */
-  CompatModeManager() noexcept = default;
-
-  /**
-   * @brief Construct a compatibility mode manager associated with a FileHandle.
-   *
-   * According to the file path, requested compatibility mode, and the system configuration, the
-   * compatibility manager:
-   * - Infers the final compatibility modes for synchronous and asynchronous I/O paths,
-   * respectively.
-   * - Initializes the file wrappers and cuFile handle associated with a FileHandle.
-   *
-   * @param file_path Refer to
-   * FileHandle::FileHandle(std::string const&, std::string const&, mode_t, CompatMode).
-   * @param flags Same as above.
-   * @param mode Same as above.
-   * @param compat_mode_requested Same as above.
-   * @param file_handle Pointer to the FileHandle object that owns this compatibility mode manager.
-   */
-  CompatModeManager(std::string const& file_path,
-                    std::string const& flags,
-                    mode_t mode,
-                    CompatMode compat_mode_requested,
-                    FileHandle* file_handle);
-
-  ~CompatModeManager() noexcept                              = default;
-  CompatModeManager(const CompatModeManager&)                = default;
-  CompatModeManager& operator=(const CompatModeManager&)     = default;
-  CompatModeManager(CompatModeManager&&) noexcept            = default;
-  CompatModeManager& operator=(CompatModeManager&&) noexcept = default;
-
-  /**
-   * @brief Functionally identical to defaults::infer_compat_mode_if_auto(CompatMode).
-   *
-   * @param compat_mode Compatibility mode.
-   * @return If the given compatibility mode is CompatMode::AUTO, infer the final compatibility
-   * mode.
-   */
-  CompatMode infer_compat_mode_if_auto(CompatMode compat_mode) noexcept;
-
-  /**
-   * @brief Functionally identical to defaults::is_compat_mode_preferred(CompatMode).
-   *
-   * @param compat_mode Compatibility mode.
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred(CompatMode compat_mode) noexcept;
-
-  /**
-   * @brief Check if the compatibility mode for synchronous I/O of the associated FileHandle is
-   * expected to be CompatMode::ON.
-   *
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred() const noexcept;
-
-  /**
-   * @brief Check if the compatibility mode for asynchronous I/O of the associated FileHandle is
-   * expected to be CompatMode::ON.
-   *
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred_for_async() const noexcept;
-
-  /**
-   * @brief Retrieve the original compatibility mode requested.
-   *
-   * @return The original compatibility mode requested.
-   */
-  CompatMode compat_mode_requested() const noexcept;
-
-  /**
-   * @brief Determine if asynchronous I/O can be performed or not (throw exceptions)
-   * according to the existing compatibility mode data in the manager.
-   *
-   * Asynchronous I/O cannot be performed, for instance, when compat_mode_requested() is
-   * CompatMode::OFF, is_compat_mode_preferred() is CompatMode::OFF, but
-   * is_compat_mode_preferred_for_async() is CompatMode::ON (due to missing cuFile stream API or
-   * cuFile configuration file).
-   */
-  void validate_compat_mode_for_async() const;
-};
-
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/compat_mode_manager.hpp b/cpp/include/kvikio/compat_mode_manager.hpp
new file mode 100644
index 0000000000..4d15f2ba9f
--- /dev/null
+++ b/cpp/include/kvikio/compat_mode_manager.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <kvikio/compat_mode.hpp>
+
+namespace kvikio {
+
+// Forward declaration.
+class FileHandle;
+
+/**
+ * @brief Store and manage the compatibility mode data associated with a FileHandle.
+ */
+class CompatModeManager {
+ private:
+  CompatMode _compat_mode_requested{CompatMode::AUTO};
+  bool _is_compat_mode_preferred{true};
+  bool _is_compat_mode_preferred_for_async{true};
+
+ public:
+  /**
+   * @brief Construct an empty compatibility mode manager.
+   */
+  CompatModeManager() noexcept = default;
+
+  /**
+   * @brief Construct a compatibility mode manager associated with a FileHandle.
+   *
+   * According to the file path, requested compatibility mode, and the system configuration, the
+   * compatibility manager:
+   * - Infers the final compatibility modes for synchronous and asynchronous I/O paths,
+   * respectively.
+   * - Initializes the file wrappers and cuFile handle associated with a FileHandle.
+   *
+   * @param file_path Refer to
+   * FileHandle::FileHandle(std::string const&, std::string const&, mode_t, CompatMode).
+   * @param flags Same as above.
+   * @param mode Same as above.
+   * @param compat_mode_requested Same as above.
+   * @param file_handle Pointer to the FileHandle object that owns this compatibility mode manager.
+   */
+  CompatModeManager(std::string const& file_path,
+                    std::string const& flags,
+                    mode_t mode,
+                    CompatMode compat_mode_requested,
+                    FileHandle* file_handle);
+
+  ~CompatModeManager() noexcept                              = default;
+  CompatModeManager(const CompatModeManager&)                = default;
+  CompatModeManager& operator=(const CompatModeManager&)     = default;
+  CompatModeManager(CompatModeManager&&) noexcept            = default;
+  CompatModeManager& operator=(CompatModeManager&&) noexcept = default;
+
+  /**
+   * @brief Check if the compatibility mode for synchronous I/O of the associated FileHandle is
+   * expected to be CompatMode::ON.
+   *
+   * @return Boolean answer.
+   */
+  bool is_compat_mode_preferred() const noexcept;
+
+  /**
+   * @brief Check if the compatibility mode for asynchronous I/O of the associated FileHandle is
+   * expected to be CompatMode::ON.
+   *
+   * @return Boolean answer.
+   */
+  bool is_compat_mode_preferred_for_async() const noexcept;
+
+  /**
+   * @brief Retrieve the original compatibility mode requested.
+   *
+   * @return The original compatibility mode requested.
+   */
+  CompatMode compat_mode_requested() const noexcept;
+
+  /**
+   * @brief Determine if asynchronous I/O can be performed or not (throw exceptions)
+   * according to the existing compatibility mode data in the manager.
+   *
+   * Asynchronous I/O cannot be performed, for instance, when compat_mode_requested() is
+   * CompatMode::OFF, is_compat_mode_preferred() is CompatMode::OFF, but
+   * is_compat_mode_preferred_for_async() is CompatMode::ON (due to missing cuFile stream API or
+   * cuFile configuration file).
+   */
+  void validate_compat_mode_for_async() const;
+};
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index 50e1c679c3..78e493c9b0 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -20,12 +20,10 @@
 
 #include <cstddef>
 #include <cstdlib>
-#include <stdexcept>
-#include <system_error>
-#include <utility>
 
 #include <kvikio/buffer.hpp>
 #include <kvikio/compat_mode.hpp>
+#include <kvikio/compat_mode_manager.hpp>
 #include <kvikio/cufile/config.hpp>
 #include <kvikio/defaults.hpp>
 #include <kvikio/error.hpp>
diff --git a/cpp/src/compat_mode.cpp b/cpp/src/compat_mode.cpp
index 78a96c66be..d8346253bf 100644
--- a/cpp/src/compat_mode.cpp
+++ b/cpp/src/compat_mode.cpp
@@ -15,15 +15,11 @@
  */
 
 #include <algorithm>
-#include <cassert>
 #include <stdexcept>
 
 #include <kvikio/compat_mode.hpp>
-#include <kvikio/cufile/config.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/file_handle.hpp>
 #include <kvikio/nvtx.hpp>
-#include <kvikio/shim/cufile.hpp>
 
 namespace kvikio {
 
@@ -50,99 +46,4 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str)
 
 }  // namespace detail
 
-CompatMode CompatModeManager::infer_compat_mode_if_auto(CompatMode compat_mode) noexcept
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (compat_mode == CompatMode::AUTO) {
-    return is_cufile_available() ? CompatMode::OFF : CompatMode::ON;
-  }
-  return compat_mode;
-}
-
-bool CompatModeManager::is_compat_mode_preferred(CompatMode compat_mode) noexcept
-{
-  return compat_mode == CompatMode::ON ||
-         (compat_mode == CompatMode::AUTO &&
-          infer_compat_mode_if_auto(compat_mode) == CompatMode::ON);
-}
-
-bool CompatModeManager::is_compat_mode_preferred() const noexcept
-{
-  return _is_compat_mode_preferred;
-}
-
-bool CompatModeManager::is_compat_mode_preferred_for_async() const noexcept
-{
-  return _is_compat_mode_preferred_for_async;
-}
-
-CompatMode CompatModeManager::compat_mode_requested() const noexcept
-{
-  return _compat_mode_requested;
-}
-
-CompatModeManager::CompatModeManager(std::string const& file_path,
-                                     std::string const& flags,
-                                     mode_t mode,
-                                     CompatMode compat_mode_requested_v,
-                                     FileHandle* file_handle)
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  KVIKIO_EXPECT(file_handle != nullptr,
-                "The compatibility mode manager does not have a proper owning file handle.",
-                std::invalid_argument);
-
-  file_handle->_file_direct_off.open(file_path, flags, false, mode);
-  _is_compat_mode_preferred = is_compat_mode_preferred(compat_mode_requested_v);
-
-  // Nothing to do in compatibility mode
-  if (_is_compat_mode_preferred) { return; }
-
-  try {
-    file_handle->_file_direct_on.open(file_path, flags, true, mode);
-  } catch (...) {
-    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
-    if (compat_mode_requested_v == CompatMode::AUTO) {
-      _is_compat_mode_preferred = true;
-    } else {  // CompatMode::OFF
-      throw;
-    }
-  }
-
-  if (_is_compat_mode_preferred) { return; }
-
-  auto error_code = file_handle->_cufile_handle.register_handle(file_handle->_file_direct_on.fd());
-  assert(error_code.has_value());
-
-  // For the AUTO mode, if the first cuFile API call fails, fall back to the compatibility
-  // mode.
-  if (compat_mode_requested_v == CompatMode::AUTO && error_code.value().err != CU_FILE_SUCCESS) {
-    _is_compat_mode_preferred = true;
-  } else {
-    CUFILE_TRY(error_code.value());
-  }
-
-  // Check cuFile async API
-  static bool const is_extra_symbol_available = is_stream_api_available();
-  static bool const is_config_path_empty      = config_path().empty();
-  _is_compat_mode_preferred_for_async =
-    _is_compat_mode_preferred || !is_extra_symbol_available || is_config_path_empty;
-}
-
-void CompatModeManager::validate_compat_mode_for_async() const
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (!_is_compat_mode_preferred && _is_compat_mode_preferred_for_async &&
-      _compat_mode_requested == CompatMode::OFF) {
-    std::string err_msg;
-    if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
-
-    // When checking for availability, we also check if cuFile's config file exists. This is
-    // because even when the stream API is available, it doesn't work if no config file exists.
-    if (config_path().empty()) { err_msg += " Missing cuFile configuration file."; }
-
-    KVIKIO_FAIL(err_msg, std::runtime_error);
-  }
-}
-
 }  // namespace kvikio
diff --git a/cpp/src/compat_mode_manager.cpp b/cpp/src/compat_mode_manager.cpp
new file mode 100644
index 0000000000..569e9e73e7
--- /dev/null
+++ b/cpp/src/compat_mode_manager.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+#include <stdexcept>
+
+#include <kvikio/compat_mode_manager.hpp>
+#include <kvikio/cufile/config.hpp>
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/file_handle.hpp>
+#include <kvikio/nvtx.hpp>
+#include <kvikio/shim/cufile.hpp>
+
+namespace kvikio {
+
+bool CompatModeManager::is_compat_mode_preferred() const noexcept
+{
+  return _is_compat_mode_preferred;
+}
+
+bool CompatModeManager::is_compat_mode_preferred_for_async() const noexcept
+{
+  return _is_compat_mode_preferred_for_async;
+}
+
+CompatMode CompatModeManager::compat_mode_requested() const noexcept
+{
+  return _compat_mode_requested;
+}
+
+CompatModeManager::CompatModeManager(std::string const& file_path,
+                                     std::string const& flags,
+                                     mode_t mode,
+                                     CompatMode compat_mode_requested_v,
+                                     FileHandle* file_handle)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  KVIKIO_EXPECT(file_handle != nullptr,
+                "The compatibility mode manager does not have a proper owning file handle.",
+                std::invalid_argument);
+
+  _compat_mode_requested = compat_mode_requested_v;
+  file_handle->_file_direct_off.open(file_path, flags, false, mode);
+  _is_compat_mode_preferred = defaults::is_compat_mode_preferred(compat_mode_requested_v);
+
+  // Nothing to do in compatibility mode
+  if (_is_compat_mode_preferred) { return; }
+
+  try {
+    file_handle->_file_direct_on.open(file_path, flags, true, mode);
+  } catch (...) {
+    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
+    if (compat_mode_requested_v == CompatMode::AUTO) {
+      _is_compat_mode_preferred = true;
+    } else {  // CompatMode::OFF
+      throw;
+    }
+  }
+
+  if (_is_compat_mode_preferred) { return; }
+
+  auto error_code = file_handle->_cufile_handle.register_handle(file_handle->_file_direct_on.fd());
+  assert(error_code.has_value());
+
+  // For the AUTO mode, if the first cuFile API call fails, fall back to the compatibility
+  // mode.
+  if (compat_mode_requested_v == CompatMode::AUTO && error_code.value().err != CU_FILE_SUCCESS) {
+    _is_compat_mode_preferred = true;
+  } else {
+    CUFILE_TRY(error_code.value());
+  }
+
+  // Check cuFile async API
+  static bool const is_extra_symbol_available = is_stream_api_available();
+  static bool const is_config_path_empty      = config_path().empty();
+  _is_compat_mode_preferred_for_async =
+    _is_compat_mode_preferred || !is_extra_symbol_available || is_config_path_empty;
+}
+
+void CompatModeManager::validate_compat_mode_for_async() const
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (!_is_compat_mode_preferred && _is_compat_mode_preferred_for_async &&
+      _compat_mode_requested == CompatMode::OFF) {
+    std::string err_msg;
+    if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
+
+    // When checking for availability, we also check if cuFile's config file exists. This is
+    // because even when the stream API is available, it doesn't work if no config file exists.
+    if (config_path().empty()) { err_msg += " Missing cuFile configuration file."; }
+
+    KVIKIO_FAIL(err_msg, std::runtime_error);
+  }
+}
+
+}  // namespace kvikio

From b69d9aeae79fdae990801c05ba21a2762821af63 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 16 Sep 2025 10:13:01 -0400
Subject: [PATCH 070/103] Support access to public S3 (#820)

## Background
KvikIO supports access to private S3 objects that require AWS credentials:
```python
# Method 1
kvikio.RemoteFile.open_s3(bucket, key)

# Method 2
kvikio.RemoteFile.open_s3_url(url)

# Method 3
kvikio.RemoteFile.open(url)
```
For public S3 object, these functions will throw the following exceptions.
>S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.

A workaround is to simply use the generic HTTP/HTTPS endpoint:
```python
# Method 1
kvikio.RemoteFile.open_http(http_url)

# Method 2
kvikio.RemoteFile.open(url, RemoteEndpointType.HTTP)
```

However, this workaround loses the feature of S3 URL syntax check.

## This PR
- Adds support for accessing public S3 objects in C++ and Python by having a new endpoint type `S3PublicEndpoint`. This endpoint does not require AWS credentials.
- Updates the unified interface `open(url)` that can automatically infer the endpoint type. Under `AUTO` mode, for a syntactically valid S3 URL using HTTP/HTTPS protocol, KvikIO now checks the connectivity using a private S3 endpoint, and if failed proceeds to use a public S3 endpoint.
- Updates the comments on each endpoint to further improve clarity.
- Adjusts Python APIs `kvikio.RemoteFile.open_*` from class method to static method (which is a breaking change).

Closes https://github.com/rapidsai/kvikio/issues/806

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/820
---
 cpp/include/kvikio/hdfs.hpp                 |  5 +-
 cpp/include/kvikio/remote_handle.hpp        | 47 ++++++++++++++--
 cpp/src/remote_handle.cpp                   | 62 ++++++++++++++++++++-
 cpp/tests/test_remote_handle.cpp            | 44 +++++++++++----
 python/kvikio/kvikio/_lib/remote_handle.pyx | 30 ++++++++--
 python/kvikio/kvikio/remote_file.py         | 56 ++++++++++++-------
 6 files changed, 201 insertions(+), 43 deletions(-)

diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
index 345051bcbd..9d89d4d2e1 100644
--- a/cpp/include/kvikio/hdfs.hpp
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -24,8 +24,9 @@ namespace kvikio {
 /**
  * @brief A remote endpoint for Apache Hadoop WebHDFS.
  *
- * If KvikIO is run within a Docker, the argument `--network host` needs to be passed to the `docker
- * run` command.
+ * This endpoint is for accessing HDFS files via the WebHDFS REST API over HTTP/HTTPS. If KvikIO is
+ * run within Docker, pass `--network host` to the `docker run` command to ensure proper name node
+ * connectivity.
  */
 class WebHdfsEndpoint : public RemoteEndpoint {
  private:
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 1c6c887281..c48e84e8fb 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -46,6 +46,9 @@ enum class RemoteEndpointType : uint8_t {
   S3,    ///< AWS S3 endpoint using credentials-based authentication. Requires AWS environment
          ///< variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION) to be
          ///< set.
+  S3_PUBLIC,  ///< AWS S3 endpoint for publicly accessible objects. No credentials required as the
+              ///< objects have public read permissions enabled. Used for open datasets and public
+              ///< buckets.
   S3_PRESIGNED_URL,  ///< AWS S3 endpoint using a presigned URL. No credentials required as
                      ///< authentication is embedded in the URL with time-limited access.
   WEBHDFS,  ///< Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System) endpoint for
@@ -108,7 +111,10 @@ class RemoteEndpoint {
 };
 
 /**
- * @brief A remote endpoint using http.
+ * @brief A remote endpoint for HTTP/HTTPS resources
+ *
+ * This endpoint is for accessing files via standard HTTP/HTTPS protocols without any specialized
+ * authentication.
  */
 class HttpEndpoint : public RemoteEndpoint {
  private:
@@ -138,7 +144,10 @@ class HttpEndpoint : public RemoteEndpoint {
 };
 
 /**
- * @brief A remote endpoint using AWS's S3 protocol.
+ * @brief A remote endpoint for AWS S3 storage requiring credentials
+ *
+ * This endpoint is for accessing private S3 objects using AWS credentials (access key, secret key,
+ * region and optional session token).
  */
 class S3Endpoint : public RemoteEndpoint {
  private:
@@ -256,8 +265,38 @@ class S3Endpoint : public RemoteEndpoint {
 };
 
 /**
- * @brief A remote endpoint using AWS's S3 protocol and expecting a presigned URL. File access via
- * this type of URL is time-limited and does not require AWS credentials.
+ * @brief A remote endpoint for publicly accessible S3 objects without authentication
+ *
+ * This endpoint is for accessing S3 objects configured with public read permissions,
+ * requiring no authentication. Supports AWS S3 services with anonymous access enabled.
+ */
+class S3PublicEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  explicit S3PublicEndpoint(std::string url);
+
+  ~S3PublicEndpoint() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 public endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
+};
+
+/**
+ * @brief A remote endpoint for AWS S3 storage using presigned URLs.
+ *
+ * This endpoint is for accessing S3 objects via presigned URLs, which provide time-limited access
+ * without requiring AWS credentials on the client side.
  */
 class S3EndpointWithPresignedUrl : public RemoteEndpoint {
  private:
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index a173804321..cee6bdb700 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -216,6 +216,7 @@ char const* get_remote_endpoint_type_name(RemoteEndpointType remote_endpoint_typ
 {
   switch (remote_endpoint_type) {
     case RemoteEndpointType::S3: return "S3";
+    case RemoteEndpointType::S3_PUBLIC: return "S3 public";
     case RemoteEndpointType::S3_PRESIGNED_URL: return "S3 with presigned URL";
     case RemoteEndpointType::WEBHDFS: return "WebHDFS";
     case RemoteEndpointType::HTTP: return "HTTP";
@@ -308,7 +309,9 @@ std::string S3Endpoint::url_from_bucket_and_object(std::string bucket_name,
       unwrap_or_default(std::move(aws_region),
                         "AWS_DEFAULT_REGION",
                         "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
-    // We default to the official AWS url scheme.
+    // "s3" is a non-standard URI scheme used by AWS CLI and AWS SDK, and cannot be identified by
+    // libcurl. A valid HTTP/HTTPS URL needs to be constructed for use in libcurl. Here the AWS
+    // virtual host style is used.
     ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name;
   } else {
     ss << endpoint_url << "/" << bucket_name << "/" << object_name;
@@ -444,6 +447,34 @@ bool S3Endpoint::is_url_valid(std::string const& url) noexcept
   return false;
 }
 
+S3PublicEndpoint::S3PublicEndpoint(std::string url)
+  : RemoteEndpoint{RemoteEndpointType::S3_PUBLIC}, _url{std::move(url)}
+{
+}
+
+void S3PublicEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+
+std::string S3PublicEndpoint::str() const { return _url; }
+
+std::size_t S3PublicEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
+}
+
+void S3PublicEndpoint::setup_range_request(CurlHandle& curl,
+                                           std::size_t file_offset,
+                                           std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool S3PublicEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  return S3Endpoint::is_url_valid(url);
+}
+
 S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
   : RemoteEndpoint{RemoteEndpointType::S3_PRESIGNED_URL}, _url{std::move(presigned_url)}
 {
@@ -558,6 +589,7 @@ RemoteHandle RemoteHandle::open(std::string url,
 {
   if (!allow_list.has_value()) {
     allow_list = {RemoteEndpointType::S3,
+                  RemoteEndpointType::S3_PUBLIC,
                   RemoteEndpointType::S3_PRESIGNED_URL,
                   RemoteEndpointType::WEBHDFS,
                   RemoteEndpointType::HTTP};
@@ -579,6 +611,10 @@ RemoteHandle RemoteHandle::open(std::string url,
         }
         return std::make_unique<S3Endpoint>(url);
 
+      case RemoteEndpointType::S3_PUBLIC:
+        if (!S3PublicEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<S3PublicEndpoint>(url);
+
       case RemoteEndpointType::S3_PRESIGNED_URL:
         if (!S3EndpointWithPresignedUrl::is_url_valid(url)) { return nullptr; }
         return std::make_unique<S3EndpointWithPresignedUrl>(url);
@@ -601,7 +637,29 @@ RemoteHandle RemoteHandle::open(std::string url,
     // Try each allowed type in the order of allowlist
     for (auto const& type : allow_list.value()) {
       endpoint = create_endpoint(type);
-      if (endpoint) { break; }
+      if (endpoint == nullptr) { continue; }
+
+      // If the credential-based S3 endpoint cannot be used to access the URL, try using S3 public
+      // endpoint instead if it is in the allowlist
+      if (endpoint->remote_endpoint_type() == RemoteEndpointType::S3) {
+        try {
+          // Check connectivity for the credential-based S3 endpoint, and throw an exception if
+          // failed
+          endpoint->get_file_size();
+        } catch (...) {
+          auto it =
+            std::find(allow_list->begin(), allow_list->end(), RemoteEndpointType::S3_PUBLIC);
+          if (it != allow_list->end()) {
+            // If S3 public endpoint is in the allowlist, use it and end the search
+            endpoint = std::make_unique<S3PublicEndpoint>(url);
+          } else {
+            continue;
+          }
+        }
+      }
+
+      // At this point, a matching endpoint has been found
+      break;
     }
     KVIKIO_EXPECT(endpoint.get() != nullptr, "Unsupported endpoint URL.", std::runtime_error);
   } else {
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index a7e3d1e829..f1e25ea34d 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -36,18 +36,21 @@ class RemoteHandleTest : public testing::Test {
   {
     _sample_urls = {
       // Endpoint type: S3
-      {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
-      {"s3://bucket-name/object-key-name-dir/object-key-name-file", kvikio::RemoteEndpointType::S3},
+      {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"s3://bucket-name/object-key-name-dir/object-key-name-file",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
       {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
-       kvikio::RemoteEndpointType::S3},
+       kvikio::RemoteEndpointType::S3_PUBLIC},
       {"https://s3.region-code.amazonaws.com/bucket-name/object-key-name",
-       kvikio::RemoteEndpointType::S3},
-      {"https://bucket-name.s3.amazonaws.com/object-key-name", kvikio::RemoteEndpointType::S3},
-      {"https://s3.amazonaws.com/bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://bucket-name.s3.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://s3.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
       {"https://bucket-name.s3-region-code.amazonaws.com/object-key-name",
-       kvikio::RemoteEndpointType::S3},
+       kvikio::RemoteEndpointType::S3_PUBLIC},
       {"https://s3-region-code.amazonaws.com/bucket-name/object-key-name",
-       kvikio::RemoteEndpointType::S3},
+       kvikio::RemoteEndpointType::S3_PUBLIC},
 
       // Endpoint type: S3 presigned URL
       {"https://bucket-name.s3.region-code.amazonaws.com/"
@@ -74,9 +77,13 @@ class RemoteHandleTest : public testing::Test {
 
         // Test unified interface
         {
-          // Here we pass the 1-byte argument to RemoteHandle::open. This prevents the endpoint
-          // constructor from querying the file size and sending requests to the server, thus
-          // allowing us to use dummy URLs for testing purpose.
+          // Here we pass the 1-byte argument to RemoteHandle::open. For all endpoints except
+          // kvikio::RemoteEndpointType::S3, this prevents the endpoint constructor from querying
+          // the file size and sending requests to the server, thus allowing us to use dummy URLs
+          // for testing purpose.
+          // For kvikio::RemoteEndpointType::S3, RemoteHandle::open sends HEAD request as a
+          // connectivity check and will fail on the syntactically valid dummy URL. The
+          // kvikio::RemoteEndpointType::S3_PUBLIC will then be used as the endpoint.
           auto remote_handle =
             kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1);
           EXPECT_EQ(remote_handle.remote_endpoint_type(), expected_endpoint_type);
@@ -150,7 +157,7 @@ TEST_F(RemoteHandleTest, test_s3_url)
                                           {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"}};
 
   {
-    test_helper(kvikio::RemoteEndpointType::S3, kvikio::S3Endpoint::is_url_valid);
+    test_helper(kvikio::RemoteEndpointType::S3_PUBLIC, kvikio::S3Endpoint::is_url_valid);
   }
 
   // Invalid URLs
@@ -168,6 +175,19 @@ TEST_F(RemoteHandleTest, test_s3_url)
       EXPECT_FALSE(kvikio::S3Endpoint::is_url_valid(invalid_url));
     }
   }
+
+  // S3_PUBLIC is not in the allowlist. So when the connectivity check fails on the dummy URL,
+  // KvikIO cannot fall back to S3_PUBLIC.
+  {
+    EXPECT_ANY_THROW({
+      kvikio::RemoteHandle::open(
+        "s3://bucket-name/object-key-name",
+        kvikio::RemoteEndpointType::AUTO,
+        std::vector<kvikio::RemoteEndpointType>{kvikio::RemoteEndpointType::S3,
+                                                kvikio::RemoteEndpointType::HTTP},
+        1);
+    });
+  }
 }
 
 TEST_F(RemoteHandleTest, test_s3_url_with_presigned_url)
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index edfd59e78d..8fae78c534 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -23,9 +23,10 @@ cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
     cpdef enum class RemoteEndpointType(uint8_t):
         AUTO = 0
         S3 = 1
-        S3_PRESIGNED_URL = 2
-        WEBHDFS = 3
-        HTTP = 4
+        S3_PUBLIC = 2
+        S3_PRESIGNED_URL = 3
+        WEBHDFS = 4
+        HTTP = 5
     cdef cppclass cpp_RemoteEndpoint "kvikio::RemoteEndpoint":
         string str() except +
 
@@ -39,6 +40,9 @@ cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
     pair[string, string] cpp_parse_s3_url \
         "kvikio::S3Endpoint::parse_s3_url"(string url) except +
 
+    cdef cppclass cpp_S3PublicEndpoint "kvikio::S3PublicEndpoint" (cpp_RemoteEndpoint):
+        cpp_S3PublicEndpoint(string url) except +
+
     cdef cppclass cpp_S3EndpointWithPresignedUrl "kvikio::S3EndpointWithPresignedUrl" \
                                                  (cpp_RemoteEndpoint):
         cpp_S3EndpointWithPresignedUrl(string presigned_url) except +
@@ -219,7 +223,25 @@ cdef class RemoteFile:
         )
 
     @staticmethod
-    def open_s3_from_http_presigned_url(
+    def open_s3_public(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3PublicEndpoint](cpp_url)
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3_presigned_url(
         presigned_url: str,
         nbytes: Optional[int],
     ):
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index a3f73d271a..2064320914 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -28,6 +28,10 @@ class RemoteEndpointType(enum.Enum):
         AWS S3 endpoint using credentials-based authentication. Requires
         AWS environment variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
         AWS_DEFAULT_REGION) to be set.
+    S3_PUBLIC : INT
+        AWS S3 endpoint for publicly accessible objects. No credentials required as the
+        objects have public read permissions enabled. Used for open datasets and public
+        buckets.
     S3_PRESIGNED_URL : int
         AWS S3 endpoint using a presigned URL. No credentials required as
         authentication is embedded in the URL with time-limited access.
@@ -46,9 +50,10 @@ class RemoteEndpointType(enum.Enum):
 
     AUTO = 0
     S3 = 1
-    S3_PRESIGNED_URL = 2
-    WEBHDFS = 3
-    HTTP = 4
+    S3_PUBLIC = 2
+    S3_PRESIGNED_URL = 3
+    WEBHDFS = 4
+    HTTP = 5
 
     @staticmethod
     def _map_to_internal(remote_endpoint_type: RemoteEndpointType):
@@ -102,7 +107,7 @@ def open_http(
         url: str,
         nbytes: Optional[int] = None,
     ) -> RemoteFile:
-        """Open a http file.
+        """Open a HTTP/HTTPS file.
 
         Parameters
         ----------
@@ -112,7 +117,7 @@ def open_http(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(_get_remote_module().RemoteFile.open_http(url, nbytes))
+        return cls(_get_remote_module().RemoteFile.open_http(url, nbytes))
 
     @classmethod
     def open_s3(
@@ -142,7 +147,7 @@ def open_s3(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(
+        return cls(
             _get_remote_module().RemoteFile.open_s3(bucket_name, object_name, nbytes)
         )
 
@@ -178,15 +183,27 @@ def open_s3_url(
         """
         parsed_result = urllib.parse.urlparse(url.lower())
         if parsed_result.scheme in ("http", "https"):
-            return RemoteFile(
+            return cls(
                 _get_remote_module().RemoteFile.open_s3_from_http_url(url, nbytes)
             )
         if parsed_result.scheme == "s3":
-            return RemoteFile(
-                _get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes)
-            )
+            return cls(_get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes))
         raise ValueError(f"Unsupported protocol: {url}")
 
+    @classmethod
+    def open_s3_public(cls, url: str, nbytes: Optional[int] = None) -> RemoteFile:
+        """Open a publicly accessible AWS S3 file.
+
+        Parameters
+        ----------
+        url
+             URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return cls(_get_remote_module().RemoteFile.open_s3_public(url, nbytes))
+
     @classmethod
     def open_s3_presigned_url(
         cls,
@@ -203,10 +220,8 @@ def open_s3_presigned_url(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(
-            _get_remote_module().RemoteFile.open_s3_from_http_presigned_url(
-                presigned_url, nbytes
-            )
+        return cls(
+            _get_remote_module().RemoteFile.open_s3_presigned_url(presigned_url, nbytes)
         )
 
     @classmethod
@@ -228,7 +243,7 @@ def open_webhdfs(
             The size of the file. If None, KvikIO will ask the server for the file
             size.
         """
-        return RemoteFile(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
+        return cls(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
 
     @classmethod
     def open(
@@ -242,9 +257,9 @@ def open(
         Create a remote file handle from a URL.
 
         This function creates a RemoteFile for reading data from various remote
-        endpoints including HTTP/HTTPS servers, AWS S3 buckets, S3 presigned URLs,
-        and WebHDFS. The endpoint type can be automatically detected from the URL
-        or explicitly specified.
+        endpoints including HTTP/HTTPS servers, AWS S3 buckets, S3 for public access,
+        S3 presigned URLs, and WebHDFS. The endpoint type can be automatically detected
+        from the URL or explicitly specified.
 
         Parameters
         ----------
@@ -252,6 +267,7 @@ def open(
             The URL of the remote file. Supported formats include:
 
             - S3 with credentials
+            - S3 for public access
             - S3 presigned URL
             - WebHDFS
             - HTTP/HTTPS
@@ -259,6 +275,7 @@ def open(
             The type of remote endpoint. Default is :class:`RemoteEndpointType.AUTO`
             which automatically detects the endpoint type from the URL. Can be
             explicitly set to :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PUBLIC`,
             :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
             :class:`RemoteEndpointType.WEBHDFS`, or :class:`RemoteEndpointType.HTTP`
             to force a specific endpoint type.
@@ -272,6 +289,7 @@ def open(
 
             If not provided, defaults to all supported types in this order:
             :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PUBLIC`,
             :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
             :class:`RemoteEndpointType.WEBHDFS`, and :class:`RemoteEndpointType.HTTP`.
         nbytes : int, optional
@@ -337,7 +355,7 @@ def open(
                  nbytes=1024 * 1024 * 100  # 100 MB
              )
         """
-        return RemoteFile(
+        return cls(
             _get_remote_module().RemoteFile.open(
                 url,
                 RemoteEndpointType._map_to_internal(remote_endpoint_type),

From 4dea5c4c41c1ade2b8bfccc22c8afa276ca45d19 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Tue, 16 Sep 2025 13:37:05 -0500
Subject: [PATCH 071/103] configuring repo for automatic release notes
 generation

---
 .github/release.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .github/release.yml

diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 0000000000..2c9a85805c
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,27 @@
+# GitHub Auto-Generated Release Notes Configuration for RAPIDS
+# This file configures how GitHub automatically generates release notes
+
+changelog:
+  exclude:
+    labels:
+      - ignore-for-release
+      - dependencies
+    authors:
+      - rapids-bot[bot]
+      - dependabot[bot]
+  categories:
+    - title: 🚨 Breaking Changes
+      labels:
+        - breaking
+    - title: 🐛 Bug Fixes
+      labels:
+        - bug
+    - title: 📖 Documentation
+      labels:
+        - doc
+    - title: 🚀 New Features
+      labels:
+        - feature request
+    - title: 🛠️ Improvements
+      labels:
+        - improvement

From b5b42326a69cd4f754a019a687fdecfa030d641b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 17 Sep 2025 01:05:42 -0500
Subject: [PATCH 072/103] Fix CUDA 13 handling of libcufile on aarch64 (#827)

Fixes an issue where CUDA 13 packages named like `linux-aarch64/libkvikio-25.10.00a43-cuda13_0_250916_b69d9aea.conda` were getting dependencies on `cuda-version >=12.2.0a0,<14.0a0`, which allowed them to be used in CUDA 12 environments. That is not desired and could cause problems.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/827
---
 conda/recipes/libkvikio/recipe.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index f4164c9611..3b3a0bc1bf 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -93,7 +93,7 @@ outputs:
         - cuda-version =${{ cuda_version }}
         - libcurl ==${{ libcurl_version }}
       run:
-        - if: x86_64
+        - if: x86_64 or (aarch64 and cuda_version >= "13.0")
           then:
           - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
           else:

From 37e5270da6091269bd7a0ac3d1ad11e0314ab172 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Thu, 18 Sep 2025 16:26:46 +0000
Subject: [PATCH 073/103] DOC v25.12 Updates [skip ci]

---
 .../cuda12.9-conda/devcontainer.json          |  6 ++--
 .devcontainer/cuda12.9-pip/devcontainer.json  |  6 ++--
 .../cuda13.0-conda/devcontainer.json          |  6 ++--
 .devcontainer/cuda13.0-pip/devcontainer.json  |  6 ++--
 .github/workflows/build.yaml                  | 18 +++++------
 .github/workflows/pr.yaml                     | 30 +++++++++----------
 .github/workflows/test.yaml                   |  8 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 RAPIDS_BRANCH                                 |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-129_arch-aarch64.yaml            |  2 +-
 .../all_cuda-129_arch-x86_64.yaml             |  2 +-
 .../all_cuda-130_arch-aarch64.yaml            |  2 +-
 .../all_cuda-130_arch-x86_64.yaml             |  2 +-
 .../downstream/cmake/get_kvikio.cmake         |  2 +-
 dependencies.yaml                             | 22 +++++++-------
 java/pom.xml                                  |  2 +-
 python/kvikio/pyproject.toml                  |  6 ++--
 18 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index 175af7a73c..b220017a03 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -5,19 +5,19 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge"
+      "BASE": "rapidsai/devcontainers:25.12-cpp-mambaforge"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda12.9-conda",
     "--ulimit",
     "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index ff13ba7e3c..e15fee7294 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -5,19 +5,19 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9"
+      "BASE": "rapidsai/devcontainers:25.12-cpp-cuda12.9"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda12.9-pip",
     "--ulimit",
     "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda13.0-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json
index 21e7bf83c4..4788a99058 100644
--- a/.devcontainer/cuda13.0-conda/devcontainer.json
+++ b/.devcontainer/cuda13.0-conda/devcontainer.json
@@ -5,19 +5,19 @@
     "args": {
       "CUDA": "13.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge"
+      "BASE": "rapidsai/devcontainers:25.12-cpp-mambaforge"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-conda",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda13.0-conda",
     "--ulimit",
     "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda13.0-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json
index 3c035eee05..ff862c5ecf 100644
--- a/.devcontainer/cuda13.0-pip/devcontainer.json
+++ b/.devcontainer/cuda13.0-pip/devcontainer.json
@@ -5,19 +5,19 @@
     "args": {
       "CUDA": "13.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda13.0"
+      "BASE": "rapidsai/devcontainers:25.12-cpp-cuda13.0"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-pip",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda13.0-pip",
     "--ulimit",
     "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0070274f37..0bac6ac934 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,19 +64,19 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:25.08-latest"
+      container_image: "rapidsai/ci-conda:25.12-latest"
       date: ${{ inputs.date }}
       node_type: "gpu-l4-latest-1"
       script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f610dce204..188b165222 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.12
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.12
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.12
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.12
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.12
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,24 +107,24 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.12
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:25.08-latest"
+      container_image: "rapidsai/ci-conda:25.12-latest"
       script: "ci/test_java.sh"
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.12
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.12
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,17 +132,17 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.12
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:25.08-latest"
+      container_image: "rapidsai/ci-conda:25.12-latest"
       script: "ci/build_docs.sh"
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.12
     with:
       arch: '["amd64", "arm64"]'
       cuda: '["13.0"]'
@@ -160,7 +160,7 @@ jobs:
         sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -170,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.12
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -179,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.12
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5f618498b3..b8914deff2 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.12
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -51,5 +51,5 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:25.08-latest"
+      container_image: "rapidsai/ci-conda:25.12-latest"
       script: "ci/test_java.sh"
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 48bf37afc4..c3cce1c34f 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.12
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
index 9b1c52d941..a2b5a91fc4 100644
--- a/RAPIDS_BRANCH
+++ b/RAPIDS_BRANCH
@@ -1 +1 @@
-branch-25.10
+branch-25.12
diff --git a/VERSION b/VERSION
index 296e35288d..7924af6192 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.10.00
+25.12.00
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 279b130e55..d2c1dd9041 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- rapids-dask-dependency==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 5460d73ab7..9b3292a72c 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- rapids-dask-dependency==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
index 1b752cbe8e..d1db3c2e9a 100644
--- a/conda/environments/all_cuda-130_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- rapids-dask-dependency==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
index 812bfdad5e..f37bf67268 100644
--- a/conda/environments/all_cuda-130_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- rapids-dask-dependency==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 4e03834e56..d163fcf5cf 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -30,4 +30,4 @@ function(find_and_configure_kvikio MIN_VERSION)
 
 endfunction()
 
-find_and_configure_kvikio("25.10")
+find_and_configure_kvikio("25.12")
diff --git a/dependencies.yaml b/dependencies.yaml
index ce1b6f1b8d..541708ecfc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -148,7 +148,7 @@ dependencies:
     common:
       - output_types: conda
         packages: &libkvikio_packages
-          - libkvikio==25.10.*,>=0.0.0a0
+          - libkvikio==25.12.*,>=0.0.0a0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -156,12 +156,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.10.*,>=0.0.0a0
+              - libkvikio-cu12==25.12.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu13==25.10.*,>=0.0.0a0
+              - libkvikio-cu13==25.12.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -230,7 +230,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.10.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==25.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -244,12 +244,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.10.*,>=0.0.0a0
+              - libkvikio-cu12==25.12.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu13==25.10.*,>=0.0.0a0
+              - libkvikio-cu13==25.12.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
@@ -312,14 +312,14 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.10.*,>=0.0.0a0
-          - libkvikio-tests==25.10.*,>=0.0.0a0
+          - libkvikio==25.12.*,>=0.0.0a0
+          - libkvikio-tests==25.12.*,>=0.0.0a0
   test_kvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.10.*,>=0.0.0a0
-          - kvikio==25.10.*,>=0.0.0a0
+          - libkvikio==25.12.*,>=0.0.0a0
+          - kvikio==25.12.*,>=0.0.0a0
   test_cpp:
     common:
       - output_types: conda
@@ -329,7 +329,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.10.*,>=0.0.0a0
+          - rapids-dask-dependency==25.12.*,>=0.0.0a0
           - pytest
           - pytest-asyncio
           - pytest-cov
diff --git a/java/pom.xml b/java/pom.xml
index 7e00c88661..d2c44cab63 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -7,7 +7,7 @@
 
   <groupId>ai.rapids.kvikio</groupId>
   <artifactId>cufile</artifactId>
-  <version>25.10.0-SNAPSHOT</version>
+  <version>25.12.0-SNAPSHOT</version>
 
   <name>cuFile</name>
   <description>
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 8f214c31d9..78723f4a7e 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cupy-cuda13x>=13.6.0",
-    "libkvikio==25.10.*,>=0.0.0a0",
+    "libkvikio==25.12.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
     "packaging",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -46,7 +46,7 @@ test = [
     "pytest-cov",
     "pytest-timeout",
     "rangehttpserver",
-    "rapids-dask-dependency==25.10.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.12.*,>=0.0.0a0",
     "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 zarr = [
@@ -120,7 +120,7 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
     "cython>=3.0.0",
-    "libkvikio==25.10.*,>=0.0.0a0",
+    "libkvikio==25.12.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From 580182f5a1e902561963fffd919092d987773da3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 18 Sep 2025 11:01:14 -0700
Subject: [PATCH 074/103] Remove ability to build kvikio without CUDA (#829)

This change supersedes #822 and makes CUDA a requirement for building kvikio. We have no compelling use case for this support outside of [legate](https://docs.nvidia.com/legate/latest/api/cpp/generated/group/group__io-kvikio.html), and after discussion with the team they are happy to sunset this support as well.

Resolves #736
Resolves #485

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/829
---
 cpp/CMakeLists.txt                           | 137 +++++++++----------
 cpp/include/kvikio/nvtx.hpp                  |  27 ----
 cpp/include/kvikio/shim/cuda.hpp             |   6 +-
 cpp/include/kvikio/shim/cuda_h_wrapper.hpp   |  95 -------------
 cpp/include/kvikio/shim/cufile_h_wrapper.hpp |   2 +-
 cpp/include/kvikio/utils.hpp                 |   4 -
 cpp/src/nvtx.cpp                             |  14 --
 cpp/src/shim/cuda.cpp                        |   7 -
 cpp/src/utils.cpp                            |   4 -
 9 files changed, 66 insertions(+), 230 deletions(-)
 delete mode 100644 cpp/include/kvikio/shim/cuda_h_wrapper.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5db5fa6f50..0a67b6e677 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -48,7 +48,6 @@ option(KvikIO_BUILD_BENCHMARKS "Configure CMake to build benchmarks" ON)
 option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON)
 option(KvikIO_BUILD_TESTS "Configure CMake to build tests" ON)
 option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote IO support" ON)
-option(KvikIO_CUDA_SUPPORT "Configure CMake to build with CUDA support" ON)
 
 # ##################################################################################################
 # * conda environment ------------------------------------------------------------------------------
@@ -72,66 +71,65 @@ if(KvikIO_REMOTE_SUPPORT)
   endif()
 endif()
 
-set(cuFile_FOUND 0)
-if(KvikIO_CUDA_SUPPORT)
-  rapids_find_package(
-    CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET kvikio-exports
-    INSTALL_EXPORT_SET kvikio-exports
+# CUDA is now required
+rapids_find_package(
+  CUDAToolkit REQUIRED
+  BUILD_EXPORT_SET kvikio-exports
+  INSTALL_EXPORT_SET kvikio-exports
+)
+include(cmake/thirdparty/get_nvtx.cmake)
+
+if(NOT TARGET CUDA::cuFile)
+  set(cuFile_FOUND 0)
+  message(
+    WARNING "Cannot find cuFile - KvikIO will still work but won't use GPUDirect Storage (GDS)"
   )
-  include(cmake/thirdparty/get_nvtx.cmake)
-
-  if(NOT TARGET CUDA::cuFile)
-    message(
-      WARNING "Cannot find cuFile - KvikIO will still work but won't use GPUDirect Storage (GDS)"
-    )
-  else()
-    set(cuFile_FOUND 1)
-
-    # Check API support
-    try_compile(
-      cuFile_BATCH_API_FOUND SOURCE_FROM_CONTENT
-      batch.cpp
-      [[#include <cufile.h>
-      int main() {
-        cuFileBatchIOSetUp(nullptr, 0);
-        return 0;
-      }
-      ]]
-      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
-      OUTPUT_VARIABLE batch_output
-    )
-    message(STATUS "Found cuFile Batch API: ${cuFile_BATCH_API_FOUND}")
-    try_compile(
-      cuFile_STREAM_API_FOUND SOURCE_FROM_CONTENT
-      stream.cpp
-      [[#include <cufile.h>
-      int main() {
-        CUfileHandle_t fh;
-        CUstream stream;
-        cuFileReadAsync(fh, nullptr, nullptr, nullptr, nullptr, nullptr, stream);
-        return 0;
-      }
-      ]]
-      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
-      OUTPUT_VARIABLE stream_output
-    )
-    message(STATUS "Found cuFile Stream API: ${cuFile_STREAM_API_FOUND}")
-    try_compile(
-      cuFile_VERSION_API_FOUND SOURCE_FROM_CONTENT
-      version.cpp
-      [[#include <cufile.h>
-      int main() {
-        int version;
-        cuFileGetVersion(&version);
-        return 0;
-      }
-      ]]
-      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
-      OUTPUT_VARIABLE version_output
-    )
-    message(STATUS "Found cuFile Version API: ${cuFile_VERSION_API_FOUND}")
-  endif()
+else()
+  set(cuFile_FOUND 1)
+
+  # Check API support
+  try_compile(
+    cuFile_BATCH_API_FOUND SOURCE_FROM_CONTENT
+    batch.cpp
+    [[#include <cufile.h>
+    int main() {
+      cuFileBatchIOSetUp(nullptr, 0);
+      return 0;
+    }
+    ]]
+    LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+    OUTPUT_VARIABLE batch_output
+  )
+  message(STATUS "Found cuFile Batch API: ${cuFile_BATCH_API_FOUND}")
+  try_compile(
+    cuFile_STREAM_API_FOUND SOURCE_FROM_CONTENT
+    stream.cpp
+    [[#include <cufile.h>
+    int main() {
+      CUfileHandle_t fh;
+      CUstream stream;
+      cuFileReadAsync(fh, nullptr, nullptr, nullptr, nullptr, nullptr, stream);
+      return 0;
+    }
+    ]]
+    LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+    OUTPUT_VARIABLE stream_output
+  )
+  message(STATUS "Found cuFile Stream API: ${cuFile_STREAM_API_FOUND}")
+  try_compile(
+    cuFile_VERSION_API_FOUND SOURCE_FROM_CONTENT
+    version.cpp
+    [[#include <cufile.h>
+    int main() {
+      int version;
+      cuFileGetVersion(&version);
+      return 0;
+    }
+    ]]
+    LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+    OUTPUT_VARIABLE version_output
+  )
+  message(STATUS "Found cuFile Version API: ${cuFile_VERSION_API_FOUND}")
 endif()
 
 include(cmake/thirdparty/get_thread_pool.cmake)
@@ -181,7 +179,7 @@ add_library(kvikio::kvikio ALIAS kvikio)
 target_include_directories(
   kvikio
   PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-         "$<BUILD_INTERFACE:$<$<BOOL:${KvikIO_CUDA_SUPPORT}>:${CUDAToolkit_INCLUDE_DIRS}>>"
+         "$<BUILD_INTERFACE:${CUDAToolkit_INCLUDE_DIRS}>"
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
@@ -195,7 +193,6 @@ target_link_libraries(
 target_compile_definitions(
   kvikio
   PUBLIC $<$<BOOL:${KvikIO_REMOTE_SUPPORT}>:KVIKIO_LIBCURL_FOUND>
-         $<$<BOOL:${KvikIO_CUDA_SUPPORT}>:KVIKIO_CUDA_FOUND>
          $<$<BOOL:${cuFile_FOUND}>:KVIKIO_CUFILE_FOUND>
          $<$<BOOL:${cuFile_BATCH_API_FOUND}>:KVIKIO_CUFILE_BATCH_API_FOUND>
          $<$<BOOL:${cuFile_STREAM_API_FOUND}>:KVIKIO_CUFILE_STREAM_API_FOUND>
@@ -231,10 +228,7 @@ if(KvikIO_BUILD_EXAMPLES)
   add_subdirectory(examples)
 endif()
 
-if(CUDAToolkit_FOUND
-   AND KvikIO_BUILD_TESTS
-   AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME
-)
+if(KvikIO_BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
   include(cmake/thirdparty/get_gtest.cmake)
 
   # include CTest module -- automatically calls enable_testing()
@@ -271,7 +265,6 @@ Provide targets for KvikIO.
 
 set(final_code_string
     "
-set(KvikIO_CUDA_SUPPORT [=[${KvikIO_CUDA_SUPPORT}]=])
 set(KvikIO_CUFILE_SUPPORT [=[${cuFile_FOUND}]=])
 set(KvikIO_REMOTE_SUPPORT [=[${KvikIO_REMOTE_SUPPORT}]=])
 "
@@ -280,13 +273,11 @@ string(
   APPEND
   final_code_string
   [=[
-if(KvikIO_CUDA_SUPPORT)
-  find_package(CUDAToolkit REQUIRED QUIET)
-  target_include_directories(kvikio::kvikio INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
+find_package(CUDAToolkit REQUIRED QUIET)
+target_include_directories(kvikio::kvikio INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
 
-  if(KvikIO_CUFILE_SUPPORT AND NOT TARGET CUDA::cuFile)
-    message(FATAL_ERROR "Compiled with cuFile support but cuFile not found")
-  endif()
+if(KvikIO_CUFILE_SUPPORT AND NOT TARGET CUDA::cuFile)
+  message(FATAL_ERROR "Compiled with cuFile support but cuFile not found")
 endif()
 ]=]
 )
diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp
index 7fff8e4d47..902283f07d 100644
--- a/cpp/include/kvikio/nvtx.hpp
+++ b/cpp/include/kvikio/nvtx.hpp
@@ -17,16 +17,13 @@
 
 #include <cstdint>
 
-#ifdef KVIKIO_CUDA_FOUND
 #include <nvtx3/nvtx3.hpp>
-#endif
 
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
 
 namespace kvikio {
 
-#ifdef KVIKIO_CUDA_FOUND
 /**
  * @brief Tag type for libkvikio's NVTX domain.
  */
@@ -100,13 +97,7 @@ using nvtx_registered_string_type = nvtx3::registered_string_in<libkvikio_domain
   nvtx3::mark_in<kvikio::libkvikio_domain>(nvtx3::event_attributes{ \
     KVIKIO_REGISTER_STRING(message), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}})
 
-#endif
-
-#ifdef KVIKIO_CUDA_FOUND
 using nvtx_color_type = nvtx3::color;
-#else
-using nvtx_color_type = int;
-#endif
 
 /**
  * @brief Utility singleton class for NVTX annotation.
@@ -181,13 +172,7 @@ class NvtxManager {
  * }
  * ```
  */
-#ifdef KVIKIO_CUDA_FOUND
 #define KVIKIO_NVTX_FUNC_RANGE(...) KVIKIO_NVTX_FUNC_RANGE_IMPL(__VA_ARGS__)
-#else
-#define KVIKIO_NVTX_FUNC_RANGE(...) \
-  do {                              \
-  } while (0)
-#endif
 
 /**
  * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to
@@ -206,13 +191,7 @@ class NvtxManager {
  * }
  * ```
  */
-#ifdef KVIKIO_CUDA_FOUND
 #define KVIKIO_NVTX_SCOPED_RANGE(...) KVIKIO_NVTX_SCOPED_RANGE_IMPL(__VA_ARGS__)
-#else
-#define KVIKIO_NVTX_SCOPED_RANGE(message, payload, ...) \
-  do {                                                  \
-  } while (0)
-#endif
 
 /**
  * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a
@@ -232,12 +211,6 @@ class NvtxManager {
  * }
  * ```
  */
-#ifdef KVIKIO_CUDA_FOUND
 #define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload)
-#else
-#define KVIKIO_NVTX_MARKER(message, payload) \
-  do {                                       \
-  } while (0)
-#endif
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index 4ebf243fbb..c0196cec1a 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -18,7 +18,7 @@
 #include <any>
 #include <functional>
 
-#include <kvikio/shim/cuda_h_wrapper.hpp>
+#include <cuda.h>
 #include <kvikio/shim/utils.hpp>
 #include <stdexcept>
 
@@ -134,10 +134,6 @@ class cudaAPI {
  *
  * @return The boolean answer
  */
-#ifdef KVIKIO_CUDA_FOUND
 bool is_cuda_available();
-#else
-constexpr bool is_cuda_available() { return false; }
-#endif
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/cuda_h_wrapper.hpp b/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
deleted file mode 100644
index ca6ab5fa59..0000000000
--- a/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-/**
- * In order to support compilation when `cuda.h` isn't available, we
- * wrap all use of cuda in a `#ifdef KVIKIO_CUDA_FOUND` guard.
- *
- * The motivation here is to make KvikIO work in all circumstances so
- * that libraries doesn't have to implement there own fallback solutions.
- */
-#ifdef KVIKIO_CUDA_FOUND
-#include <cuda.h>
-#else
-
-// If CUDA isn't defined, we define some of the data types here.
-// Notice, the functions and constant values don't need to match the CUDA
-// definitions, but the types *do*, since downstream libraries dlsym()-ing
-// the symbols at runtime rely on accurate type definitions. If we mismatch
-// here, then those libraries will get "mismatched type alias redefinition"
-// errors when they include our headers.
-
-#if defined(_WIN64) || defined(__LP64__)
-// Don't use uint64_t, we want to match the driver headers exactly
-using CUdeviceptr = unsigned long long;
-#else
-using CUdeviceptr = unsigned int;
-#endif
-static_assert(sizeof(CUdeviceptr) == sizeof(void*));
-
-using CUresult  = int;
-using CUdevice  = int;
-using CUcontext = struct CUctx_st*;
-using CUstream  = struct CUstream_st*;
-
-#define CUDA_ERROR_STUB_LIBRARY             0
-#define CUDA_SUCCESS                        0
-#define CUDA_ERROR_INVALID_VALUE            0
-#define CU_POINTER_ATTRIBUTE_CONTEXT        0
-#define CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL 0
-#define CU_POINTER_ATTRIBUTE_DEVICE_POINTER 0
-#define CU_MEMHOSTALLOC_PORTABLE            0
-#define CU_STREAM_DEFAULT                   0
-
-CUresult cuInit(...);
-CUresult cuMemHostAlloc(...);
-CUresult cuMemFreeHost(...);
-CUresult cuMemcpyHtoDAsync(...);
-CUresult cuMemcpyDtoHAsync(...);
-CUresult cuMemcpyBatchAsync(...);
-CUresult cuPointerGetAttribute(...);
-CUresult cuPointerGetAttributes(...);
-CUresult cuCtxPushCurrent(...);
-CUresult cuCtxPopCurrent(...);
-CUresult cuCtxGetCurrent(...);
-CUresult cuCtxGetDevice(...);
-CUresult cuMemGetAddressRange(...);
-CUresult cuGetErrorName(...);
-CUresult cuGetErrorString(...);
-CUresult cuDeviceGet(...);
-CUresult cuDeviceGetCount(...);
-CUresult cuDeviceGetAttribute(...);
-CUresult cuDevicePrimaryCtxRetain(...);
-CUresult cuDevicePrimaryCtxRelease(...);
-CUresult cuStreamCreate(...);
-CUresult cuStreamDestroy(...);
-CUresult cuStreamSynchronize(...);
-CUresult cuDriverGetVersion(...);
-
-enum CUdevice_attribute {
-  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
-};
-
-enum CUmemcpySrcAccessOrder_enum {
-  CU_MEMCPY_SRC_ACCESS_ORDER_STREAM,
-};
-
-struct CUmemcpyAttributes {
-  int srcAccessOrder;
-};
-
-#endif
diff --git a/cpp/include/kvikio/shim/cufile_h_wrapper.hpp b/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
index 66f5adbaf3..886a6e12aa 100644
--- a/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
+++ b/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
@@ -17,7 +17,7 @@
 
 #include <sys/types.h>
 
-#include <kvikio/shim/cuda_h_wrapper.hpp>
+#include <cuda.h>
 
 /**
  * In order to support compilation when `cufile.h` isn't available, we
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 7724fe92a6..7b84b3bb77 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -73,11 +73,7 @@ template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
  * @param ptr Memory pointer to query
  * @return The boolean answer
  */
-#ifdef KVIKIO_CUDA_FOUND
 bool is_host_memory(void const* ptr);
-#else
-constexpr bool is_host_memory(void const* ptr) { return true; }
-#endif
 
 /**
  * @brief Return the device owning the pointer
diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp
index b9d7c3e146..e62ba2eac7 100644
--- a/cpp/src/nvtx.cpp
+++ b/cpp/src/nvtx.cpp
@@ -18,9 +18,7 @@
 #include <array>
 #include <sstream>
 
-#ifdef KVIKIO_CUDA_FOUND
 #include <nvtx3/nvtx3.hpp>
-#endif
 
 #include <kvikio/nvtx.hpp>
 
@@ -34,18 +32,12 @@ NvtxManager& NvtxManager::instance() noexcept
 
 const nvtx_color_type& NvtxManager::default_color() noexcept
 {
-#ifdef KVIKIO_CUDA_FOUND
   static nvtx_color_type default_color{nvtx3::argb{0, 255, 255, 255}};
   return default_color;
-#else
-  static nvtx_color_type dummy{};
-  return dummy;
-#endif
 }
 
 const nvtx_color_type& NvtxManager::get_color_by_index(std::uint64_t idx) noexcept
 {
-#ifdef KVIKIO_CUDA_FOUND
   constexpr std::size_t num_color{16};
   static_assert((num_color & (num_color - 1)) == 0);  // Is power of 2
   static std::array<nvtx_color_type, num_color> color_palette = {nvtx3::rgb{106, 192, 67},
@@ -66,15 +58,10 @@ const nvtx_color_type& NvtxManager::get_color_by_index(std::uint64_t idx) noexce
                                                                  nvtx3::rgb{122, 50, 49}};
   auto safe_idx = idx & (num_color - 1);  // idx % num_color
   return color_palette[safe_idx];
-#else
-  static nvtx_color_type dummy{};
-  return dummy;
-#endif
 }
 
 void NvtxManager::rename_current_thread(std::string_view new_name) noexcept
 {
-#ifdef KVIKIO_CUDA_FOUND
   auto tid = syscall(SYS_gettid);
   std::stringstream ss;
   ss << new_name << " (" << tid << ")";
@@ -88,7 +75,6 @@ void NvtxManager::rename_current_thread(std::string_view new_name) noexcept
   attribs.message.ascii            = ss.str().c_str();
   nvtxResourceHandle_t handle =
     nvtxDomainResourceCreate(nvtx3::domain::get<libkvikio_domain>(), &attribs);
-#endif
 }
 
 }  // namespace kvikio
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
index ae69a495aa..0d4784db59 100644
--- a/cpp/src/shim/cuda.cpp
+++ b/cpp/src/shim/cuda.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <functional>
 #include <stdexcept>
 
 #include <kvikio/error.hpp>
@@ -22,7 +21,6 @@
 
 namespace kvikio {
 
-#ifdef KVIKIO_CUDA_FOUND
 cudaAPI::cudaAPI()
 {
   void* lib = load_library("libcuda.so.1");
@@ -71,9 +69,6 @@ cudaAPI::cudaAPI()
   }
 #endif
 }
-#else
-cudaAPI::cudaAPI() { KVIKIO_FAIL("KvikIO not compiled with CUDA support", std::runtime_error); }
-#endif
 
 cudaAPI& cudaAPI::instance()
 {
@@ -81,7 +76,6 @@ cudaAPI& cudaAPI::instance()
   return _instance;
 }
 
-#ifdef KVIKIO_CUDA_FOUND
 bool is_cuda_available()
 {
   try {
@@ -91,6 +85,5 @@ bool is_cuda_available()
   }
   return true;
 }
-#endif
 
 }  // namespace kvikio
diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp
index f86b539ab4..433eb6b279 100644
--- a/cpp/src/utils.cpp
+++ b/cpp/src/utils.cpp
@@ -20,9 +20,7 @@
 #include <iostream>
 #include <map>
 #include <optional>
-#include <stdexcept>
 #include <tuple>
-#include <type_traits>
 
 #include <kvikio/detail/utils.hpp>
 #include <kvikio/error.hpp>
@@ -57,7 +55,6 @@ CUdeviceptr convert_void2deviceptr(void const* devPtr)
   return reinterpret_cast<CUdeviceptr>(devPtr);
 }
 
-#ifdef KVIKIO_CUDA_FOUND
 bool is_host_memory(void const* ptr)
 {
   CUpointer_attribute attrs[1] = {
@@ -77,7 +74,6 @@ bool is_host_memory(void const* ptr)
   // does it to support `cudaMemoryTypeUnregistered`.
   return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
 }
-#endif
 
 int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
 {

From 987683af02da8847225359786201ef1038ab4394 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 18 Sep 2025 15:01:43 -0700
Subject: [PATCH 075/103] Keep string alive until nvtxDomainResourceCreate
 (#832)

Resolves #830

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/kvikio/pull/832
---
 cpp/src/nvtx.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp
index b9d7c3e146..0467e46e38 100644
--- a/cpp/src/nvtx.cpp
+++ b/cpp/src/nvtx.cpp
@@ -85,7 +85,8 @@ void NvtxManager::rename_current_thread(std::string_view new_name) noexcept
   attribs.identifierType           = NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE;
   attribs.identifier.ullValue      = tid;
   attribs.messageType              = NVTX_MESSAGE_TYPE_ASCII;
-  attribs.message.ascii            = ss.str().c_str();
+  auto st                          = ss.str();
+  attribs.message.ascii            = st.c_str();
   nvtxResourceHandle_t handle =
     nvtxDomainResourceCreate(nvtx3::domain::get<libkvikio_domain>(), &attribs);
 #endif

From d4b7773e0cc4f164b1a69df62f00ddd179226c6c Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 22 Sep 2025 16:43:20 -0400
Subject: [PATCH 076/103] Fix a bug in public S3 inference (#831)

`kvikio::RemoteHandle::open()` has started to support public S3 since https://github.com/rapidsai/kvikio/pull/820. When `open()` sees an S3 URL, it first assumes a private S3 object and queries its size. If the query fails, it proceeds to assume that the file is a public S3.

During the construction of a private S3 object, the constructor scans the environment variables for AWS credentials. Manual testing of https://github.com/rapidsai/kvikio/pull/820 accidentally includes the env vars all the time and hides a bug: in absence of env vars, the constructor of the private S3 object will throw an exception, which is unhandled, and KvikIO never gets a chance to try with public S3.

This PR fixes this bug.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/kvikio/pull/831
---
 cpp/src/remote_handle.cpp | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index cee6bdb700..33b481a74a 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -636,25 +636,23 @@ RemoteHandle RemoteHandle::open(std::string url,
   if (remote_endpoint_type == RemoteEndpointType::AUTO) {
     // Try each allowed type in the order of allowlist
     for (auto const& type : allow_list.value()) {
-      endpoint = create_endpoint(type);
-      if (endpoint == nullptr) { continue; }
-
-      // If the credential-based S3 endpoint cannot be used to access the URL, try using S3 public
-      // endpoint instead if it is in the allowlist
-      if (endpoint->remote_endpoint_type() == RemoteEndpointType::S3) {
-        try {
+      try {
+        endpoint = create_endpoint(type);
+        if (endpoint == nullptr) { continue; }
+        if (type == RemoteEndpointType::S3) {
           // Check connectivity for the credential-based S3 endpoint, and throw an exception if
           // failed
           endpoint->get_file_size();
-        } catch (...) {
-          auto it =
-            std::find(allow_list->begin(), allow_list->end(), RemoteEndpointType::S3_PUBLIC);
-          if (it != allow_list->end()) {
-            // If S3 public endpoint is in the allowlist, use it and end the search
-            endpoint = std::make_unique<S3PublicEndpoint>(url);
-          } else {
-            continue;
-          }
+        }
+      } catch (...) {
+        // If the credential-based S3 endpoint cannot be used to access the URL, try using S3 public
+        // endpoint instead if it is in the allowlist
+        if (type == RemoteEndpointType::S3 &&
+            std::find(allow_list->begin(), allow_list->end(), RemoteEndpointType::S3_PUBLIC) !=
+              allow_list->end()) {
+          endpoint = std::make_unique<S3PublicEndpoint>(url);
+        } else {
+          throw;
         }
       }
 

From 1b70488a0ee6ed7590ca16618e8ee5d8e6605853 Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@nvidia.com>
Date: Tue, 23 Sep 2025 16:24:14 -0500
Subject: [PATCH 077/103] Empty commit to trigger a build (#840)

This is an empty commit to trigger a build. It is used when builds get
stuck with an old ABI. Rebuilding updates them to the new one.

From d65e583806608a3bde785204fb99493a782bc0c4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 24 Sep 2025 16:54:01 -0700
Subject: [PATCH 078/103] Simplify async validation condition (#842)

The current condition has a redundant clause. If the requested compat mode is `CompatMode::OFF`, then `_is_compat_mode_preferred` is always false. All of the fallback conditions that are checked in the constructor that could result in `_is_compat_mode_preferred` being true only hold when the user requests `CompatMode::AUTO`.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/842
---
 cpp/src/compat_mode_manager.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/compat_mode_manager.cpp b/cpp/src/compat_mode_manager.cpp
index 569e9e73e7..3a162ad768 100644
--- a/cpp/src/compat_mode_manager.cpp
+++ b/cpp/src/compat_mode_manager.cpp
@@ -94,8 +94,7 @@ CompatModeManager::CompatModeManager(std::string const& file_path,
 void CompatModeManager::validate_compat_mode_for_async() const
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  if (!_is_compat_mode_preferred && _is_compat_mode_preferred_for_async &&
-      _compat_mode_requested == CompatMode::OFF) {
+  if (_is_compat_mode_preferred_for_async && _compat_mode_requested == CompatMode::OFF) {
     std::string err_msg;
     if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
 

From 19ea7581b1ffc595cb56b6f39bc73fd99dd86744 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 25 Sep 2025 22:15:05 -0400
Subject: [PATCH 079/103] Update `RAPIDS_BRANCH`, codify changes in
 `update-version.sh` (#843)

This PR updates `RAPIDS_BRANCH` and modifies `update-version.sh` to make future changes automatically.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/843
---
 ci/release/update-version.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 4d8f8cffcd..a315f189a9 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -32,6 +32,7 @@ function sed_runner() {
 
 # Centralized version file update
 echo "${NEXT_FULL_TAG}" > VERSION
+echo "branch-${NEXT_SHORT_TAG}" > RAPIDS_BRANCH
 
 DEPENDENCIES=(
   kvikio

From d8787fc1ae9db44aa233612aefa2b9111202ce73 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Fri, 26 Sep 2025 17:04:30 -0500
Subject: [PATCH 080/103] Add named parameters for S3 credentials to Python API
 (#846)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`S3Endpoint` takes optional parameters for the AWS region, access key ID, etc. If these aren't set, they're looked up from the environment.

Previously, the only way to specify these from Python was via environment variables. This adds named parameters to `f = kvikio.RemoteFile.open_s3` so that users can specify the credentials programatically. The default behavior is unchanged: environment variables are used when not specified otherwise.

Here's a test snippet against an S3 bucket:

```python
import sys
import boto3
import kvikio
import rmm


bucket, access_key_id, secret_access_key, session_token, default_region = sys.argv[1:]

client = boto3.client(
    's3',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    aws_session_token=session_token,
    region_name=default_region,
)

key = "test/date-2025-09-16"
client.put_object(Bucket=bucket, Key=key, Body=b'Hello, world!')
client.head_object(Bucket=bucket, Key=key)

buf = rmm.DeviceBuffer(size=13)
f = kvikio.RemoteFile.open_s3(bucket, key, access_key_id=access_key_id, secret_access_key=secret_access_key, session_token=session_token, region_name=default_region)
f.read(buf)
print(buf.tobytes())
```

I've set those variables to `_`-prefixed versions. When run, that prints

```
❯ python debug.py kvikiobench-33622 $_AWS_ACCESS_KEY_ID $_AWS_SECRET_ACCESS_KEY $_AWS_SESSION_TOKEN $_AWS_DEFAULT_REGION
b'Hello, world!'
```

Authors:
  - Tom Augspurger (https://github.com/TomAugspurger)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/kvikio/pull/846
---
 python/kvikio/kvikio/_lib/remote_handle.pyx | 100 +++++++++++++++++-
 python/kvikio/kvikio/remote_file.py         | 109 +++++++++++++++++---
 2 files changed, 189 insertions(+), 20 deletions(-)

diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 8fae78c534..123f60ee43 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -34,8 +34,21 @@ cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
         cpp_HttpEndpoint(string url) except +
 
     cdef cppclass cpp_S3Endpoint "kvikio::S3Endpoint"(cpp_RemoteEndpoint):
-        cpp_S3Endpoint(string url) except +
-        cpp_S3Endpoint(pair[string, string] bucket_and_object_names) except +
+        cpp_S3Endpoint(
+            string url,
+            optional[string] aws_region,
+            optional[string] aws_access_key,
+            optional[string] aws_secret_access_key,
+            optional[string] aws_session_token
+        ) except +
+        cpp_S3Endpoint(
+            pair[string, string] bucket_and_object_names,
+            optional[string] aws_region,
+            optional[string] aws_access_key,
+            optional[string] aws_secret_access_key,
+            optional[string] aws_endpoint_url,
+            optional[string] aws_session_token
+        ) except +
 
     pair[string, string] cpp_parse_s3_url \
         "kvikio::S3Endpoint::parse_s3_url"(string url) except +
@@ -89,6 +102,15 @@ cdef pair[string, string] _to_string_pair(str s1, str s2):
     """Wrap two Python string objects in a C++ pair"""
     return pair[string, string](_to_string(s1), _to_string(s2))
 
+cdef optional[string] _to_optional_string(str s):
+    """Convert Python object to a C++ optional string (if None, return nullopt)"""
+    cdef optional[string] result
+    if s is None:
+        result = nullopt
+    else:
+        result = optional[string](_to_string(s))
+    return result
+
 
 # Helper function to cast an endpoint to its base class `RemoteEndpoint`
 cdef extern from * nogil:
@@ -168,15 +190,38 @@ cdef class RemoteFile:
         bucket_name: str,
         object_name: str,
         nbytes: Optional[int],
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_endpoint_url: Optional[str] = None,
+        session_token: Optional[str] = None,
     ):
         cdef pair[string, string] bucket_and_object_names = _to_string_pair(
             bucket_name, object_name
         )
+        cdef optional[string] cpp_aws_region = _to_optional_string(aws_region_name)
+        cdef optional[string] cpp_aws_access_key = _to_optional_string(
+            aws_access_key_id
+        )
+        cdef optional[string] cpp_aws_secret_access_key = (
+            _to_optional_string(aws_secret_access_key)
+        )
+        cdef optional[string] cpp_aws_endpoint_url = _to_optional_string(
+            aws_endpoint_url
+        )
+        cdef optional[string] cpp_aws_session_token = _to_optional_string(session_token)
         cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
 
         with nogil:
             cpp_endpoint = cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](bucket_and_object_names)
+                make_unique[cpp_S3Endpoint](
+                    bucket_and_object_names,
+                    cpp_aws_region,
+                    cpp_aws_access_key,
+                    cpp_aws_secret_access_key,
+                    cpp_aws_endpoint_url,
+                    cpp_aws_session_token
+                )
             )
 
         return RemoteFile._from_endpoint(
@@ -188,13 +233,33 @@ cdef class RemoteFile:
     def open_s3_from_http_url(
         url: str,
         nbytes: Optional[int],
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        session_token: Optional[str] = None,
     ):
         cdef string cpp_url = _to_string(url)
+        cdef optional[string] cpp_aws_region = _to_optional_string(aws_region_name)
+        cdef optional[string] cpp_aws_access_key = _to_optional_string(
+            aws_access_key_id
+        )
+        cdef optional[string] cpp_aws_secret_access_key = (
+            _to_optional_string(aws_secret_access_key)
+        )
+        cdef optional[string] cpp_aws_session_token = _to_optional_string(
+            session_token
+        )
         cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
 
         with nogil:
             cpp_endpoint = cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](cpp_url)
+                make_unique[cpp_S3Endpoint](
+                    cpp_url,
+                    cpp_aws_region,
+                    cpp_aws_access_key,
+                    cpp_aws_secret_access_key,
+                    cpp_aws_session_token
+                )
             )
 
         return RemoteFile._from_endpoint(
@@ -206,15 +271,40 @@ cdef class RemoteFile:
     def open_s3_from_s3_url(
         url: str,
         nbytes: Optional[int],
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_endpoint_url: Optional[str] = None,
+        session_token: Optional[str] = None,
     ):
         cdef string cpp_url = _to_string(url)
         cdef pair[string, string] bucket_and_object_names
+        cdef optional[string] cpp_aws_region = _to_optional_string(aws_region_name)
+        cdef optional[string] cpp_aws_access_key = _to_optional_string(
+            aws_access_key_id
+        )
+        cdef optional[string] cpp_aws_secret_access_key = (
+            _to_optional_string(aws_secret_access_key)
+        )
+        cdef optional[string] cpp_aws_endpoint_url = _to_optional_string(
+            aws_endpoint_url
+        )
+        cdef optional[string] cpp_aws_session_token = _to_optional_string(
+            session_token
+        )
         cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
 
         with nogil:
             bucket_and_object_names = cpp_parse_s3_url(cpp_url)
             cpp_endpoint = cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](bucket_and_object_names)
+                make_unique[cpp_S3Endpoint](
+                    bucket_and_object_names,
+                    cpp_aws_region,
+                    cpp_aws_access_key,
+                    cpp_aws_secret_access_key,
+                    cpp_aws_endpoint_url,
+                    cpp_aws_session_token
+                )
             )
 
         return RemoteFile._from_endpoint(
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 2064320914..49c30f4bd0 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -125,16 +125,25 @@ def open_s3(
         bucket_name: str,
         object_name: str,
         nbytes: Optional[int] = None,
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_endpoint_url: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ) -> RemoteFile:
         """Open a AWS S3 file from a bucket name and object name.
 
-        Please make sure to set the AWS environment variables:
-          - `AWS_DEFAULT_REGION`
-          - `AWS_ACCESS_KEY_ID`
-          - `AWS_SECRET_ACCESS_KEY`
-          - `AWS_SESSION_TOKEN` (when using temporary credentials)
+        AWS credentials can be provided as keyword arguments or through
+        environment variables:
 
-        Additionally, to overwrite the AWS endpoint, set `AWS_ENDPOINT_URL`.
+        - ``AWS_DEFAULT_REGION`` (or region_name parameter)
+        - ``AWS_ACCESS_KEY_ID`` (or access_key_id parameter)
+        - ``AWS_SECRET_ACCESS_KEY`` (or secret_access_key parameter)
+        - ``AWS_SESSION_TOKEN`` (or session_token parameter, when using
+          temporary credentials)
+
+        Additionally, to overwrite the AWS endpoint, set `AWS_ENDPOINT_URL`
+        (or endpoint_url parameter).
         See <https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html>
 
         Parameters
@@ -146,9 +155,36 @@ def open_s3(
         nbytes
             The size of the file. If None, KvikIO will ask the server
             for the file size.
+        aws_region
+            The AWS region, such as "us-east-1", to use. If None, the value of the
+            `AWS_DEFAULT_REGION` environment variable is used.
+        aws_access_key
+            The AWS access key to use. If None, the value of the
+            `AWS_ACCESS_KEY_ID` environment variable is used.
+        aws_secret_access_key
+            The AWS secret access key to use. If None, the value of the
+            `AWS_SECRET_ACCESS_KEY` environment variable is used.
+        aws_endpoint_url
+            Overwrite the endpoint url (including the protocol part) by using
+            the scheme: "<aws_endpoint_url>/<bucket_name>/<object_name>". If None,
+            the value of the `AWS_ENDPOINT_URL` environment variable is used. If
+            this is also not set, the regular AWS url scheme is used:
+            "https://<bucket_name>.s3.<region>.amazonaws.com/<object_name>".
+        aws_session_token
+            The AWS session token to use. If None, the value of the
+            `AWS_SESSION_TOKEN` environment variable is used.
         """
         return cls(
-            _get_remote_module().RemoteFile.open_s3(bucket_name, object_name, nbytes)
+            _get_remote_module().RemoteFile.open_s3(
+                bucket_name,
+                object_name,
+                nbytes,
+                aws_region_name,
+                aws_access_key_id,
+                aws_secret_access_key,
+                aws_endpoint_url,
+                aws_session_token,
+            )
         )
 
     @classmethod
@@ -156,6 +192,11 @@ def open_s3_url(
         cls,
         url: str,
         nbytes: Optional[int] = None,
+        aws_region_name: Optional[str] = None,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_endpoint_url: Optional[str] = None,
+        session_token: Optional[str] = None,
     ) -> RemoteFile:
         """Open a AWS S3 file from an URL.
 
@@ -163,14 +204,17 @@ def open_s3_url(
           - A full http url such as "http://127.0.0.1/my/file", or
           - A S3 url such as "s3://<bucket>/<object>".
 
-        Please make sure to set the AWS environment variables:
-          - `AWS_DEFAULT_REGION`
-          - `AWS_ACCESS_KEY_ID`
-          - `AWS_SECRET_ACCESS_KEY`
-          - `AWS_SESSION_TOKEN` (when using temporary credentials)
+        AWS credentials can be provided as keyword arguments or through
+        environment variables:
+
+        - ``AWS_DEFAULT_REGION`` (or region_name parameter)
+        - ``AWS_ACCESS_KEY_ID`` (or access_key_id parameter)
+        - ``AWS_SECRET_ACCESS_KEY`` (or secret_access_key parameter)
+        - ``AWS_SESSION_TOKEN`` (or session_token parameter, when using
+          temporary credentials)
 
         Additionally, if `url` is a S3 url, it is possible to overwrite the AWS endpoint
-        by setting `AWS_ENDPOINT_URL`.
+        by setting `AWS_ENDPOINT_URL` (or endpoint_url parameter).
         See <https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html>
 
         Parameters
@@ -180,14 +224,49 @@ def open_s3_url(
         nbytes
             The size of the file. If None, KvikIO will ask the server
             for the file size.
+        aws_region
+            The AWS region, such as "us-east-1", to use. If None, the value of the
+            `AWS_DEFAULT_REGION` environment variable is used.
+        aws_access_key
+            The AWS access key to use. If None, the value of the
+            `AWS_ACCESS_KEY_ID` environment variable is used.
+        aws_secret_access_key
+            The AWS secret access key to use. If None, the value of the
+            `AWS_SECRET_ACCESS_KEY` environment variable is used.
+        aws_endpoint_url
+            Overwrite the endpoint url (including the protocol part) by using
+            the scheme: "<aws_endpoint_url>/<bucket_name>/<object_name>". If None,
+            the value of the `AWS_ENDPOINT_URL` environment variable is used. If
+            this is also not set, the regular AWS url scheme is used:
+            "https://<bucket_name>.s3.<region>.amazonaws.com/<object_name>".
+        aws_session_token
+            The AWS session token to use. If None, the value of the
+            `AWS_SESSION_TOKEN` environment variable is used.
         """
         parsed_result = urllib.parse.urlparse(url.lower())
         if parsed_result.scheme in ("http", "https"):
             return cls(
-                _get_remote_module().RemoteFile.open_s3_from_http_url(url, nbytes)
+                _get_remote_module().RemoteFile.open_s3_from_http_url(
+                    url,
+                    nbytes,
+                    aws_region_name,
+                    aws_access_key_id,
+                    aws_secret_access_key,
+                    session_token,
+                )
             )
         if parsed_result.scheme == "s3":
-            return cls(_get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes))
+            return cls(
+                _get_remote_module().RemoteFile.open_s3_from_s3_url(
+                    url,
+                    nbytes,
+                    aws_region_name,
+                    aws_access_key_id,
+                    aws_secret_access_key,
+                    aws_endpoint_url,
+                    session_token,
+                )
+            )
         raise ValueError(f"Unsupported protocol: {url}")
 
     @classmethod

From a5661b9e168fed1a09f28c202b6f4b8122656efd Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Thu, 2 Oct 2025 16:46:21 -0400
Subject: [PATCH 081/103] Create utility class URL builder and URL encoder. Fix
 a tricky URL path issue (#844)

## Background

### Initial problem
There is currently an unsolved problem in libcurl, which somehow is mislabeled as merged/solved in https://github.com/curl/curl/issues/13754. For AWS S3 that requires credentials, if an object key name contains `=`, libcurl will fail with an HTTP 403 response. This problem does not occur to public S3 objects. This can be reproduced using the `curl` program:

```bash
#!/usr/bin/env bash

# version: curl 8.15.0-DEV
curl_bin=<my_curl_program_loc>

# ..........S3 private..........
region=$(aws configure get region)
user_password=$(aws configure get aws_access_key_id):$(aws configure get aws_secret_access_key)
# curl can handle this. The object key name does not contain =
url="https://<private-bucket>.s3.<region>.amazonaws.com/witcher/2MiB.bin"
# curl cannot handle this. The object key name contains =
url="https://<private-bucket>.s3.<region>.amazonaws.com/witcher/key=value_2MiB.bin"

$curl_bin -s $url \
--aws-sigv4 "aws:amz:$region:s3" \
--user "$user_password" \
-o /dev/null -w "%{http_code}\n" -v

# ..........S3 public..........
# curl can handle both
url="https://<public-bucket>.s3.<region>.amazonaws.com/witcher/2MiB.bin"
url="https://<public-bucket>.s3.<region>.amazonaws.com/witcher/key=value_2MiB.bin"
$curl_bin -s $url \
-o /dev/null -w "%{http_code}\n" -v
```

### Additional problem
It has been found that beyond `=` alone, other special characters such as `!*'()` in a private S3 object will also cause libcurl error. In addition, some characters such as `+` in a public S3 object will cause the same error.

## This PR

This PR addresses this problem by handling special characters listed in the [AWS object key naming guidelines](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines), for both private and public S3 object names. The KvikIO-specific object key naming guidelines are added to the remote file documentation.

Specifically, this PR introduces utility classes `UrlBuilder` (to complement the existing `UrlParser`), which builds a URL according to the user-provided components, and `UrlEncoder` which uses a compile-time, percent-encoding lookup table to encode selected characters.


Closes https://github.com/rapidsai/kvikio/issues/823

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/844
---
 cpp/include/kvikio/detail/url.hpp | 254 +++++++++++++++++++++++++++++-
 cpp/src/detail/url.cpp            | 153 ++++++++++++++++++
 cpp/src/remote_handle.cpp         |  17 +-
 cpp/tests/test_url.cpp            | 219 ++++++++++++++++++++++++++
 docs/source/remote_file.rst       |  24 ++-
 5 files changed, 658 insertions(+), 9 deletions(-)

diff --git a/cpp/include/kvikio/detail/url.hpp b/cpp/include/kvikio/detail/url.hpp
index e57d2c4c94..2feb055881 100644
--- a/cpp/include/kvikio/detail/url.hpp
+++ b/cpp/include/kvikio/detail/url.hpp
@@ -137,7 +137,7 @@ class UrlParser {
    *
    * @return UrlComponents structure containing the parsed URL components
    *
-   * @throw std::runtime_error if the URL cannot be parsed or if component extraction fails
+   * @exception std::runtime_error if the URL cannot be parsed or if component extraction fails
    *
    * Example:
    * @code{.cpp}
@@ -151,9 +151,10 @@ class UrlParser {
    *     CURLU_URLDECODE
    * );
    *
-   * // Allow non-standard schemes
+   * // Allow non-standard schemes, i.e. schemes not registered with Internet Assigned Numbers
+   * // Authority (IANA), such as AWS S3
    * auto custom = UrlParser::parse(
-   *     "myscheme://example.com",
+   *     "s3://my-bucket/my-object.bin",
    *     CURLU_NON_SUPPORT_SCHEME
    * );
    * @endcode
@@ -170,7 +171,7 @@ class UrlParser {
    * @param bitmask_component_flags Flags controlling extraction behavior
    * @param allowed_err_code Optional error code to treat as valid (e.g., CURLUE_NO_SCHEME)
    * @return The extracted component as a string, or std::nullopt if not present
-   * @throw std::runtime_error if extraction fails with an unexpected error
+   * @exception std::runtime_error if extraction fails with an unexpected error
    */
   static std::optional<std::string> extract_component(
     CurlUrlHandle const& handle,
@@ -187,7 +188,7 @@ class UrlParser {
    * @param bitmask_component_flags Flags controlling extraction behavior
    * @param allowed_err_code Optional error code to treat as valid
    * @return The extracted component as a string, or std::nullopt if not present
-   * @throw std::runtime_error if extraction fails with an unexpected error
+   * @exception std::runtime_error if extraction fails with an unexpected error
    */
   static std::optional<std::string> extract_component(
     std::string const& url,
@@ -196,4 +197,247 @@ class UrlParser {
     std::optional<unsigned int> bitmask_component_flags = std::nullopt,
     std::optional<CURLUcode> allowed_err_code           = std::nullopt);
 };
+
+/**
+ * @brief URL builder utility using libcurl's URL API
+ *
+ * This class provides methods for constructing and modifying URLs by setting individual components
+ * (scheme, host, port, path, query, fragment).
+ *
+ * @note This class uses libcurl's URL parsing which follows RFC 3986 plus. See
+ * https://curl.se/docs/url-syntax.html
+ *
+ * Example:
+ * @code{.cpp}
+ * // Build from scratch
+ * auto url = UrlBuilder()
+ *     .set_scheme("https")
+ *     .set_host("witcher4.com")
+ *     .set_path("/ciri")
+ *     .set_query("occupation", "witcher")
+ *     .build();
+ *
+ * // Modify existing URL
+ * auto modified = UrlBuilder("https://witcher4.com/old/path/to/bestiary")
+ *     .set_path("/new/path/to/bestiary")
+ *     .set_port("8080")
+ *     .build();
+ * @endcode
+ */
+class UrlBuilder {
+ private:
+  CurlUrlHandle _handle;
+
+  /**
+   * @brief Internal helper to set a URL component
+   *
+   * @param part The URL part to set
+   * @param value The value to set. Use `nullptr` to clear
+   * @param flags Optional flags for the operation
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the operation fails
+   */
+  UrlBuilder& set_component(CURLUPart part,
+                            char const* value,
+                            std::optional<unsigned int> flags = std::nullopt);
+
+ public:
+  /**
+   * @brief Construct an empty URL builder
+   * @exception std::runtime_error if initialization fails
+   */
+  explicit UrlBuilder();
+
+  /**
+   * @brief Construct a URL builder from an existing URL string
+   *
+   * @param url The URL string to start with
+   * @param bitmask_url_flags Optional flags for URL parsing. Common flags include:
+   *  - CURLU_DEFAULT_SCHEME: Allows URLs without schemes
+   *  - CURLU_NON_SUPPORT_SCHEME: Accept non-supported schemes
+   *  - CURLU_URLENCODE: URL encode the path
+   * @exception std::runtime_error if the URL cannot be parsed
+   */
+  explicit UrlBuilder(std::string const& url,
+                      std::optional<unsigned int> bitmask_url_flags = std::nullopt);
+
+  /**
+   * @brief Construct a URL builder from parsed URL components
+   *
+   * @param components The parsed URL components to start with
+   * @param bitmask_url_flags Optional flags for URL handling
+   * @exception std::runtime_error if the components cannot be set
+   */
+  explicit UrlBuilder(UrlParser::UrlComponents const& components,
+                      std::optional<unsigned int> bitmask_url_flags = std::nullopt);
+
+  /**
+   * @brief Set the URL scheme (e.g., "http", "https", "ftp")
+   *
+   * @param scheme The scheme to set. Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the scheme is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_scheme("https");
+   * @endcode
+   */
+  UrlBuilder& set_scheme(std::optional<std::string> const& scheme);
+
+  /**
+   * @brief Set the hostname or IP address
+   *
+   * @param host The host to set. Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the host is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_host("api.example.com");
+   * @endcode
+   */
+  UrlBuilder& set_host(std::optional<std::string> const& host);
+
+  /**
+   * @brief Set the port number
+   *
+   * @param port The port to set as string. Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the port is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_port("8080");
+   * @endcode
+   */
+  UrlBuilder& set_port(std::optional<std::string> const& port);
+
+  /**
+   * @brief Set the path component
+   *
+   * @param path The path to set (should start with "/" for absolute paths). Use `std::nullopt` to
+   * clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the path is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_path("/api/v1/users");
+   * @endcode
+   */
+  UrlBuilder& set_path(std::optional<std::string> const& path);
+
+  /**
+   * @brief Set the entire query string
+   *
+   * @param query The query string (without leading "?"). Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the query is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_query("page=1&limit=10");
+   * @endcode
+   */
+  UrlBuilder& set_query(std::optional<std::string> const& query);
+
+  /**
+   * @brief Set the fragment identifier
+   *
+   * @param fragment The fragment (without leading "#"). Use `std::nullopt` to clear
+   * @return Reference to this builder for chaining
+   * @exception std::runtime_error if the fragment is invalid
+   *
+   * Example:
+   * @code{.cpp}
+   * builder.set_fragment("section-2");
+   * @endcode
+   */
+  UrlBuilder& set_fragment(std::optional<std::string> const& fragment);
+
+  /**
+   * @brief Build the final URL string
+   *
+   * @param bitmask_component_flags Optional flags for URL formatting. Common flags:
+   *  - CURLU_PUNYCODE: Convert host to punycode if needed
+   *  - CURLU_NO_DEFAULT_PORT: Include port even if it's the default for the scheme
+   * @return The complete URL string
+   * @exception std::runtime_error if the URL cannot be built
+   *
+   * Example:
+   * @code{.cpp}
+   * std::string url = builder.build();
+   * @endcode
+   */
+  std::string build(std::optional<unsigned int> bitmask_component_flags = std::nullopt) const;
+
+  static std::string build_manually(UrlParser::UrlComponents const& components);
+};
+
+/**
+ * @brief Provides URL encoding functionality
+ *
+ * The AWS object naming documentation
+ * (https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html) lists several
+ * types of special characters. In practice, handling them using libcurl is complex and described
+ * below.
+ *
+ *  - Special characters that are safe for use in key names: "!-_.*'()" KvikIO includes !*'() in
+ * `aws_special_chars`, because for private bucket they cause AWS authentication by libcurl to fail
+ *
+ *  - Characters that might require special handling: "&$@=;/:+ ,? and 0-31, 127 ASCII
+ * characters". For /, KvikIO does not include it in `aws_special_chars`, because it can be legally
+ * used as a path separator. For the space character and ?, although KvikIO has them in
+ * `aws_special_chars`, users must manually percent encode them to %20 and %3F, respectively.
+ * Otherwise, the space character will be considered malformed by libcurl, and ? cause ambiguity
+ * with the query string. For the control characters, KvikIO include them all in
+ * `aws_special_chars`.
+ *
+ *  - Characters to avoid: "\{^}%`]">[~<#| and 128-255 non-ASCII characters". KvikIO recommends
+ * users avoiding these characters in the URL. They are not included in `aws_special_chars`.
+ *
+ */
+class UrlEncoder {
+ public:
+  /**
+   * @brief Default set of special characters requiring encoding in AWS URLs
+   */
+  static constexpr char aws_special_chars[] = {
+    '!',    '*',    '\'',   '(',    ')',    '&',    '$',    '@',    '=',    ';',    ':',    '+',
+    ' ',    ',',    '?',    '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08',
+    '\x09', '\x0A', '\x0B', '\x0C', '\x0D', '\x0E', '\x0F', '\x10', '\x11', '\x12', '\x13', '\x14',
+    '\x15', '\x16', '\x17', '\x18', '\x19', '\x1A', '\x1B', '\x1C', '\x1D', '\x1E', '\x1F', '\x7F'};
+
+  /**
+   * @brief Percent-encodes specified characters in a URL path
+   *
+   * Performs percent-encoding (RFC 3986) on a given path string, encoding only the characters
+   * specified in the chars_to_encode parameter. Each encoded character is replaced with its
+   * percent-encoded equivalent (%XX where XX is the hexadecimal representation of the character).
+   *
+   * Only ASCII characters (0-127) are supported for encoding. Non-ASCII characters in
+   * chars_to_encode will be encoded to an empty string. Characters not in chars_to_encode are
+   * passed through unchanged.
+   *
+   * @param path The path string to encode
+   * @param chars_to_encode Set of characters that should be encoded (defaults to aws_special_chars)
+   *
+   * @return A new string with specified characters percent-encoded
+   *
+   * @code{.cpp}
+   * // Example usage with default AWS special characters
+   * std::string encoded = UrlEncoder::encode_path("/path/ with spaces");
+   * // Result: "/path/%20with%20spaces"
+   *
+   * // Example with custom character set
+   * std::string encoded = UrlEncoder::encode_path("hello/world", "/");
+   * // Result: "hello%2Fworld"
+   * @endcode
+   */
+  static std::string encode_path(std::string_view path,
+                                 std::string_view chars_to_encode = std::string_view{
+                                   aws_special_chars, sizeof(aws_special_chars)});
+};
+
 }  // namespace kvikio::detail
diff --git a/cpp/src/detail/url.cpp b/cpp/src/detail/url.cpp
index 64f5b8fde9..5542a1285f 100644
--- a/cpp/src/detail/url.cpp
+++ b/cpp/src/detail/url.cpp
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+#include <array>
+#include <cstdint>
 #include <sstream>
 #include <stdexcept>
+#include <string_view>
 #include <utility>
 
 #include <curl/curl.h>
@@ -139,4 +142,154 @@ UrlParser::UrlComponents UrlParser::parse(std::string const& url,
 
   return components;
 }
+
+UrlBuilder::UrlBuilder() {}
+
+UrlBuilder::UrlBuilder(std::string const& url, std::optional<unsigned int> bitmask_url_flags)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+
+  CHECK_CURL_URL_ERR(
+    curl_url_set(_handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+}
+
+UrlBuilder::UrlBuilder(UrlParser::UrlComponents const& components,
+                       std::optional<unsigned int> bitmask_url_flags)
+{
+  // Start with an empty builder
+  // Set each component if present
+  if (components.scheme.has_value()) { set_scheme(components.scheme); }
+  if (components.host.has_value()) { set_host(components.host); }
+  if (components.port.has_value()) { set_port(components.port); }
+  if (components.path.has_value()) { set_path(components.path); }
+  if (components.query.has_value()) { set_query(components.query); }
+  if (components.fragment.has_value()) { set_fragment(components.fragment); }
+}
+
+UrlBuilder& UrlBuilder::set_component(CURLUPart part,
+                                      char const* value,
+                                      std::optional<unsigned int> flags)
+{
+  if (!flags.has_value()) { flags = 0U; }
+
+  CHECK_CURL_URL_ERR(curl_url_set(_handle.get(), part, value, flags.value()));
+  return *this;
+}
+
+UrlBuilder& UrlBuilder::set_scheme(std::optional<std::string> const& scheme)
+{
+  auto const* value = scheme.has_value() ? scheme.value().c_str() : nullptr;
+  return set_component(CURLUPART_SCHEME, value);
+}
+
+UrlBuilder& UrlBuilder::set_host(std::optional<std::string> const& host)
+{
+  auto const* value = host.has_value() ? host.value().c_str() : nullptr;
+  return set_component(CURLUPART_HOST, value);
+}
+
+UrlBuilder& UrlBuilder::set_port(std::optional<std::string> const& port)
+{
+  auto const* value = port.has_value() ? port.value().c_str() : nullptr;
+  return set_component(CURLUPART_PORT, value);
+}
+
+UrlBuilder& UrlBuilder::set_path(std::optional<std::string> const& path)
+{
+  auto const* value = path.has_value() ? path.value().c_str() : nullptr;
+  return set_component(CURLUPART_PATH, value);
+}
+
+UrlBuilder& UrlBuilder::set_query(std::optional<std::string> const& query)
+{
+  auto const* value = query.has_value() ? query.value().c_str() : nullptr;
+  return set_component(CURLUPART_QUERY, value);
+}
+
+UrlBuilder& UrlBuilder::set_fragment(std::optional<std::string> const& fragment)
+{
+  auto const* value = fragment.has_value() ? fragment.value().c_str() : nullptr;
+  return set_component(CURLUPART_FRAGMENT, value);
+}
+
+std::string UrlBuilder::build(std::optional<unsigned int> bitmask_component_flags) const
+{
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  char* url = nullptr;
+  CHECK_CURL_URL_ERR(
+    curl_url_get(_handle.get(), CURLUPART_URL, &url, bitmask_component_flags.value()));
+
+  KVIKIO_EXPECT(
+    url != nullptr, "Failed to build URL: curl_url_get returned nullptr", std::runtime_error);
+
+  std::string result(url);
+  curl_free(url);
+  return result;
+}
+
+std::string UrlBuilder::build_manually(UrlParser::UrlComponents const& components)
+{
+  std::string url;
+  if (components.scheme) { url += components.scheme.value() + "://"; }
+  if (components.host) { url += components.host.value(); }
+  if (components.port) { url += ":" + components.port.value(); }
+  if (components.path) { url += components.path.value(); }
+  if (components.query) { url += "?" + components.query.value(); }
+  if (components.fragment) { url += "#" + components.fragment.value(); }
+  return url;
+}
+
+namespace {
+/**
+ * @brief Compile-time encoding lookup table
+ *
+ * ASCII characters will be percent-encoded. For example, = has a hexadecimal value of 3D, and the
+ * encoding result is %3D. Characters outside the ASCII region are encoded to NUL and map to an
+ * empty std::string.
+ */
+struct EncodingTable {
+  std::array<unsigned char[4], 256> table;
+  constexpr EncodingTable() : table{}
+  {
+    char const num_to_chars[] = "0123456789ABCDEF";
+    for (uint16_t idx = 0U; idx < table.size(); ++idx) {
+      if (idx < 128) {
+        table[idx][0] = '%';
+        table[idx][1] = num_to_chars[idx >> 4];
+        table[idx][2] = num_to_chars[idx & 0x0F];
+        table[idx][3] = '\0';
+      } else {
+        table[idx][0] = '\0';
+      }
+    }
+  }
+};
+}  // namespace
+
+std::string UrlEncoder::encode_path(std::string_view path, std::string_view chars_to_encode)
+{
+  constexpr EncodingTable encoding_table{};
+
+  std::array<bool, 256> should_encode{};
+  for (auto const c : chars_to_encode) {
+    std::size_t idx    = static_cast<unsigned char>(c);
+    should_encode[idx] = true;
+  }
+
+  std::string result;
+  for (auto const c : path) {
+    std::size_t idx = static_cast<unsigned char>(c);
+    if (should_encode[idx]) {
+      // If the character is within chars_to_encode, encode it
+      result += std::string{reinterpret_cast<char const*>(encoding_table.table[idx])};
+    } else {
+      // Otherwise, pass it through
+      result += c;
+    }
+  }
+
+  return result;
+}
+
 }  // namespace kvikio::detail
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 33b481a74a..e2e4ea21c8 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -228,6 +228,13 @@ char const* get_remote_endpoint_type_name(RemoteEndpointType remote_endpoint_typ
       return "UNKNOWN";
   }
 }
+
+std::string encode_special_chars_in_path(std::string const& url)
+{
+  auto components = detail::UrlParser::parse(url);
+  components.path = detail::UrlEncoder::encode_path(components.path.value());
+  return detail::UrlBuilder::build_manually(components);
+}
 }  // namespace
 
 RemoteEndpoint::RemoteEndpoint(RemoteEndpointType remote_endpoint_type)
@@ -275,7 +282,9 @@ void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_st
 
 void S3Endpoint::setopt(CurlHandle& curl)
 {
-  curl.setopt(CURLOPT_URL, _url.c_str());
+  auto new_url = encode_special_chars_in_path(_url);
+  curl.setopt(CURLOPT_URL, new_url.c_str());
+
   curl.setopt(CURLOPT_AWS_SIGV4, _aws_sigv4.c_str());
   curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str());
   if (_curl_header_list) { curl.setopt(CURLOPT_HTTPHEADER, _curl_header_list); }
@@ -452,7 +461,11 @@ S3PublicEndpoint::S3PublicEndpoint(std::string url)
 {
 }
 
-void S3PublicEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+void S3PublicEndpoint::setopt(CurlHandle& curl)
+{
+  auto new_url = encode_special_chars_in_path(_url);
+  curl.setopt(CURLOPT_URL, new_url.c_str());
+}
 
 std::string S3PublicEndpoint::str() const { return _url; }
 
diff --git a/cpp/tests/test_url.cpp b/cpp/tests/test_url.cpp
index ce419ed5a5..e359c5442b 100644
--- a/cpp/tests/test_url.cpp
+++ b/cpp/tests/test_url.cpp
@@ -18,6 +18,7 @@
 #include <gtest/gtest.h>
 
 #include <kvikio/detail/url.hpp>
+#include <kvikio/error.hpp>
 #include <stdexcept>
 
 using ::testing::HasSubstr;
@@ -62,3 +63,221 @@ TEST(UrlTest, parse_host)
     EXPECT_THROW({ kvikio::detail::UrlParser::parse(invalid_host_url); }, std::runtime_error);
   }
 }
+
+TEST(UrlTest, build_url)
+{
+  // Build a URL from scratch
+  {
+    std::string scheme{"https"};
+    std::string host{"api.example.com"};
+    std::string port{"8080"};
+    std::string path{"/v1/users"};
+    std::string query{"page=1&limit=10"};
+    std::string fragment{"results"};
+    std::stringstream ss;
+    ss << scheme << "://" << host << ":" << port << path << "?" << query << "#" << fragment;
+
+    {
+      auto url = kvikio::detail::UrlBuilder()
+                   .set_scheme("https")
+                   .set_host("api.example.com")
+                   .set_port("8080")
+                   .set_path("/v1/users")
+                   .set_query("page=1&limit=10")
+                   .set_fragment("results")
+                   .build();
+
+      EXPECT_EQ(url, ss.str());
+    }
+
+    // The components do not have to be specified in their correct order
+    {
+      auto url = kvikio::detail::UrlBuilder()
+                   .set_fragment("results")
+                   .set_scheme("https")
+                   .set_path("/v1/users")
+                   .set_host("api.example.com")
+                   .set_query("page=1&limit=10")
+                   .set_port("8080")
+                   .build();
+
+      EXPECT_EQ(url, ss.str());
+    }
+  }
+
+  // Modify an existing URL
+  {
+    std::string scheme_host{"https://api.example.com"};
+    std::string query{"page=1&limit=10"};
+
+    std::string old_path{"/old/path/file.txt"};
+    std::string new_path{"/new/path/document.pdf"};
+
+    // Modify the path
+    {
+      std::string old_url          = scheme_host + old_path + "?" + query;
+      std::string expected_new_url = scheme_host + new_path + "?" + query;
+
+      auto actual_new_url = kvikio::detail::UrlBuilder(old_url).set_path(new_path).build();
+      EXPECT_EQ(actual_new_url, expected_new_url);
+    }
+
+    // Modify the path and add the query
+    std::string port{"8080"};
+    std::string old_url          = scheme_host + old_path;
+    std::string expected_new_url = scheme_host + ":" + port + new_path + "?" + query;
+
+    auto actual_new_url = kvikio::detail::UrlBuilder(old_url)
+                            .set_port(port)
+                            .set_path(new_path)
+                            .set_query(query)
+                            .build();
+    EXPECT_EQ(actual_new_url, expected_new_url);
+  }
+
+  // Build from parsed components
+  {
+    std::string scheme{"https"};
+    std::string host{"api.example.com"};
+    std::string path{"/v1/users"};
+    std::string query{"page=1&limit=10"};
+    std::stringstream ss;
+    ss << scheme << "://" << host << path << "?" << query;
+
+    // First parse an existing URL
+    auto components = kvikio::detail::UrlParser::parse(ss.str());
+
+    // Modify components
+    components.path = "/v2/api";
+    components.port = "443";
+
+    // Build new URL from modified components
+    auto actual_new_url = kvikio::detail::UrlBuilder(components).build();
+
+    // Expected URL
+    ss.str("");
+    ss << scheme << "://" << host << ":" << components.port.value() << components.path.value()
+       << "?" << query;
+
+    EXPECT_EQ(actual_new_url, ss.str());
+  }
+
+  // AWS S3-like URL
+  {
+    std::string path = "/my-bucket/&$@=;:+,.txt";
+    auto url = kvikio::detail::UrlBuilder("https://s3.region.amazonaws.com").set_path(path).build();
+    std::string encoded_path = kvikio::detail::UrlEncoder::encode_path(path);
+
+    auto actual_encoded_url = kvikio::detail::UrlBuilder(url).set_path(encoded_path).build();
+    std::string expected_encoded_url{
+      "https://s3.region.amazonaws.com/my-bucket/%26%24%40%3D%3B%3A%2B%2C.txt"};
+
+    std::transform(actual_encoded_url.begin(),
+                   actual_encoded_url.end(),
+                   actual_encoded_url.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+
+    std::transform(expected_encoded_url.begin(),
+                   expected_encoded_url.end(),
+                   expected_encoded_url.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+
+    EXPECT_EQ(actual_encoded_url, expected_encoded_url);
+  }
+}
+
+TEST(UrlTest, encoding_table)
+{
+  // Look up the reserved characters (RFC 3986 section 2.2) in the encoding table
+  {
+    std::string special_chars{"!#$&\'()*+,/:;=?@[]"};
+    std::string expected_result{"%21%23%24%26%27%28%29%2A%2B%2C%2F%3A%3B%3D%3F%40%5B%5D"};
+    // First parameter: string containing special characters
+    // Second parameter: a sequence of special characters to be encoded
+    std::string actual_result =
+      kvikio::detail::UrlEncoder::encode_path(special_chars, special_chars);
+    EXPECT_EQ(actual_result, expected_result);
+  }
+
+  // Check a few samples from the encoding table. Out-of-bound characters (beyond ASCII table) are
+  // expected to be encoded to empty strings.
+  {
+    std::array<unsigned char, 6> input{0,  // First ASCII char NUL
+                                       '\x3D',
+                                       127,  // Last ASCII char DEL
+                                       128,  // Out-of-bound chars
+                                       200,
+                                       255};
+    std::array<std::string, sizeof(input)> expected_results{"%00",
+                                                            "%3D",
+                                                            "%7F"
+                                                            "",
+                                                            "",
+                                                            ""};
+    for (std::size_t i = 0; i < input.size(); ++i) {
+      std::string s{static_cast<char>(input[i])};
+      std::string actual_result = kvikio::detail::UrlEncoder::encode_path(s, s);
+      EXPECT_EQ(actual_result, expected_results[i]);
+    }
+  }
+
+  // Check control characters
+  {
+    std::map<char, std::string> mapping{
+      {'\x00', "%00"}, {'\x1A', "%1A"}, {'\x1F', "%1F"}, {'\x7F', "%7F"}};
+
+    for (auto const [question, answer] : mapping) {
+      // Construct a string view for the character, and specify the size explicitly to take account
+      // of NUL
+      std::string sv{&question, 1};
+      std::string result = kvikio::detail::UrlEncoder::encode_path(sv, sv);
+      EXPECT_EQ(result, answer);
+    }
+  }
+
+  // Check out-of-bound characters
+  {
+    unsigned char out_of_bound_chars[] = {128, 200, 255};
+    std::string_view sv{reinterpret_cast<char*>(out_of_bound_chars), sizeof(out_of_bound_chars)};
+    std::string result = kvikio::detail::UrlEncoder::encode_path(sv, sv);
+    EXPECT_EQ(result, "");
+  }
+}
+
+TEST(UrlTest, encode_url)
+{
+  // Path does not contain characters that require special handling, so no character is encoded
+  {
+    std::string original{"abc123/-_..bin"};
+    auto encoded = kvikio::detail::UrlEncoder::encode_path(original);
+    EXPECT_EQ(original, encoded);
+  }
+
+  // chars_to_encode is empty, so no character is encoded
+  {
+    std::string original{"abc123/!-_.*'()/&$@=;:+ ,?.bin"};
+    auto encoded = kvikio::detail::UrlEncoder::encode_path(original, {});
+    EXPECT_EQ(original, encoded);
+  }
+
+  // Test all characters mentioned by AWS documentation that require special handling
+  {
+    std::string const& input{kvikio::detail::UrlEncoder::aws_special_chars};
+    auto encoded = kvikio::detail::UrlEncoder::encode_path(input);
+
+    // Encoding is performed, so the string is expected to be changed
+    EXPECT_NE(input, encoded);
+
+    auto* curl     = curl_easy_init();
+    auto* expected = curl_easy_escape(curl, input.data(), input.size());
+    EXPECT_NE(expected, nullptr);
+    EXPECT_EQ(encoded, std::string{expected});
+
+    curl_free(expected);
+    curl_easy_cleanup(curl);
+
+    // aws_special_chars does not contain %, so double encoding is expected to not alter anything
+    auto double_encoded = kvikio::detail::UrlEncoder::encode_path(encoded);
+    EXPECT_EQ(encoded, double_encoded);
+  }
+}
diff --git a/docs/source/remote_file.rst b/docs/source/remote_file.rst
index ed6fe45b7b..e6d038035e 100644
--- a/docs/source/remote_file.rst
+++ b/docs/source/remote_file.rst
@@ -1,11 +1,31 @@
 Remote File
 ===========
 
-KvikIO provides direct access to remote files.
-
+KvikIO provides direct access to remote files, including AWS S3, WebHDFS, and generic HTTP/HTTPS.
 
 Example
 -------
 
 .. literalinclude:: ../../python/kvikio/examples/http_io.py
     :language: python
+
+AWS S3 object naming requirement
+--------------------------------
+
+KvikIO imposes the following naming requirements derived from the `AWS object naming guidelines <https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html>`_ .
+
+ - ``!``, ``*``, ``'``, ``(``, ``)``, ``&``, ``$``, ``@``, ``=``, ``;``, ``:``, ``+``, ``,``: These special characters are automatically encoded by KvikIO, and are safe for use in key names.
+
+ - ``-``, ``_``, ``.``: These special characters are **not** automatically encoded by KvikIO, but are still safe for use in key names.
+
+ - ``/`` is used as path separator and must not appear in the object name itself.
+
+ - Space character must be explicitly encoded (``%20``) because it will otherwise render the URL malformed.
+
+ - ``?`` must be explicitly encoded (``%3F``) because it will otherwise cause ambiguity with the query string.
+
+ - Control characters ``0x00`` ~ ``0x1F`` hexadecimal (0~31 decimal) and ``0x7F`` (127) are automatically encoded by KvikIO, and are safe for use in key names.
+
+ - Other printable special characters must be avoided, such as ``\``, ``{``, ``^``, ``}``, ``%``, `````, ``]``, ``"``, ``>``, ``[``, ``~``, ``<``, ``#``, ``|``.
+
+ - Non-ASCII characters ``0x80`` ~ ``0xFF`` (128~255) must be avoided.

From 1222bd47b68be6d949d2857a1ac91f8d02d8c94e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 7 Oct 2025 10:37:37 -0400
Subject: [PATCH 082/103] Make parameter name more consistent for S3 utility
 functions (#848)

This small PR adds `aws_` prefix to the parameter `session_token` to make the parameter names more consistent for S3 utility functions.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Tom Augspurger (https://github.com/TomAugspurger)

URL: https://github.com/rapidsai/kvikio/pull/848
---
 python/kvikio/kvikio/_lib/remote_handle.pyx | 14 ++++++++------
 python/kvikio/kvikio/remote_file.py         | 10 +++++-----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 123f60ee43..545ddae731 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -194,7 +194,7 @@ cdef class RemoteFile:
         aws_access_key_id: Optional[str] = None,
         aws_secret_access_key: Optional[str] = None,
         aws_endpoint_url: Optional[str] = None,
-        session_token: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ):
         cdef pair[string, string] bucket_and_object_names = _to_string_pair(
             bucket_name, object_name
@@ -209,7 +209,9 @@ cdef class RemoteFile:
         cdef optional[string] cpp_aws_endpoint_url = _to_optional_string(
             aws_endpoint_url
         )
-        cdef optional[string] cpp_aws_session_token = _to_optional_string(session_token)
+        cdef optional[string] cpp_aws_session_token = _to_optional_string(
+            aws_session_token
+        )
         cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
 
         with nogil:
@@ -236,7 +238,7 @@ cdef class RemoteFile:
         aws_region_name: Optional[str] = None,
         aws_access_key_id: Optional[str] = None,
         aws_secret_access_key: Optional[str] = None,
-        session_token: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ):
         cdef string cpp_url = _to_string(url)
         cdef optional[string] cpp_aws_region = _to_optional_string(aws_region_name)
@@ -247,7 +249,7 @@ cdef class RemoteFile:
             _to_optional_string(aws_secret_access_key)
         )
         cdef optional[string] cpp_aws_session_token = _to_optional_string(
-            session_token
+            aws_session_token
         )
         cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
 
@@ -275,7 +277,7 @@ cdef class RemoteFile:
         aws_access_key_id: Optional[str] = None,
         aws_secret_access_key: Optional[str] = None,
         aws_endpoint_url: Optional[str] = None,
-        session_token: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ):
         cdef string cpp_url = _to_string(url)
         cdef pair[string, string] bucket_and_object_names
@@ -290,7 +292,7 @@ cdef class RemoteFile:
             aws_endpoint_url
         )
         cdef optional[string] cpp_aws_session_token = _to_optional_string(
-            session_token
+            aws_session_token
         )
         cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
 
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 49c30f4bd0..31d9d19831 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -139,7 +139,7 @@ def open_s3(
         - ``AWS_DEFAULT_REGION`` (or region_name parameter)
         - ``AWS_ACCESS_KEY_ID`` (or access_key_id parameter)
         - ``AWS_SECRET_ACCESS_KEY`` (or secret_access_key parameter)
-        - ``AWS_SESSION_TOKEN`` (or session_token parameter, when using
+        - ``AWS_SESSION_TOKEN`` (or aws_session_token parameter, when using
           temporary credentials)
 
         Additionally, to overwrite the AWS endpoint, set `AWS_ENDPOINT_URL`
@@ -196,7 +196,7 @@ def open_s3_url(
         aws_access_key_id: Optional[str] = None,
         aws_secret_access_key: Optional[str] = None,
         aws_endpoint_url: Optional[str] = None,
-        session_token: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
     ) -> RemoteFile:
         """Open a AWS S3 file from an URL.
 
@@ -210,7 +210,7 @@ def open_s3_url(
         - ``AWS_DEFAULT_REGION`` (or region_name parameter)
         - ``AWS_ACCESS_KEY_ID`` (or access_key_id parameter)
         - ``AWS_SECRET_ACCESS_KEY`` (or secret_access_key parameter)
-        - ``AWS_SESSION_TOKEN`` (or session_token parameter, when using
+        - ``AWS_SESSION_TOKEN`` (or aws_session_token parameter, when using
           temporary credentials)
 
         Additionally, if `url` is a S3 url, it is possible to overwrite the AWS endpoint
@@ -252,7 +252,7 @@ def open_s3_url(
                     aws_region_name,
                     aws_access_key_id,
                     aws_secret_access_key,
-                    session_token,
+                    aws_session_token,
                 )
             )
         if parsed_result.scheme == "s3":
@@ -264,7 +264,7 @@ def open_s3_url(
                     aws_access_key_id,
                     aws_secret_access_key,
                     aws_endpoint_url,
-                    session_token,
+                    aws_session_token,
                 )
             )
         raise ValueError(f"Unsupported protocol: {url}")

From 9ac0c317a352315bc82d925e09a6c82684ce3695 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Wed, 8 Oct 2025 15:24:37 +0000
Subject: [PATCH 083/103] Update Changelog [skip ci]

---
 CHANGELOG.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a3aeaa2633..1e6789b685 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,59 @@
+# kvikio 25.10.00 (8 Oct 2025)
+
+## 🚨 Breaking Changes
+
+- Support access to public S3 ([#820](https://github.com/rapidsai/kvikio/pull/820)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Reduce duplication between compat manager and defaults ([#816](https://github.com/rapidsai/kvikio/pull/816)) [@vyasr](https://github.com/vyasr)
+- Devendor libnvcomp from libkvikio ([#805](https://github.com/rapidsai/kvikio/pull/805)) [@bdice](https://github.com/bdice)
+- Remove Python nvCOMP bindings and Zarr 2 support ([#798](https://github.com/rapidsai/kvikio/pull/798)) [@vuule](https://github.com/vuule)
+
+## 🐛 Bug Fixes
+
+- Keep string alive until nvtxDomainResourceCreate ([#832](https://github.com/rapidsai/kvikio/pull/832)) [@vyasr](https://github.com/vyasr)
+- Fix a bug in public S3 inference ([#831](https://github.com/rapidsai/kvikio/pull/831)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix CUDA 13 handling of libcufile on aarch64 ([#827](https://github.com/rapidsai/kvikio/pull/827)) [@bdice](https://github.com/bdice)
+- Skip max_device_cache_size setter when BAR1 memory isn&#39;t present on the GPUs in the system ([#814](https://github.com/rapidsai/kvikio/pull/814)) [@ahoyle-nvidia](https://github.com/ahoyle-nvidia)
+- Fix an S3 parsing bug in the open function. Improve regex usage ([#810](https://github.com/rapidsai/kvikio/pull/810)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Remove remaining nvcomp references ([#801](https://github.com/rapidsai/kvikio/pull/801)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Set compiler versions in context ([#755)&quot; (#784](https://github.com/rapidsai/kvikio/pull/755)&quot; (#784)) [@vyasr](https://github.com/vyasr)
+- Relax mmap read requirement. Improve error message. ([#781](https://github.com/rapidsai/kvikio/pull/781)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🚀 New Features
+
+- Support access to public S3 ([#820](https://github.com/rapidsai/kvikio/pull/820)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Upgrade to nvCOMP 5.0.0.6 ([#800](https://github.com/rapidsai/kvikio/pull/800)) [@vuule](https://github.com/vuule)
+- Remove Python nvCOMP bindings and Zarr 2 support ([#798](https://github.com/rapidsai/kvikio/pull/798)) [@vuule](https://github.com/vuule)
+- Support WebHDFS (2/2): Python binding ([#791](https://github.com/rapidsai/kvikio/pull/791)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file size query for S3 presigned URL ([#789](https://github.com/rapidsai/kvikio/pull/789)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support WebHDFS (1/2): C++ implementation ([#788](https://github.com/rapidsai/kvikio/pull/788)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file-backed mapping (2/n): host/device read Python binding ([#742](https://github.com/rapidsai/kvikio/pull/742)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🛠️ Improvements
+
+- Empty commit to trigger a build ([#840](https://github.com/rapidsai/kvikio/pull/840)) [@msarahan](https://github.com/msarahan)
+- Configure repo for automatic release notes generation ([#825](https://github.com/rapidsai/kvikio/pull/825)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use C++20 for KvikIO main library ([#819](https://github.com/rapidsai/kvikio/pull/819)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Explicitly manage TLS/SSL CA paths for remote I/O ([#817](https://github.com/rapidsai/kvikio/pull/817)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Reduce duplication between compat manager and defaults ([#816](https://github.com/rapidsai/kvikio/pull/816)) [@vyasr](https://github.com/vyasr)
+- Added KVIKIO_REMOTE_VERBOSE option ([#815](https://github.com/rapidsai/kvikio/pull/815)) [@TomAugspurger](https://github.com/TomAugspurger)
+- Use branch-25.10 again ([#812](https://github.com/rapidsai/kvikio/pull/812)) [@jameslamb](https://github.com/jameslamb)
+- Update rapids-dependency-file-generator ([#809](https://github.com/rapidsai/kvikio/pull/809)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add a unified remote I/O interface that infers the endpoint type from URL (2/2): Python binding ([#808](https://github.com/rapidsai/kvikio/pull/808)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Devendor libnvcomp from libkvikio ([#805](https://github.com/rapidsai/kvikio/pull/805)) [@bdice](https://github.com/bdice)
+- Remove more nvcomp packaging for conda ([#804](https://github.com/rapidsai/kvikio/pull/804)) [@bdice](https://github.com/bdice)
+- Build and test with CUDA 13.0.0 ([#803](https://github.com/rapidsai/kvikio/pull/803)) [@jameslamb](https://github.com/jameslamb)
+- Optionally require zarr&gt;=3.0.0 ([#802](https://github.com/rapidsai/kvikio/pull/802)) [@TomAugspurger](https://github.com/TomAugspurger)
+- Use build cluster in devcontainers ([#797](https://github.com/rapidsai/kvikio/pull/797)) [@trxcllnt](https://github.com/trxcllnt)
+- Improve KvikIO Python binding performance by releasing GIL wherever deemed necessary ([#796](https://github.com/rapidsai/kvikio/pull/796)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Update rapids_config to handle user defined branch name ([#794](https://github.com/rapidsai/kvikio/pull/794)) [@robertmaynard](https://github.com/robertmaynard)
+- Add a unified remote I/O interface that infers the endpoint type from URL (1/2): C++ implementation ([#793](https://github.com/rapidsai/kvikio/pull/793)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Update rapids-build-backend to 0.4.0 ([#790](https://github.com/rapidsai/kvikio/pull/790)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix C++20 warning in the mmap test ([#785](https://github.com/rapidsai/kvikio/pull/785)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Allow latest OS in devcontainers ([#780](https://github.com/rapidsai/kvikio/pull/780)) [@bdice](https://github.com/bdice)
+- Update build infra to support new branching strategy ([#776](https://github.com/rapidsai/kvikio/pull/776)) [@robertmaynard](https://github.com/robertmaynard)
+- Use GCC 14 in conda builds. ([#756](https://github.com/rapidsai/kvikio/pull/756)) [@vyasr](https://github.com/vyasr)
+- Use C++20 standard ([#749](https://github.com/rapidsai/kvikio/pull/749)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
 # kvikio 25.08.00 (6 Aug 2025)
 
 ## 🚨 Breaking Changes

From a1171e1519e6756e979af5b029ee9142fbe24af6 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 15 Oct 2025 09:07:40 -0700
Subject: [PATCH 084/103] Enable `sccache-dist` connection pool (#852)

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/kvikio/pull/852
---
 .devcontainer/Dockerfile | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 4775d28f23..0088aaddd6 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -34,8 +34,7 @@ ENV HISTFILE="/home/coder/.cache/._bash_history"
 ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-# 2hr (1 minute longer than sccache-dist request timeout)
-ENV SCCACHE_IDLE_TIMEOUT=7200
+ENV SCCACHE_IDLE_TIMEOUT=0
 
 ###
 # sccache-dist configuration
@@ -46,13 +45,8 @@ ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1
 ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true
 # Retry transient errors 4 times (for a total of 5 attempts)
 ENV SCCACHE_DIST_MAX_RETRIES=4
-ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
-ENV SCCACHE_DIST_CONNECTION_POOL=false
 # 1hr 59min (to accommodate debug builds)
 ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140
-ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true
-ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20
-ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600
 ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com"
 
 # Build as much in parallel as possible

From 21fb8a5f52a476b0f09b569653f838a85c770705 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 16 Oct 2025 14:18:05 -0400
Subject: [PATCH 085/103] Fix an out-of-bounds memory access in file open flags
 checking (#853)

This small PR fixes an out-of-bounds memory access that happens when the file open flags consist of a single character (e.g. `"r"` or `"w"` without the `"+"` suffix).

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/kvikio/pull/853
---
 cpp/src/file_utils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/file_utils.cpp b/cpp/src/file_utils.cpp
index e567b86476..04e3d1a16c 100644
--- a/cpp/src/file_utils.cpp
+++ b/cpp/src/file_utils.cpp
@@ -131,11 +131,11 @@ int open_fd_parse_flags(std::string const& flags, bool o_direct)
   switch (flags[0]) {
     case 'r':
       file_flags = O_RDONLY;
-      if (flags[1] == '+') { file_flags = O_RDWR; }
+      if (flags.length() > 1 && flags[1] == '+') { file_flags = O_RDWR; }
       break;
     case 'w':
       file_flags = O_WRONLY;
-      if (flags[1] == '+') { file_flags = O_RDWR; }
+      if (flags.length() > 1 && flags[1] == '+') { file_flags = O_RDWR; }
       file_flags |= O_CREAT | O_TRUNC;
       break;
     case 'a': KVIKIO_FAIL("Open flag 'a' isn't supported", std::invalid_argument);

From ed9b6a4f97b14cbd605d00365b119d096761256e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 20 Oct 2025 10:09:10 -0500
Subject: [PATCH 086/103] Use main in RAPIDS_BRANCH (#854)

Supports rollout of new branching strategy. https://docs.rapids.ai/notices/rsn0047/

xref: https://github.com/rapidsai/build-planning/issues/224

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/kvikio/pull/854
---
 RAPIDS_BRANCH | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
index a2b5a91fc4..ba2906d066 100644
--- a/RAPIDS_BRANCH
+++ b/RAPIDS_BRANCH
@@ -1 +1 @@
-branch-25.12
+main

From b978f71f4e5865c5b7f87419b684fe6f87fd3464 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 20 Oct 2025 10:39:31 -0500
Subject: [PATCH 087/103] Use main branch (#855)

Supports rollout of new branching strategy. https://docs.rapids.ai/notices/rsn0047/

xref: https://github.com/rapidsai/build-planning/issues/224

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/kvikio/pull/855
---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0bac6ac934..d0352953ca 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -3,7 +3,7 @@ name: build
 on:
   push:
     branches:
-      - "branch-*"
+      - "main"
     tags:
       - v[0-9][0-9].[0-9][0-9].[0-9][0-9]
   workflow_dispatch:

From e0396ee8f34b9180e2877a0cc874033598b2ccf1 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 20 Oct 2025 16:48:43 -0400
Subject: [PATCH 088/103] Use ruff and ruff-format instead of flake8, black,
 isort (#856)

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/856
---
 .flake8                                       | 34 ----------
 .pre-commit-config.yaml                       | 26 ++------
 docs/source/conf.py                           | 16 ++---
 .../medical_dicom_image_loading_example.ipynb | 41 ++++++------
 .../medical_nifti_image_loading_example.ipynb | 29 +++++----
 pyproject.toml                                | 62 +++++++++++++++++++
 python/kvikio/kvikio/cufile.py                |  2 +-
 python/kvikio/kvikio/cufile_driver.py         |  6 +-
 python/kvikio/kvikio/defaults.py              |  6 +-
 python/kvikio/kvikio/numpy.py                 |  8 +--
 python/kvikio/pyproject.toml                  | 54 ----------------
 python/kvikio/tests/test_http_io.py           |  5 +-
 12 files changed, 127 insertions(+), 162 deletions(-)
 delete mode 100644 .flake8
 create mode 100644 pyproject.toml

diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 01f556d94c..0000000000
--- a/.flake8
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-[flake8]
-filename = *.py, *.pyx, *.pxd, *.pxi
-force-check = True
-max-line-length = 88
-exclude =
-    .eggs,
-    *.egg,
-    build,
-    docs,
-    .git,
-    _skbuild,
-ignore =
-    # line break before binary operator
-    W503,
-    # whitespace before :
-    E203
-per-file-ignores =
-    # Ignore black/flake8-pyi conflicts
-    *.pyi:E301 E302 E704
-    # Rules ignored only in Cython:
-    # E211: whitespace before '(' (used in multi-line imports)
-    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
-    # E402: invalid syntax (works for Python, not Cython)
-    # E999: invalid syntax (works for Python, not Cython)
-    # W503: line break before binary operator (breaks lines that start with a pointer)
-    # W504: line break after binary operator (breaks lines that end with a pointer)
-    *.pyx: E211, E225, E226, E227, E275, E402, E999, W503, W504
-    *.pxd: E211, E225, E226, E227, E275, E402, E999, W503, W504
-    *.pxi: E211, E225, E226, E227, E275, E402, E999, W503, W504
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 17ccb948fc..297ccc09aa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,28 +6,12 @@ repos:
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.1
     hooks:
-      - id: isort
-        args: ["--config-root=python/", "--resolve-all-configs"]
-        files: python/.*
-        types_or: [python, cython, pyi]
-  - repo: https://github.com/psf/black
-    rev: 23.3.0
-    hooks:
-      - id: black
-        files: python/.*
-        args: ["--config", "python/kvikio/pyproject.toml"]
-  - repo: https://github.com/PyCQA/flake8
-    rev: 7.1.1
-    hooks:
-      - id: flake8
-        args: ["--config=.flake8"]
-        files: python/.*$
-        types: [file]
-        types_or: [python, cython]
-        additional_dependencies: ["flake8-force"]
+      - id: ruff-check
+        args: ["--fix"]
+      - id: ruff-format
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.15.0
     hooks:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0ac87ceae7..b6ac274a47 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 #
 # Configuration file for the Sphinx documentation builder.
@@ -31,7 +31,9 @@
 # The short X.Y version.
 version = f"{KVIKIO_VERSION.major:02}.{KVIKIO_VERSION.minor:02}"
 # The full version, including alpha/beta/rc tags
-release = f"{KVIKIO_VERSION.major:02}.{KVIKIO_VERSION.minor:02}.{KVIKIO_VERSION.micro:02}"
+release = (
+    f"{KVIKIO_VERSION.major:02}.{KVIKIO_VERSION.minor:02}.{KVIKIO_VERSION.micro:02}"
+)
 
 
 # -- General configuration ---------------------------------------------------
@@ -82,11 +84,11 @@
 pygments_style = None
 
 autodoc_default_options = {
-    'members': True,
-    'member-order': 'bysource',
-    'special-members': '__init__',
-    'undoc-members': True,
-    'exclude-members': '__weakref__'
+    "members": True,
+    "member-order": "bysource",
+    "special-members": "__init__",
+    "undoc-members": True,
+    "exclude-members": "__weakref__",
 }
 
 # -- Options for HTML output -------------------------------------------------
diff --git a/notebooks/medical_dicom_image_loading_example.ipynb b/notebooks/medical_dicom_image_loading_example.ipynb
index 3e47693ee5..1cd603e4c5 100644
--- a/notebooks/medical_dicom_image_loading_example.ipynb
+++ b/notebooks/medical_dicom_image_loading_example.ipynb
@@ -58,21 +58,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import kvikio\n",
-    "import kvikio.defaults\n",
-    "import cupy as cp\n",
+    "import datetime\n",
+    "import os\n",
+    "import shutil\n",
     "import tempfile\n",
+    "from timeit import default_timer as timer\n",
+    "\n",
+    "import cupy as cp\n",
+    "import numpy as np\n",
     "import pydicom\n",
     "from pydicom.dataset import Dataset, FileDataset\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import datetime\n",
-    "import requests\n",
-    "import tarfile\n",
-    "import gzip\n",
-    "import shutil\n",
-    "import io\n",
-    "from timeit import default_timer as timer"
+    "\n",
+    "import kvikio\n",
+    "import kvikio.defaults"
    ]
   },
   {
@@ -110,6 +108,7 @@
     "    c = cp.random.rand(100, 100, 3)\n",
     "    d = cp.mean(c)\n",
     "\n",
+    "\n",
     "warmup_kvikio()"
    ]
   },
@@ -172,7 +171,12 @@
     }
    ],
    "source": [
-    "def create_multiframe_dicom(file_path, num_slices=128, pixel_array_shape=(1024, 1024), pixel_value_range=(0, 4095)):\n",
+    "def create_multiframe_dicom(\n",
+    "    file_path,\n",
+    "    num_slices=128,\n",
+    "    pixel_array_shape=(1024, 1024),\n",
+    "    pixel_value_range=(0, 4095),\n",
+    "):\n",
     "    # Create a new DICOM dataset\n",
     "    file_meta = pydicom.dataset.FileMetaDataset()\n",
     "    file_meta.MediaStorageSOPClassUID = pydicom.uid.generate_uid()\n",
@@ -190,10 +194,10 @@
     "    ds.StudyInstanceUID = pydicom.uid.generate_uid()\n",
     "    ds.SeriesInstanceUID = pydicom.uid.generate_uid()\n",
     "    ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID\n",
-    "    ds.StudyDate = datetime.date.today().strftime('%Y%m%d')\n",
-    "    ds.ContentDate = datetime.date.today().strftime('%Y%m%d')\n",
-    "    ds.StudyTime = datetime.datetime.now().strftime('%H%M%S')\n",
-    "    ds.ContentTime = datetime.datetime.now().strftime('%H%M%S')\n",
+    "    ds.StudyDate = datetime.date.today().strftime(\"%Y%m%d\")\n",
+    "    ds.ContentDate = datetime.date.today().strftime(\"%Y%m%d\")\n",
+    "    ds.StudyTime = datetime.datetime.now().strftime(\"%H%M%S\")\n",
+    "    ds.ContentTime = datetime.datetime.now().strftime(\"%H%M%S\")\n",
     "\n",
     "    # Set the pixel data with random integers\n",
     "    pixel_array = np.random.randint(\n",
@@ -234,6 +238,7 @@
     "    ds.save_as(file_path)\n",
     "    print(f\"Multi-frame DICOM file created at: {file_path}\")\n",
     "\n",
+    "\n",
     "# Example usage\n",
     "example_dcm_path = os.path.join(temp_working_dir, \"example.dcm\")\n",
     "\n",
@@ -277,7 +282,7 @@
     "    columns = dcm_read_data.Columns\n",
     "    bits_allocated = dcm_read_data.BitsAllocated\n",
     "    samples_per_pixel = dcm_read_data.SamplesPerPixel\n",
-    "    number_of_frames = getattr(dcm_read_data, 'NumberOfFrames', 1)\n",
+    "    number_of_frames = getattr(dcm_read_data, \"NumberOfFrames\", 1)\n",
     "    pixel_representation = dcm_read_data.PixelRepresentation\n",
     "\n",
     "    if bits_allocated == 8:\n",
diff --git a/notebooks/medical_nifti_image_loading_example.ipynb b/notebooks/medical_nifti_image_loading_example.ipynb
index bc0ec2f837..7104f7db07 100644
--- a/notebooks/medical_nifti_image_loading_example.ipynb
+++ b/notebooks/medical_nifti_image_loading_example.ipynb
@@ -60,19 +60,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import kvikio\n",
-    "import kvikio.defaults\n",
-    "import cupy as cp\n",
-    "import numpy as np\n",
+    "import gzip\n",
+    "import io\n",
+    "import os\n",
+    "import shutil\n",
+    "import tarfile\n",
     "import tempfile\n",
+    "from timeit import default_timer as timer\n",
+    "\n",
+    "import cupy as cp\n",
     "import nibabel as nib\n",
-    "import os\n",
+    "import numpy as np\n",
     "import requests\n",
-    "import tarfile\n",
-    "import gzip\n",
-    "import shutil\n",
-    "import io\n",
-    "from timeit import default_timer as timer"
+    "\n",
+    "import kvikio\n",
+    "import kvikio.defaults"
    ]
   },
   {
@@ -110,6 +112,7 @@
     "    c = cp.random.rand(100, 100, 3)\n",
     "    d = cp.mean(c)\n",
     "\n",
+    "\n",
     "warmup_kvikio()"
    ]
   },
@@ -196,8 +199,10 @@
    ],
    "source": [
     "# decompress the nii.gz file\n",
-    "example_nifti_path = os.path.join(temp_working_dir, \"Task09_Spleen\", \"imagesTr\", \"spleen_53.nii\")\n",
-    "with gzip.open(example_nifti_path+\".gz\", \"rb\") as f_in:\n",
+    "example_nifti_path = os.path.join(\n",
+    "    temp_working_dir, \"Task09_Spleen\", \"imagesTr\", \"spleen_53.nii\"\n",
+    ")\n",
+    "with gzip.open(example_nifti_path + \".gz\", \"rb\") as f_in:\n",
     "    with open(example_nifti_path, \"wb\") as f_out:\n",
     "        shutil.copyfileobj(f_in, f_out)\n",
     "print(\"a decompressed nifti file is saved at: \", example_nifti_path)"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..a8871b3936
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,62 @@
+[tool.ruff]
+line-length = 88
+target-version = "py310"
+
+[tool.ruff.lint]
+select = [
+    "E",
+    "F",
+    "I",
+    "W",
+]
+ignore = [
+    # whitespace before :
+    "E203",
+    # line-too-long (due to Copyright header)
+    "E501",
+]
+fixable = ["ALL"]
+
+[tool.ruff.lint.per-file-ignores]
+"*.pyx" = [
+    "E211",
+    "E225",
+    "E226",
+    "E227",
+    "E275",
+    "E402",
+]
+"*.pxd" = [
+    "E211",
+    "E225",
+    "E226",
+    "E227",
+    "E275",
+    "E402",
+]
+"*.pxi" = [
+    "E211",
+    "E225",
+    "E226",
+    "E227",
+    "E275",
+    "E402",
+]
+"notebooks/*" = [
+    "F841",
+]
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+order-by-type = true
+known-first-party = [
+    "kvikio",
+]
+default-section = "third-party"
+section-order = [
+    "future",
+    "standard-library",
+    "third-party",
+    "first-party",
+    "local-folder",
+]
diff --git a/python/kvikio/kvikio/cufile.py b/python/kvikio/kvikio/cufile.py
index fd1021d8ca..c8ba5bfd57 100644
--- a/python/kvikio/kvikio/cufile.py
+++ b/python/kvikio/kvikio/cufile.py
@@ -435,7 +435,7 @@ def raw_write(
 
 
 def get_page_cache_info(
-    file: Union[os.PathLike, str, int, io.IOBase]
+    file: Union[os.PathLike, str, int, io.IOBase],
 ) -> tuple[int, int]:
     """Obtain the page cache residency information for a given file
 
diff --git a/python/kvikio/kvikio/cufile_driver.py b/python/kvikio/kvikio/cufile_driver.py
index 8c8804d885..c9970903d7 100644
--- a/python/kvikio/kvikio/cufile_driver.py
+++ b/python/kvikio/kvikio/cufile_driver.py
@@ -92,13 +92,11 @@ def _property_getter_and_setter(
 
 
 @overload
-def set(config: dict[str, Any], /) -> ConfigContextManager:
-    ...
+def set(config: dict[str, Any], /) -> ConfigContextManager: ...
 
 
 @overload
-def set(key: str, value: Any, /) -> ConfigContextManager:
-    ...
+def set(key: str, value: Any, /) -> ConfigContextManager: ...
 
 
 def set(*config) -> ConfigContextManager:
diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py
index ee0ebf5f95..3004d74e00 100644
--- a/python/kvikio/kvikio/defaults.py
+++ b/python/kvikio/kvikio/defaults.py
@@ -68,13 +68,11 @@ def _property_getter_and_setter(self) -> tuple[dict[str, Any], dict[str, Any]]:
 
 
 @overload
-def set(config: dict[str, Any], /) -> ConfigContextManager:
-    ...
+def set(config: dict[str, Any], /) -> ConfigContextManager: ...
 
 
 @overload
-def set(key: str, value: Any, /) -> ConfigContextManager:
-    ...
+def set(key: str, value: Any, /) -> ConfigContextManager: ...
 
 
 def set(*config) -> ConfigContextManager:
diff --git a/python/kvikio/kvikio/numpy.py b/python/kvikio/kvikio/numpy.py
index 461a0fae65..311067bed0 100644
--- a/python/kvikio/kvikio/numpy.py
+++ b/python/kvikio/kvikio/numpy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 import io
@@ -16,12 +16,10 @@
 class FileLike(Protocol):
     """File like object that represent a OS-level file"""
 
-    def fileno(self) -> int:
-        ...
+    def fileno(self) -> int: ...
 
     @property
-    def name(self) -> str:
-        ...
+    def name(self) -> str: ...
 
 
 class LikeWrapper:
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 78723f4a7e..4e7ae7c21d 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -56,60 +56,6 @@ zarr = [
 [project.urls]
 Homepage = "https://github.com/rapidsai/kvikio"
 
-[tool.black]
-line-length = 88
-target-version = ["py310"]
-include = '\.py?$'
-exclude = '''
-/(
-    thirdparty |
-    \.eggs |
-    \.git |
-    \.hg |
-    \.mypy_cache |
-    \.tox |
-    \.venv |
-    _build |
-    buck-out |
-    build |
-    dist |
-    _skbuild
-)/
-'''
-
-[tool.isort]
-line_length = 88
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_first_party = [
-    "kvikio",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
-
 [tool.mypy]
 ignore_missing_imports = true
 
diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py
index fac0aca5cc..2d271c9833 100644
--- a/python/kvikio/tests/test_http_io.py
+++ b/python/kvikio/tests/test_http_io.py
@@ -203,8 +203,9 @@ def test_retry_http_503_fails(tmpdir, xp, capfd):
         a.tofile(tmpdir / "a")
         b = xp.empty_like(a)
 
-        with pytest.raises(RuntimeError) as m, kvikio.defaults.set(
-            "http_max_attempts", 2
+        with (
+            pytest.raises(RuntimeError) as m,
+            kvikio.defaults.set("http_max_attempts", 2),
         ):
             with kvikio.RemoteFile.open_http(f"{server.url}/a") as f:
                 f.read(b)

From f5dd4a5d66ddcd80761d3cbb718ba72799e9c149 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 21 Oct 2025 10:27:12 -0400
Subject: [PATCH 089/103] Minimize transitive includes by moving NVTX-related
 code to the detail namespace (#851)

This PR is part of the effort to minimize transitive includes in KvikIO shared library. It moves the NVTX-related code from the public headers to the `detail` namespace.
As a result, the files `parallel_operation.hpp` and `posix_io.hpp` have also been moved to the `detail` namespace.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/kvikio/pull/851
---
 cpp/CMakeLists.txt                            |  4 +--
 cpp/examples/basic_no_cuda.cpp                |  3 +-
 cpp/include/kvikio/{ => detail}/nvtx.hpp      |  0
 .../{ => detail}/parallel_operation.hpp       |  2 +-
 cpp/include/kvikio/{ => detail}/posix_io.hpp  |  2 +-
 cpp/include/kvikio/file_handle.hpp            |  2 --
 cpp/include/kvikio/remote_handle.hpp          |  2 --
 cpp/include/kvikio/threadpool_wrapper.hpp     | 29 +++----------------
 cpp/src/bounce_buffer.cpp                     |  2 +-
 cpp/src/buffer.cpp                            |  2 +-
 cpp/src/compat_mode.cpp                       |  2 +-
 cpp/src/compat_mode_manager.cpp               |  2 +-
 cpp/src/defaults.cpp                          |  1 +
 cpp/src/{ => detail}/nvtx.cpp                 |  2 +-
 cpp/src/{ => detail}/posix_io.cpp             |  4 +--
 cpp/src/file_handle.cpp                       |  4 ++-
 cpp/src/file_utils.cpp                        |  2 +-
 cpp/src/hdfs.cpp                              |  2 +-
 cpp/src/mmap.cpp                              |  6 ++--
 cpp/src/remote_handle.cpp                     |  6 ++--
 cpp/src/shim/libcurl.cpp                      |  4 +--
 cpp/src/stream.cpp                            |  2 +-
 22 files changed, 32 insertions(+), 53 deletions(-)
 rename cpp/include/kvikio/{ => detail}/nvtx.hpp (100%)
 rename cpp/include/kvikio/{ => detail}/parallel_operation.hpp (99%)
 rename cpp/include/kvikio/{ => detail}/posix_io.hpp (99%)
 rename cpp/src/{ => detail}/nvtx.cpp (99%)
 rename cpp/src/{ => detail}/posix_io.cpp (97%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0a67b6e677..c54a333afd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -151,8 +151,8 @@ set(SOURCES
     "src/file_handle.cpp"
     "src/file_utils.cpp"
     "src/mmap.cpp"
-    "src/nvtx.cpp"
-    "src/posix_io.cpp"
+    "src/detail/nvtx.cpp"
+    "src/detail/posix_io.cpp"
     "src/shim/cuda.cpp"
     "src/shim/cufile.cpp"
     "src/shim/utils.cpp"
diff --git a/cpp/examples/basic_no_cuda.cpp b/cpp/examples/basic_no_cuda.cpp
index 42ecb7142d..8e08d18206 100644
--- a/cpp/examples/basic_no_cuda.cpp
+++ b/cpp/examples/basic_no_cuda.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <chrono>
 #include <iostream>
+#include <numeric>
 
 #include <kvikio/batch.hpp>
 #include <kvikio/buffer.hpp>
diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/detail/nvtx.hpp
similarity index 100%
rename from cpp/include/kvikio/nvtx.hpp
rename to cpp/include/kvikio/detail/nvtx.hpp
diff --git a/cpp/include/kvikio/parallel_operation.hpp b/cpp/include/kvikio/detail/parallel_operation.hpp
similarity index 99%
rename from cpp/include/kvikio/parallel_operation.hpp
rename to cpp/include/kvikio/detail/parallel_operation.hpp
index 5026240557..891e33330f 100644
--- a/cpp/include/kvikio/parallel_operation.hpp
+++ b/cpp/include/kvikio/detail/parallel_operation.hpp
@@ -26,8 +26,8 @@
 #include <vector>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/utils.hpp>
 
 namespace kvikio {
diff --git a/cpp/include/kvikio/posix_io.hpp b/cpp/include/kvikio/detail/posix_io.hpp
similarity index 99%
rename from cpp/include/kvikio/posix_io.hpp
rename to cpp/include/kvikio/detail/posix_io.hpp
index a139cae718..c923f25246 100644
--- a/cpp/include/kvikio/posix_io.hpp
+++ b/cpp/include/kvikio/detail/posix_io.hpp
@@ -22,8 +22,8 @@
 #include <thread>
 
 #include <kvikio/bounce_buffer.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
 
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index 78e493c9b0..e88b6b82ab 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -28,8 +28,6 @@
 #include <kvikio/defaults.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_utils.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/shim/cufile_h_wrapper.hpp>
 #include <kvikio/stream.hpp>
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index c48e84e8fb..9384178ab6 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -24,8 +24,6 @@
 
 #include <kvikio/defaults.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/utils.hpp>
 
 struct curl_slist;
diff --git a/cpp/include/kvikio/threadpool_wrapper.hpp b/cpp/include/kvikio/threadpool_wrapper.hpp
index 6eb2f158b2..c6f6c4becc 100644
--- a/cpp/include/kvikio/threadpool_wrapper.hpp
+++ b/cpp/include/kvikio/threadpool_wrapper.hpp
@@ -16,47 +16,26 @@
 
 #pragma once
 
-#include <functional>
-
 #include <BS_thread_pool.hpp>
 
-#include <kvikio/nvtx.hpp>
-
 namespace kvikio {
 
 template <typename pool_type>
 class thread_pool_wrapper : public pool_type {
  public:
   /**
-   * @brief Construct a new thread pool wrapper, and invoke a pre-defined initialization function in
-   * each worker thread.
+   * @brief Construct a new thread pool wrapper.
    *
    * @param nthreads The number of threads to use.
    */
-  thread_pool_wrapper(unsigned int nthreads) : pool_type{nthreads, worker_thread_init_func}
-  {
-    KVIKIO_NVTX_FUNC_RANGE();
-  }
+  thread_pool_wrapper(unsigned int nthreads) : pool_type{nthreads} {}
 
   /**
-   * @brief Reset the number of threads in the thread pool, and invoke a pre-defined initialization
-   * function in each worker thread.
+   * @brief Reset the number of threads in the thread pool.
    *
    * @param nthreads The number of threads to use.
    */
-  void reset(unsigned int nthreads)
-  {
-    KVIKIO_NVTX_FUNC_RANGE();
-    pool_type::reset(nthreads, worker_thread_init_func);
-  }
-
- private:
-  inline static std::function<void()> worker_thread_init_func{[] {
-    KVIKIO_NVTX_FUNC_RANGE();
-    // Rename the worker thread in the thread pool to improve clarity from nsys-ui.
-    // Note: This NVTX feature is currently not supported by nsys-ui.
-    NvtxManager::rename_current_thread("thread pool");
-  }};
+  void reset(unsigned int nthreads) { pool_type::reset(nthreads); }
 };
 
 using BS_thread_pool = thread_pool_wrapper<BS::thread_pool>;
diff --git a/cpp/src/bounce_buffer.cpp b/cpp/src/bounce_buffer.cpp
index 02281b2adc..fb91de26d0 100644
--- a/cpp/src/bounce_buffer.cpp
+++ b/cpp/src/bounce_buffer.cpp
@@ -19,8 +19,8 @@
 
 #include <kvikio/bounce_buffer.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cuda.hpp>
 
 namespace kvikio {
diff --git a/cpp/src/buffer.cpp b/cpp/src/buffer.cpp
index 744a004b1d..8f0b29ba27 100644
--- a/cpp/src/buffer.cpp
+++ b/cpp/src/buffer.cpp
@@ -20,8 +20,8 @@
 
 #include <kvikio/buffer.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/shim/cufile_h_wrapper.hpp>
 #include <kvikio/utils.hpp>
diff --git a/cpp/src/compat_mode.cpp b/cpp/src/compat_mode.cpp
index d8346253bf..109eb7fd2b 100644
--- a/cpp/src/compat_mode.cpp
+++ b/cpp/src/compat_mode.cpp
@@ -18,8 +18,8 @@
 #include <stdexcept>
 
 #include <kvikio/compat_mode.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 
 namespace kvikio {
 
diff --git a/cpp/src/compat_mode_manager.cpp b/cpp/src/compat_mode_manager.cpp
index 3a162ad768..7eef3e4502 100644
--- a/cpp/src/compat_mode_manager.cpp
+++ b/cpp/src/compat_mode_manager.cpp
@@ -20,9 +20,9 @@
 #include <kvikio/compat_mode_manager.hpp>
 #include <kvikio/cufile/config.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cufile.hpp>
 
 namespace kvikio {
diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp
index 24a4de0898..88d1bea729 100644
--- a/cpp/src/defaults.cpp
+++ b/cpp/src/defaults.cpp
@@ -25,6 +25,7 @@
 
 #include <kvikio/compat_mode.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/http_status_codes.hpp>
 #include <kvikio/shim/cufile.hpp>
diff --git a/cpp/src/nvtx.cpp b/cpp/src/detail/nvtx.cpp
similarity index 99%
rename from cpp/src/nvtx.cpp
rename to cpp/src/detail/nvtx.cpp
index 9dd1bbf687..61e153c3b9 100644
--- a/cpp/src/nvtx.cpp
+++ b/cpp/src/detail/nvtx.cpp
@@ -20,7 +20,7 @@
 
 #include <nvtx3/nvtx3.hpp>
 
-#include <kvikio/nvtx.hpp>
+#include <kvikio/detail/nvtx.hpp>
 
 namespace kvikio {
 
diff --git a/cpp/src/posix_io.cpp b/cpp/src/detail/posix_io.cpp
similarity index 97%
rename from cpp/src/posix_io.cpp
rename to cpp/src/detail/posix_io.cpp
index 512576943b..0351c04965 100644
--- a/cpp/src/posix_io.cpp
+++ b/cpp/src/detail/posix_io.cpp
@@ -21,9 +21,9 @@
 #include <thread>
 
 #include <kvikio/bounce_buffer.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/posix_io.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
 
diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp
index 11698bf30f..1b22920449 100644
--- a/cpp/src/file_handle.cpp
+++ b/cpp/src/file_handle.cpp
@@ -25,10 +25,12 @@
 
 #include <kvikio/compat_mode.hpp>
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/parallel_operation.hpp>
+#include <kvikio/detail/posix_io.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
 #include <kvikio/file_utils.hpp>
-#include <kvikio/nvtx.hpp>
 
 namespace kvikio {
 
diff --git a/cpp/src/file_utils.cpp b/cpp/src/file_utils.cpp
index 04e3d1a16c..986dab16ff 100644
--- a/cpp/src/file_utils.cpp
+++ b/cpp/src/file_utils.cpp
@@ -27,10 +27,10 @@
 #include <utility>
 #include <vector>
 
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
 #include <kvikio/file_utils.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cufile.hpp>
 
 namespace kvikio {
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
index 2aa5f53278..c222b5e6ca 100644
--- a/cpp/src/hdfs.cpp
+++ b/cpp/src/hdfs.cpp
@@ -16,10 +16,10 @@
 
 #include <regex>
 
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/detail/remote_handle.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/hdfs.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/remote_handle.hpp>
 #include <kvikio/shim/libcurl.hpp>
 
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index ee41a55d3d..c93e88eb7d 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -24,12 +24,12 @@
 #include <type_traits>
 #include <unordered_map>
 
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/parallel_operation.hpp>
+#include <kvikio/detail/posix_io.hpp>
 #include <kvikio/detail/utils.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/mmap.hpp>
-#include <kvikio/nvtx.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/utils.hpp>
 #include "kvikio/file_utils.hpp"
 
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index e2e4ea21c8..2c4ab235c7 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -26,13 +26,13 @@
 #include <string>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/parallel_operation.hpp>
+#include <kvikio/detail/posix_io.hpp>
 #include <kvikio/detail/remote_handle.hpp>
 #include <kvikio/detail/url.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/hdfs.hpp>
-#include <kvikio/nvtx.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/remote_handle.hpp>
 #include <kvikio/shim/libcurl.hpp>
 #include <kvikio/utils.hpp>
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
index a78fb33d30..170dc7afb5 100644
--- a/cpp/src/shim/libcurl.cpp
+++ b/cpp/src/shim/libcurl.cpp
@@ -29,10 +29,10 @@
 #include <curl/curl.h>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/parallel_operation.hpp>
+#include <kvikio/detail/posix_io.hpp>
 #include <kvikio/detail/tls.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
 #include <kvikio/shim/libcurl.hpp>
 #include <kvikio/utils.hpp>
 
diff --git a/cpp/src/stream.cpp b/cpp/src/stream.cpp
index 4b5f454a3f..beaa571912 100644
--- a/cpp/src/stream.cpp
+++ b/cpp/src/stream.cpp
@@ -21,8 +21,8 @@
 #include <tuple>
 #include <utility>
 
+#include <kvikio/detail/nvtx.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/nvtx.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/stream.hpp>

From 0e5aa097a308fabdd41cc4a5ad2063cf7b60c11c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 21 Oct 2025 12:56:12 -0500
Subject: [PATCH 090/103] Use main shared-workflows branch (#857)

Contributes to https://github.com/rapidsai/build-planning/issues/224

## Notes for Reviewers

This is safe to admin-merge because the change is a no-op... configs on those 2 branches are identical.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nate Rock (https://github.com/rockhowse)

URL: https://github.com/rapidsai/kvikio/pull/857
---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 4 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d0352953ca..9a698f588f 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 188b165222..1650da81c8 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,7 +142,7 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main
     with:
       arch: '["amd64", "arm64"]'
       cuda: '["13.0"]'
@@ -160,7 +160,7 @@ jobs:
         sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -170,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -179,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b8914deff2..3f8d231157 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index c3cce1c34f..c471e2a151 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.12
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From 5c0a46229b837de1045a853b2efd29e56c45176f Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 23 Oct 2025 09:28:39 -0400
Subject: [PATCH 091/103] Use SPDX for all copyright headers (#858)

Issue: https://github.com/rapidsai/build-infra/issues/297

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/kvikio/pull/858
---
 .pre-commit-config.yaml                       | 19 +++++++++++++------
 build.sh                                      |  3 ++-
 ci/build_cpp.sh                               |  3 ++-
 ci/build_docs.sh                              |  3 ++-
 ci/build_python.sh                            |  3 ++-
 ci/build_wheel.sh                             |  3 ++-
 ci/build_wheel_cpp.sh                         |  3 ++-
 ci/build_wheel_python.sh                      |  3 ++-
 ci/check_style.sh                             |  3 ++-
 ci/release/update-version.sh                  |  3 ++-
 ci/run_ctests.sh                              |  3 ++-
 ci/run_pytests.sh                             |  3 ++-
 ci/test_cpp.sh                                |  3 ++-
 ci/test_java.sh                               |  3 ++-
 ci/test_python.sh                             |  3 ++-
 ci/test_wheel.sh                              |  3 ++-
 ci/validate_wheel.sh                          |  3 ++-
 cmake/RAPIDS.cmake                            | 15 ++++-----------
 cmake/rapids_config.cmake                     | 15 ++++-----------
 conda/recipes/kvikio/recipe.yaml              |  3 ++-
 conda/recipes/libkvikio/recipe.yaml           |  3 ++-
 cpp/CMakeLists.txt                            | 15 ++++-----------
 cpp/benchmarks/CMakeLists.txt                 | 15 ++++-----------
 .../threadpool/threadpool_benchmark.cpp       | 15 ++-------------
 cpp/cmake/thirdparty/get_gtest.cmake          | 15 ++++-----------
 cpp/cmake/thirdparty/get_libcurl.cmake        | 15 ++++-----------
 cpp/cmake/thirdparty/get_nvtx.cmake           | 15 ++++-----------
 cpp/cmake/thirdparty/get_thread_pool.cmake    | 15 ++++-----------
 cpp/examples/CMakeLists.txt                   | 15 ++++-----------
 cpp/examples/basic_io.cpp                     | 15 ++-------------
 cpp/examples/basic_no_cuda.cpp                | 15 ++-------------
 cpp/examples/downstream/CMakeLists.txt        | 15 ++++-----------
 cpp/examples/downstream/cmake/get_cpm.cmake   |  5 +++++
 .../downstream/cmake/get_kvikio.cmake         | 15 ++++-----------
 .../downstream/downstream_example.cpp         | 15 ++-------------
 cpp/include/kvikio/batch.hpp                  | 15 ++-------------
 cpp/include/kvikio/bounce_buffer.hpp          | 15 ++-------------
 cpp/include/kvikio/buffer.hpp                 | 15 ++-------------
 cpp/include/kvikio/compat_mode.hpp            | 15 ++-------------
 cpp/include/kvikio/compat_mode_manager.hpp    | 15 ++-------------
 cpp/include/kvikio/cufile/config.hpp          | 15 ++-------------
 cpp/include/kvikio/cufile/driver.hpp          | 15 ++-------------
 cpp/include/kvikio/defaults.hpp               | 15 ++-------------
 cpp/include/kvikio/detail/nvtx.hpp            | 15 ++-------------
 .../kvikio/detail/parallel_operation.hpp      | 15 ++-------------
 cpp/include/kvikio/detail/posix_io.hpp        | 15 ++-------------
 cpp/include/kvikio/detail/remote_handle.hpp   | 15 ++-------------
 cpp/include/kvikio/detail/tls.hpp             | 15 ++-------------
 cpp/include/kvikio/detail/url.hpp             | 15 ++-------------
 cpp/include/kvikio/detail/utils.hpp           | 15 ++-------------
 cpp/include/kvikio/error.hpp                  | 15 ++-------------
 cpp/include/kvikio/file_handle.hpp            | 15 ++-------------
 cpp/include/kvikio/file_utils.hpp             | 15 ++-------------
 cpp/include/kvikio/hdfs.hpp                   | 15 ++-------------
 cpp/include/kvikio/http_status_codes.hpp      | 15 ++-------------
 cpp/include/kvikio/mmap.hpp                   | 15 ++-------------
 cpp/include/kvikio/remote_handle.hpp          | 15 ++-------------
 cpp/include/kvikio/shim/cuda.hpp              | 15 ++-------------
 cpp/include/kvikio/shim/cufile.hpp            | 15 ++-------------
 cpp/include/kvikio/shim/cufile_h_wrapper.hpp  | 15 ++-------------
 cpp/include/kvikio/shim/libcurl.hpp           | 15 ++-------------
 cpp/include/kvikio/shim/utils.hpp             | 15 ++-------------
 cpp/include/kvikio/stream.hpp                 | 15 ++-------------
 cpp/include/kvikio/threadpool_wrapper.hpp     | 15 ++-------------
 cpp/include/kvikio/utils.hpp                  | 15 ++-------------
 cpp/scripts/run-cmake-format.sh               |  3 ++-
 cpp/src/batch.cpp                             | 15 ++-------------
 cpp/src/bounce_buffer.cpp                     | 15 ++-------------
 cpp/src/buffer.cpp                            | 15 ++-------------
 cpp/src/compat_mode.cpp                       | 15 ++-------------
 cpp/src/compat_mode_manager.cpp               | 15 ++-------------
 cpp/src/cufile/config.cpp                     | 15 ++-------------
 cpp/src/cufile/driver.cpp                     | 15 ++-------------
 cpp/src/defaults.cpp                          | 15 ++-------------
 cpp/src/detail/nvtx.cpp                       | 15 ++-------------
 cpp/src/detail/posix_io.cpp                   | 15 ++-------------
 cpp/src/detail/remote_handle.cpp              | 15 ++-------------
 cpp/src/detail/tls.cpp                        | 15 ++-------------
 cpp/src/detail/url.cpp                        | 15 ++-------------
 cpp/src/error.cpp                             | 15 ++-------------
 cpp/src/file_handle.cpp                       | 15 ++-------------
 cpp/src/file_utils.cpp                        | 15 ++-------------
 cpp/src/hdfs.cpp                              | 15 ++-------------
 cpp/src/http_status_codes.cpp                 | 15 ++-------------
 cpp/src/mmap.cpp                              | 15 ++-------------
 cpp/src/remote_handle.cpp                     | 15 ++-------------
 cpp/src/shim/cuda.cpp                         | 15 ++-------------
 cpp/src/shim/cufile.cpp                       | 15 ++-------------
 cpp/src/shim/libcurl.cpp                      | 15 ++-------------
 cpp/src/shim/utils.cpp                        | 15 ++-------------
 cpp/src/stream.cpp                            | 15 ++-------------
 cpp/src/utils.cpp                             | 15 ++-------------
 cpp/tests/CMakeLists.txt                      | 15 ++++-----------
 cpp/tests/test_basic_io.cpp                   | 15 ++-------------
 cpp/tests/test_defaults.cpp                   | 15 ++-------------
 cpp/tests/test_error.cpp                      | 15 ++-------------
 cpp/tests/test_hdfs.cpp                       | 15 ++-------------
 cpp/tests/test_mmap.cpp                       | 15 ++-------------
 cpp/tests/test_remote_handle.cpp              | 15 ++-------------
 cpp/tests/test_tls.cpp                        | 15 ++-------------
 cpp/tests/test_url.cpp                        | 15 ++-------------
 cpp/tests/utils/env.cpp                       | 15 ++-------------
 cpp/tests/utils/env.hpp                       | 15 ++-------------
 cpp/tests/utils/hdfs_helper.cpp               | 15 ++-------------
 cpp/tests/utils/hdfs_helper.hpp               | 15 ++-------------
 cpp/tests/utils/utils.hpp                     | 15 ++-------------
 dependencies.yaml                             |  3 +++
 docs/Makefile                                 |  3 +++
 docs/make.bat                                 |  3 +++
 docs/source/conf.py                           |  4 ++--
 java/pom.xml                                  |  5 ++++-
 .../java/ai/rapids/kvikio/cufile/CuFile.java  | 15 ++-------------
 .../ai/rapids/kvikio/cufile/CuFileDriver.java | 15 ++-------------
 .../ai/rapids/kvikio/cufile/CuFileHandle.java | 15 ++-------------
 .../kvikio/cufile/CuFileReadHandle.java       | 15 ++-------------
 .../kvikio/cufile/CuFileWriteHandle.java      | 15 ++-------------
 java/src/main/native/CMakeLists.txt           | 15 ++++-----------
 java/src/main/native/src/CuFileJni.cpp        | 15 ++-------------
 .../kvikio/cufile/BasicReadWriteTest.java     | 15 ++-------------
 pyproject.toml                                |  9 +++++++++
 python/kvikio/CMakeLists.txt                  | 15 ++++-----------
 python/kvikio/examples/hello_world.py         |  4 ++--
 python/kvikio/examples/http_io.py             |  4 ++--
 python/kvikio/kvikio/__init__.py              |  4 ++--
 python/kvikio/kvikio/_lib/CMakeLists.txt      | 15 ++++-----------
 python/kvikio/kvikio/_lib/__init__.pxd        |  4 ++--
 python/kvikio/kvikio/_lib/__init__.py         |  4 ++--
 python/kvikio/kvikio/_lib/arr.pxd             |  4 ++--
 python/kvikio/kvikio/_lib/arr.pyi             |  4 ++--
 python/kvikio/kvikio/_lib/arr.pyx             |  4 ++--
 python/kvikio/kvikio/_lib/buffer.pyx          |  4 ++--
 python/kvikio/kvikio/_lib/cufile_driver.pyx   |  4 ++--
 python/kvikio/kvikio/_lib/defaults.pyx        |  4 ++--
 python/kvikio/kvikio/_lib/file_handle.pyx     |  4 ++--
 python/kvikio/kvikio/_lib/future.pxd          |  4 ++--
 python/kvikio/kvikio/_lib/future.pyx          |  4 ++--
 python/kvikio/kvikio/_lib/mmap.pyx            |  4 ++--
 python/kvikio/kvikio/_lib/remote_handle.pyx   |  4 ++--
 python/kvikio/kvikio/_version.py              | 15 ++-------------
 python/kvikio/kvikio/benchmarks/__init__.py   |  4 ++--
 python/kvikio/kvikio/benchmarks/http_io.py    |  4 ++--
 python/kvikio/kvikio/benchmarks/s3_io.py      |  4 ++--
 .../kvikio/benchmarks/single_node_io.py       |  4 ++--
 python/kvikio/kvikio/benchmarks/utils.py      |  4 ++--
 python/kvikio/kvikio/benchmarks/zarr_io.py    |  4 ++--
 python/kvikio/kvikio/buffer.py                |  4 ++--
 python/kvikio/kvikio/cufile.py                |  4 ++--
 python/kvikio/kvikio/cufile_driver.py         |  4 ++--
 python/kvikio/kvikio/defaults.py              |  4 ++--
 python/kvikio/kvikio/mmap.py                  |  4 ++--
 python/kvikio/kvikio/numcodecs.py             |  4 ++--
 python/kvikio/kvikio/numpy.py                 |  4 ++--
 python/kvikio/kvikio/remote_file.py           |  4 ++--
 python/kvikio/kvikio/utils.py                 |  4 ++--
 python/kvikio/kvikio/zarr/__init__.py         |  3 ++-
 python/kvikio/kvikio/zarr/_zarr_python_3.py   |  4 ++--
 python/kvikio/pyproject.toml                  |  4 ++--
 python/kvikio/tests/conftest.py               |  4 ++--
 python/kvikio/tests/test_async_io.py          |  4 ++--
 python/kvikio/tests/test_basic_io.py          |  4 ++--
 python/kvikio/tests/test_benchmarks.py        |  4 ++--
 python/kvikio/tests/test_cufile_driver.py     |  4 ++--
 python/kvikio/tests/test_defaults.py          |  4 ++--
 python/kvikio/tests/test_examples.py          |  4 ++--
 python/kvikio/tests/test_hdfs_io.py           |  4 ++--
 python/kvikio/tests/test_http_io.py           |  4 ++--
 python/kvikio/tests/test_mmap.py              |  4 ++--
 python/kvikio/tests/test_numpy.py             |  4 ++--
 python/kvikio/tests/test_s3_io.py             |  4 ++--
 python/kvikio/tests/test_utils.py             |  4 ++--
 python/kvikio/tests/test_version.py           |  4 ++--
 python/kvikio/tests/test_zarr_missing.py      |  4 ++--
 python/kvikio/tests/test_zarr_v3.py           |  4 ++--
 python/kvikio/tests/utils.py                  |  4 ++--
 python/libkvikio/CMakeLists.txt               | 15 ++++-----------
 python/libkvikio/libkvikio/__init__.py        | 15 ++-------------
 python/libkvikio/libkvikio/_version.py        | 15 ++-------------
 python/libkvikio/libkvikio/load.py            | 15 ++-------------
 python/libkvikio/pyproject.toml               |  4 ++--
 179 files changed, 416 insertions(+), 1399 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 297ccc09aa..f6d9e7263d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -66,15 +67,21 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.6.0
+    rev: v1.1.0
     hooks:
       - id: verify-copyright
+        args: [--fix, --spdx]
         files: |
           (?x)
-              [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$|
-              ^[.]pre-commit-config[.]yaml$|
-              CMakeLists[.]txt$|
-              meta[.]yaml$
+            [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx|pyi|java|bat)$|
+            ^[.]pre-commit-config[.]yaml$|
+            CMakeLists[.]txt$|
+            meta[.]yaml$|
+            pyproject[.]toml$|
+            recipe[.]yaml$|
+            dependencies[.]yaml$|
+            Makefile$|
+            pom[.]xml$
       - id: verify-alpha-spec
       - id: verify-codeowners
         args: [--fix, --project-prefix=kvikio]
diff --git a/build.sh b/build.sh
index bf4c0780f6..0b83259cf4 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 # kvikio build script
 
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 700ee25707..a27b160696 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 982e9a0d63..6813851dd6 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 1559350ed9..9744abd203 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index c22e809ae5..d9a74f877d 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 61654c66b5..83c8280215 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index 312fedc0e7..e515139105 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 675f251098..922e86f6e4 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index a315f189a9..b7adf5e033 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 ##########################
 # KvikIO Version Updater #
 ##########################
diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh
index dcb938cbdd..08c015fa1b 100755
--- a/ci/run_ctests.sh
+++ b/ci/run_ctests.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/run_pytests.sh b/ci/run_pytests.sh
index e75c351700..7cd835af5d 100755
--- a/ci/run_pytests.sh
+++ b/ci/run_pytests.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 05fc954b4f..770f26a675 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/test_java.sh b/ci/test_java.sh
index a715b9422d..27ef5e8905 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 2527094f69..1ca634edfe 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index ca0fd7dd25..5513083815 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -eou pipefail
 
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index 40bb27feeb..5c5facf13f 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake
index ddef819498..5bbc8fcc3a 100644
--- a/cmake/RAPIDS.cmake
+++ b/cmake/RAPIDS.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 #
 # This is the preferred entry point for projects using rapids-cmake
diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake
index b2c54a3f27..768e24874e 100644
--- a/cmake/rapids_config.cmake
+++ b/cmake/rapids_config.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2018-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 file(READ "${CMAKE_CURRENT_LIST_DIR}/../VERSION" _rapids_version)
 if(_rapids_version MATCHES [[^([0-9][0-9])\.([0-9][0-9])\.([0-9][0-9])]])
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index c4c52b540c..48dafb21cf 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -1,4 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 schema_version: 1
 
 context:
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 3b3a0bc1bf..3c7b6d69a0 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -1,4 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 schema_version: 1
 
 context:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c54a333afd..17bb297f67 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index fc857fdfdf..c6e227558e 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 #[=======================================================================[.rst:
diff --git a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
index 1c90579d22..4271eddb20 100644
--- a/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
+++ b/cpp/benchmarks/threadpool/threadpool_benchmark.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 // This benchmark assesses the scalability of the thread pool.
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 10e6b026d9..698c61eb37 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # This function finds gtest and sets any additional necessary environment variables.
diff --git a/cpp/cmake/thirdparty/get_libcurl.cmake b/cpp/cmake/thirdparty/get_libcurl.cmake
index e25c5fff0a..98c7089554 100644
--- a/cpp/cmake/thirdparty/get_libcurl.cmake
+++ b/cpp/cmake/thirdparty/get_libcurl.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # This function finds libcurl and sets any additional necessary environment variables.
diff --git a/cpp/cmake/thirdparty/get_nvtx.cmake b/cpp/cmake/thirdparty/get_nvtx.cmake
index ee4fc02f69..744a0e3ed1 100644
--- a/cpp/cmake/thirdparty/get_nvtx.cmake
+++ b/cpp/cmake/thirdparty/get_nvtx.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # Need to call rapids_cpm_nvtx3 to get support for an installed version of nvtx3 and to support
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
index 3faf21366a..bb5d98433c 100644
--- a/cpp/cmake/thirdparty/get_thread_pool.cmake
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # Need to call rapids_cpm_bs_thread_pool to get support for an installed version of thread-pool and
diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt
index b7aa73dd2d..b401c50d4e 100644
--- a/cpp/examples/CMakeLists.txt
+++ b/cpp/examples/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 set(TEST_INSTALL_PATH bin/tests/libkvikio)
diff --git a/cpp/examples/basic_io.cpp b/cpp/examples/basic_io.cpp
index 9fed0cee6a..4cc050ffd9 100644
--- a/cpp/examples/basic_io.cpp
+++ b/cpp/examples/basic_io.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <chrono>
diff --git a/cpp/examples/basic_no_cuda.cpp b/cpp/examples/basic_no_cuda.cpp
index 8e08d18206..a5dfac0435 100644
--- a/cpp/examples/basic_no_cuda.cpp
+++ b/cpp/examples/basic_no_cuda.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <chrono>
diff --git a/cpp/examples/downstream/CMakeLists.txt b/cpp/examples/downstream/CMakeLists.txt
index a21d5699d4..d99711b7c9 100644
--- a/cpp/examples/downstream/CMakeLists.txt
+++ b/cpp/examples/downstream/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
diff --git a/cpp/examples/downstream/cmake/get_cpm.cmake b/cpp/examples/downstream/cmake/get_cpm.cmake
index 2050ab4f88..ab9480bb71 100644
--- a/cpp/examples/downstream/cmake/get_cpm.cmake
+++ b/cpp/examples/downstream/cmake/get_cpm.cmake
@@ -1,3 +1,8 @@
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
+
 set(CPM_DOWNLOAD_VERSION 0.35.5)
 
 if(CPM_SOURCE_CACHE)
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index d163fcf5cf..66fbfe3a61 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # Use CPM to fetch KvikIO, which makes `kvikio::kvikio` available for `target_link_libraries`
diff --git a/cpp/examples/downstream/downstream_example.cpp b/cpp/examples/downstream/downstream_example.cpp
index 87603908a1..f340835091 100644
--- a/cpp/examples/downstream/downstream_example.cpp
+++ b/cpp/examples/downstream/downstream_example.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <iostream>
diff --git a/cpp/include/kvikio/batch.hpp b/cpp/include/kvikio/batch.hpp
index 40168ffbc3..f85c87657c 100644
--- a/cpp/include/kvikio/batch.hpp
+++ b/cpp/include/kvikio/batch.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp
index 5a7623a6a4..f4e37d8f10 100644
--- a/cpp/include/kvikio/bounce_buffer.hpp
+++ b/cpp/include/kvikio/bounce_buffer.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/buffer.hpp b/cpp/include/kvikio/buffer.hpp
index 3897bb3371..00bff4c4c2 100644
--- a/cpp/include/kvikio/buffer.hpp
+++ b/cpp/include/kvikio/buffer.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/compat_mode.hpp b/cpp/include/kvikio/compat_mode.hpp
index d25f0ffaef..a2f1171dba 100644
--- a/cpp/include/kvikio/compat_mode.hpp
+++ b/cpp/include/kvikio/compat_mode.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
diff --git a/cpp/include/kvikio/compat_mode_manager.hpp b/cpp/include/kvikio/compat_mode_manager.hpp
index 4d15f2ba9f..0dc55f165d 100644
--- a/cpp/include/kvikio/compat_mode_manager.hpp
+++ b/cpp/include/kvikio/compat_mode_manager.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
diff --git a/cpp/include/kvikio/cufile/config.hpp b/cpp/include/kvikio/cufile/config.hpp
index fd721e5fe4..c6b878a3cd 100644
--- a/cpp/include/kvikio/cufile/config.hpp
+++ b/cpp/include/kvikio/cufile/config.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/cufile/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp
index 56a6e8159b..0f23dee6e7 100644
--- a/cpp/include/kvikio/cufile/driver.hpp
+++ b/cpp/include/kvikio/cufile/driver.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index 6576d286fe..53254dda21 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
diff --git a/cpp/include/kvikio/detail/nvtx.hpp b/cpp/include/kvikio/detail/nvtx.hpp
index 902283f07d..ef41681c55 100644
--- a/cpp/include/kvikio/detail/nvtx.hpp
+++ b/cpp/include/kvikio/detail/nvtx.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/detail/parallel_operation.hpp b/cpp/include/kvikio/detail/parallel_operation.hpp
index 891e33330f..a4489da8e5 100644
--- a/cpp/include/kvikio/detail/parallel_operation.hpp
+++ b/cpp/include/kvikio/detail/parallel_operation.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/detail/posix_io.hpp b/cpp/include/kvikio/detail/posix_io.hpp
index c923f25246..4d26f553bb 100644
--- a/cpp/include/kvikio/detail/posix_io.hpp
+++ b/cpp/include/kvikio/detail/posix_io.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/detail/remote_handle.hpp b/cpp/include/kvikio/detail/remote_handle.hpp
index f6bd55c4c0..2e6613aeef 100644
--- a/cpp/include/kvikio/detail/remote_handle.hpp
+++ b/cpp/include/kvikio/detail/remote_handle.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/detail/tls.hpp b/cpp/include/kvikio/detail/tls.hpp
index be40eaf5e5..52d5d01202 100644
--- a/cpp/include/kvikio/detail/tls.hpp
+++ b/cpp/include/kvikio/detail/tls.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/detail/url.hpp b/cpp/include/kvikio/detail/url.hpp
index 2feb055881..2c18b8ac29 100644
--- a/cpp/include/kvikio/detail/url.hpp
+++ b/cpp/include/kvikio/detail/url.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/detail/utils.hpp b/cpp/include/kvikio/detail/utils.hpp
index b3ce990c44..32261ed676 100644
--- a/cpp/include/kvikio/detail/utils.hpp
+++ b/cpp/include/kvikio/detail/utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp
index 29538f5b6a..604388476c 100644
--- a/cpp/include/kvikio/error.hpp
+++ b/cpp/include/kvikio/error.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index e88b6b82ab..cc71b58f6d 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/file_utils.hpp b/cpp/include/kvikio/file_utils.hpp
index aa3e6fd380..e666b2b471 100644
--- a/cpp/include/kvikio/file_utils.hpp
+++ b/cpp/include/kvikio/file_utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
index 9d89d4d2e1..70c8db1222 100644
--- a/cpp/include/kvikio/hdfs.hpp
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/http_status_codes.hpp b/cpp/include/kvikio/http_status_codes.hpp
index 98ffb52324..aa6a0c06e2 100644
--- a/cpp/include/kvikio/http_status_codes.hpp
+++ b/cpp/include/kvikio/http_status_codes.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
diff --git a/cpp/include/kvikio/mmap.hpp b/cpp/include/kvikio/mmap.hpp
index b5730727f1..fe8b71cbf4 100644
--- a/cpp/include/kvikio/mmap.hpp
+++ b/cpp/include/kvikio/mmap.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 9384178ab6..44cf536cc3 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index c0196cec1a..8ec968aa92 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/shim/cufile.hpp b/cpp/include/kvikio/shim/cufile.hpp
index 987ab6aa6f..721b2f4e17 100644
--- a/cpp/include/kvikio/shim/cufile.hpp
+++ b/cpp/include/kvikio/shim/cufile.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/shim/cufile_h_wrapper.hpp b/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
index 886a6e12aa..9aff7248c4 100644
--- a/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
+++ b/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp
index 0a98cecb77..150ac69e6e 100644
--- a/cpp/include/kvikio/shim/libcurl.hpp
+++ b/cpp/include/kvikio/shim/libcurl.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/shim/utils.hpp b/cpp/include/kvikio/shim/utils.hpp
index 1beb67ef0d..7c6a4e6d35 100644
--- a/cpp/include/kvikio/shim/utils.hpp
+++ b/cpp/include/kvikio/shim/utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/stream.hpp b/cpp/include/kvikio/stream.hpp
index f48f9955f7..12bef04342 100644
--- a/cpp/include/kvikio/stream.hpp
+++ b/cpp/include/kvikio/stream.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/include/kvikio/threadpool_wrapper.hpp b/cpp/include/kvikio/threadpool_wrapper.hpp
index c6f6c4becc..0644b8c9ca 100644
--- a/cpp/include/kvikio/threadpool_wrapper.hpp
+++ b/cpp/include/kvikio/threadpool_wrapper.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 7b84b3bb77..b2b695b578 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index 37c57b5af5..659577dd99 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 # This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
diff --git a/cpp/src/batch.cpp b/cpp/src/batch.cpp
index 8ee2c73290..e1a0f6214d 100644
--- a/cpp/src/batch.cpp
+++ b/cpp/src/batch.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cstddef>
diff --git a/cpp/src/bounce_buffer.cpp b/cpp/src/bounce_buffer.cpp
index fb91de26d0..b40ae82ad2 100644
--- a/cpp/src/bounce_buffer.cpp
+++ b/cpp/src/bounce_buffer.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <mutex>
diff --git a/cpp/src/buffer.cpp b/cpp/src/buffer.cpp
index 8f0b29ba27..b964f63d8f 100644
--- a/cpp/src/buffer.cpp
+++ b/cpp/src/buffer.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <algorithm>
diff --git a/cpp/src/compat_mode.cpp b/cpp/src/compat_mode.cpp
index 109eb7fd2b..a8c7e69fac 100644
--- a/cpp/src/compat_mode.cpp
+++ b/cpp/src/compat_mode.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <algorithm>
diff --git a/cpp/src/compat_mode_manager.cpp b/cpp/src/compat_mode_manager.cpp
index 7eef3e4502..7d794f30f3 100644
--- a/cpp/src/compat_mode_manager.cpp
+++ b/cpp/src/compat_mode_manager.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cassert>
diff --git a/cpp/src/cufile/config.cpp b/cpp/src/cufile/config.cpp
index d1d36adc04..3c33e101df 100644
--- a/cpp/src/cufile/config.cpp
+++ b/cpp/src/cufile/config.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cstdlib>
diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp
index 46da3151f7..98070afc84 100644
--- a/cpp/src/cufile/driver.cpp
+++ b/cpp/src/cufile/driver.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <iostream>
diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp
index 88d1bea729..265263b16e 100644
--- a/cpp/src/defaults.cpp
+++ b/cpp/src/defaults.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cstddef>
diff --git a/cpp/src/detail/nvtx.cpp b/cpp/src/detail/nvtx.cpp
index 61e153c3b9..45809ebab8 100644
--- a/cpp/src/detail/nvtx.cpp
+++ b/cpp/src/detail/nvtx.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <sys/syscall.h>
diff --git a/cpp/src/detail/posix_io.cpp b/cpp/src/detail/posix_io.cpp
index 0351c04965..c4131f7872 100644
--- a/cpp/src/detail/posix_io.cpp
+++ b/cpp/src/detail/posix_io.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <unistd.h>
diff --git a/cpp/src/detail/remote_handle.cpp b/cpp/src/detail/remote_handle.cpp
index b023859ef8..87d1ed5ab5 100644
--- a/cpp/src/detail/remote_handle.cpp
+++ b/cpp/src/detail/remote_handle.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <string>
diff --git a/cpp/src/detail/tls.cpp b/cpp/src/detail/tls.cpp
index 0dccb6b612..89773db22a 100644
--- a/cpp/src/detail/tls.cpp
+++ b/cpp/src/detail/tls.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <unistd.h>
diff --git a/cpp/src/detail/url.cpp b/cpp/src/detail/url.cpp
index 5542a1285f..3e76643082 100644
--- a/cpp/src/detail/url.cpp
+++ b/cpp/src/detail/url.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <array>
diff --git a/cpp/src/error.cpp b/cpp/src/error.cpp
index 5d479b36e4..d0179c2c80 100644
--- a/cpp/src/error.cpp
+++ b/cpp/src/error.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <iostream>
diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp
index 1b22920449..226fcd41aa 100644
--- a/cpp/src/file_handle.cpp
+++ b/cpp/src/file_handle.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <fcntl.h>
diff --git a/cpp/src/file_utils.cpp b/cpp/src/file_utils.cpp
index 986dab16ff..33c122b652 100644
--- a/cpp/src/file_utils.cpp
+++ b/cpp/src/file_utils.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <fcntl.h>
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
index c222b5e6ca..173dbb64d4 100644
--- a/cpp/src/hdfs.cpp
+++ b/cpp/src/hdfs.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <regex>
diff --git a/cpp/src/http_status_codes.cpp b/cpp/src/http_status_codes.cpp
index da4a4cc379..94d9d52659 100644
--- a/cpp/src/http_status_codes.cpp
+++ b/cpp/src/http_status_codes.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <cstddef>
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index c93e88eb7d..d88e4b9d94 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #include <sys/mman.h>
 
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 2c4ab235c7..6e6d8a8912 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <algorithm>
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
index 0d4784db59..94402659c5 100644
--- a/cpp/src/shim/cuda.cpp
+++ b/cpp/src/shim/cuda.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <stdexcept>
diff --git a/cpp/src/shim/cufile.cpp b/cpp/src/shim/cufile.cpp
index 3daa52cd62..0098a10deb 100644
--- a/cpp/src/shim/cufile.cpp
+++ b/cpp/src/shim/cufile.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <stdexcept>
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
index 170dc7afb5..bfee920d0d 100644
--- a/cpp/src/shim/libcurl.cpp
+++ b/cpp/src/shim/libcurl.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <algorithm>
diff --git a/cpp/src/shim/utils.cpp b/cpp/src/shim/utils.cpp
index e393cf77c1..232d3a413f 100644
--- a/cpp/src/shim/utils.cpp
+++ b/cpp/src/shim/utils.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <dlfcn.h>
diff --git a/cpp/src/stream.cpp b/cpp/src/stream.cpp
index beaa571912..71b7f544f4 100644
--- a/cpp/src/stream.cpp
+++ b/cpp/src/stream.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <sys/types.h>
diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp
index 433eb6b279..889b283e19 100644
--- a/cpp/src/utils.cpp
+++ b/cpp/src/utils.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <unistd.h>
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index afa7e8d97b..11ae47306a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 enable_testing()
diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp
index 141fe386c3..8aac777190 100644
--- a/cpp/tests/test_basic_io.cpp
+++ b/cpp/tests/test_basic_io.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <kvikio/defaults.hpp>
diff --git a/cpp/tests/test_defaults.cpp b/cpp/tests/test_defaults.cpp
index a74f38c86f..4209fafd6b 100644
--- a/cpp/tests/test_defaults.cpp
+++ b/cpp/tests/test_defaults.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <stdexcept>
diff --git a/cpp/tests/test_error.cpp b/cpp/tests/test_error.cpp
index 27713b7830..587022389c 100644
--- a/cpp/tests/test_error.cpp
+++ b/cpp/tests/test_error.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <fcntl.h>
diff --git a/cpp/tests/test_hdfs.cpp b/cpp/tests/test_hdfs.cpp
index 354c8c4aca..d7256cc456 100644
--- a/cpp/tests/test_hdfs.cpp
+++ b/cpp/tests/test_hdfs.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <gtest/gtest.h>
diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
index 1c6e3a54b3..9e355f4789 100644
--- a/cpp/tests/test_mmap.cpp
+++ b/cpp/tests/test_mmap.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <sys/mman.h>
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index f1e25ea34d..41d975fd00 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <functional>
diff --git a/cpp/tests/test_tls.cpp b/cpp/tests/test_tls.cpp
index 4b04f10296..63f6735e4d 100644
--- a/cpp/tests/test_tls.cpp
+++ b/cpp/tests/test_tls.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <gmock/gmock.h>
diff --git a/cpp/tests/test_url.cpp b/cpp/tests/test_url.cpp
index e359c5442b..84d5e6bc01 100644
--- a/cpp/tests/test_url.cpp
+++ b/cpp/tests/test_url.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <gmock/gmock.h>
diff --git a/cpp/tests/utils/env.cpp b/cpp/tests/utils/env.cpp
index 9495289157..262794a48d 100644
--- a/cpp/tests/utils/env.cpp
+++ b/cpp/tests/utils/env.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "env.hpp"
diff --git a/cpp/tests/utils/env.hpp b/cpp/tests/utils/env.hpp
index e6e84cd9b6..3d924f8829 100644
--- a/cpp/tests/utils/env.hpp
+++ b/cpp/tests/utils/env.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
diff --git a/cpp/tests/utils/hdfs_helper.cpp b/cpp/tests/utils/hdfs_helper.cpp
index 2bcbc7fed2..e884b93eca 100644
--- a/cpp/tests/utils/hdfs_helper.cpp
+++ b/cpp/tests/utils/hdfs_helper.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "hdfs_helper.hpp"
diff --git a/cpp/tests/utils/hdfs_helper.hpp b/cpp/tests/utils/hdfs_helper.hpp
index 7cc5da377c..47bd4022ca 100644
--- a/cpp/tests/utils/hdfs_helper.hpp
+++ b/cpp/tests/utils/hdfs_helper.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/cpp/tests/utils/utils.hpp b/cpp/tests/utils/utils.hpp
index 7d733621a7..bb8ea80d82 100644
--- a/cpp/tests/utils/utils.hpp
+++ b/cpp/tests/utils/utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 541708ecfc..0ab4199c8a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
 # Dependency list for https://github.com/rapidsai/dependency-file-generator
 files:
   all:
diff --git a/docs/Makefile b/docs/Makefile
index 7c6066a619..79f529b67d 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
 # Minimal makefile for Sphinx documentation
 #
 
diff --git a/docs/make.bat b/docs/make.bat
index 6fcf05b4b7..0217175e0b 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,3 +1,6 @@
+REM SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+REM SPDX-License-Identifier: Apache-2.0
+
 @ECHO OFF
 
 pushd %~dp0
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b6ac274a47..0b3353e945 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
 # Configuration file for the Sphinx documentation builder.
 #
diff --git a/java/pom.xml b/java/pom.xml
index d2c44cab63..1dc011b575 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -1,5 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
-
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+SPDX-License-Identifier: Apache-2.0
+-->
 <project  xmlns="http://maven.apache.org/POM/4.0.0"
           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
           xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFile.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFile.java
index 68c6fcbb31..fcc931a6b3 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFile.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFile.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileDriver.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileDriver.java
index 109857e137..b33b866c83 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileDriver.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileDriver.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileHandle.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileHandle.java
index 1df4728253..6c70a2997f 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileHandle.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileHandle.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileReadHandle.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileReadHandle.java
index c21399e292..8323d6bca4 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileReadHandle.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileReadHandle.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileWriteHandle.java b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileWriteHandle.java
index 36e8952df5..d7e18a5660 100644
--- a/java/src/main/java/ai/rapids/kvikio/cufile/CuFileWriteHandle.java
+++ b/java/src/main/java/ai/rapids/kvikio/cufile/CuFileWriteHandle.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 1800db8a75..be211cc8e1 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -1,15 +1,8 @@
 #
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 #
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
diff --git a/java/src/main/native/src/CuFileJni.cpp b/java/src/main/native/src/CuFileJni.cpp
index 829fcbf5f9..47429a3713 100644
--- a/java/src/main/native/src/CuFileJni.cpp
+++ b/java/src/main/native/src/CuFileJni.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <fcntl.h>
diff --git a/java/src/test/java/ai/rapids/kvikio/cufile/BasicReadWriteTest.java b/java/src/test/java/ai/rapids/kvikio/cufile/BasicReadWriteTest.java
index 832dfcb626..9443933693 100644
--- a/java/src/test/java/ai/rapids/kvikio/cufile/BasicReadWriteTest.java
+++ b/java/src/test/java/ai/rapids/kvikio/cufile/BasicReadWriteTest.java
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 package ai.rapids.kvikio.cufile;
diff --git a/pyproject.toml b/pyproject.toml
index a8871b3936..6f9c11cd0b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
 [tool.ruff]
 line-length = 88
 target-version = "py310"
@@ -60,3 +63,9 @@ section-order = [
     "first-party",
     "local-folder",
 ]
+
+[tool.cython-lint]
+ignore = [
+    # line-too-long (due to Copyright header)
+    "E501",
+]
diff --git a/python/kvikio/CMakeLists.txt b/python/kvikio/CMakeLists.txt
index 480e485d2b..1a8ebf99b6 100644
--- a/python/kvikio/CMakeLists.txt
+++ b/python/kvikio/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
diff --git a/python/kvikio/examples/hello_world.py b/python/kvikio/examples/hello_world.py
index a5f4cd49a9..33d2191a1f 100644
--- a/python/kvikio/examples/hello_world.py
+++ b/python/kvikio/examples/hello_world.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import cupy
 
diff --git a/python/kvikio/examples/http_io.py b/python/kvikio/examples/http_io.py
index 26c9af1d44..79d272c531 100644
--- a/python/kvikio/examples/http_io.py
+++ b/python/kvikio/examples/http_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import pathlib
 import tempfile
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index 124698206e..9765101216 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # If libkvikio was installed as a wheel, we must request it to load the library symbols.
 # Otherwise, we assume that the library was installed in a system path that ld can find.
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index b46d59c960..fe640ecfd7 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 # Set the list of Cython files to build, one .so per file
diff --git a/python/kvikio/kvikio/_lib/__init__.pxd b/python/kvikio/kvikio/_lib/__init__.pxd
index 09549e2c5b..1dcf28d231 100644
--- a/python/kvikio/kvikio/_lib/__init__.pxd
+++ b/python/kvikio/kvikio/_lib/__init__.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/__init__.py b/python/kvikio/kvikio/_lib/__init__.py
index 3772e2e8e3..57eded9dda 100644
--- a/python/kvikio/kvikio/_lib/__init__.py
+++ b/python/kvikio/kvikio/_lib/__init__.py
@@ -1,2 +1,2 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/python/kvikio/kvikio/_lib/arr.pxd b/python/kvikio/kvikio/_lib/arr.pxd
index 47bad21a3b..a09bb260ff 100644
--- a/python/kvikio/kvikio/_lib/arr.pxd
+++ b/python/kvikio/kvikio/_lib/arr.pxd
@@ -1,5 +1,5 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/arr.pyi b/python/kvikio/kvikio/_lib/arr.pyi
index 0d2a8201eb..fb84317fb2 100644
--- a/python/kvikio/kvikio/_lib/arr.pyi
+++ b/python/kvikio/kvikio/_lib/arr.pyi
@@ -1,5 +1,5 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from typing import Generic, Tuple, TypeVar
 
diff --git a/python/kvikio/kvikio/_lib/arr.pyx b/python/kvikio/kvikio/_lib/arr.pyx
index 19818d7cc0..c151535d18 100644
--- a/python/kvikio/kvikio/_lib/arr.pyx
+++ b/python/kvikio/kvikio/_lib/arr.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # cython: language_level=3
 
diff --git a/python/kvikio/kvikio/_lib/buffer.pyx b/python/kvikio/kvikio/_lib/buffer.pyx
index a7b638eaf4..e57903f49a 100644
--- a/python/kvikio/kvikio/_lib/buffer.pyx
+++ b/python/kvikio/kvikio/_lib/buffer.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/cufile_driver.pyx b/python/kvikio/kvikio/_lib/cufile_driver.pyx
index 0bc44b2066..34d3ba652f 100644
--- a/python/kvikio/kvikio/_lib/cufile_driver.pyx
+++ b/python/kvikio/kvikio/_lib/cufile_driver.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index 2fb6065680..e38ef80524 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index 062d0d9fb5..de17322303 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/future.pxd b/python/kvikio/kvikio/_lib/future.pxd
index 4d564b37a9..daabbe0bdc 100644
--- a/python/kvikio/kvikio/_lib/future.pxd
+++ b/python/kvikio/kvikio/_lib/future.pxd
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/future.pyx b/python/kvikio/kvikio/_lib/future.pyx
index 94d9dd6d9c..1d86f0fc27 100644
--- a/python/kvikio/kvikio/_lib/future.pyx
+++ b/python/kvikio/kvikio/_lib/future.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/mmap.pyx b/python/kvikio/kvikio/_lib/mmap.pyx
index 46fc3846d0..29861f0a2c 100644
--- a/python/kvikio/kvikio/_lib/mmap.pyx
+++ b/python/kvikio/kvikio/_lib/mmap.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 545ddae731..2f7031a7c3 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # distutils: language = c++
 # cython: language_level=3
diff --git a/python/kvikio/kvikio/_version.py b/python/kvikio/kvikio/_version.py
index a5171f19f4..c5a2a0cd83 100644
--- a/python/kvikio/kvikio/_version.py
+++ b/python/kvikio/kvikio/_version.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 import importlib.resources
 
diff --git a/python/kvikio/kvikio/benchmarks/__init__.py b/python/kvikio/kvikio/benchmarks/__init__.py
index 8586c47db2..905216a919 100644
--- a/python/kvikio/kvikio/benchmarks/__init__.py
+++ b/python/kvikio/kvikio/benchmarks/__init__.py
@@ -1,2 +1,2 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/python/kvikio/kvikio/benchmarks/http_io.py b/python/kvikio/kvikio/benchmarks/http_io.py
index af4e44b973..ab72446857 100644
--- a/python/kvikio/kvikio/benchmarks/http_io.py
+++ b/python/kvikio/kvikio/benchmarks/http_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import argparse
 import contextlib
diff --git a/python/kvikio/kvikio/benchmarks/s3_io.py b/python/kvikio/kvikio/benchmarks/s3_io.py
index 08bdfc93a0..5b585cd018 100644
--- a/python/kvikio/kvikio/benchmarks/s3_io.py
+++ b/python/kvikio/kvikio/benchmarks/s3_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import argparse
 import contextlib
diff --git a/python/kvikio/kvikio/benchmarks/single_node_io.py b/python/kvikio/kvikio/benchmarks/single_node_io.py
index ddc6680167..eb35f2ae7c 100644
--- a/python/kvikio/kvikio/benchmarks/single_node_io.py
+++ b/python/kvikio/kvikio/benchmarks/single_node_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import argparse
 import contextlib
diff --git a/python/kvikio/kvikio/benchmarks/utils.py b/python/kvikio/kvikio/benchmarks/utils.py
index 2462b22a79..a6a2b1fb42 100644
--- a/python/kvikio/kvikio/benchmarks/utils.py
+++ b/python/kvikio/kvikio/benchmarks/utils.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
 
diff --git a/python/kvikio/kvikio/benchmarks/zarr_io.py b/python/kvikio/kvikio/benchmarks/zarr_io.py
index 14c9fb38d3..81766bd9f9 100644
--- a/python/kvikio/kvikio/benchmarks/zarr_io.py
+++ b/python/kvikio/kvikio/benchmarks/zarr_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import argparse
 import contextlib
diff --git a/python/kvikio/kvikio/buffer.py b/python/kvikio/kvikio/buffer.py
index 62bbc754b4..653d79a9d8 100644
--- a/python/kvikio/kvikio/buffer.py
+++ b/python/kvikio/kvikio/buffer.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from kvikio._lib import buffer  # type: ignore
 
diff --git a/python/kvikio/kvikio/cufile.py b/python/kvikio/kvikio/cufile.py
index c8ba5bfd57..ec8fe4be84 100644
--- a/python/kvikio/kvikio/cufile.py
+++ b/python/kvikio/kvikio/cufile.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import io
 import os
diff --git a/python/kvikio/kvikio/cufile_driver.py b/python/kvikio/kvikio/cufile_driver.py
index c9970903d7..154e8e885b 100644
--- a/python/kvikio/kvikio/cufile_driver.py
+++ b/python/kvikio/kvikio/cufile_driver.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import atexit
 from typing import Any, Tuple, overload
diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py
index 3004d74e00..d8b36de444 100644
--- a/python/kvikio/kvikio/defaults.py
+++ b/python/kvikio/kvikio/defaults.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 from typing import Any, overload
diff --git a/python/kvikio/kvikio/mmap.py b/python/kvikio/kvikio/mmap.py
index 3b506ded02..3c216dd236 100644
--- a/python/kvikio/kvikio/mmap.py
+++ b/python/kvikio/kvikio/mmap.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 import stat
diff --git a/python/kvikio/kvikio/numcodecs.py b/python/kvikio/kvikio/numcodecs.py
index 40f62be1de..059ad296f9 100644
--- a/python/kvikio/kvikio/numcodecs.py
+++ b/python/kvikio/kvikio/numcodecs.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 """
 This module implements CUDA compression and transformation codecs for Numcodecs.
diff --git a/python/kvikio/kvikio/numpy.py b/python/kvikio/kvikio/numpy.py
index 311067bed0..c3acab2bd6 100644
--- a/python/kvikio/kvikio/numpy.py
+++ b/python/kvikio/kvikio/numpy.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import io
 import os
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 31d9d19831..1faf010c58 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
 
diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py
index 72dabaf064..4f58b36c18 100644
--- a/python/kvikio/kvikio/utils.py
+++ b/python/kvikio/kvikio/utils.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import functools
 import multiprocessing
diff --git a/python/kvikio/kvikio/zarr/__init__.py b/python/kvikio/kvikio/zarr/__init__.py
index ca6cfde7e9..a3ae659b2e 100644
--- a/python/kvikio/kvikio/zarr/__init__.py
+++ b/python/kvikio/kvikio/zarr/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 try:
     from ._zarr_python_3 import GDSStore
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_3.py b/python/kvikio/kvikio/zarr/_zarr_python_3.py
index b12ff93125..5f22d73ae0 100644
--- a/python/kvikio/kvikio/zarr/_zarr_python_3.py
+++ b/python/kvikio/kvikio/zarr/_zarr_python_3.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import asyncio
 import functools
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 4e7ae7c21d..39fbdf5df4 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 [build-system]
 build-backend = "rapids_build_backend.build"
diff --git a/python/kvikio/tests/conftest.py b/python/kvikio/tests/conftest.py
index 06aef52ecc..eb6eda9fda 100644
--- a/python/kvikio/tests/conftest.py
+++ b/python/kvikio/tests/conftest.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import contextlib
 import multiprocessing as mp
diff --git a/python/kvikio/tests/test_async_io.py b/python/kvikio/tests/test_async_io.py
index 2de4aef9c9..388a2019f6 100644
--- a/python/kvikio/tests/test_async_io.py
+++ b/python/kvikio/tests/test_async_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 
diff --git a/python/kvikio/tests/test_basic_io.py b/python/kvikio/tests/test_basic_io.py
index 7263a77154..5be7b5e982 100644
--- a/python/kvikio/tests/test_basic_io.py
+++ b/python/kvikio/tests/test_basic_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import io
 import os
diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py
index 584f719fc8..465aed7bd9 100644
--- a/python/kvikio/tests/test_benchmarks.py
+++ b/python/kvikio/tests/test_benchmarks.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 import os.path
diff --git a/python/kvikio/tests/test_cufile_driver.py b/python/kvikio/tests/test_cufile_driver.py
index ed715b859a..78f9f54dae 100644
--- a/python/kvikio/tests/test_cufile_driver.py
+++ b/python/kvikio/tests/test_cufile_driver.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import subprocess
 
diff --git a/python/kvikio/tests/test_defaults.py b/python/kvikio/tests/test_defaults.py
index 57a6ffea7e..c0fac1eaec 100644
--- a/python/kvikio/tests/test_defaults.py
+++ b/python/kvikio/tests/test_defaults.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 import pytest
diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py
index 1d08525a23..3e280b2d67 100644
--- a/python/kvikio/tests/test_examples.py
+++ b/python/kvikio/tests/test_examples.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 from importlib import import_module
diff --git a/python/kvikio/tests/test_hdfs_io.py b/python/kvikio/tests/test_hdfs_io.py
index eab4d0a5fb..709bb04992 100644
--- a/python/kvikio/tests/test_hdfs_io.py
+++ b/python/kvikio/tests/test_hdfs_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
 
diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py
index 2d271c9833..f152c316c5 100644
--- a/python/kvikio/tests/test_http_io.py
+++ b/python/kvikio/tests/test_http_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 import http
diff --git a/python/kvikio/tests/test_mmap.py b/python/kvikio/tests/test_mmap.py
index 6ad7468690..4790f06db4 100644
--- a/python/kvikio/tests/test_mmap.py
+++ b/python/kvikio/tests/test_mmap.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import contextlib
 import os
diff --git a/python/kvikio/tests/test_numpy.py b/python/kvikio/tests/test_numpy.py
index 607081e649..14ffc1c2e2 100644
--- a/python/kvikio/tests/test_numpy.py
+++ b/python/kvikio/tests/test_numpy.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 import pytest
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
index 510940d284..d8610c73bc 100644
--- a/python/kvikio/tests/test_s3_io.py
+++ b/python/kvikio/tests/test_s3_io.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import multiprocessing as mp
 import time
diff --git a/python/kvikio/tests/test_utils.py b/python/kvikio/tests/test_utils.py
index cd34c50d0e..34279826b7 100644
--- a/python/kvikio/tests/test_utils.py
+++ b/python/kvikio/tests/test_utils.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 
 import pytest
diff --git a/python/kvikio/tests/test_version.py b/python/kvikio/tests/test_version.py
index 840d6624c9..0f8249856f 100644
--- a/python/kvikio/tests/test_version.py
+++ b/python/kvikio/tests/test_version.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import kvikio
 
diff --git a/python/kvikio/tests/test_zarr_missing.py b/python/kvikio/tests/test_zarr_missing.py
index ef25b41dab..eee25b0e2e 100644
--- a/python/kvikio/tests/test_zarr_missing.py
+++ b/python/kvikio/tests/test_zarr_missing.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import sys
 
diff --git a/python/kvikio/tests/test_zarr_v3.py b/python/kvikio/tests/test_zarr_v3.py
index 59331d5513..4a430b5dbd 100644
--- a/python/kvikio/tests/test_zarr_v3.py
+++ b/python/kvikio/tests/test_zarr_v3.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import pathlib
 
diff --git a/python/kvikio/tests/utils.py b/python/kvikio/tests/utils.py
index 58d801efcf..1b1368a421 100644
--- a/python/kvikio/tests/utils.py
+++ b/python/kvikio/tests/utils.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import socket
 
diff --git a/python/libkvikio/CMakeLists.txt b/python/libkvikio/CMakeLists.txt
index fe0019bf58..318d1edb49 100644
--- a/python/libkvikio/CMakeLists.txt
+++ b/python/libkvikio/CMakeLists.txt
@@ -1,15 +1,8 @@
 # =============================================================================
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
 # =============================================================================
 
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
diff --git a/python/libkvikio/libkvikio/__init__.py b/python/libkvikio/libkvikio/__init__.py
index a221295d4c..8051956848 100644
--- a/python/libkvikio/libkvikio/__init__.py
+++ b/python/libkvikio/libkvikio/__init__.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 from libkvikio._version import __git_commit__, __version__
 from libkvikio.load import load_library
diff --git a/python/libkvikio/libkvikio/_version.py b/python/libkvikio/libkvikio/_version.py
index 8b67326806..dec2546e41 100644
--- a/python/libkvikio/libkvikio/_version.py
+++ b/python/libkvikio/libkvikio/_version.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 
 import importlib.resources
 
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index 2fd25e642c..beb7273da2 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
 #
 
 import ctypes
diff --git a/python/libkvikio/pyproject.toml b/python/libkvikio/pyproject.toml
index 3239d8c651..7861779811 100644
--- a/python/libkvikio/pyproject.toml
+++ b/python/libkvikio/pyproject.toml
@@ -1,5 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 [build-system]
 build-backend = "rapids_build_backend.build"

From ca47bd9a183e42598818bf39901df051d2f99502 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 28 Oct 2025 18:38:53 -0400
Subject: [PATCH 092/103] Implement simple username-based authentication for
 WebHDFS (#859)

With this PR, KvikIO will support username-based authentication for WebHDFS via environment variable `KVIKIO_WEBHDFS_USERNAME`.

Note: `libcudf` uses KvikIO's utility function `open(url)` to infer endpoint type, where currently the access credentials can only be specified via environment variables instead of programmatically as function parameters. We will address this limitation in the future.

This PR is breaking in that:
- It moves S3 endpoint's utility function `unwrap_or_default` to the detailed namespace, considering that this utility function is supposed to be an implementation detail.
- It adds `username` parameter to one of the two WebHDFS endpoint constructors for completeness (the other constructor has already had `username` as its parameter).

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/859
---
 cpp/CMakeLists.txt                   |  1 +
 cpp/include/kvikio/detail/env.hpp    | 27 +++++++++++++
 cpp/include/kvikio/hdfs.hpp          | 16 +++++++-
 cpp/include/kvikio/remote_handle.hpp | 17 --------
 cpp/src/detail/env.cpp               | 23 +++++++++++
 cpp/src/hdfs.cpp                     | 45 ++++++++++++---------
 cpp/src/remote_handle.cpp            | 59 +++++++++++-----------------
 cpp/tests/test_hdfs.cpp              |  9 +++--
 8 files changed, 120 insertions(+), 77 deletions(-)
 create mode 100644 cpp/include/kvikio/detail/env.hpp
 create mode 100644 cpp/src/detail/env.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 17bb297f67..a681fc6135 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -144,6 +144,7 @@ set(SOURCES
     "src/file_handle.cpp"
     "src/file_utils.cpp"
     "src/mmap.cpp"
+    "src/detail/env.cpp"
     "src/detail/nvtx.cpp"
     "src/detail/posix_io.cpp"
     "src/shim/cuda.cpp"
diff --git a/cpp/include/kvikio/detail/env.hpp b/cpp/include/kvikio/detail/env.hpp
new file mode 100644
index 0000000000..6832eb9ad1
--- /dev/null
+++ b/cpp/include/kvikio/detail/env.hpp
@@ -0,0 +1,27 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <optional>
+#include <string>
+
+namespace kvikio::detail {
+/**
+ * @brief Unwrap an optional parameter, obtaining a fallback from the environment.
+ *
+ * Resolution order:
+ * - If `value` has a value, return it
+ * - If environment variable `env_var` is set, return its value (even if empty)
+ * - Return std::nullopt if err_msg is std::nullopt; throw an exception otherwise
+ *
+ * @param value The value to unwrap.
+ * @param env_var The name of the environment variable to check if `value` isn't set.
+ * @param err_msg Optional error message that controls whether to throw an exception if neither
+ * source provides a value.
+ * @return The resolved value, or std::nullopt if neither source provides a value.
+ */
+std::optional<std::string> unwrap_or_env(std::optional<std::string> value,
+                                         std::string const& env_var,
+                                         std::optional<std::string> const& err_msg = std::nullopt);
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
index 70c8db1222..8b6aeb12a3 100644
--- a/cpp/include/kvikio/hdfs.hpp
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -27,8 +27,15 @@ class WebHdfsEndpoint : public RemoteEndpoint {
    * @brief Create an WebHDFS endpoint from a url.
    *
    * @param url The WebHDFS HTTP/HTTPS url to the remote file.
+   * @param username Optional user name.
+   *
+   * @note The optional username for authentication is determined in the following descending
+   * priority order:
+   * - Function parameter `username`
+   * - Query string in URL (?user.name=xxx)
+   * - Environment variable `KVIKIO_WEBHDFS_USERNAME`
    */
-  explicit WebHdfsEndpoint(std::string url);
+  explicit WebHdfsEndpoint(std::string url, std::optional<std::string> username = std::nullopt);
 
   /**
    * @brief Create an WebHDFS endpoint from the host, port, file path and optionally username.
@@ -36,7 +43,12 @@ class WebHdfsEndpoint : public RemoteEndpoint {
    * @param host Host
    * @param port Port
    * @param remote_file_path Remote file path
-   * @param username User name
+   * @param username Optional user name.
+   *
+   * @note The optional username for authentication is determined in the following descending
+   * priority order:
+   * - Function parameter `username`
+   * - Environment variable `KVIKIO_WEBHDFS_USERNAME`
    */
   explicit WebHdfsEndpoint(std::string host,
                            std::string port,
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 44cf536cc3..416e374291 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -143,23 +143,6 @@ class S3Endpoint : public RemoteEndpoint {
   std::string _aws_userpwd;
   curl_slist* _curl_header_list{};
 
-  /**
-   * @brief Unwrap an optional parameter, obtaining a default from the environment.
-   *
-   * If not nullopt, the optional's value is returned. Otherwise, the environment
-   * variable `env_var` is used. If that also doesn't have a value:
-   *   - if `err_msg` is empty, the empty string is returned.
-   *   - if `err_msg` is not empty, `std::invalid_argument(`err_msg`)` is thrown.
-   *
-   * @param value The value to unwrap.
-   * @param env_var The name of the environment variable to check if `value` isn't set.
-   * @param err_msg The error message to throw on error or the empty string.
-   * @return The parsed AWS argument or the empty string.
-   */
-  static std::string unwrap_or_default(std::optional<std::string> aws_arg,
-                                       std::string const& env_var,
-                                       std::string const& err_msg = "");
-
  public:
   /**
    * @brief Get url from a AWS S3 bucket and object name.
diff --git a/cpp/src/detail/env.cpp b/cpp/src/detail/env.cpp
new file mode 100644
index 0000000000..5d31644501
--- /dev/null
+++ b/cpp/src/detail/env.cpp
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <kvikio/detail/env.hpp>
+#include <kvikio/detail/nvtx.hpp>
+#include <kvikio/error.hpp>
+
+namespace kvikio::detail {
+std::optional<std::string> unwrap_or_env(std::optional<std::string> value,
+                                         std::string const& env_var,
+                                         std::optional<std::string> const& err_msg)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (value.has_value()) { return value; }
+  char const* env = std::getenv(env_var.c_str());
+  if (env != nullptr) { return std::string(env); }
+  if (!err_msg.has_value()) { return std::nullopt; }
+  KVIKIO_FAIL(*err_msg, std::invalid_argument);
+  return std::nullopt;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
index 173dbb64d4..938ab396c8 100644
--- a/cpp/src/hdfs.cpp
+++ b/cpp/src/hdfs.cpp
@@ -5,6 +5,7 @@
 
 #include <regex>
 
+#include <kvikio/detail/env.hpp>
 #include <kvikio/detail/nvtx.hpp>
 #include <kvikio/detail/remote_handle.hpp>
 #include <kvikio/error.hpp>
@@ -14,11 +15,12 @@
 
 namespace kvikio {
 
-WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpointType::WEBHDFS}
+WebHdfsEndpoint::WebHdfsEndpoint(std::string url, std::optional<std::string> username)
+  : RemoteEndpoint{RemoteEndpointType::WEBHDFS}, _username(std::move(username))
 {
-  // todo: Use libcurl URL API for more secure and idiomatic parsing.
-  // Split the URL into two parts: one without query and one with.
-  std::regex static const pattern{R"(^([^?]+)\?([^#]*))"};
+  // Extract two parts from the URL: components before (not including) the query and the query
+  // itself
+  std::regex static const url_pattern{R"(^([^?]+)\?([^#]*))"};
   // Regex meaning:
   // ^: From the start of the line
   // [^?]+: Matches non-question-mark characters one or more times. The question mark ushers in the
@@ -26,26 +28,32 @@ WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpoin
   // \?: Matches the question mark, which needs to be escaped.
   // [^#]*: Matches the non-pound characters zero or more times. The pound sign ushers in the URL
   // fragment component. It is very likely that this part does not exist.
-  std::smatch match_results;
-  bool found = std::regex_search(url, match_results, pattern);
-  // If the match is not found, the URL contains no query.
-  if (!found) {
+  std::smatch url_match_results;
+  bool found_query = std::regex_search(url, url_match_results, url_pattern);
+  if (!found_query) {
     _url = url;
-    return;
-  }
+  } else {
+    // URL components before (not including) the query
+    _url = url_match_results[1].str();
 
-  _url       = match_results[1].str();
-  auto query = match_results[2].str();
+    auto query = url_match_results[2].str();
 
-  {
     // Extract user name if provided. In WebHDFS, user name is specified as the key=value pair in
     // the query
-    std::regex static const pattern{R"(user.name=([^&]+))"};
+    std::regex static const username_pattern{R"(user.name=([^&]+))"};
     // Regex meaning:
     // [^&]+: Matches the non-ampersand character one or more times. The ampersand delimits
     // different parameters.
-    std::smatch match_results;
-    if (std::regex_search(query, match_results, pattern)) { _username = match_results[1].str(); }
+    std::smatch username_match_results;
+    bool found_username = std::regex_search(query, username_match_results, username_pattern);
+    if (found_username) { _username = username_match_results[1].str(); }
+  }
+
+  // If the username is not specified by function parameter `username` or by the query string, check
+  // the environment variable
+  if (!_username.has_value()) {
+    auto const* env_val = std::getenv("KVIKIO_WEBHDFS_USERNAME");
+    if (env_val != nullptr) { _username = env_val; }
   }
 }
 
@@ -53,11 +61,12 @@ WebHdfsEndpoint::WebHdfsEndpoint(std::string host,
                                  std::string port,
                                  std::string file_path,
                                  std::optional<std::string> username)
-  : RemoteEndpoint{RemoteEndpointType::WEBHDFS}, _username{std::move(username)}
+  : RemoteEndpoint{RemoteEndpointType::WEBHDFS}
 {
   std::stringstream ss;
   ss << "http://" << host << ":" << port << "/webhdfs/v1" << file_path;
-  _url = ss.str();
+  _url      = ss.str();
+  _username = detail::unwrap_or_env(std::move(username), "KVIKIO_WEBHDFS_USERNAME");
 }
 
 std::string WebHdfsEndpoint::str() const { return _url; }
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 6e6d8a8912..f3f0ef996e 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -15,6 +15,7 @@
 #include <string>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/env.hpp>
 #include <kvikio/detail/nvtx.hpp>
 #include <kvikio/detail/parallel_operation.hpp>
 #include <kvikio/detail/posix_io.hpp>
@@ -279,40 +280,25 @@ void S3Endpoint::setopt(CurlHandle& curl)
   if (_curl_header_list) { curl.setopt(CURLOPT_HTTPHEADER, _curl_header_list); }
 }
 
-std::string S3Endpoint::unwrap_or_default(std::optional<std::string> aws_arg,
-                                          std::string const& env_var,
-                                          std::string const& err_msg)
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (aws_arg.has_value()) { return std::move(*aws_arg); }
-
-  char const* env = std::getenv(env_var.c_str());
-  if (env == nullptr) {
-    if (err_msg.empty()) { return std::string(); }
-    KVIKIO_FAIL(err_msg, std::invalid_argument);
-  }
-  return std::string(env);
-}
-
 std::string S3Endpoint::url_from_bucket_and_object(std::string bucket_name,
                                                    std::string object_name,
                                                    std::optional<std::string> aws_region,
                                                    std::optional<std::string> aws_endpoint_url)
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  auto const endpoint_url = unwrap_or_default(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL");
+  auto const endpoint_url = detail::unwrap_or_env(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL");
   std::stringstream ss;
-  if (endpoint_url.empty()) {
+  if (!endpoint_url.has_value()) {
     auto const region =
-      unwrap_or_default(std::move(aws_region),
-                        "AWS_DEFAULT_REGION",
-                        "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
+      detail::unwrap_or_env(std::move(aws_region),
+                            "AWS_DEFAULT_REGION",
+                            "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
     // "s3" is a non-standard URI scheme used by AWS CLI and AWS SDK, and cannot be identified by
     // libcurl. A valid HTTP/HTTPS URL needs to be constructed for use in libcurl. Here the AWS
     // virtual host style is used.
-    ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name;
+    ss << "https://" << bucket_name << ".s3." << region.value() << ".amazonaws.com/" << object_name;
   } else {
-    ss << endpoint_url << "/" << bucket_name << "/" << object_name;
+    ss << endpoint_url.value() << "/" << bucket_name << "/" << object_name;
   }
   return ss.str();
 }
@@ -343,16 +329,16 @@ S3Endpoint::S3Endpoint(std::string url,
                 std::invalid_argument);
 
   auto const region =
-    unwrap_or_default(std::move(aws_region),
-                      "AWS_DEFAULT_REGION",
-                      "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
+    detail::unwrap_or_env(std::move(aws_region),
+                          "AWS_DEFAULT_REGION",
+                          "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
 
   auto const access_key =
-    unwrap_or_default(std::move(aws_access_key),
-                      "AWS_ACCESS_KEY_ID",
-                      "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set.");
+    detail::unwrap_or_env(std::move(aws_access_key),
+                          "AWS_ACCESS_KEY_ID",
+                          "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set.");
 
-  auto const secret_access_key = unwrap_or_default(
+  auto const secret_access_key = detail::unwrap_or_env(
     std::move(aws_secret_access_key),
     "AWS_SECRET_ACCESS_KEY",
     "S3: must provide `aws_secret_access_key` if AWS_SECRET_ACCESS_KEY isn't set.");
@@ -360,7 +346,7 @@ S3Endpoint::S3Endpoint(std::string url,
   // Create the CURLOPT_AWS_SIGV4 option
   {
     std::stringstream ss;
-    ss << "aws:amz:" << region << ":s3";
+    ss << "aws:amz:" << region.value() << ":s3";
     _aws_sigv4 = ss.str();
   }
   // Create the CURLOPT_USERPWD option
@@ -369,21 +355,21 @@ S3Endpoint::S3Endpoint(std::string url,
   // <https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_sigv-create-signed-request.html>
   {
     std::stringstream ss;
-    ss << access_key << ":" << secret_access_key;
+    ss << access_key.value() << ":" << secret_access_key.value();
     _aws_userpwd = ss.str();
   }
   // Access key IDs beginning with ASIA are temporary credentials that are created using AWS STS
   // operations. They need a session token to work.
-  if (access_key.compare(0, 4, std::string("ASIA")) == 0) {
+  if (access_key->compare(0, 4, std::string("ASIA")) == 0) {
     // Create a Custom Curl header for the session token.
     // The _curl_header_list created by curl_slist_append must be manually freed
     // (see https://curl.se/libcurl/c/CURLOPT_HTTPHEADER.html)
     auto session_token =
-      unwrap_or_default(std::move(aws_session_token),
-                        "AWS_SESSION_TOKEN",
-                        "When using temporary credentials, AWS_SESSION_TOKEN must be set.");
+      detail::unwrap_or_env(std::move(aws_session_token),
+                            "AWS_SESSION_TOKEN",
+                            "When using temporary credentials, AWS_SESSION_TOKEN must be set.");
     std::stringstream ss;
-    ss << "x-amz-security-token: " << session_token;
+    ss << "x-amz-security-token: " << session_token.value();
     _curl_header_list = curl_slist_append(NULL, ss.str().c_str());
     KVIKIO_EXPECT(_curl_header_list != nullptr,
                   "Failed to create curl header for AWS token",
@@ -589,6 +575,7 @@ RemoteHandle RemoteHandle::open(std::string url,
                                 std::optional<std::vector<RemoteEndpointType>> allow_list,
                                 std::optional<std::size_t> nbytes)
 {
+  KVIKIO_NVTX_FUNC_RANGE();
   if (!allow_list.has_value()) {
     allow_list = {RemoteEndpointType::S3,
                   RemoteEndpointType::S3_PUBLIC,
diff --git a/cpp/tests/test_hdfs.cpp b/cpp/tests/test_hdfs.cpp
index d7256cc456..c0c49cffb2 100644
--- a/cpp/tests/test_hdfs.cpp
+++ b/cpp/tests/test_hdfs.cpp
@@ -17,13 +17,14 @@
 // This test makes the following assumptions:
 // - This KvikIO unit test is run on the "name node" of a WebHDFS server.
 // - Port 9870 (default for WebHDFS) is being used to listen to the requests.
-// - The environment variable `KVIKIO_USER` is specified prior to the test. It contains a valid user
+// - The environment variable `KVIKIO_WEBHDFS_USERNAME` is specified prior to the test. It contains
+// a valid user
 //   name that has been granted access to the HDFS.
 // - The user has the proper permission to create a file under the `/tmp` directory on the HDFS.
 // - If the unit test is run within a Docker. The following arguments are passed to the `docker run`
 //   command:
 //   - `--network host`
-//   - `--env KVIKIO_USER=<hdfs_username>`
+//   - `--env KVIKIO_WEBHDFS_USERNAME=<hdfs_username>`
 //
 // If any of these assumptions is not satisfied, this unit test is expected to be skipped
 // gracefully.
@@ -58,11 +59,11 @@ class WebHdfsTest : public testing::Test {
 
     config.remote_file_path = "/tmp/kvikio-test-webhdfs.bin";
 
-    auto res = std::getenv("KVIKIO_USER");
+    auto res = std::getenv("KVIKIO_WEBHDFS_USERNAME");
     if (res) {
       config._username = res;
     } else {
-      GTEST_SKIP() << "Environment variable KVIKIO_USER is not set for this test.";
+      GTEST_SKIP() << "Environment variable KVIKIO_WEBHDFS_USERNAME is not set for this test.";
     }
 
     webhdfs_helper =

From 923d0569c5307fcf0f4bd451e857084a379183cb Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 29 Oct 2025 15:19:49 -0500
Subject: [PATCH 093/103] Use `RAPIDS_BRANCH` in cmake-format invocations that
 need rapids-cmake configs (#862)

This uses `RAPIDS_BRANCH` in style checks where we reference rapids-cmake configs for `cmake-format`.

xref: https://github.com/rapidsai/build-planning/issues/224

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/kvikio/pull/862
---
 ci/check_style.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/check_style.sh b/ci/check_style.sh
index 922e86f6e4..cbd17858d2 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -15,10 +15,10 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_BRANCH="$(cat "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../RAPIDS_BRANCH)"
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/cmake-format-rapids-cmake.json
-export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/${RAPIDS_BRANCH}/cmake-format-rapids-cmake.json
+export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-format-rapids-cmake.json
 mkdir -p "$(dirname ${RAPIDS_CMAKE_FORMAT_FILE})"
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} "${FORMAT_FILE_URL}"
 

From 0ed4c8ad2ce76e1635c9583e0c2c8d4aa25d30f0 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 4 Nov 2025 14:41:50 -0500
Subject: [PATCH 094/103] Restore isort for Cython files (#864)

Ruff does not yet support Cython, so restore isort only for Cython.

Issue: https://github.com/rapidsai/build-planning/issues/130

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/kvikio/pull/864
---
 .pre-commit-config.yaml      |  7 +++++++
 python/kvikio/pyproject.toml | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f6d9e7263d..2065300064 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,6 +7,13 @@ repos:
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--config-root=python/", "--resolve-all-configs"]
+        files: python/.*
+        types: [cython]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.14.1
     hooks:
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 39fbdf5df4..2a2d4c416b 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -56,6 +56,39 @@ zarr = [
 [project.urls]
 Homepage = "https://github.com/rapidsai/kvikio"
 
+[tool.isort]
+line_length = 88
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_first_party = [
+    "kvikio",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
+
 [tool.mypy]
 ignore_missing_imports = true
 

From 6908b9a25129f517947fe67f83fd9560195be142 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 5 Nov 2025 11:20:48 -0600
Subject: [PATCH 095/103] Use current system architecture in conda environment
 creation command (#867)

This fixes a conda environment creation command to support both `x86_64` and `aarch64` systems.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/kvikio/pull/867
---
 docs/source/install.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/install.rst b/docs/source/install.rst
index 91cc39af1f..844477e421 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -61,7 +61,7 @@ In order to setup a development environment, we recommend Conda:
 .. code-block::
 
   # CUDA 13
-  mamba env create --name kvikio-dev --file conda/environments/all_cuda-130_arch-x86_64.yaml
+  mamba env create --name kvikio-dev --file conda/environments/all_cuda-130_arch-$(arch).yaml
 
 The Python library depends on the C++ library, thus we build and install both:
 

From 61b05c14f51137ed8a5091a1b9226283f85e7737 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 6 Nov 2025 11:00:01 -0800
Subject: [PATCH 096/103] Disable benchmark builds by default, enable in CI
 (#866)

This PR disables building benchmarks by default, consistent with other RAPIDS projects such as cuDF and RAFT. It also updates the CI build script to ensure that benchmark builds are still tested in CI. This change helps address the issue in cuDF where KvikIO benchmarks are built unnecessarily.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/866
---
 build.sh                            | 10 ++++++++--
 conda/recipes/libkvikio/recipe.yaml |  2 +-
 cpp/CMakeLists.txt                  |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/build.sh b/build.sh
index 0b83259cf4..785753b2d4 100755
--- a/build.sh
+++ b/build.sh
@@ -19,11 +19,12 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd "$(dirname "$0")"; pwd)
 
-VALIDARGS="clean libkvikio kvikio -v -g -n --pydevelop -h"
-HELP="$0 [clean] [libkvikio] [kvikio] [-v] [-g] [-n] [--cmake-args=\"<args>\"] [-h]
+VALIDARGS="clean libkvikio kvikio benchmarks -v -g -n --pydevelop -h"
+HELP="$0 [clean] [libkvikio] [kvikio] [benchmarks] [-v] [-g] [-n] [--cmake-args=\"<args>\"] [-h]
    clean                       - remove all existing build artifacts and configuration (start over)
    libkvikio                   - build and install the libkvikio C++ code
    kvikio                      - build and install the kvikio Python package (requires libkvikio)
+   benchmarks                  - build benchmarks
    -v                          - verbose build mode
    -g                          - build for debug
    -n                          - no install step
@@ -40,6 +41,7 @@ BUILD_DIRS="${LIBKVIKIO_BUILD_DIR} ${KVIKIO_BUILD_DIR}"
 VERBOSE_FLAG=""
 BUILD_TYPE=Release
 INSTALL_TARGET=install
+BUILD_BENCHMARKS=OFF
 RAN_CMAKE=0
 PYTHON_ARGS_FOR_INSTALL=("-v" "--no-build-isolation" "--no-deps" "--config-settings" "rapidsai.disable-cuda=true")
 
@@ -88,6 +90,7 @@ function ensureCMakeRan {
         cmake -B "${LIBKVIKIO_BUILD_DIR}" -S . \
               -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" \
               -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+              -DKvikIO_BUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
               "${EXTRA_CMAKE_ARGS[@]}"
         RAN_CMAKE=1
     fi
@@ -125,6 +128,9 @@ fi
 if hasArg -n; then
     INSTALL_TARGET=""
 fi
+if hasArg benchmarks; then
+    BUILD_BENCHMARKS=ON
+fi
 if hasArg --pydevelop; then
     PYTHON_ARGS_FOR_INSTALL+=("-e")
 fi
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 3c7b6d69a0..6ae2c9659a 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -37,7 +37,7 @@ cache:
         export CXXFLAGS=$(echo $CXXFLAGS | sed -E 's@\-fdebug\-prefix\-map[^ ]*@@g')
         set +x
 
-        ./build.sh -v -n libkvikio
+        ./build.sh -v -n libkvikio benchmarks
       secrets:
         - AWS_ACCESS_KEY_ID
         - AWS_SECRET_ACCESS_KEY
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a681fc6135..71cbc258de 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -37,7 +37,7 @@ rapids_cmake_build_type(Release)
 # * build options ----------------------------------------------------------------------------------
 
 option(BUILD_SHARED_LIBS "Build KvikIO shared library" ON)
-option(KvikIO_BUILD_BENCHMARKS "Configure CMake to build benchmarks" ON)
+option(KvikIO_BUILD_BENCHMARKS "Configure CMake to build benchmarks" OFF)
 option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON)
 option(KvikIO_BUILD_TESTS "Configure CMake to build tests" ON)
 option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote IO support" ON)

From 16ee391a934ed1db01f5a0f801d8ddf1b7eda8b8 Mon Sep 17 00:00:00 2001
From: Nate Rock <rockhowse@gmail.com>
Date: Fri, 7 Nov 2025 12:50:05 -0600
Subject: [PATCH 097/103] refactored update-version.sh to handle new branching
 strategy (#868)

This PR supports handling the new main branch strategy outlined below:

* [RSN 47 - Changes to RAPIDS branching strategy in 25.12](https://docs.rapids.ai/notices/rsn0047/)

The `update-version.sh` script should now supports two modes controlled via  `CLI` params or `ENV` vars:

CLI arguments: `--run-context=main|release`
ENV var `RAPIDS_RUN_CONTEXT=main|release`

xref: https://github.com/rapidsai/build-planning/issues/224

Authors:
  - Nate Rock (https://github.com/rockhowse)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/868
---
 ci/release/update-version.sh                  | 80 +++++++++++++++++--
 .../downstream/cmake/get_kvikio.cmake         |  2 +-
 2 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index b7adf5e033..0bc6d899ec 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -6,10 +6,60 @@
 ##########################
 
 ## Usage
-# bash update-version.sh <new_version>
+# NOTE: This script must be run from the repository root, not from the ci/release/ directory
+# Primary interface:   bash ci/release/update-version.sh <new_version> [--run-context=main|release]
+# Fallback interface:  [RAPIDS_RUN_CONTEXT=main|release] bash ci/release/update-version.sh <new_version>
+# CLI arguments take precedence over environment variables
+# Defaults to main when no run-context is specified
+
+# Parse command line arguments
+CLI_RUN_CONTEXT=""
+VERSION_ARG=""
+
+for arg in "$@"; do
+    case $arg in
+        --run-context=*)
+            CLI_RUN_CONTEXT="${arg#*=}"
+            shift
+            ;;
+        *)
+            if [[ -z "$VERSION_ARG" ]]; then
+                VERSION_ARG="$arg"
+            fi
+            ;;
+    esac
+done
 
 # Format is YY.MM.PP - no leading 'v' or trailing 'a'
-NEXT_FULL_TAG=$1
+NEXT_FULL_TAG="$VERSION_ARG"
+
+# Determine RUN_CONTEXT with CLI precedence over environment variable, defaulting to main
+if [[ -n "$CLI_RUN_CONTEXT" ]]; then
+    RUN_CONTEXT="$CLI_RUN_CONTEXT"
+    echo "Using run-context from CLI: $RUN_CONTEXT"
+elif [[ -n "${RAPIDS_RUN_CONTEXT}" ]]; then
+    RUN_CONTEXT="$RAPIDS_RUN_CONTEXT"
+    echo "Using run-context from environment: $RUN_CONTEXT"
+else
+    RUN_CONTEXT="main"
+    echo "No run-context provided, defaulting to: $RUN_CONTEXT"
+fi
+
+# Validate RUN_CONTEXT value
+if [[ "${RUN_CONTEXT}" != "main" && "${RUN_CONTEXT}" != "release" ]]; then
+    echo "Error: Invalid run-context value '${RUN_CONTEXT}'"
+    echo "Valid values: main, release"
+    exit 1
+fi
+
+# Validate version argument
+if [[ -z "$NEXT_FULL_TAG" ]]; then
+    echo "Error: Version argument is required"
+    echo "Usage: $0 <new_version> [--run-context=<context>]"
+    echo "   or: [RAPIDS_RUN_CONTEXT=<context>] $0 <new_version>"
+    echo "Note: Defaults to main when run-context is not specified"
+    exit 1
+fi
 
 # Get current version
 CURRENT_TAG=$(git tag --merged HEAD | grep -xE '^v.*' | sort --version-sort | tail -n 1 | tr -d 'v')
@@ -24,7 +74,14 @@ NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
 PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))")
 
-echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
+# Set branch references based on RUN_CONTEXT
+if [[ "${RUN_CONTEXT}" == "main" ]]; then
+    RAPIDS_BRANCH_NAME="main"
+    echo "Preparing development branch update $CURRENT_TAG => $NEXT_FULL_TAG (targeting main branch)"
+elif [[ "${RUN_CONTEXT}" == "release" ]]; then
+    RAPIDS_BRANCH_NAME="release/${NEXT_SHORT_TAG}"
+    echo "Preparing release branch update $CURRENT_TAG => $NEXT_FULL_TAG (targeting release/${NEXT_SHORT_TAG} branch)"
+fi
 
 # Inplace sed replace; workaround for Linux and Mac
 function sed_runner() {
@@ -33,7 +90,7 @@ function sed_runner() {
 
 # Centralized version file update
 echo "${NEXT_FULL_TAG}" > VERSION
-echo "branch-${NEXT_SHORT_TAG}" > RAPIDS_BRANCH
+echo "${RAPIDS_BRANCH_NAME}" > RAPIDS_BRANCH
 
 DEPENDENCIES=(
   kvikio
@@ -52,8 +109,8 @@ done
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
-  sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
-  sed_runner "s/:[0-9]*\\.[0-9]*-/:${NEXT_SHORT_TAG}-/g" "${FILE}"
+  sed_runner "/shared-workflows/ s|@.*|@${RAPIDS_BRANCH_NAME}|g" "${FILE}"
+  sed_runner "s|:[0-9]*\\.[0-9]*-|:${NEXT_SHORT_TAG}-|g" "${FILE}"
 done
 
 # .devcontainer files
@@ -63,7 +120,16 @@ find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r
     sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
 
-# The example of a downstream project
+# Update downstream example GIT_TAG based on context
+if [[ "${RUN_CONTEXT}" == "main" ]]; then
+    # In main context, convert any release/X.Y references to main
+    sed_runner "s|GIT_TAG release/[^[:space:]]*|GIT_TAG main|g" "cpp/examples/downstream/cmake/get_kvikio.cmake"
+elif [[ "${RUN_CONTEXT}" == "release" ]]; then
+    # In release context, convert main to release/X.Y
+    sed_runner "s|GIT_TAG main|GIT_TAG release/${NEXT_SHORT_TAG}|g" "cpp/examples/downstream/cmake/get_kvikio.cmake"
+fi
+
+# The example of a downstream project - update version number
 sed_runner "s/find_and_configure_kvikio(.*)/find_and_configure_kvikio(\"${NEXT_SHORT_TAG}\")/g" "cpp/examples/downstream/cmake/get_kvikio.cmake"
 
 # Java files
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 66fbfe3a61..7a257e7f0b 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -13,7 +13,7 @@ function(find_and_configure_kvikio MIN_VERSION)
     VERSION ${MIN_VERSION}
             GIT_REPOSITORY
             https://github.com/rapidsai/kvikio.git
-    GIT_TAG branch-${MIN_VERSION}
+    GIT_TAG main
     GIT_SHALLOW
       TRUE
       SOURCE_SUBDIR

From 99a065d6d87e34e5a9d71f40ecf30ee43dc0e8ed Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Mon, 10 Nov 2025 16:51:49 -0800
Subject: [PATCH 098/103] Pin Cython pre-3.2.0 and PyTest pre-9 (#869)

Recently Cython 3.2.0 was released and we have seen a few subtle issues building with it. While we work out these issues, this pins to Cython 3.1, which know to be working for us.

Similarly PyTest 9 was recently released, but we have ran into some issues with it as well. So pin to PyTest 8 while we work through PyTest 9 issues.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/869
---
 conda/environments/all_cuda-129_arch-aarch64.yaml |  6 +++---
 conda/environments/all_cuda-129_arch-x86_64.yaml  |  6 +++---
 conda/environments/all_cuda-130_arch-aarch64.yaml |  6 +++---
 conda/environments/all_cuda-130_arch-x86_64.yaml  |  6 +++---
 conda/recipes/kvikio/recipe.yaml                  |  2 +-
 dependencies.yaml                                 | 10 +++++-----
 python/kvikio/pyproject.toml                      |  8 ++++----
 7 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index d2c1dd9041..d5ba9548a9 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -13,7 +13,7 @@ dependencies:
 - cuda-version=12.9
 - cupy>=13.6.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.0,<3.2.0a0
 - doxygen=1.9.1
 - gcc_linux-aarch64=14.*
 - libcufile-dev
@@ -25,10 +25,10 @@ dependencies:
 - numpydoc
 - packaging
 - pre-commit
-- pytest
 - pytest-asyncio
 - pytest-cov
 - pytest-timeout
+- pytest<9.0.0a0
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
@@ -38,5 +38,5 @@ dependencies:
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-aarch64=2.28
-- zarr>=3.0.0,<4.0.0
+- zarr>=3.0.0,<3.2.0a0,<4.0.0
 name: all_cuda-129_arch-aarch64
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 9b3292a72c..b1a2bb38d5 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -13,7 +13,7 @@ dependencies:
 - cuda-version=12.9
 - cupy>=13.6.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.0,<3.2.0a0
 - doxygen=1.9.1
 - gcc_linux-64=14.*
 - libcufile-dev
@@ -25,10 +25,10 @@ dependencies:
 - numpydoc
 - packaging
 - pre-commit
-- pytest
 - pytest-asyncio
 - pytest-cov
 - pytest-timeout
+- pytest<9.0.0a0
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
@@ -38,5 +38,5 @@ dependencies:
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-64=2.28
-- zarr>=3.0.0,<4.0.0
+- zarr>=3.0.0,<3.2.0a0,<4.0.0
 name: all_cuda-129_arch-x86_64
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
index d1db3c2e9a..551b24281f 100644
--- a/conda/environments/all_cuda-130_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -13,7 +13,7 @@ dependencies:
 - cuda-version=13.0
 - cupy>=13.6.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.0,<3.2.0a0
 - doxygen=1.9.1
 - gcc_linux-aarch64=14.*
 - libcufile-dev
@@ -25,10 +25,10 @@ dependencies:
 - numpydoc
 - packaging
 - pre-commit
-- pytest
 - pytest-asyncio
 - pytest-cov
 - pytest-timeout
+- pytest<9.0.0a0
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
@@ -38,5 +38,5 @@ dependencies:
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-aarch64=2.28
-- zarr>=3.0.0,<4.0.0
+- zarr>=3.0.0,<3.2.0a0,<4.0.0
 name: all_cuda-130_arch-aarch64
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
index f37bf67268..419d29ed31 100644
--- a/conda/environments/all_cuda-130_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -13,7 +13,7 @@ dependencies:
 - cuda-version=13.0
 - cupy>=13.6.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.0,<3.2.0a0
 - doxygen=1.9.1
 - gcc_linux-64=14.*
 - libcufile-dev
@@ -25,10 +25,10 @@ dependencies:
 - numpydoc
 - packaging
 - pre-commit
-- pytest
 - pytest-asyncio
 - pytest-cov
 - pytest-timeout
+- pytest<9.0.0a0
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
@@ -38,5 +38,5 @@ dependencies:
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-64=2.28
-- zarr>=3.0.0,<4.0.0
+- zarr>=3.0.0,<3.2.0a0,<4.0.0
 name: all_cuda-130_arch-x86_64
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 48dafb21cf..798c7d7dce 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -64,7 +64,7 @@ requirements:
     - ${{ stdlib("c") }}
   host:
     - cuda-version =${{ cuda_version }}
-    - cython >=3.0.0
+    - cython >=3.0.0,<3.2.0a0
     - libcurl ${{ libcurl_version }}
     - libkvikio =${{ version }}
     - pip
diff --git a/dependencies.yaml b/dependencies.yaml
index 0ab4199c8a..bed9980785 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -170,7 +170,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
+          - cython>=3.0.0,<3.2.0a0
   checks:
     common:
       - output_types: [conda, requirements]
@@ -259,7 +259,7 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - numpydoc
-          - zarr>=3.0.0,<4.0.0
+          - zarr>=3.0.0,<3.2.0a0,<4.0.0
           - sphinx
           - sphinx-click
           - sphinx_rtd_theme
@@ -310,7 +310,7 @@ dependencies:
     common:
       - output_types: [requirements, pyproject]
         packages:
-          - "zarr>=3.0.0,<4.0.0; python_version >= '3.11'"
+          - "zarr>=3.0.0,<3.2.0a0,<4.0.0; python_version >= '3.11'"
   test_libkvikio:
     common:
       - output_types: conda
@@ -333,7 +333,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - rapids-dask-dependency==25.12.*,>=0.0.0a0
-          - pytest
+          - pytest<9.0.0a0
           - pytest-asyncio
           - pytest-cov
           - pytest-timeout
@@ -352,7 +352,7 @@ dependencies:
           - matrix:
               py: "3.1[123]"
             packages:
-              - zarr>=3.0.0,<4.0.0
+              - zarr>=3.0.0,<3.2.0a0,<4.0.0
           - matrix:
             packages:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 2a2d4c416b..2551b8641f 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -41,16 +41,16 @@ test = [
     "boto3>=1.21.21",
     "cuda-python>=13.0.1,<14.0a0",
     "moto[server]>=4.0.8",
-    "pytest",
     "pytest-asyncio",
     "pytest-cov",
     "pytest-timeout",
+    "pytest<9.0.0a0",
     "rangehttpserver",
     "rapids-dask-dependency==25.12.*,>=0.0.0a0",
-    "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
+    "zarr>=3.0.0,<3.2.0a0,<4.0.0; python_version >= '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 zarr = [
-    "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
+    "zarr>=3.0.0,<3.2.0a0,<4.0.0; python_version >= '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -98,7 +98,7 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
-    "cython>=3.0.0",
+    "cython>=3.0.0,<3.2.0a0",
     "libkvikio==25.12.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 1604348b0184e85933eac3633782d20878de116f Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 14 Nov 2025 15:05:31 -0500
Subject: [PATCH 099/103] Add opportunistic Direct I/O support to POSIX backend
 (#863)

This PR significantly improves POSIX I/O write performance as well as cold-page-cache read by opportunistically using Direct I/O. The speedup for sequential write is approximately 3~4x.

The opportunistic POSIX Direct I/O feature can be controlled in two ways:
- Environment variables:
  - `"KVIKIO_AUTO_DIRECT_IO_READ"`: defaults to `false`.
  - `"KVIKIO_AUTO_DIRECT_IO_WRITE"`: defaults to `true`.

- C++/Python API
  - `defaults::set_auto_direct_io_read(bool flag)`/`kvikio.defaults.set("posix_auto_direct_io_read", flag)`
  - `defaults::set_auto_direct_io_write(bool flag)`/`kvikio.defaults.set("posix_auto_direct_io_write", flag)`

In addition, this PR refactors the bounce buffer class. To improve clarity, relevant classes and variables have been renamed and a lot of comments added. The bounce buffer class is now templated by allocator to accommodate different use cases:
- `PageAlignedBounceBufferPool`: used for Direct I/O to/from unaligned host buffer. Does not require CUDA context.
- `CudaPinnedBounceBufferPool`: used for buffered I/O to/from device buffer. Requires CUDA context. This is the original implementation on the main branch.
- `CudaPageAlignedPinnedBounceBufferPool`: used for Direct I/O to/from device buffer. Requires CUDA context.


## Performance results
See https://github.com/rapidsai/kvikio/pull/863#issuecomment-3479016816

## Goal
- Addresses most part of https://github.com/rapidsai/kvikio/issues/761
- Addresses the reported write performance issue in cudf

## Non-goal
- This PR does not add opportunistic Direct I/O as file handle's function parameters. This will be revisited in a future PR.
- This PR does not address one of the objectives in #520, which is to unify the implementation of the bounce buffer in POSIX IO and in Remote IO. This will be revisited in a future PR.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/kvikio/pull/863
---
 cpp/include/kvikio/bounce_buffer.hpp      | 216 ++++++++++++++++---
 cpp/include/kvikio/defaults.hpp           |  38 ++++
 cpp/include/kvikio/detail/posix_io.hpp    | 245 ++++++++++++++++------
 cpp/include/kvikio/detail/utils.hpp       |  26 ++-
 cpp/include/kvikio/file_handle.hpp        |  10 +
 cpp/include/kvikio/shim/cuda.hpp          |   2 +
 cpp/src/bounce_buffer.cpp                 | 140 +++++++++----
 cpp/src/compat_mode_manager.cpp           |  15 +-
 cpp/src/defaults.cpp                      |  14 +-
 cpp/src/detail/posix_io.cpp               |  30 ++-
 cpp/src/file_handle.cpp                   |  17 +-
 cpp/src/mmap.cpp                          |   9 +-
 cpp/src/remote_handle.cpp                 |  13 +-
 cpp/src/shim/cuda.cpp                     |   2 +
 cpp/src/utils.cpp                         |  30 ++-
 cpp/tests/CMakeLists.txt                  |   2 +-
 cpp/tests/test_basic_io.cpp               | 138 ++++++++++++
 cpp/tests/utils/utils.hpp                 |  62 ++++++
 docs/source/runtime_settings.rst          |  65 +++++-
 python/kvikio/kvikio/_lib/buffer.pyx      |  13 +-
 python/kvikio/kvikio/_lib/defaults.pyx    |  32 +++
 python/kvikio/kvikio/_lib/file_handle.pyx |   8 +
 python/kvikio/kvikio/cufile.py            |  15 ++
 python/kvikio/kvikio/defaults.py          |   4 +
 24 files changed, 969 insertions(+), 177 deletions(-)

diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp
index f4e37d8f10..8b7b45c2e9 100644
--- a/cpp/include/kvikio/bounce_buffer.hpp
+++ b/cpp/include/kvikio/bounce_buffer.hpp
@@ -11,48 +11,153 @@
 namespace kvikio {
 
 /**
- * @brief Singleton class to retain host memory allocations
+ * @brief Allocator for page-aligned host memory
  *
- * Call `AllocRetain::get` to get an allocation that will be retained when it
+ * Uses std::aligned_alloc to allocate host memory aligned to page boundaries (typically 4096
+ * bytes). This allocator is suitable for Direct I/O operations that require page-aligned buffers
+ * but do not need CUDA context (i.e., host-to-host transfers only).
+ */
+class PageAlignedAllocator {
+ public:
+  /**
+   * @brief Allocate page-aligned host memory
+   *
+   * @param size Requested size in bytes (will be rounded up to page boundary)
+   * @return Pointer to allocated memory
+   */
+  void* allocate(std::size_t size);
+
+  /**
+   * @brief Deallocate memory previously allocated by this allocator
+   *
+   * @param buffer Pointer to memory to deallocate
+   * @param size Size of the allocation (unused, for interface consistency)
+   */
+  void deallocate(void* buffer, std::size_t size);
+};
+
+/**
+ * @brief Allocator for CUDA pinned host memory
+ *
+ * Uses cudaMemHostAlloc to allocate pinned (page-locked) host memory that can be efficiently
+ * transferred to/from GPU device memory. The allocation is only guaranteed to be aligned to "at
+ * least 256 bytes". It is NOT guaranteed to be page aligned.
+ *
+ * @note Do NOT use with Direct I/O - lacks page alignment guarantee
+ */
+class CudaPinnedAllocator {
+ public:
+  /**
+   * @brief Allocate CUDA pinned host memory
+   *
+   * @param size Requested size in bytes
+   * @return Pointer to allocated pinned memory
+   */
+  void* allocate(std::size_t size);
+
+  /**
+   * @brief Deallocate memory previously allocated by this allocator
+   *
+   * @param buffer Pointer to memory to deallocate
+   * @param size Size of the allocation (unused, for interface consistency)
+   */
+  void deallocate(void* buffer, std::size_t size);
+};
+
+/**
+ * @brief Allocator for page-aligned AND CUDA-registered pinned host memory
+ *
+ * Combines the benefits of both page alignment (for Direct I/O) and CUDA registration
+ * (for efficient host-device transfers). Uses std::aligned_alloc followed by
+ * cudaMemHostRegister to achieve both properties.
+ *
+ * @note This is the required allocator for Direct I/O with device memory. Requires a valid CUDA
+ * context when allocating.
+ */
+class CudaPageAlignedPinnedAllocator {
+ public:
+  /**
+   * @brief Allocate page-aligned CUDA-registered pinned host memory
+   *
+   * @param size Requested size in bytes (will be rounded up to page boundary)
+   * @return Pointer to allocated memory
+   */
+  void* allocate(std::size_t size);
+
+  /**
+   * @brief Deallocate memory previously allocated by this allocator
+   *
+   * @param buffer Pointer to memory to deallocate
+   * @param size Size of the allocation (unused, for interface consistency)
+   */
+  void deallocate(void* buffer, std::size_t size);
+};
+
+/**
+ * @brief Thread-safe singleton pool for reusable bounce buffers
+ *
+ * Manages a pool of host memory buffers used for staging data during I/O operations.
+ * Buffers are retained and reused across calls to minimize allocation overhead.
+ * The pool uses a LIFO (stack) allocation strategy optimized for cache locality.
+ *
+ * All buffers in the pool have the same size, controlled by `defaults::bounce_buffer_size()`. If
+ * the buffer size changes, all cached buffers are cleared and reallocated at the new size.
+ *
+ * Call `BounceBufferPool::get` to get an allocation that will be retained when it
  * goes out of scope (RAII). The size of all retained allocations are the same.
+ *
+ * @tparam Allocator The allocator policy that determines buffer properties:
+ * - PageAlignedAllocator: For host-only Direct I/O
+ * - CudaPinnedAllocator: For device I/O without Direct I/O
+ * - CudaPageAlignedPinnedAllocator: For device I/O with Direct I/O
+ *
+ * @note The destructor intentionally leaks allocations to avoid CUDA cleanup issues when static
+ * destructors run after CUDA context destruction
  */
-class AllocRetain {
+template <typename Allocator = CudaPinnedAllocator>
+class BounceBufferPool {
  private:
   std::mutex _mutex{};
-  // Stack of free allocations
-  std::stack<void*> _free_allocs{};
-  // The size of each allocation in `_free_allocs`
-  std::size_t _size{defaults::bounce_buffer_size()};
+  // Stack of free allocations (LIFO for cache locality)
+  std::stack<void*> _free_buffers{};
+  // The size of each allocation in `_free_buffers`
+  std::size_t _buffer_size{defaults::bounce_buffer_size()};
+  Allocator _allocator{};
 
  public:
   /**
-   * @brief An host memory allocation
+   * @brief RAII wrapper for a host bounce buffer allocation
+   *
+   * Automatically returns the buffer to the pool when destroyed (RAII pattern). Provides access to
+   * the underlying memory and its size.
+   *
+   * @note Non-copyable and non-movable to ensure single ownership
    */
-  class Alloc {
+  class Buffer {
    private:
-    AllocRetain* _manager;
-    void* _alloc;
+    BounceBufferPool* _pool;
+    void* _buffer;
     std::size_t const _size;
 
    public:
-    Alloc(AllocRetain* manager, void* alloc, std::size_t size);
-    Alloc(Alloc const&)            = delete;
-    Alloc& operator=(Alloc const&) = delete;
-    Alloc(Alloc&& o)               = delete;
-    Alloc& operator=(Alloc&& o)    = delete;
-    ~Alloc() noexcept;
+    Buffer(BounceBufferPool<Allocator>* pool, void* buffer, std::size_t size);
+    Buffer(Buffer const&)            = delete;
+    Buffer& operator=(Buffer const&) = delete;
+    Buffer(Buffer&& o)               = delete;
+    Buffer& operator=(Buffer&& o)    = delete;
+    ~Buffer() noexcept;
     void* get() noexcept;
     void* get(std::ptrdiff_t offset) noexcept;
     std::size_t size() noexcept;
   };
 
-  AllocRetain() = default;
+  BounceBufferPool() = default;
 
   // Notice, we do not clear the allocations at destruction thus the allocations leaks
-  // at exit. We do this because `AllocRetain::instance()` stores the allocations in a
+  // at exit. We do this because `BounceBufferPool::instance()` stores the allocations in a
   // static stack that are destructed below main, which is not allowed in CUDA:
   // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-  ~AllocRetain() noexcept = default;
+  ~BounceBufferPool() noexcept = default;
 
  private:
   /**
@@ -67,28 +172,81 @@ class AllocRetain {
   /**
    * @brief Ensure the sizes of the retained allocations match `defaults::bounce_buffer_size()`
    *
+   * If the configured bounce buffer size has changed, clears all cached buffers so new allocations
+   * will use the updated size.
+   *
    * NB: `_mutex` must be taken prior to calling this function.
    */
-  void _ensure_alloc_size();
+  void _ensure_buffer_size();
 
  public:
-  [[nodiscard]] Alloc get();
+  /**
+   * @brief Acquire a bounce buffer from the pool
+   *
+   * Returns a cached buffer if available, otherwise allocates a new one. The returned Buffer object
+   * will automatically return the buffer to the pool when it goes out of scope.
+   *
+   * @return RAII Buffer object wrapping the allocated memory
+   * @exception CudaError if allocation fails (e.g., invalid CUDA context for pinned allocators)
+   */
+  [[nodiscard]] Buffer get();
 
-  void put(void* alloc, std::size_t size);
+  /**
+   * @brief Return a buffer to the pool for reuse
+   *
+   * Typically called automatically by Buffer's destructor. Only adds the buffer to the pool if its
+   * size matches the current pool buffer size; otherwise the buffer is deallocated immediately.
+   *
+   * @param buffer Pointer to memory to return
+   * @param size Size of the buffer in bytes
+   */
+  void put(void* buffer, std::size_t size);
 
   /**
-   * @brief Free all retained allocations
+   * @brief Free all retained allocations in the pool
+   *
+   * Clears the pool and deallocates all cached buffers. Useful for reclaiming memory when bounce
+   * buffers are no longer needed.
    *
    * @return The number of bytes cleared
    */
   std::size_t clear();
 
-  KVIKIO_EXPORT static AllocRetain& instance();
+  /**
+   * @brief Get the singleton instance of the pool
+   *
+   * Each template instantiation (different Allocator) has its own singleton instance.
+   *
+   * @return Reference to the singleton pool instance
+   */
+  KVIKIO_EXPORT static BounceBufferPool& instance();
 
-  AllocRetain(AllocRetain const&)            = delete;
-  AllocRetain& operator=(AllocRetain const&) = delete;
-  AllocRetain(AllocRetain&& o)               = delete;
-  AllocRetain& operator=(AllocRetain&& o)    = delete;
+  BounceBufferPool(BounceBufferPool const&)            = delete;
+  BounceBufferPool& operator=(BounceBufferPool const&) = delete;
+  BounceBufferPool(BounceBufferPool&& o)               = delete;
+  BounceBufferPool& operator=(BounceBufferPool&& o)    = delete;
 };
 
+/**
+ * @brief Bounce buffer pool using page-aligned host memory
+ *
+ * Use for: Host-only Direct I/O operations (no CUDA context involvement)
+ */
+using PageAlignedBounceBufferPool = BounceBufferPool<PageAlignedAllocator>;
+
+/**
+ * @brief Bounce buffer pool using CUDA pinned memory
+ *
+ * Use for: Device I/O operations without Direct I/O
+ * Note: Not page-aligned - cannot be used with Direct I/O
+ */
+using CudaPinnedBounceBufferPool = BounceBufferPool<CudaPinnedAllocator>;
+
+/**
+ * @brief Bounce buffer pool using page-aligned CUDA-registered pinned memory
+ *
+ * Use for: Device I/O operations with Direct I/O enabled
+ * Provides both page alignment (for Direct I/O) and CUDA registration (for efficient transfers)
+ */
+using CudaPageAlignedPinnedBounceBufferPool = BounceBufferPool<CudaPageAlignedPinnedAllocator>;
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index 53254dda21..722986c362 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -119,6 +119,8 @@ class defaults {
   std::size_t _http_max_attempts;
   long _http_timeout;
   std::vector<int> _http_status_codes;
+  bool _auto_direct_io_read;
+  bool _auto_direct_io_write;
 
   static unsigned int get_num_threads_from_env();
 
@@ -356,6 +358,42 @@ class defaults {
    * @param status_codes The HTTP status codes to retry.
    */
   static void set_http_status_codes(std::vector<int> status_codes);
+
+  /**
+   * @brief Check if Direct I/O is enabled for POSIX reads
+   *
+   * Returns true if KvikIO should attempt to use Direct I/O (O_DIRECT) for POSIX read operations.
+   *
+   * @return Boolean answer
+   */
+  static bool auto_direct_io_read();
+
+  /**
+   * @brief Enable or disable Direct I/O for POSIX reads
+   *
+   * Controls whether KvikIO should attempt to use Direct I/O (O_DIRECT) for POSIX read operations.
+   *
+   * @param flag true to enable opportunistic Direct I/O reads, false to disable
+   */
+  static void set_auto_direct_io_read(bool flag);
+
+  /**
+   * @brief Check if Direct I/O is enabled for POSIX writes
+   *
+   * Returns true if KvikIO should attempt to use Direct I/O (O_DIRECT) for POSIX write operations.
+   *
+   * @return Boolean answer
+   */
+  static bool auto_direct_io_write();
+
+  /**
+   * @brief Enable or disable Direct I/O for POSIX writes
+   *
+   * Controls whether KvikIO should attempt to use Direct I/O (O_DIRECT) for POSIX write operations.
+   *
+   * @param flag true to enable opportunistic Direct I/O writes, false to disable
+   */
+  static void set_auto_direct_io_write(bool flag);
 };
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/detail/posix_io.hpp b/cpp/include/kvikio/detail/posix_io.hpp
index 4d26f553bb..2997337906 100644
--- a/cpp/include/kvikio/detail/posix_io.hpp
+++ b/cpp/include/kvikio/detail/posix_io.hpp
@@ -9,9 +9,11 @@
 #include <cstdlib>
 #include <map>
 #include <thread>
+#include <type_traits>
 
 #include <kvikio/bounce_buffer.hpp>
 #include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/utils.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
@@ -66,91 +68,197 @@ class StreamsByThread {
 };
 
 /**
- * @brief Read or write host memory to or from disk using POSIX
+ * @brief Read or write host memory to or from disk using POSIX with opportunistic Direct I/O
  *
- * @tparam Operation Whether the operation is a read or a write.
- * @tparam PartialIOStatus Whether all requested data are processed or not. If `FULL`, all of
- * `count` bytes are read or written.
- * @param fd File descriptor
- * @param buf Buffer to write
- * @param count Number of bytes to write
- * @param offset File offset
- * @return The number of bytes read or written (always gather than zero)
+ * This function attempts to use Direct I/O (O_DIRECT) when alignment requirements are satisfied,
+ * and automatically falls back to buffered I/O when they cannot be met. Direct I/O requires:
+ * - File offset aligned to page boundary
+ * - Buffer address aligned to page boundary
+ * - Transfer size as a multiple of page size
+ *
+ * The implementation handles partial alignment by breaking the I/O into segments:
+ * - Unaligned prefix (if offset not page-aligned): uses buffered I/O to reach page boundary
+ * - Aligned middle section: uses Direct I/O with bounce buffer if needed
+ * - Unaligned suffix (if remaining bytes < page size): uses buffered I/O
+ *
+ * @tparam Operation Whether the operation is a read or a write
+ * @tparam PartialIOStatus If PartialIO::YES, returns after first successful I/O. If PartialIO::NO,
+ * loops until all `count` bytes are processed
+ * @tparam BounceBufferPoolType Pool type for acquiring page-aligned bounce buffers when the user
+ * buffer is not page-aligned (defaults to PageAlignedBounceBufferPool)
+ * @param fd_direct_off File descriptor opened without O_DIRECT (always valid)
+ * @param buf Buffer to read into or write from
+ * @param count Number of bytes to transfer
+ * @param offset File offset in bytes
+ * @param fd_direct_on File descriptor opened with O_DIRECT, or -1 to disable Direct I/O attempts
+ * @return Number of bytes read or written (always greater than zero)
  */
-template <IOOperationType Operation, PartialIO PartialIOStatus>
-ssize_t posix_host_io(int fd, void const* buf, size_t count, off_t offset)
+template <IOOperationType Operation,
+          PartialIO PartialIOStatus,
+          typename BounceBufferPoolType = PageAlignedBounceBufferPool>
+ssize_t posix_host_io(
+  int fd_direct_off, void const* buf, size_t count, off_t offset, int fd_direct_on = -1)
 {
-  off_t cur_offset      = offset;
-  size_t byte_remaining = count;
-  char* buffer          = const_cast<char*>(static_cast<char const*>(buf));
-  while (byte_remaining > 0) {
-    ssize_t nbytes = 0;
+  auto pread_or_write = [](int fd, void* buf, size_t count, off_t offset) -> ssize_t {
+    ssize_t nbytes{};
     if constexpr (Operation == IOOperationType::READ) {
-      nbytes = ::pread(fd, buffer, byte_remaining, cur_offset);
+      nbytes = ::pread(fd, buf, count, offset);
     } else {
-      nbytes = ::pwrite(fd, buffer, byte_remaining, cur_offset);
+      nbytes = ::pwrite(fd, buf, count, offset);
+    }
+    return nbytes;
+  };
+
+  off_t cur_offset       = offset;
+  size_t bytes_remaining = count;
+  char* buffer           = const_cast<char*>(static_cast<char const*>(buf));
+  auto const page_size   = get_page_size();
+
+  // Process all bytes in a loop (unless PartialIO::YES returns early)
+  while (bytes_remaining > 0) {
+    ssize_t nbytes_processed{};
+
+    if (fd_direct_on == -1) {
+      // Direct I/O disabled: use buffered I/O for entire transfer
+      nbytes_processed = pread_or_write(fd_direct_off, buffer, bytes_remaining, cur_offset);
+    } else {
+      // Direct I/O enabled: attempt to use it when alignment allows
+      auto const is_cur_offset_aligned = detail::is_aligned(cur_offset, page_size);
+
+      if (!is_cur_offset_aligned) {
+        // Handle unaligned prefix: use buffered I/O to reach next page boundary
+        // This ensures subsequent iterations will have page-aligned offsets
+        auto const aligned_cur_offset = detail::align_up(cur_offset, page_size);
+        auto const bytes_requested    = std::min(aligned_cur_offset - cur_offset, bytes_remaining);
+        nbytes_processed = pread_or_write(fd_direct_off, buffer, bytes_requested, cur_offset);
+      } else {
+        if (bytes_remaining < page_size) {
+          // Handle unaligned suffix: remaining bytes are less than a page, use buffered I/O
+          nbytes_processed = pread_or_write(fd_direct_off, buffer, bytes_remaining, cur_offset);
+        } else {
+          // Offset is page-aligned. Now make transfer size page-aligned too by rounding down
+          auto aligned_bytes_remaining = detail::align_down(bytes_remaining, page_size);
+          auto const is_buf_aligned    = detail::is_aligned(buffer, page_size);
+          auto bytes_requested         = aligned_bytes_remaining;
+
+          if (!is_buf_aligned) {
+            // Buffer not page-aligned: use bounce buffer for Direct I/O
+            auto bounce_buffer = BounceBufferPoolType::instance().get();
+            auto* aligned_buf  = bounce_buffer.get();
+            // Limit transfer size to bounce buffer capacity
+            bytes_requested = std::min(bytes_requested, bounce_buffer.size());
+
+            if constexpr (Operation == IOOperationType::WRITE) {
+              // Copy user data to aligned bounce buffer before Direct I/O write
+              std::memcpy(aligned_buf, buffer, bytes_requested);
+            }
+
+            // Perform Direct I/O using the bounce buffer
+            nbytes_processed =
+              pread_or_write(fd_direct_on, aligned_buf, bytes_requested, cur_offset);
+
+            if constexpr (Operation == IOOperationType::READ) {
+              // Copy data from bounce buffer to user buffer after Direct I/O read
+              std::memcpy(buffer, aligned_buf, nbytes_processed);
+            }
+          } else {
+            // Buffer is page-aligned: perform Direct I/O directly with user buffer
+            nbytes_processed = pread_or_write(fd_direct_on, buffer, bytes_requested, cur_offset);
+          }
+        }
+      }
     }
-    if (nbytes == -1) {
+
+    // Error handling
+    if (nbytes_processed == -1) {
       std::string const name = (Operation == IOOperationType::READ) ? "pread" : "pwrite";
       KVIKIO_EXPECT(errno != EBADF, "POSIX error: Operation not permitted");
       KVIKIO_FAIL("POSIX error on " + name + ": " + strerror(errno));
     }
     if constexpr (Operation == IOOperationType::READ) {
-      KVIKIO_EXPECT(nbytes != 0, "POSIX error on pread: EOF");
+      KVIKIO_EXPECT(nbytes_processed != 0, "POSIX error on pread: EOF");
     }
-    if constexpr (PartialIOStatus == PartialIO::YES) { return nbytes; }
-    buffer += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    cur_offset += nbytes;
-    byte_remaining -= nbytes;
+
+    // Return early if partial I/O is allowed
+    if constexpr (PartialIOStatus == PartialIO::YES) { return nbytes_processed; }
+
+    // Advance to next segment
+    buffer += nbytes_processed;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    cur_offset += nbytes_processed;
+    bytes_remaining -= nbytes_processed;
   }
+
   return convert_size2ssize(count);
 }
 
 /**
- * @brief Read or write device memory to or from disk using POSIX
+ * @brief Read or write device memory to or from disk using POSIX with opportunistic Direct I/O
  *
- * @tparam Operation Whether the operation is a read or a write.
- * @param fd File descriptor
- * @param devPtr_base Device pointer to read or write to.
- * @param size Number of bytes to read or write.
- * @param file_offset Byte offset to the start of the file.
- * @param devPtr_offset Byte offset to the start of the device pointer.
- * @return Number of bytes read or written.
+ * This function transfers data between GPU device memory and files by staging through a host bounce
+ * buffer. Since without GDS Direct I/O cannot be performed directly with device memory, the
+ * operation is split into stages:
+ * - For reads: File --> Host bounce buffer (with Direct I/O if aligned) --> Device memory
+ * - For writes: Device memory --> Host bounce buffer --> File (with Direct I/O if aligned)
+ *
+ * The underlying file I/O uses `posix_host_io` which opportunistically attempts Direct I/O when
+ * alignment requirements are satisfied.
+ *
+ * @tparam Operation Whether the operation is a read or a write
+ * @tparam BounceBufferPoolType Pool type for acquiring CUDA-registered bounce buffers (defaults to
+ * CudaPinnedBounceBufferPool)
+ * @param fd_direct_off File descriptor opened without O_DIRECT (always valid)
+ * @param devPtr_base Base device pointer for the transfer
+ * @param size Total number of bytes to transfer
+ * @param file_offset Byte offset from the start of the file
+ * @param devPtr_offset Byte offset from devPtr_base (allows working with sub-regions)
+ * @param fd_direct_on File descriptor opened with O_DIRECT, or -1 to disable Direct I/O attempts
+ * @return Total number of bytes read or written
  */
-template <IOOperationType Operation>
-std::size_t posix_device_io(int fd,
+template <IOOperationType Operation, typename BounceBufferPoolType = CudaPinnedBounceBufferPool>
+std::size_t posix_device_io(int fd_direct_off,
                             void const* devPtr_base,
                             std::size_t size,
                             std::size_t file_offset,
-                            std::size_t devPtr_offset)
+                            std::size_t devPtr_offset,
+                            int fd_direct_on = -1)
 {
-  auto alloc              = AllocRetain::instance().get();
+  // Direct I/O requires page-aligned bounce buffers. CudaPinnedBounceBufferPool uses
+  // cudaMemHostAlloc which does not guarantee page alignment.
+  if (std::is_same_v<BounceBufferPoolType, CudaPinnedBounceBufferPool>) {
+    KVIKIO_EXPECT(
+      fd_direct_on == -1,
+      "Direct I/O requires page-aligned bounce buffers. CudaPinnedBounceBufferPool does not "
+      "guarantee page alignment. Use CudaPageAlignedPinnedBounceBufferPool instead.");
+  }
+
+  auto bounce_buffer      = BounceBufferPoolType::instance().get();
   CUdeviceptr devPtr      = convert_void2deviceptr(devPtr_base) + devPtr_offset;
   off_t cur_file_offset   = convert_size2off(file_offset);
-  off_t byte_remaining    = convert_size2off(size);
-  off_t const chunk_size2 = convert_size2off(alloc.size());
+  off_t bytes_remaining   = convert_size2off(size);
+  off_t const chunk_size2 = convert_size2off(bounce_buffer.size());
 
   // Get a stream for the current CUDA context and thread
   CUstream stream = StreamsByThread::get();
 
-  while (byte_remaining > 0) {
-    off_t const nbytes_requested = std::min(chunk_size2, byte_remaining);
+  while (bytes_remaining > 0) {
+    off_t const nbytes_requested = std::min(chunk_size2, bytes_remaining);
     ssize_t nbytes_got           = nbytes_requested;
     if constexpr (Operation == IOOperationType::READ) {
       nbytes_got = posix_host_io<IOOperationType::READ, PartialIO::YES>(
-        fd, alloc.get(), nbytes_requested, cur_file_offset);
-      CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream));
+        fd_direct_off, bounce_buffer.get(), nbytes_requested, cur_file_offset, fd_direct_on);
+      CUDA_DRIVER_TRY(
+        cudaAPI::instance().MemcpyHtoDAsync(devPtr, bounce_buffer.get(), nbytes_got, stream));
       CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
     } else {  // Is a write operation
       CUDA_DRIVER_TRY(
-        cudaAPI::instance().MemcpyDtoHAsync(alloc.get(), devPtr, nbytes_requested, stream));
+        cudaAPI::instance().MemcpyDtoHAsync(bounce_buffer.get(), devPtr, nbytes_requested, stream));
       CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
       posix_host_io<IOOperationType::WRITE, PartialIO::NO>(
-        fd, alloc.get(), nbytes_requested, cur_file_offset);
+        fd_direct_off, bounce_buffer.get(), nbytes_requested, cur_file_offset, fd_direct_on);
     }
     cur_file_offset += nbytes_got;
     devPtr += nbytes_got;
-    byte_remaining -= nbytes_got;
+    bytes_remaining -= nbytes_got;
   }
   return size;
 }
@@ -163,18 +271,24 @@ std::size_t posix_device_io(int fd,
  *
  * @tparam PartialIOStatus Whether all requested data are processed or not. If `FULL`, all of
  * `count` bytes are read.
- * @param fd File descriptor
+ * @param fd_direct_off File descriptor without Direct I/O.
  * @param buf Base address of buffer in host memory.
  * @param size Size in bytes to read.
  * @param file_offset Offset in the file to read from.
+ * @param fd_direct_on Optional file descriptor with Direct I/O.
  * @return Size of bytes that were successfully read.
  */
 template <PartialIO PartialIOStatus>
-std::size_t posix_host_read(int fd, void* buf, std::size_t size, std::size_t file_offset)
+std::size_t posix_host_read(
+  int fd_direct_off, void* buf, std::size_t size, std::size_t file_offset, int fd_direct_on = -1)
 {
   KVIKIO_NVTX_FUNC_RANGE(size);
+
+  auto cur_fd_direct_on{-1};
+  if (fd_direct_on != -1 && defaults::auto_direct_io_read()) { cur_fd_direct_on = fd_direct_on; }
+
   return detail::posix_host_io<IOOperationType::READ, PartialIOStatus>(
-    fd, buf, size, convert_size2off(file_offset));
+    fd_direct_off, buf, size, convert_size2off(file_offset), cur_fd_direct_on);
 }
 
 /**
@@ -183,20 +297,29 @@ std::size_t posix_host_read(int fd, void* buf, std::size_t size, std::size_t fil
  * If `size` or `file_offset` isn't aligned with `page_size` then
  * `fd` cannot have been opened with the `O_DIRECT` flag.
  *
- * @tparam ioDataCompletionLevel Whether all requested data are processed or not. If `FULL`, all of
- * `count` bytes are written.
- * @param fd File descriptor
+ * @tparam ioDataCompletionLevel Whether all requested data are processed or not. If `FULL`, all
+ * of `count` bytes are written.
+ * @param fd_direct_off File descriptor without Direct I/O.
  * @param buf Base address of buffer in host memory.
  * @param size Size in bytes to write.
  * @param file_offset Offset in the file to write to.
+ * @param fd_direct_on Optional file descriptor with Direct I/O.
  * @return Size of bytes that were successfully read.
  */
 template <PartialIO PartialIOStatus>
-std::size_t posix_host_write(int fd, void const* buf, std::size_t size, std::size_t file_offset)
+std::size_t posix_host_write(int fd_direct_off,
+                             void const* buf,
+                             std::size_t size,
+                             std::size_t file_offset,
+                             int fd_direct_on = -1)
 {
   KVIKIO_NVTX_FUNC_RANGE(size);
+
+  auto cur_fd_direct_on{-1};
+  if (fd_direct_on != -1 && defaults::auto_direct_io_write()) { cur_fd_direct_on = fd_direct_on; }
+
   return detail::posix_host_io<IOOperationType::WRITE, PartialIOStatus>(
-    fd, buf, size, convert_size2off(file_offset));
+    fd_direct_off, buf, size, convert_size2off(file_offset), cur_fd_direct_on);
 }
 
 /**
@@ -205,18 +328,20 @@ std::size_t posix_host_write(int fd, void const* buf, std::size_t size, std::siz
  * If `size` or `file_offset` isn't aligned with `page_size` then
  * `fd` cannot have been opened with the `O_DIRECT` flag.
  *
- * @param fd File descriptor
+ * @param fd_direct_off File descriptor without Direct I/O.
  * @param devPtr_base Base address of buffer in device memory.
  * @param size Size in bytes to read.
  * @param file_offset Offset in the file to read from.
  * @param devPtr_offset Offset relative to the `devPtr_base` pointer to read into.
+ * @param fd_direct_on Optional file descriptor with Direct I/O.
  * @return Size of bytes that were successfully read.
  */
-std::size_t posix_device_read(int fd,
+std::size_t posix_device_read(int fd_direct_off,
                               void const* devPtr_base,
                               std::size_t size,
                               std::size_t file_offset,
-                              std::size_t devPtr_offset);
+                              std::size_t devPtr_offset,
+                              int fd_direct_on = -1);
 
 /**
  * @brief Write device memory to disk using POSIX
@@ -224,17 +349,19 @@ std::size_t posix_device_read(int fd,
  * If `size` or `file_offset` isn't aligned with `page_size` then
  * `fd` cannot have been opened with the `O_DIRECT` flag.
  *
- * @param fd File descriptor
+ * @param fd_direct_off File descriptor without Direct I/O.
  * @param devPtr_base Base address of buffer in device memory.
  * @param size Size in bytes to write.
  * @param file_offset Offset in the file to write to.
  * @param devPtr_offset Offset relative to the `devPtr_base` pointer to write into.
+ * @param fd_direct_on Optional file descriptor with Direct I/O.
  * @return Size of bytes that were successfully written.
  */
-std::size_t posix_device_write(int fd,
+std::size_t posix_device_write(int fd_direct_off,
                                void const* devPtr_base,
                                std::size_t size,
                                std::size_t file_offset,
-                               std::size_t devPtr_offset);
+                               std::size_t devPtr_offset,
+                               int fd_direct_on = -1);
 
 }  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/detail/utils.hpp b/cpp/include/kvikio/detail/utils.hpp
index 32261ed676..ae81c03dc6 100644
--- a/cpp/include/kvikio/detail/utils.hpp
+++ b/cpp/include/kvikio/detail/utils.hpp
@@ -15,7 +15,7 @@ namespace kvikio::detail {
  * @param alignment Must be a power of 2
  * @return Rounded result
  */
-[[nodiscard]] std::size_t align_up(std::size_t value, std::size_t alignment) noexcept;
+[[nodiscard]] std::size_t align_up(std::size_t value, std::size_t alignment);
 
 /**
  * @brief Align the address `addr` up to the boundary of `alignment`
@@ -24,7 +24,7 @@ namespace kvikio::detail {
  * @param alignment Must be a power of 2
  * @return Aligned address
  */
-[[nodiscard]] void* align_up(void* addr, std::size_t alignment) noexcept;
+[[nodiscard]] void* align_up(void* addr, std::size_t alignment);
 
 /**
  * @brief Round down `value` to multiples of `alignment`
@@ -33,7 +33,7 @@ namespace kvikio::detail {
  * @param alignment Must be a power of 2
  * @return Rounded result
  */
-[[nodiscard]] std::size_t align_down(std::size_t value, std::size_t alignment) noexcept;
+[[nodiscard]] std::size_t align_down(std::size_t value, std::size_t alignment);
 
 /**
  * @brief Align the address `addr` down to the boundary of `alignment`
@@ -42,6 +42,24 @@ namespace kvikio::detail {
  * @param alignment Must be a power of 2
  * @return Aligned address
  */
-[[nodiscard]] void* align_down(void* addr, std::size_t alignment) noexcept;
+[[nodiscard]] void* align_down(void* addr, std::size_t alignment);
+
+/**
+ * @brief Whether `value` is a multiple of `alignment`
+ *
+ * @param value Value to be checked
+ * @param alignment Must be a power of 2
+ * @return Boolean answer
+ */
+bool is_aligned(std::size_t value, std::size_t alignment);
+
+/**
+ * @brief Whether the address `addr` is a multiple of `alignment`
+ *
+ * @param addr Address to be checked
+ * @param alignment Must be a power of 2
+ * @return Boolean answer
+ */
+bool is_aligned(void* addr, std::size_t alignment);
 
 }  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index cc71b58f6d..e74b8e3e20 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -430,6 +430,16 @@ class FileHandle {
    * @return The associated compatibility mode manager.
    */
   const CompatModeManager& get_compat_mode_manager() const noexcept;
+
+  /**
+   * @brief Whether Direct I/O is supported on this file handle. This is determined by two factors:
+   * - Direct I/O support from the operating system and the file system
+   * - KvikIO global setting `auto_direct_io_read` and `auto_direct_io_write`. If both values are
+   * false, Direct I/O will not be supported on this file handle.
+   *
+   * @return Boolean answer.
+   */
+  bool is_direct_io_supported() const noexcept;
 };
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index 8ec968aa92..22c8276817 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -82,6 +82,8 @@ class cudaAPI {
   decltype(cuInit)* Init{nullptr};
   decltype(cuMemHostAlloc)* MemHostAlloc{nullptr};
   decltype(cuMemFreeHost)* MemFreeHost{nullptr};
+  decltype(cuMemHostRegister)* MemHostRegister{nullptr};
+  decltype(cuMemHostUnregister)* MemHostUnregister{nullptr};
   decltype(cuMemcpyHtoDAsync)* MemcpyHtoDAsync{nullptr};
   decltype(cuMemcpyDtoHAsync)* MemcpyDtoHAsync{nullptr};
 
diff --git a/cpp/src/bounce_buffer.cpp b/cpp/src/bounce_buffer.cpp
index b40ae82ad2..d2f2c92b9c 100644
--- a/cpp/src/bounce_buffer.cpp
+++ b/cpp/src/bounce_buffer.cpp
@@ -9,106 +9,168 @@
 #include <kvikio/bounce_buffer.hpp>
 #include <kvikio/defaults.hpp>
 #include <kvikio/detail/nvtx.hpp>
+#include <kvikio/detail/utils.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/shim/cuda.hpp>
 
 namespace kvikio {
 
-AllocRetain::Alloc::Alloc(AllocRetain* manager, void* alloc, std::size_t size)
-  : _manager(manager), _alloc{alloc}, _size{size}
+void* PageAlignedAllocator::allocate(std::size_t size)
+{
+  void* buffer{};
+  auto const page_size    = get_page_size();
+  auto const aligned_size = detail::align_up(size, page_size);
+  buffer                  = std::aligned_alloc(page_size, aligned_size);
+  return buffer;
+}
+
+void PageAlignedAllocator::deallocate(void* buffer, std::size_t /*size*/) { std::free(buffer); }
+
+void* CudaPinnedAllocator::allocate(std::size_t size)
+{
+  void* buffer{};
+
+  // If no available allocation, allocate and register a new one
+  // Allocate page-locked host memory
+  // Under unified addressing, host memory allocated this way is automatically portable and
+  // mapped.
+  CUDA_DRIVER_TRY(cudaAPI::instance().MemHostAlloc(&buffer, size, CU_MEMHOSTALLOC_PORTABLE));
+
+  return buffer;
+}
+void CudaPinnedAllocator::deallocate(void* buffer, std::size_t /*size*/)
+{
+  CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(buffer));
+}
+
+void* CudaPageAlignedPinnedAllocator::allocate(std::size_t size)
+{
+  void* buffer{};
+  auto const page_size    = get_page_size();
+  auto const aligned_size = detail::align_up(size, page_size);
+  buffer                  = std::aligned_alloc(page_size, aligned_size);
+  KVIKIO_EXPECT(buffer != nullptr, "Aligned allocation failed");
+  CUDA_DRIVER_TRY(
+    cudaAPI::instance().MemHostRegister(buffer, aligned_size, CU_MEMHOSTALLOC_PORTABLE));
+  return buffer;
+}
+
+void CudaPageAlignedPinnedAllocator::deallocate(void* buffer, std::size_t /*size*/)
+{
+  CUDA_DRIVER_TRY(cudaAPI::instance().MemHostUnregister(buffer));
+  std::free(buffer);
+}
+
+template <typename Allocator>
+BounceBufferPool<Allocator>::Buffer::Buffer(BounceBufferPool<Allocator>* pool,
+                                            void* buffer,
+                                            std::size_t size)
+  : _pool(pool), _buffer{buffer}, _size{size}
 {
-  KVIKIO_NVTX_FUNC_RANGE();
 }
 
-AllocRetain::Alloc::~Alloc() noexcept
+template <typename Allocator>
+BounceBufferPool<Allocator>::Buffer::~Buffer() noexcept
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  _manager->put(_alloc, _size);
+  _pool->put(_buffer, _size);
 }
 
-void* AllocRetain::Alloc::get() noexcept
+template <typename Allocator>
+void* BounceBufferPool<Allocator>::Buffer::get() noexcept
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  return _alloc;
+  return _buffer;
 }
 
-void* AllocRetain::Alloc::get(std::ptrdiff_t offset) noexcept
+template <typename Allocator>
+void* BounceBufferPool<Allocator>::Buffer::get(std::ptrdiff_t offset) noexcept
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  return static_cast<char*>(_alloc) + offset;
+  return static_cast<char*>(_buffer) + offset;
 }
 
-std::size_t AllocRetain::Alloc::size() noexcept { return _size; }
+template <typename Allocator>
+std::size_t BounceBufferPool<Allocator>::Buffer::size() noexcept
+{
+  return _size;
+}
 
-std::size_t AllocRetain::_clear()
+template <typename Allocator>
+std::size_t BounceBufferPool<Allocator>::_clear()
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  std::size_t ret = _free_allocs.size() * _size;
-  while (!_free_allocs.empty()) {
-    CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(_free_allocs.top()));
-    _free_allocs.pop();
+  std::size_t ret = _free_buffers.size() * _buffer_size;
+  while (!_free_buffers.empty()) {
+    _allocator.deallocate(_free_buffers.top(), _buffer_size);
+    _free_buffers.pop();
   }
   return ret;
 }
 
-void AllocRetain::_ensure_alloc_size()
+template <typename Allocator>
+void BounceBufferPool<Allocator>::_ensure_buffer_size()
 {
   KVIKIO_NVTX_FUNC_RANGE();
   auto const bounce_buffer_size = defaults::bounce_buffer_size();
-  if (_size != bounce_buffer_size) {
+  if (_buffer_size != bounce_buffer_size) {
     _clear();
-    _size = bounce_buffer_size;
+    _buffer_size = bounce_buffer_size;
   }
 }
 
-AllocRetain::Alloc AllocRetain::get()
+template <typename Allocator>
+BounceBufferPool<Allocator>::Buffer BounceBufferPool<Allocator>::get()
 {
   KVIKIO_NVTX_FUNC_RANGE();
   std::lock_guard const lock(_mutex);
-  _ensure_alloc_size();
+  _ensure_buffer_size();
 
   // Check if we have an allocation available
-  if (!_free_allocs.empty()) {
-    void* ret = _free_allocs.top();
-    _free_allocs.pop();
-    return Alloc(this, ret, _size);
+  if (!_free_buffers.empty()) {
+    void* ret = _free_buffers.top();
+    _free_buffers.pop();
+    return Buffer(this, ret, _buffer_size);
   }
 
-  // If no available allocation, allocate and register a new one
-  void* alloc{};
-  // Allocate page-locked host memory
-  // Under unified addressing, host memory allocated this way is automatically portable and mapped.
-  CUDA_DRIVER_TRY(cudaAPI::instance().MemHostAlloc(&alloc, _size, CU_MEMHOSTALLOC_PORTABLE));
-  return Alloc(this, alloc, _size);
+  auto* buffer = _allocator.allocate(_buffer_size);
+  return Buffer(this, buffer, _buffer_size);
 }
 
-void AllocRetain::put(void* alloc, std::size_t size)
+template <typename Allocator>
+void BounceBufferPool<Allocator>::put(void* buffer, std::size_t size)
 {
   KVIKIO_NVTX_FUNC_RANGE();
   std::lock_guard const lock(_mutex);
-  _ensure_alloc_size();
+  _ensure_buffer_size();
 
-  // If the size of `alloc` matches the sizes of the retained allocations,
+  // If the size of `buffer` matches the sizes of the retained allocations,
   // it is added to the set of free allocation otherwise it is freed.
-  if (size == _size) {
-    _free_allocs.push(alloc);
+  if (size == _buffer_size) {
+    _free_buffers.push(buffer);
   } else {
-    CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(alloc));
+    _allocator.deallocate(buffer, size);
   }
 }
 
-std::size_t AllocRetain::clear()
+template <typename Allocator>
+std::size_t BounceBufferPool<Allocator>::clear()
 {
   KVIKIO_NVTX_FUNC_RANGE();
   std::lock_guard const lock(_mutex);
   return _clear();
 }
 
-AllocRetain& AllocRetain::instance()
+template <typename Allocator>
+BounceBufferPool<Allocator>& BounceBufferPool<Allocator>::instance()
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  static AllocRetain _instance;
+  static BounceBufferPool _instance;
   return _instance;
 }
 
+// Explicit instantiations
+template class BounceBufferPool<PageAlignedAllocator>;
+template class BounceBufferPool<CudaPinnedAllocator>;
+template class BounceBufferPool<CudaPageAlignedPinnedAllocator>;
 }  // namespace kvikio
diff --git a/cpp/src/compat_mode_manager.cpp b/cpp/src/compat_mode_manager.cpp
index 7d794f30f3..eb601af375 100644
--- a/cpp/src/compat_mode_manager.cpp
+++ b/cpp/src/compat_mode_manager.cpp
@@ -46,14 +46,21 @@ CompatModeManager::CompatModeManager(std::string const& file_path,
   file_handle->_file_direct_off.open(file_path, flags, false, mode);
   _is_compat_mode_preferred = defaults::is_compat_mode_preferred(compat_mode_requested_v);
 
-  // Nothing to do in compatibility mode
-  if (_is_compat_mode_preferred) { return; }
+  if (_is_compat_mode_preferred && !defaults::auto_direct_io_read() &&
+      !defaults::auto_direct_io_write()) {
+    return;
+  }
 
+  // Try to open the file with the O_DIRECT flag.
   try {
     file_handle->_file_direct_on.open(file_path, flags, true, mode);
   } catch (...) {
-    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
-    if (compat_mode_requested_v == CompatMode::AUTO) {
+    // Handle different compatibility mode requests when Direct I/O is not supported
+    if (compat_mode_requested_v == CompatMode::ON) {
+      // _file_direct_on.fd() == -1, so Direct I/O will never be used on this file handle
+      return;
+    } else if (compat_mode_requested_v == CompatMode::AUTO) {
+      // Fall back to compatibility mode
       _is_compat_mode_preferred = true;
     } else {  // CompatMode::OFF
       throw;
diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp
index 265263b16e..f827ef6cf5 100644
--- a/cpp/src/defaults.cpp
+++ b/cpp/src/defaults.cpp
@@ -5,7 +5,6 @@
 
 #include <cstddef>
 #include <cstdlib>
-#include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -136,6 +135,12 @@ defaults::defaults()
     _http_status_codes =
       getenv_or("KVIKIO_HTTP_STATUS_CODES", std::vector<int>{429, 500, 502, 503, 504});
   }
+
+  // Determine the default value of `auto_direct_io_read` and `auto_direct_io_write`
+  {
+    _auto_direct_io_read  = getenv_or("KVIKIO_AUTO_DIRECT_IO_READ", false);
+    _auto_direct_io_write = getenv_or("KVIKIO_AUTO_DIRECT_IO_WRITE", true);
+  }
 }
 
 defaults* defaults::instance()
@@ -226,4 +231,11 @@ void defaults::set_http_timeout(long timeout_seconds)
   instance()->_http_timeout = timeout_seconds;
 }
 
+bool defaults::auto_direct_io_read() { return instance()->_auto_direct_io_read; }
+
+void defaults::set_auto_direct_io_read(bool flag) { instance()->_auto_direct_io_read = flag; }
+
+bool defaults::auto_direct_io_write() { return instance()->_auto_direct_io_write; }
+
+void defaults::set_auto_direct_io_write(bool flag) { instance()->_auto_direct_io_write = flag; }
 }  // namespace kvikio
diff --git a/cpp/src/detail/posix_io.cpp b/cpp/src/detail/posix_io.cpp
index c4131f7872..1ddafa4d3e 100644
--- a/cpp/src/detail/posix_io.cpp
+++ b/cpp/src/detail/posix_io.cpp
@@ -46,26 +46,40 @@ CUstream StreamsByThread::get()
   return get(ctx, std::this_thread::get_id());
 }
 
-std::size_t posix_device_read(int fd,
+std::size_t posix_device_read(int fd_direct_off,
                               void const* devPtr_base,
                               std::size_t size,
                               std::size_t file_offset,
-                              std::size_t devPtr_offset)
+                              std::size_t devPtr_offset,
+                              int fd_direct_on)
 {
   KVIKIO_NVTX_FUNC_RANGE(size);
-  return detail::posix_device_io<IOOperationType::READ>(
-    fd, devPtr_base, size, file_offset, devPtr_offset);
+  // If Direct I/O is supported and requested
+  if (fd_direct_on != -1 && defaults::auto_direct_io_read()) {
+    return detail::posix_device_io<IOOperationType::READ, CudaPageAlignedPinnedBounceBufferPool>(
+      fd_direct_off, devPtr_base, size, file_offset, devPtr_offset, fd_direct_on);
+  } else {
+    return detail::posix_device_io<IOOperationType::READ>(
+      fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
+  }
 }
 
-std::size_t posix_device_write(int fd,
+std::size_t posix_device_write(int fd_direct_off,
                                void const* devPtr_base,
                                std::size_t size,
                                std::size_t file_offset,
-                               std::size_t devPtr_offset)
+                               std::size_t devPtr_offset,
+                               int fd_direct_on)
 {
   KVIKIO_NVTX_FUNC_RANGE(size);
-  return detail::posix_device_io<IOOperationType::WRITE>(
-    fd, devPtr_base, size, file_offset, devPtr_offset);
+  // If Direct I/O is supported and requested
+  if (fd_direct_on != -1 && defaults::auto_direct_io_write()) {
+    return detail::posix_device_io<IOOperationType::WRITE, CudaPageAlignedPinnedBounceBufferPool>(
+      fd_direct_off, devPtr_base, size, file_offset, devPtr_offset, fd_direct_on);
+  } else {
+    return detail::posix_device_io<IOOperationType::WRITE>(
+      fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
+  }
 }
 
 }  // namespace kvikio::detail
diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp
index 226fcd41aa..30f1cf335a 100644
--- a/cpp/src/file_handle.cpp
+++ b/cpp/src/file_handle.cpp
@@ -9,7 +9,6 @@
 #include <unistd.h>
 #include <cstddef>
 #include <cstdlib>
-#include <stdexcept>
 #include <utility>
 
 #include <kvikio/compat_mode.hpp>
@@ -106,7 +105,7 @@ std::size_t FileHandle::read(void* devPtr_base,
   KVIKIO_NVTX_FUNC_RANGE(size);
   if (get_compat_mode_manager().is_compat_mode_preferred()) {
     return detail::posix_device_read(
-      _file_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset);
+      _file_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset, _file_direct_on.fd());
   }
   if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); }
 
@@ -130,7 +129,7 @@ std::size_t FileHandle::write(void const* devPtr_base,
 
   if (get_compat_mode_manager().is_compat_mode_preferred()) {
     return detail::posix_device_write(
-      _file_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset);
+      _file_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset, _file_direct_on.fd());
   }
   if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); }
 
@@ -160,7 +159,7 @@ std::future<std::size_t> FileHandle::pread(void* buf,
                      std::size_t hostPtr_offset) -> std::size_t {
       char* buf = static_cast<char*>(hostPtr_base) + hostPtr_offset;
       return detail::posix_host_read<detail::PartialIO::NO>(
-        _file_direct_off.fd(), buf, size, file_offset);
+        _file_direct_off.fd(), buf, size, file_offset, _file_direct_on.fd());
     };
 
     return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color);
@@ -171,7 +170,8 @@ std::future<std::size_t> FileHandle::pread(void* buf,
   // Shortcut that circumvent the threadpool and use the POSIX backend directly.
   if (size < gds_threshold) {
     PushAndPopContext c(ctx);
-    auto bytes_read = detail::posix_device_read(_file_direct_off.fd(), buf, size, file_offset, 0);
+    auto bytes_read = detail::posix_device_read(
+      _file_direct_off.fd(), buf, size, file_offset, 0, _file_direct_on.fd());
     // Maintain API consistency while making this trivial case synchronous.
     // The result in the future is immediately available after the call.
     return make_ready_future(bytes_read);
@@ -212,7 +212,7 @@ std::future<std::size_t> FileHandle::pwrite(void const* buf,
                      std::size_t hostPtr_offset) -> std::size_t {
       char const* buf = static_cast<char const*>(hostPtr_base) + hostPtr_offset;
       return detail::posix_host_write<detail::PartialIO::NO>(
-        _file_direct_off.fd(), buf, size, file_offset);
+        _file_direct_off.fd(), buf, size, file_offset, _file_direct_on.fd());
     };
 
     return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color);
@@ -223,7 +223,8 @@ std::future<std::size_t> FileHandle::pwrite(void const* buf,
   // Shortcut that circumvent the threadpool and use the POSIX backend directly.
   if (size < gds_threshold) {
     PushAndPopContext c(ctx);
-    auto bytes_write = detail::posix_device_write(_file_direct_off.fd(), buf, size, file_offset, 0);
+    auto bytes_write = detail::posix_device_write(
+      _file_direct_off.fd(), buf, size, file_offset, 0, _file_direct_on.fd());
     // Maintain API consistency while making this trivial case synchronous.
     // The result in the future is immediately available after the call.
     return make_ready_future(bytes_write);
@@ -323,4 +324,6 @@ const CompatModeManager& FileHandle::get_compat_mode_manager() const noexcept
   return _compat_mode_manager;
 }
 
+bool FileHandle::is_direct_io_supported() const noexcept { return _file_direct_on.fd() != -1; }
+
 }  // namespace kvikio
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index d88e4b9d94..a720fa8929 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -13,14 +13,15 @@
 #include <type_traits>
 #include <unordered_map>
 
+#include <kvikio/bounce_buffer.hpp>
 #include <kvikio/detail/nvtx.hpp>
 #include <kvikio/detail/parallel_operation.hpp>
 #include <kvikio/detail/posix_io.hpp>
 #include <kvikio/detail/utils.hpp>
 #include <kvikio/error.hpp>
+#include <kvikio/file_utils.hpp>
 #include <kvikio/mmap.hpp>
 #include <kvikio/utils.hpp>
-#include "kvikio/file_utils.hpp"
 
 namespace kvikio {
 
@@ -230,9 +231,9 @@ void read_impl(void* dst_buf,
     src_devptr = convert_void2deviceptr(src);
     h2d_batch_cpy_sync(dst_devptr, src_devptr, size, stream);
   } else {
-    auto alloc = AllocRetain::instance().get();
-    std::memcpy(alloc.get(), src, size);
-    src_devptr = convert_void2deviceptr(alloc.get());
+    auto bounce_buffer = CudaPinnedBounceBufferPool::instance().get();
+    std::memcpy(bounce_buffer.get(), src, size);
+    src_devptr = convert_void2deviceptr(bounce_buffer.get());
     h2d_batch_cpy_sync(dst_devptr, src_devptr, size, stream);
   }
 }
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index f3f0ef996e..6004515b76 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -14,6 +14,7 @@
 #include <stdexcept>
 #include <string>
 
+#include <kvikio/bounce_buffer.hpp>
 #include <kvikio/defaults.hpp>
 #include <kvikio/detail/env.hpp>
 #include <kvikio/detail/nvtx.hpp>
@@ -37,11 +38,11 @@ namespace {
  * @note Is not thread-safe.
  */
 class BounceBufferH2D {
-  CUstream _stream;                 // The CUDA stream to use.
-  CUdeviceptr _dev;                 // The output device buffer.
-  AllocRetain::Alloc _host_buffer;  // The host buffer to bounce data on.
-  std::ptrdiff_t _dev_offset{0};    // Number of bytes written to `_dev`.
-  std::ptrdiff_t _host_offset{0};   // Number of bytes written to `_host` (resets on flush).
+  CUstream _stream;                                 // The CUDA stream to use.
+  CUdeviceptr _dev;                                 // The output device buffer.
+  CudaPinnedBounceBufferPool::Buffer _host_buffer;  // The host buffer to bounce data on.
+  std::ptrdiff_t _dev_offset{0};                    // Number of bytes written to `_dev`.
+  std::ptrdiff_t _host_offset{0};  // Number of bytes written to `_host` (resets on flush).
 
  public:
   /**
@@ -53,7 +54,7 @@ class BounceBufferH2D {
   BounceBufferH2D(CUstream stream, void* device_buffer)
     : _stream{stream},
       _dev{convert_void2deviceptr(device_buffer)},
-      _host_buffer{AllocRetain::instance().get()}
+      _host_buffer{CudaPinnedBounceBufferPool::instance().get()}
   {
     KVIKIO_NVTX_FUNC_RANGE();
   }
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
index 94402659c5..693dd1bd2a 100644
--- a/cpp/src/shim/cuda.cpp
+++ b/cpp/src/shim/cuda.cpp
@@ -19,6 +19,8 @@ cudaAPI::cudaAPI()
   // the name of the symbol through cude.h.
   get_symbol(MemHostAlloc, lib, KVIKIO_STRINGIFY(cuMemHostAlloc));
   get_symbol(MemFreeHost, lib, KVIKIO_STRINGIFY(cuMemFreeHost));
+  get_symbol(MemHostRegister, lib, KVIKIO_STRINGIFY(cuMemHostRegister));
+  get_symbol(MemHostUnregister, lib, KVIKIO_STRINGIFY(cuMemHostUnregister));
   get_symbol(MemcpyHtoDAsync, lib, KVIKIO_STRINGIFY(cuMemcpyHtoDAsync));
   get_symbol(MemcpyDtoHAsync, lib, KVIKIO_STRINGIFY(cuMemcpyDtoHAsync));
   get_symbol(PointerGetAttribute, lib, KVIKIO_STRINGIFY(cuPointerGetAttribute));
diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp
index 889b283e19..076cec5968 100644
--- a/cpp/src/utils.cpp
+++ b/cpp/src/utils.cpp
@@ -169,27 +169,49 @@ std::tuple<void*, std::size_t, std::size_t> get_alloc_info(void const* devPtr, C
 
 namespace detail {
 
-std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
+std::size_t align_up(std::size_t value, std::size_t alignment)
 {
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
   return (value + alignment - 1) & ~(alignment - 1);
 }
 
-void* align_up(void* addr, std::size_t alignment) noexcept
+void* align_up(void* addr, std::size_t alignment)
 {
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
   auto res = (reinterpret_cast<uintptr_t>(addr) + alignment - 1) & ~(alignment - 1);
   return reinterpret_cast<void*>(res);
 }
 
-std::size_t align_down(std::size_t value, std::size_t alignment) noexcept
+std::size_t align_down(std::size_t value, std::size_t alignment)
 {
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
   return value & ~(alignment - 1);
 }
 
-void* align_down(void* addr, std::size_t alignment) noexcept
+void* align_down(void* addr, std::size_t alignment)
 {
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
   auto res = reinterpret_cast<uintptr_t>(addr) & ~(alignment - 1);
   return reinterpret_cast<void*>(res);
 }
 
+bool is_aligned(std::size_t value, std::size_t alignment)
+{
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
+  return (value & (alignment - 1)) == 0;
+}
+
+bool is_aligned(void* addr, std::size_t alignment)
+{
+  KVIKIO_EXPECT((alignment > 0) && ((alignment & (alignment - 1)) == 0),
+                "Alignment must be a power of 2");
+  return (reinterpret_cast<uintptr_t>(addr) & (alignment - 1)) == 0;
+}
+
 }  // namespace detail
 }  // namespace kvikio
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 11ae47306a..2effd6a559 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -61,7 +61,7 @@ function(kvikio_add_test)
   )
 endfunction()
 
-kvikio_add_test(NAME BASIC_IO_TEST SOURCES test_basic_io.cpp)
+kvikio_add_test(NAME BASIC_IO_TEST SOURCES test_basic_io.cpp utils/env.cpp)
 
 kvikio_add_test(NAME DEFAULTS_TEST SOURCES test_defaults.cpp utils/env.cpp)
 
diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp
index 8aac777190..aeda7051d0 100644
--- a/cpp/tests/test_basic_io.cpp
+++ b/cpp/tests/test_basic_io.cpp
@@ -3,8 +3,19 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <fcntl.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/utils.hpp>
+#include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
+#include <kvikio/file_utils.hpp>
+#include <kvikio/utils.hpp>
+
+#include "utils/env.hpp"
 #include "utils/utils.hpp"
 
 using namespace kvikio::test;
@@ -86,3 +97,130 @@ TEST_F(BasicIOTest, write_read_async)
 
   CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamDestroy(stream));
 }
+
+class DirectIOTest : public testing::Test {
+ public:
+  using value_type = std::int64_t;
+
+ protected:
+  void SetUp() override
+  {
+    TempDir tmp_dir{false};
+    _filepath = tmp_dir.path() / "test";
+
+    // Skip the fixture if Direct I/O is not supported
+    try {
+      [[maybe_unused]] auto fd =
+        kvikio::open_fd(_filepath.c_str(), "w", true /* o_direct */, kvikio::FileHandle::m644);
+    } catch (...) {
+      GTEST_SKIP() << "Direct I/O is not supported for the test file: " << _filepath;
+    }
+
+    // Create a sequence of numbers as a ground truth
+    _num_elements = 1ULL * 1024ULL * 1024ULL + 1234ULL;
+    _total_bytes  = _num_elements * sizeof(value_type);
+    _ground_truth.resize(_num_elements);
+    std::iota(_ground_truth.begin(), _ground_truth.end(), 0);
+  }
+
+  void TearDown() override {}
+
+  std::filesystem::path _filepath;
+  std::size_t _num_elements{};
+  std::vector<value_type> _ground_truth;
+  std::size_t _total_bytes{};
+
+ public:
+  static std::size_t constexpr page_size{4096};
+  using AlignedAllocator   = kvikio::test::CustomHostAllocator<value_type, 4096>;
+  using UnalignedAllocator = kvikio::test::CustomHostAllocator<value_type, 4096, 123>;
+};
+
+TEST_F(DirectIOTest, pwrite)
+{
+  // Create host buffers (page-aligned and unaligned) and device buffer for testing
+  std::vector<value_type, AlignedAllocator> aligned_host_buf(_num_elements);
+  std::vector<value_type, UnalignedAllocator> unaligned_host_buf(_num_elements);
+  DevBuffer<value_type> dev_buf(_num_elements);
+
+  std::array<void*, 3> buffers{aligned_host_buf.data(), unaligned_host_buf.data(), dev_buf.ptr};
+  std::array auto_direct_io_statuses{true, false};
+
+  for (const auto& flag : auto_direct_io_statuses) {
+    std::string flag_str = flag ? "true" : "false";
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_AUTO_DIRECT_IO_WRITE", flag_str}};
+    for (const auto buf : buffers) {
+      // Fill up the buffer and write data to file (using KvikIO's pwrite)
+      {
+        if (kvikio::is_host_memory(buf)) {
+          std::memcpy(buf, _ground_truth.data(), _total_bytes);
+        } else {
+          KVIKIO_CHECK_CUDA(
+            cudaMemcpy(buf, _ground_truth.data(), _total_bytes, cudaMemcpyKind::cudaMemcpyDefault));
+        }
+
+        kvikio::FileHandle f(_filepath, "w");
+        auto fut               = f.pwrite(buf, _total_bytes);
+        auto num_bytes_written = fut.get();
+        EXPECT_EQ(num_bytes_written, _total_bytes);
+      }
+
+      // Read data from file (using Linux syscall) and check correctness
+      {
+        auto fd = open(_filepath.c_str(), O_RDONLY);
+        SYSCALL_CHECK(fd, "File cannot be opened");
+
+        std::vector<value_type> result(_ground_truth.size(), 0);
+        SYSCALL_CHECK(read(fd, result.data(), _total_bytes));
+        EXPECT_EQ(result, _ground_truth);
+
+        SYSCALL_CHECK(close(fd));
+      }
+    }
+  }
+}
+
+TEST_F(DirectIOTest, pread)
+{
+  // Write ground truth data to file (using Linux syscall)
+  {
+    auto fd = open(_filepath.c_str(), O_WRONLY | O_CREAT | O_TRUNC, kvikio::FileHandle::m644);
+    SYSCALL_CHECK(fd, "File cannot be opened");
+    SYSCALL_CHECK(write(fd, _ground_truth.data(), _total_bytes));
+    SYSCALL_CHECK(close(fd));
+  }
+
+  // Create host buffers (page-aligned and unaligned) and device buffer for testing
+  std::vector<value_type, AlignedAllocator> aligned_host_buf(_num_elements);
+  std::vector<value_type, UnalignedAllocator> unaligned_host_buf(_num_elements);
+  DevBuffer<value_type> dev_buf(_num_elements);
+
+  std::array<void*, 3> buffers{aligned_host_buf.data(), unaligned_host_buf.data(), dev_buf.ptr};
+  std::array auto_direct_io_statuses{true, false};
+
+  for (const auto& flag : auto_direct_io_statuses) {
+    std::string flag_str = flag ? "true" : "false";
+    kvikio::test::EnvVarContext env_var_ctx{{"KVIKIO_AUTO_DIRECT_IO_READ", flag_str}};
+    for (const auto buf : buffers) {
+      // Read data from file (using KvikIO's pread) and check correctness
+      {
+        kvikio::FileHandle f(_filepath, "r");
+        auto fut            = f.pread(buf, _total_bytes);
+        auto num_bytes_read = fut.get();
+        EXPECT_EQ(num_bytes_read, _total_bytes);
+
+        if (kvikio::is_host_memory(buf)) {
+          auto* buf_helper = reinterpret_cast<value_type*>(buf);
+          for (std::size_t i = 0; i < _num_elements; ++i) {
+            EXPECT_EQ(buf_helper[i], _ground_truth[i]);
+          }
+        } else {
+          std::vector<value_type> result(_num_elements);
+          KVIKIO_CHECK_CUDA(
+            cudaMemcpy(result.data(), buf, _total_bytes, cudaMemcpyKind::cudaMemcpyDefault));
+          EXPECT_EQ(result, _ground_truth);
+        }
+      }
+    }
+  }
+}
diff --git a/cpp/tests/utils/utils.hpp b/cpp/tests/utils/utils.hpp
index bb8ea80d82..92e6296e13 100644
--- a/cpp/tests/utils/utils.hpp
+++ b/cpp/tests/utils/utils.hpp
@@ -178,4 +178,66 @@ inline void expect_equal(DevBuffer<T> const& a, DevBuffer<T> const& b)
   }
 }
 
+/**
+ * @brief Custom allocator with alignment and element offset support, suitable for use with standard
+ * containers like std::vector.
+ *
+ * @tparam T The type of elements to allocate
+ * @tparam ali Alignment requirement in bytes (must be a power of 2)
+ * @tparam element_offset Number of elements to offset the returned pointer (default: 0)
+ *
+ * Example usage:
+ * @code
+ * // Allocator with 4096-byte alignment, no offset
+ * std::vector<int, CustomHostAllocator<int, 4096>> vec;
+ *
+ * // Allocator with 64-byte alignment and 10-element offset (i.e. 80-byte offset)
+ * std::vector<double, CustomHostAllocator<double, 64, 10>> offset_vec;
+ * @endcode
+ */
+template <class T, std::size_t ali, std::size_t element_offset = 0>
+struct CustomHostAllocator {
+  using value_type      = T;
+  CustomHostAllocator() = default;
+
+  template <class U>
+  constexpr CustomHostAllocator(const CustomHostAllocator<U, ali, element_offset>&) noexcept
+  {
+  }
+
+  template <class U>
+  struct rebind {
+    using other = CustomHostAllocator<U, ali, element_offset>;
+  };
+
+  [[nodiscard]] T* allocate(std::size_t num_elements)
+  {
+    if (num_elements > std::numeric_limits<std::size_t>::max() / sizeof(T)) {
+      throw std::bad_array_new_length();
+    }
+
+    auto total_bytes = (num_elements + element_offset) * sizeof(T);
+    total_bytes      = (total_bytes + ali - 1) & ~(ali - 1);
+
+    if (auto* ptr = static_cast<T*>(std::aligned_alloc(ali, total_bytes))) {
+      auto dst_ptr = reinterpret_cast<std::byte*>(ptr) + element_offset * sizeof(T);
+      return reinterpret_cast<T*>(dst_ptr);
+    }
+
+    throw std::bad_alloc();
+  }
+
+  void deallocate(T* ptr, [[maybe_unused]] std::size_t n) noexcept
+  {
+    auto src_ptr = reinterpret_cast<std::byte*>(ptr) - element_offset * sizeof(T);
+    std::free(src_ptr);
+  }
+
+  template <class U, std::size_t ali_u, std::size_t element_offset_u>
+  bool operator==(const CustomHostAllocator<U, ali_u, element_offset_u>&) const noexcept
+  {
+    return ali == ali_u && element_offset == element_offset_u;
+  }
+};
+
 }  // namespace kvikio::test
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
index e707031720..c96b2804a8 100644
--- a/docs/source/runtime_settings.rst
+++ b/docs/source/runtime_settings.rst
@@ -17,42 +17,42 @@ Under ``AUTO``, KvikIO falls back to the compatibility mode:
   * when running in Windows Subsystem for Linux (WSL).
   * when ``/run/udev`` isn't readable, which typically happens when running inside a docker image not launched with ``--volume /run/udev:/run/udev:ro``.
 
-This setting can also be programmatically accessed using :py:func:`kvikio.defaults.compat_mode` (getter) and :py:func:`kvikio.defaults.set` (setter).
+This setting can also be programmatically accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 Thread Pool ``KVIKIO_NTHREADS``
 -------------------------------
 KvikIO can use multiple threads for IO automatically. Set the environment variable ``KVIKIO_NTHREADS`` to the number of threads in the thread pool. If not set, the default value is 1.
 
-This setting can also be accessed using :py:func:`kvikio.defaults.num_threads` (getter) and :py:func:`kvikio.defaults.set`  (setter).
+This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set`  (setter).
 
 Task Size ``KVIKIO_TASK_SIZE``
 ------------------------------
 KvikIO splits parallel IO operations into multiple tasks. Set the environment variable ``KVIKIO_TASK_SIZE`` to the maximum task size (in bytes). If not set, the default value is 4194304 (4 MiB).
 
-This setting can also be accessed using :py:func:`kvikio.defaults.task_size` (getter) and :py:func:`kvikio.defaults.set` (setter).
+This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 GDS Threshold ``KVIKIO_GDS_THRESHOLD``
 --------------------------------------
-In order to improve performance of small IO, ``.pread()`` and ``.pwrite()`` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable ``KVIKIO_GDS_THRESHOLD`` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB).
+In order to improve performance of small IO, ``.pread()`` and ``.pwrite()`` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable ``KVIKIO_GDS_THRESHOLD`` to the minimum size (in bytes) to use GDS. If not set, the default value is 16384 (16 KiB).
 
-This setting can also be accessed using :py:func:`kvikio.defaults.gds_threshold` (getter) and :py:func:`kvikio.defaults.set` (setter).
+This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 Size of the Bounce Buffer ``KVIKIO_BOUNCE_BUFFER_SIZE``
 -------------------------------------------------------
 KvikIO might have to use intermediate host buffers (one per thread) when copying between files and device memory. Set the environment variable ``KVIKIO_BOUNCE_BUFFER_SIZE`` to the size (in bytes) of these "bounce" buffers. If not set, the default value is 16777216 (16 MiB).
 
-This setting can also be accessed using :py:func:`kvikio.defaults.bounce_buffer_size` (getter) and :py:func:`kvikio.defaults.set` (setter).
+This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 HTTP Retries ``KVIKIO_HTTP_STATUS_CODES``, ``KVIKIO_HTTP_MAX_ATTEMPTS``
 ------------------------------------------------------------------------
 
 The behavior when a remote I/O read returns an error can be controlled through the `KVIKIO_HTTP_STATUS_CODES`, `KVIKIO_HTTP_MAX_ATTEMPTS`, and `KVIKIO_HTTP_TIMEOUT` environment variables.
 
-KvikIO will retry a request should any of the HTTP status code in ``KVIKIO_HTTP_STATUS_CODES`` is received. The default values are ``429, 500, 502, 503, 504``. This setting can also be accessed using :py:func:`kvikio.defaults.http_status_codes` (getter) and :py:func:`kvikio.defaults.set` (setter).
+KvikIO will retry a request should any of the HTTP status code in ``KVIKIO_HTTP_STATUS_CODES`` is received. The default values are ``429, 500, 502, 503, 504``. This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
-The maximum number of attempts to make before throwing an exception is controlled by ``KVIKIO_HTTP_MAX_ATTEMPTS``. The default value is 3. This setting can also be accessed using :py:func:`kvikio.defaults.http_max_attempts` (getter) and :py:func:`kvikio.defaults.set` (setter).
+The maximum number of attempts to make before throwing an exception is controlled by ``KVIKIO_HTTP_MAX_ATTEMPTS``. The default value is 3. This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
-The maximum duration of each HTTP request is controlled by ``KVIKIO_HTTP_TIMEOUT``. The default value is 60, which is the duration in seconds to allow. This setting can also be accessed using :py:func:`kvikio.defaults.http_timoeout` (getter) and :py:func:`kvikio.defaults.set` (setter).
+The maximum duration of each HTTP request is controlled by ``KVIKIO_HTTP_TIMEOUT``. The default value is 60, which is the duration in seconds to allow. This setting can also be accessed using :py:func:`kvikio.defaults.get` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 HTTP Verbose ``KVIKIO_REMOTE_VERBOSE``
 --------------------------------------
@@ -74,3 +74,50 @@ The Certificate Authority (CA) paths required for TLS/SSL verification in ``libc
   * ``SSL_CERT_DIR`` (also used in OpenSSL): Specifies the CA certificate directory.
 
 When neither is specified, KvikIO searches several standard system locations for the CA file and directory, and if the search fails falls back to the libcurl compile-time defaults.
+
+Opportunistic POSIX Direct I/O operations ``KVIKIO_AUTO_DIRECT_IO_READ``, ``KVIKIO_AUTO_DIRECT_IO_WRITE``
+---------------------------------------------------------------------------------------------------------
+
+Overview
+^^^^^^^^
+
+By default, POSIX I/O operations perform buffered I/O using the OS page cache. However, Direct I/O (bypassing the page cache) can significantly improve performance in certain scenarios, such as writes and cold page-cache reads.
+
+Traditional Direct I/O has strict requirements: The buffer address must be page-aligned, the file offset must be page-aligned, and the transfer size must be a multiple of page size (typically 4096 bytes). :py:class:`kvikio.CuFile` provides the feature of opportunistic Direct I/O, which removes these restrictions by automatically handling alignment. Specifically, KvikIO can split a POSIX I/O operation into unaligned and aligned segments and apply buffered I/O and direct I/O respectively.
+
+Configuration
+^^^^^^^^^^^^^
+
+Set the environment variable ``KVIKIO_AUTO_DIRECT_IO_READ`` / ``KVIKIO_AUTO_DIRECT_IO_WRITE`` to ``true``, ``on``, ``yes``, or ``1`` (case-insensitive) to enable opportunistic Direct I/O.
+
+.. code-block:: bash
+
+   export KVIKIO_AUTO_DIRECT_IO_READ=1
+   export KVIKIO_AUTO_DIRECT_IO_WRITE=1
+
+Set them to ``false``, ``off``, ``no``, or ``0`` to disable this feature and use buffered I/O.
+
+.. code-block:: bash
+
+   export KVIKIO_AUTO_DIRECT_IO_READ=0
+   export KVIKIO_AUTO_DIRECT_IO_WRITE=0
+
+If not set, the default setting is buffered I/O for POSIX read (``KVIKIO_AUTO_DIRECT_IO_READ=0``) and Direct I/O for POSIX write (``KVIKIO_AUTO_DIRECT_IO_WRITE=1``).
+
+Programmatic Access
+^^^^^^^^^^^^^^^^^^^
+
+These settings can be queried (:py:func:`kvikio.defaults.get`) and modified (:py:func:`kvikio.defaults.set`) at runtime using the property name ``auto_direct_io_read`` and ``auto_direct_io_write``.
+
+Example:
+
+.. code-block:: python
+
+   import kvikio.defaults
+
+   # Check current settings
+   print(kvikio.defaults.get("auto_direct_io_read"))
+   print(kvikio.defaults.get("auto_direct_io_write"))
+
+   # Enable Direct I/O for reads, and disable it for writes
+   kvikio.defaults.set({"auto_direct_io_read": True, "auto_direct_io_write": False})
diff --git a/python/kvikio/kvikio/_lib/buffer.pyx b/python/kvikio/kvikio/_lib/buffer.pyx
index e57903f49a..c86ee7f021 100644
--- a/python/kvikio/kvikio/_lib/buffer.pyx
+++ b/python/kvikio/kvikio/_lib/buffer.pyx
@@ -30,11 +30,20 @@ def memory_deregister(buf) -> None:
 
 
 cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
-    size_t cpp_alloc_retain_clear "kvikio::AllocRetain::instance().clear"() except +
+    size_t cpp_page_aligned_bounce_buffer_pool_clear \
+        "kvikio::PageAlignedBounceBufferPool::instance().clear"() except +
+
+    size_t cpp_cuda_pinned_bounce_buffer_pool_clear \
+        "kvikio::CudaPinnedBounceBufferPool::instance().clear"() except +
+
+    size_t cpp_cuda_page_aligned_pinned_bounce_buffer_pool_clear \
+        "kvikio::CudaPageAlignedPinnedBounceBufferPool::instance().clear"() except +
 
 
 def bounce_buffer_free() -> int:
     cdef size_t result
     with nogil:
-        result = cpp_alloc_retain_clear()
+        result = cpp_page_aligned_bounce_buffer_pool_clear() + \
+            cpp_cuda_pinned_bounce_buffer_pool_clear() + \
+            cpp_cuda_page_aligned_pinned_bounce_buffer_pool_clear()
     return result
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index e38ef80524..e5bfbca713 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -40,6 +40,12 @@ cdef extern from "<kvikio/defaults.hpp>" namespace "kvikio" nogil:
     long cpp_http_timeout "kvikio::defaults::http_timeout"() except +
     void cpp_set_http_timeout\
         "kvikio::defaults::set_http_timeout"(long timeout_seconds) except +
+    bool cpp_auto_direct_io_read "kvikio::defaults::auto_direct_io_read"() except +
+    void cpp_set_auto_direct_io_read \
+        "kvikio::defaults::set_auto_direct_io_read"(size_t flag) except +
+    bool cpp_auto_direct_io_write "kvikio::defaults::auto_direct_io_write"() except +
+    void cpp_set_auto_direct_io_write \
+        "kvikio::defaults::set_auto_direct_io_write"(size_t flag) except +
 
 
 def is_compat_mode_preferred() -> bool:
@@ -147,3 +153,29 @@ def http_status_codes() -> list[int]:
 def set_http_status_codes(status_codes: list[int]) -> None:
     # Cannot use nogil here because we need the GIL for list conversion
     cpp_set_http_status_codes(status_codes)
+
+
+def auto_direct_io_read() -> bool:
+    cdef bool result
+    with nogil:
+        result = cpp_auto_direct_io_read()
+    return result
+
+
+def set_auto_direct_io_read(flag: bool) -> None:
+    cdef bool cpp_flag = flag
+    with nogil:
+        cpp_set_auto_direct_io_read(cpp_flag)
+
+
+def auto_direct_io_write() -> bool:
+    cdef bool result
+    with nogil:
+        result = cpp_auto_direct_io_write()
+    return result
+
+
+def set_auto_direct_io_write(flag: bool) -> None:
+    cdef bool cpp_flag = flag
+    with nogil:
+        cpp_set_auto_direct_io_write(cpp_flag)
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index de17322303..b2a11fbdc1 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -87,6 +87,7 @@ cdef extern from "<kvikio/file_handle.hpp>" namespace "kvikio" nogil:
             size_t devPtr_offset,
             CUstream stream
         ) except +
+        bool is_direct_io_supported()
 
 
 cdef class CuFile:
@@ -216,6 +217,13 @@ cdef class CuFile:
             )
         return _wrap_stream_future(fut)
 
+    def is_direct_io_supported(self) -> bool:
+        cdef bool result
+        with nogil:
+            result = self._handle.is_direct_io_supported()
+        return result
+
+
 cdef extern from "<kvikio/file_utils.hpp>" nogil:
     pair[size_t, size_t] cpp_get_page_cache_info_str \
         "kvikio::get_page_cache_info"(string file_path) except +
diff --git a/python/kvikio/kvikio/cufile.py b/python/kvikio/kvikio/cufile.py
index ec8fe4be84..0aa16695c9 100644
--- a/python/kvikio/kvikio/cufile.py
+++ b/python/kvikio/kvikio/cufile.py
@@ -433,6 +433,21 @@ def raw_write(
         """
         return self._handle.write(buf, size, file_offset, dev_offset)
 
+    def is_direct_io_supported(self) -> bool:
+        """Whether Direct I/O is supported on this file handle.
+
+        This is determined by two factors:
+        - Direct I/O support from the operating system and the file system
+        - KvikIO global setting `auto_direct_io_read` and `auto_direct_io_write`. If
+        both values are false, Direct I/O will not be supported on this file handle.
+
+        Returns
+        -------
+        bool
+            Whether Direct I/O is supported
+        """
+        return self._handle.is_direct_io_supported()
+
 
 def get_page_cache_info(
     file: Union[os.PathLike, str, int, io.IOBase],
diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py
index d8b36de444..14d355db15 100644
--- a/python/kvikio/kvikio/defaults.py
+++ b/python/kvikio/kvikio/defaults.py
@@ -56,6 +56,8 @@ def _property_getter_and_setter(self) -> tuple[dict[str, Any], dict[str, Any]]:
             "http_max_attempts",
             "http_status_codes",
             "http_timeout",
+            "auto_direct_io_read",
+            "auto_direct_io_write",
         ]
 
         property_getters = {}
@@ -123,6 +125,8 @@ def set(*config) -> ConfigContextManager:
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
         - ``"http_timeout"``
+        - ``"auto_direct_io_read"``
+        - ``"auto_direct_io_write"``
 
     Returns
     -------

From e53a7ca59c63f362e565da36e9cce56b5f7e7ed2 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Fri, 14 Nov 2025 15:45:44 -0600
Subject: [PATCH 100/103] Merge pull request #871 from
 rapidsai/version-update-26.02

Update to 26.02
---
 .../cuda12.9-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda12.9-pip/devcontainer.json  |  6 ++---
 .../cuda13.0-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda13.0-pip/devcontainer.json  |  6 ++---
 .github/workflows/build.yaml                  |  2 +-
 .github/workflows/pr.yaml                     |  4 ++--
 .github/workflows/test.yaml                   |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-129_arch-aarch64.yaml            |  2 +-
 .../all_cuda-129_arch-x86_64.yaml             |  2 +-
 .../all_cuda-130_arch-aarch64.yaml            |  2 +-
 .../all_cuda-130_arch-x86_64.yaml             |  2 +-
 .../downstream/cmake/get_kvikio.cmake         |  2 +-
 dependencies.yaml                             | 22 +++++++++----------
 java/pom.xml                                  |  2 +-
 python/kvikio/pyproject.toml                  |  6 ++---
 16 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index b220017a03..c5effd0db8 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -5,19 +5,19 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.12-cpp-mambaforge"
+      "BASE": "rapidsai/devcontainers:26.02-cpp-mambaforge"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda12.9-conda",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.02-cuda12.9-conda",
     "--ulimit",
     "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index e15fee7294..49702d7f5d 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -5,19 +5,19 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.12-cpp-cuda12.9"
+      "BASE": "rapidsai/devcontainers:26.02-cpp-cuda12.9"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda12.9-pip",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.02-cuda12.9-pip",
     "--ulimit",
     "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda13.0-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json
index 4788a99058..b1dec3d161 100644
--- a/.devcontainer/cuda13.0-conda/devcontainer.json
+++ b/.devcontainer/cuda13.0-conda/devcontainer.json
@@ -5,19 +5,19 @@
     "args": {
       "CUDA": "13.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.12-cpp-mambaforge"
+      "BASE": "rapidsai/devcontainers:26.02-cpp-mambaforge"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda13.0-conda",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.02-cuda13.0-conda",
     "--ulimit",
     "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda13.0-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json
index ff862c5ecf..df77a93038 100644
--- a/.devcontainer/cuda13.0-pip/devcontainer.json
+++ b/.devcontainer/cuda13.0-pip/devcontainer.json
@@ -5,19 +5,19 @@
     "args": {
       "CUDA": "13.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.12-cpp-cuda13.0"
+      "BASE": "rapidsai/devcontainers:26.02-cpp-cuda13.0"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.12-cuda13.0-pip",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.02-cuda13.0-pip",
     "--ulimit",
     "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9a698f588f..c28761233e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -69,7 +69,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:25.12-latest"
+      container_image: "rapidsai/ci-conda:26.02-latest"
       date: ${{ inputs.date }}
       node_type: "gpu-l4-latest-1"
       script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1650da81c8..329f47c0cb 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -112,7 +112,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:25.12-latest"
+      container_image: "rapidsai/ci-conda:26.02-latest"
       script: "ci/test_java.sh"
   conda-python-build:
     needs: conda-cpp-build
@@ -137,7 +137,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:25.12-latest"
+      container_image: "rapidsai/ci-conda:26.02-latest"
       script: "ci/build_docs.sh"
   devcontainer:
     needs: telemetry-setup
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3f8d231157..324a641b6b 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -51,5 +51,5 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:25.12-latest"
+      container_image: "rapidsai/ci-conda:26.02-latest"
       script: "ci/test_java.sh"
diff --git a/VERSION b/VERSION
index 7924af6192..5c33046aca 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.12.00
+26.02.00
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index d5ba9548a9..067837ea36 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- rapids-dask-dependency==25.12.*,>=0.0.0a0
+- rapids-dask-dependency==26.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index b1a2bb38d5..9270b47f6e 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- rapids-dask-dependency==25.12.*,>=0.0.0a0
+- rapids-dask-dependency==26.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
index 551b24281f..e5c40f372a 100644
--- a/conda/environments/all_cuda-130_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- rapids-dask-dependency==25.12.*,>=0.0.0a0
+- rapids-dask-dependency==26.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
index 419d29ed31..3c38fd8b2b 100644
--- a/conda/environments/all_cuda-130_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
-- rapids-dask-dependency==25.12.*,>=0.0.0a0
+- rapids-dask-dependency==26.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 7a257e7f0b..fa61ef3b2d 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -23,4 +23,4 @@ function(find_and_configure_kvikio MIN_VERSION)
 
 endfunction()
 
-find_and_configure_kvikio("25.12")
+find_and_configure_kvikio("26.02")
diff --git a/dependencies.yaml b/dependencies.yaml
index bed9980785..3805b35326 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -151,7 +151,7 @@ dependencies:
     common:
       - output_types: conda
         packages: &libkvikio_packages
-          - libkvikio==25.12.*,>=0.0.0a0
+          - libkvikio==26.2.*,>=0.0.0a0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -159,12 +159,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.12.*,>=0.0.0a0
+              - libkvikio-cu12==26.2.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu13==25.12.*,>=0.0.0a0
+              - libkvikio-cu13==26.2.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -233,7 +233,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.12.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==26.2.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -247,12 +247,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.12.*,>=0.0.0a0
+              - libkvikio-cu12==26.2.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu13==25.12.*,>=0.0.0a0
+              - libkvikio-cu13==26.2.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
@@ -315,14 +315,14 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.12.*,>=0.0.0a0
-          - libkvikio-tests==25.12.*,>=0.0.0a0
+          - libkvikio==26.2.*,>=0.0.0a0
+          - libkvikio-tests==26.2.*,>=0.0.0a0
   test_kvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.12.*,>=0.0.0a0
-          - kvikio==25.12.*,>=0.0.0a0
+          - libkvikio==26.2.*,>=0.0.0a0
+          - kvikio==26.2.*,>=0.0.0a0
   test_cpp:
     common:
       - output_types: conda
@@ -332,7 +332,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.12.*,>=0.0.0a0
+          - rapids-dask-dependency==26.2.*,>=0.0.0a0
           - pytest<9.0.0a0
           - pytest-asyncio
           - pytest-cov
diff --git a/java/pom.xml b/java/pom.xml
index 1dc011b575..be956c8cb4 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -10,7 +10,7 @@ SPDX-License-Identifier: Apache-2.0
 
   <groupId>ai.rapids.kvikio</groupId>
   <artifactId>cufile</artifactId>
-  <version>25.12.0-SNAPSHOT</version>
+  <version>26.02.0-SNAPSHOT</version>
 
   <name>cuFile</name>
   <description>
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 2551b8641f..619dcbfa90 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cupy-cuda13x>=13.6.0",
-    "libkvikio==25.12.*,>=0.0.0a0",
+    "libkvikio==26.2.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
     "packaging",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -46,7 +46,7 @@ test = [
     "pytest-timeout",
     "pytest<9.0.0a0",
     "rangehttpserver",
-    "rapids-dask-dependency==25.12.*,>=0.0.0a0",
+    "rapids-dask-dependency==26.2.*,>=0.0.0a0",
     "zarr>=3.0.0,<3.2.0a0,<4.0.0; python_version >= '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 zarr = [
@@ -99,7 +99,7 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
     "cython>=3.0.0,<3.2.0a0",
-    "libkvikio==25.12.*,>=0.0.0a0",
+    "libkvikio==26.2.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From c9fb696290199c65baae3e87b2fd6bf02be66d77 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Fri, 14 Nov 2025 13:46:26 -0800
Subject: [PATCH 101/103] Update for release branch release/25.12

---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 RAPIDS_BRANCH                                 |  2 +-
 .../downstream/cmake/get_kvikio.cmake         |  2 +-
 6 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9a698f588f..b060c2569e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1650da81c8..1e802d80b1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/25.12
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/25.12
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/25.12
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/25.12
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/25.12
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/25.12
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/25.12
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,7 +142,7 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@release/25.12
     with:
       arch: '["amd64", "arm64"]'
       cuda: '["13.0"]'
@@ -160,7 +160,7 @@ jobs:
         sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -170,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -179,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/25.12
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3f8d231157..44d7267018 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index c471e2a151..0b885544da 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/25.12
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
index ba2906d066..26b84372d3 100644
--- a/RAPIDS_BRANCH
+++ b/RAPIDS_BRANCH
@@ -1 +1 @@
-main
+release/25.12
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 7a257e7f0b..ed7c1278a4 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -13,7 +13,7 @@ function(find_and_configure_kvikio MIN_VERSION)
     VERSION ${MIN_VERSION}
             GIT_REPOSITORY
             https://github.com/rapidsai/kvikio.git
-    GIT_TAG main
+    GIT_TAG release/25.12
     GIT_SHALLOW
       TRUE
       SOURCE_SUBDIR

From 13acceaf319ed0b84d7ace64004351b0d8ec5fa8 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Fri, 14 Nov 2025 15:47:16 -0600
Subject: [PATCH 102/103] Revert "Forward-merge release/25.12 into main" (#873)

Reverts rapidsai/kvikio#872
---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 RAPIDS_BRANCH                                 |  2 +-
 .../downstream/cmake/get_kvikio.cmake         |  2 +-
 6 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index b19d4b4984..c28761233e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4226fce388..329f47c0cb 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,7 +142,7 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main
     with:
       arch: '["amd64", "arm64"]'
       cuda: '["13.0"]'
@@ -160,7 +160,7 @@ jobs:
         sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -170,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -179,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3b6e56fcff..324a641b6b 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 0b885544da..c471e2a151 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/25.12
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
index 26b84372d3..ba2906d066 100644
--- a/RAPIDS_BRANCH
+++ b/RAPIDS_BRANCH
@@ -1 +1 @@
-release/25.12
+main
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 70dfaee901..fa61ef3b2d 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -13,7 +13,7 @@ function(find_and_configure_kvikio MIN_VERSION)
     VERSION ${MIN_VERSION}
             GIT_REPOSITORY
             https://github.com/rapidsai/kvikio.git
-    GIT_TAG release/25.12
+    GIT_TAG main
     GIT_SHALLOW
       TRUE
       SOURCE_SUBDIR

From 143bc118a43ea24fd787224c644f54b8eebc4ee0 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Thu, 20 Nov 2025 08:59:27 -0800
Subject: [PATCH 103/103] Use `sccache-dist` build cluster for conda and wheel
 builds (#865)

RAPIDS has deployed an autoscaling cloud build cluster that can be used to accelerate building large RAPIDS projects.

This PR updates the conda and wheel builds to use the build cluster.

This contributes to https://github.com/rapidsai/build-planning/issues/228.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Nate Rock (https://github.com/rockhowse)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/865
---
 .devcontainer/Dockerfile            |  1 +
 .github/workflows/build.yaml        |  7 +++++++
 .github/workflows/pr.yaml           | 10 ++++++++++
 .github/workflows/test.yaml         |  2 ++
 ci/build_cpp.sh                     |  4 ++--
 ci/build_python.sh                  |  4 ++--
 ci/build_wheel.sh                   |  6 +++++-
 cmake/rapids_config.cmake           |  3 +++
 conda/recipes/kvikio/recipe.yaml    | 25 ++++++++++++++++++++-----
 conda/recipes/libkvikio/recipe.yaml | 27 ++++++++++++++++++++-------
 10 files changed, 72 insertions(+), 17 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 0088aaddd6..2253e6271d 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -34,6 +34,7 @@ ENV HISTFILE="/home/coder/.cache/._bash_history"
 ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
+ENV SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=true
 ENV SCCACHE_IDLE_TIMEOUT=0
 
 ###
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index b060c2569e..fbfcd334e1 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -38,9 +38,11 @@ jobs:
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
+      node_type: cpu8
       date: ${{ inputs.date }}
       script: ci/build_cpp.sh
       sha: ${{ inputs.sha }}
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   python-build:
     needs: [cpp-build]
     secrets: inherit
@@ -51,6 +53,7 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_python.sh
       sha: ${{ inputs.sha }}
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
@@ -83,9 +86,11 @@ jobs:
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
+      node_type: cpu8
       script: ci/build_wheel_cpp.sh
       package-name: libkvikio
       package-type: cpp
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
@@ -95,8 +100,10 @@ jobs:
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
+      node_type: cpu8
       script: ci/build_wheel_python.sh
       package-name: kvikio
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
       package-type: python
   wheel-publish-cpp:
     needs: wheel-build-cpp
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1e802d80b1..6ff45af754 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -95,7 +95,9 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/25.12
     with:
       build_type: pull-request
+      node_type: cpu8
       script: ci/build_cpp.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
@@ -104,6 +106,7 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_cpp.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
@@ -121,6 +124,7 @@ jobs:
     with:
       build_type: pull-request
       script: ci/build_python.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
@@ -129,6 +133,7 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_python.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   docs-build:
     needs: conda-python-build
     secrets: inherit
@@ -164,18 +169,22 @@ jobs:
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
+      node_type: cpu8
       script: ci/build_wheel_cpp.sh
       package-name: libkvikio
       package-type: cpp
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12
     with:
       build_type: pull-request
+      node_type: cpu8
       script: ci/build_wheel_python.sh
       package-name: kvikio
       package-type: python
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
@@ -184,6 +193,7 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   telemetry-summarize:
     # This job must use a self-hosted runner to record telemetry traces.
     runs-on: linux-amd64-cpu4
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 44d7267018..f24be6991c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -32,6 +32,7 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/test_cpp.sh
       sha: ${{ inputs.sha }}
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   python-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/25.12
@@ -41,6 +42,7 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/test_python.sh
       sha: ${{ inputs.sha }}
+      sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
   conda-java-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index a27b160696..a787fb7510 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -5,7 +5,6 @@
 set -euo pipefail
 
 source rapids-configure-sccache
-
 source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
@@ -14,7 +13,7 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-sccache --zero-stats
+sccache --stop-server 2>/dev/null || true
 
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version)
 export RAPIDS_PACKAGE_VERSION
@@ -66,6 +65,7 @@ rattler-build build --recipe conda/recipes/libkvikio \
                     "${RATTLER_CHANNELS[@]}"
 
 sccache --show-adv-stats
+sccache --stop-server >/dev/null 2>&1 || true
 
 # remove build_cache directory to avoid uploading the entire source tree
 # tracked in https://github.com/prefix-dev/rattler-build/issues/1424
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 9744abd203..8b4bb2b948 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -5,7 +5,6 @@
 set -euo pipefail
 
 source rapids-configure-sccache
-
 source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
@@ -26,7 +25,7 @@ rapids-logger "Prepending channel ${CPP_CHANNEL} to RATTLER_CHANNELS"
 
 RATTLER_CHANNELS=("--channel" "${CPP_CHANNEL}" "${RATTLER_CHANNELS[@]}")
 
-sccache --zero-stats
+sccache --stop-server 2>/dev/null || true
 
 rapids-logger "Building kvikio"
 
@@ -38,6 +37,7 @@ rattler-build build --recipe conda/recipes/kvikio \
                     "${RATTLER_CHANNELS[@]}"
 
 sccache --show-adv-stats
+sccache --stop-server >/dev/null 2>&1 || true
 
 # remove build_cache directory to avoid uploading the entire source tree
 # tracked in https://github.com/prefix-dev/rattler-build/issues/1424
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index d9a74f877d..04a3055972 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -11,11 +11,14 @@ source rapids-configure-sccache
 source rapids-date-string
 source rapids-init-pip
 
+export SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX="${package_name}/${RAPIDS_CONDA_ARCH}/cuda${RAPIDS_CUDA_VERSION%%.*}/wheel/preprocessor-cache"
+export SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=true
+
 rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-sccache --zero-stats
+sccache --stop-server 2>/dev/null || true
 
 rapids-logger "Building '${package_name}' wheel"
 rapids-pip-retry wheel \
@@ -26,3 +29,4 @@ rapids-pip-retry wheel \
     .
 
 sccache --show-adv-stats
+sccache --stop-server >/dev/null 2>&1 || true
diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake
index 768e24874e..9cda1f26e4 100644
--- a/cmake/rapids_config.cmake
+++ b/cmake/rapids_config.cmake
@@ -35,3 +35,6 @@ if(NOT rapids-cmake-branch)
   set(rapids-cmake-branch "${_rapids_branch}")
 endif()
 include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake")
+
+# Don't use sccache-dist for CMake's compiler tests
+set(ENV{SCCACHE_NO_DIST_COMPILE} "1")
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 798c7d7dce..cc31c8ea80 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -41,17 +41,32 @@ build:
       - AWS_ACCESS_KEY_ID
       - AWS_SECRET_ACCESS_KEY
       - AWS_SESSION_TOKEN
+      - SCCACHE_DIST_AUTH_TOKEN
     env:
       CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
       CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
       CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
       CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
-      SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
-      SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
-      SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
-      SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
-      SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+      NVCC_APPEND_FLAGS: ${{ env.get("NVCC_APPEND_FLAGS", default="") }}
+      PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL", default="8") }}
+      RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR", default="") }}
+      SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET", default="") }}
+      SCCACHE_DIST_AUTH_TYPE: ${{ env.get("SCCACHE_DIST_AUTH_TYPE", default="token") }}
+      SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE: ${{ env.get("SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE", default="false") }}
+      SCCACHE_DIST_MAX_RETRIES: ${{ env.get("SCCACHE_DIST_MAX_RETRIES", default="inf") }}
+      SCCACHE_DIST_REQUEST_TIMEOUT: ${{ env.get("SCCACHE_DIST_REQUEST_TIMEOUT", default="7140") }}
+      SCCACHE_DIST_SCHEDULER_URL: ${{ env.get("SCCACHE_DIST_SCHEDULER_URL", default="") }}
+      SCCACHE_ERROR_LOG: ${{ env.get("SCCACHE_ERROR_LOG", default="/tmp/sccache.log") }}
+      SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT", default="0") }}
+      SCCACHE_NO_CACHE: ${{ env.get("SCCACHE_NO_CACHE", default="") }}
+      SCCACHE_RECACHE: ${{ env.get("SCCACHE_RECACHE", default="") }}
+      SCCACHE_REGION: ${{ env.get("SCCACHE_REGION", default="") }}
       SCCACHE_S3_KEY_PREFIX: kvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}
+      SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS", default="false") }}
+      SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: kvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}/conda/preprocessor-cache
+      SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: ${{ env.get("SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE", default="true") }}
+      SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL", default="true") }}
+      SCCACHE_SERVER_LOG: ${{ env.get("SCCACHE_SERVER_LOG", default="sccache=debug") }}
 
 requirements:
   build:
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 6ae2c9659a..30942b7215 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -42,19 +42,32 @@ cache:
         - AWS_ACCESS_KEY_ID
         - AWS_SECRET_ACCESS_KEY
         - AWS_SESSION_TOKEN
+        - SCCACHE_DIST_AUTH_TOKEN
       env:
         CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
         CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
         CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
         CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
-        PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL") }}
-        RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR") }}
-        SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
-        SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
-        SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
-        SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
-        SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+        NVCC_APPEND_FLAGS: ${{ env.get("NVCC_APPEND_FLAGS", default="") }}
+        PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL", default="8") }}
+        RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR", default="") }}
+        SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET", default="") }}
+        SCCACHE_DIST_AUTH_TYPE: ${{ env.get("SCCACHE_DIST_AUTH_TYPE", default="token") }}
+        SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE: ${{ env.get("SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE", default="false") }}
+        SCCACHE_DIST_MAX_RETRIES: ${{ env.get("SCCACHE_DIST_MAX_RETRIES", default="inf") }}
+        SCCACHE_DIST_REQUEST_TIMEOUT: ${{ env.get("SCCACHE_DIST_REQUEST_TIMEOUT", default="7140") }}
+        SCCACHE_DIST_SCHEDULER_URL: ${{ env.get("SCCACHE_DIST_SCHEDULER_URL", default="") }}
+        SCCACHE_ERROR_LOG: ${{ env.get("SCCACHE_ERROR_LOG", default="/tmp/sccache.log") }}
+        SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT", default="0") }}
+        SCCACHE_NO_CACHE: ${{ env.get("SCCACHE_NO_CACHE", default="") }}
+        SCCACHE_RECACHE: ${{ env.get("SCCACHE_RECACHE", default="") }}
+        SCCACHE_REGION: ${{ env.get("SCCACHE_REGION", default="") }}
         SCCACHE_S3_KEY_PREFIX: libkvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_key_string }}
+        SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS", default="false") }}
+        SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: libkvikio/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}/conda/preprocessor-cache
+        SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: ${{ env.get("SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE", default="true") }}
+        SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL", default="true") }}
+        SCCACHE_SERVER_LOG: ${{ env.get("SCCACHE_SERVER_LOG", default="sccache=debug") }}
   requirements:
     build:
       - ${{ compiler("c") }}