ai-dynamo
diff --git a/‎meson.build‎
Lines changed: 1 addition & 0 deletions b/‎meson.build‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/gtest/device_api/cuda_ptr.cuh‎
Lines changed: 66 additions & 0 deletions b/‎test/gtest/device_api/cuda_ptr.cuh‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎test/gtest/device_api/utils.cu‎ renamed to ‎test/gtest/device_api/device_test_base.cu‎
Lines changed: 76 additions & 81 deletions b/‎test/gtest/device_api/utils.cu‎ renamed to ‎test/gtest/device_api/device_test_base.cu‎
Lines changed: 76 additions & 81 deletions
@@ -100,6 +100,7 @@ if cuda_dep.found()
     nvcc_flags_link += ['-gencode=arch=compute_80,code=sm_80']
     nvcc_flags_link += ['-gencode=arch=compute_90,code=sm_90']
     add_project_link_arguments(nvcc_flags_link, language: 'cuda')
+    add_project_arguments('-dopt=on', language: 'cuda')
 else
     warning('CUDA not found. UCX backend will be built without CUDA support, and some plugins will be disabled.')
 endif
 
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _CUDA_PTR_CUH
+#define _CUDA_PTR_CUH
+
+#include <cuda_runtime.h>
+
+template<typename T> class CudaPtr {
+public:
+    explicit CudaPtr(T **ptr) : ptr_(ptr) {
+        cudaMalloc(reinterpret_cast<void **>(ptr_), sizeof(T));
+        cudaMemset(*ptr_, 0, sizeof(T));
+    }
+
+    ~CudaPtr() {
+        if (ptr_ && *ptr_) {
+            cudaFree(*ptr_);
+            *ptr_ = nullptr;
+        }
+    }
+
+    CudaPtr(const CudaPtr &) = delete;
+    CudaPtr &
+    operator=(const CudaPtr &) = delete;
+
+    CudaPtr(CudaPtr &&other) noexcept : ptr_(other.ptr_) {
+        other.ptr_ = nullptr;
+    }
+
+    CudaPtr &
+    operator=(CudaPtr &&other) noexcept {
+        if (this != &other) {
+            if (ptr_ && *ptr_) {
+                cudaFree(*ptr_);
+            }
+            ptr_ = other.ptr_;
+            other.ptr_ = nullptr;
+        }
+        return *this;
+    }
+
+    T *
+    get() const {
+        return ptr_ ? *ptr_ : nullptr;
+    }
+
+private:
+    T **ptr_;
+};
+
+#endif // _CUDA_PTR_CUH
@@ -1,108 +1,68 @@
 /*
  * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "utils.cuh"
+#include "device_test_base.cuh"
 
-namespace gtest {
-namespace gpu {
-
-const char *GetGpuXferLevelStr(nixl_gpu_level_t level) {
-    switch (level) {
-    case nixl_gpu_level_t::WARP:
-        return "WARP";
-    case nixl_gpu_level_t::BLOCK:
-        return "BLOCK";
-    case nixl_gpu_level_t::THREAD:
-        return "THREAD";
-    default:
-        return "UNKNOWN";
-    }
-}
-
-void initTiming(unsigned long long **start_time_ptr, unsigned long long **end_time_ptr) {
-    cudaMalloc(start_time_ptr, sizeof(unsigned long long));
-    cudaMalloc(end_time_ptr, sizeof(unsigned long long));
-    cudaMemset(*start_time_ptr, 0, sizeof(unsigned long long));
-    cudaMemset(*end_time_ptr, 0, sizeof(unsigned long long));
-}
-
-void getTiming(unsigned long long *start_time_ptr,
-               unsigned long long *end_time_ptr,
-               unsigned long long &start_time_cpu,
-               unsigned long long &end_time_cpu) {
-    cudaMemcpy(&start_time_cpu, start_time_ptr, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&end_time_cpu, end_time_ptr, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
-}
-
-void logResults(size_t size,
-                size_t count,
-                size_t num_iters,
-                unsigned long long start_time_cpu,
-                unsigned long long end_time_cpu) {
-    auto total_time = NS_TO_SEC(end_time_cpu - start_time_cpu);
-    double total_size = size * count * num_iters;
-    auto bandwidth = total_size / total_time / (1024 * 1024);
-    printf("Device API Results: %zux%zux%zu=%.0f bytes in %f seconds (%.2f MB/s)\n",
-           size, count, num_iters, total_size, total_time, bandwidth);
-}
-
-} // namespace gpu
-} // namespace gtest
-
-nixlAgentConfig DeviceApiTestBase::getConfig() {
-    return nixlAgentConfig(true,
-                          false,
-                          0,
-                          nixl_thread_sync_t::NIXL_THREAD_SYNC_RW,
-                          0,
-                          100000);
+nixlAgentConfig
+DeviceApiTestBase::getConfig() {
+    return nixlAgentConfig(true, false, 0, nixl_thread_sync_t::NIXL_THREAD_SYNC_RW, 0, 100000);
 }
 
-nixl_b_params_t DeviceApiTestBase::getBackendParams() {
+nixl_b_params_t
+DeviceApiTestBase::getBackendParams() {
     nixl_b_params_t params;
     params["num_workers"] = "2";
     return params;
 }
 
-void DeviceApiTestBase::SetUp() {
+void
+DeviceApiTestBase::SetUp() {
     if (cudaSetDevice(0) != cudaSuccess) {
         FAIL() << "Failed to set CUDA device 0";
     }
 
     for (size_t i = 0; i < 2; i++) {
         agents.emplace_back(std::make_unique<nixlAgent>(getAgentName(i), getConfig()));
         nixlBackendH *backend_handle = nullptr;
-        nixl_status_t status = agents.back()->createBackend("UCX", getBackendParams(), backend_handle);
+        nixl_status_t status =
+            agents.back()->createBackend("UCX", getBackendParams(), backend_handle);
         ASSERT_EQ(status, NIXL_SUCCESS);
         EXPECT_NE(backend_handle, nullptr);
         backend_handles.push_back(backend_handle);
     }
 }
 
-void DeviceApiTestBase::TearDown() {
+void
+DeviceApiTestBase::TearDown() {
     agents.clear();
-    backend_handles.clear();
-}
-
-template<typename Desc>
-nixlDescList<Desc> DeviceApiTestBase::makeDescList(const std::vector<MemBuffer> &buffers, nixl_mem_t mem_type) {
-    nixlDescList<Desc> desc_list(mem_type);
-    for (const auto &buffer : buffers) {
-        desc_list.addDesc(Desc(buffer, buffer.getSize(), uint64_t(DEV_ID)));
-    }
-    return desc_list;
 }
 
-void DeviceApiTestBase::registerMem(nixlAgent &agent, const std::vector<MemBuffer> &buffers, nixl_mem_t mem_type) {
+void
+DeviceApiTestBase::registerMem(nixlAgent &agent,
+                               const std::vector<MemBuffer> &buffers,
+                               nixl_mem_t mem_type) {
     auto reg_list = makeDescList<nixlBlobDesc>(buffers, mem_type);
     agent.registerMem(reg_list);
 }
 
-void DeviceApiTestBase::completeWireup(size_t from_agent, size_t to_agent) {
+void
+DeviceApiTestBase::completeWireup(size_t from_agent, size_t to_agent) {
     nixl_notifs_t notifs;
-    nixl_status_t status = getAgent(from_agent).genNotif(getAgentName(to_agent), NOTIF_MSG);
+    nixl_status_t status = getAgent(from_agent).genNotif(getAgentName(to_agent), notifMsg);
     ASSERT_EQ(status, NIXL_SUCCESS) << "Failed to complete wireup";
 
     do {
@@ -112,7 +72,8 @@ void DeviceApiTestBase::completeWireup(size_t from_agent, size_t to_agent) {
     } while (notifs.size() == 0);
 }
 
-void DeviceApiTestBase::exchangeMD(size_t from_agent, size_t to_agent) {
+void
+DeviceApiTestBase::exchangeMD(size_t from_agent, size_t to_agent) {
     for (size_t i = 0; i < agents.size(); i++) {
         nixl_blob_t md;
         nixl_status_t status = agents[i]->getLocalMD(md);
@@ -130,7 +91,8 @@ void DeviceApiTestBase::exchangeMD(size_t from_agent, size_t to_agent) {
     completeWireup(from_agent, to_agent);
 }
 
-void DeviceApiTestBase::invalidateMD() {
+void
+DeviceApiTestBase::invalidateMD() {
     for (size_t i = 0; i < agents.size(); i++) {
         for (size_t j = 0; j < agents.size(); j++) {
             if (i == j) continue;
@@ -140,24 +102,57 @@ void DeviceApiTestBase::invalidateMD() {
     }
 }
 
-void DeviceApiTestBase::createRegisteredMem(nixlAgent &agent,
-                                           size_t size,
-                                           size_t count,
-                                           nixl_mem_t mem_type,
-                                           std::vector<MemBuffer> &out) {
+void
+DeviceApiTestBase::createRegisteredMem(nixlAgent &agent,
+                                       size_t size,
+                                       size_t count,
+                                       nixl_mem_t mem_type,
+                                       std::vector<MemBuffer> &out) {
     while (count-- != 0) {
         out.emplace_back(size, mem_type);
     }
 
     registerMem(agent, out, mem_type);
 }
 
-nixlAgent &DeviceApiTestBase::getAgent(size_t idx) {
+nixlAgent &
+DeviceApiTestBase::getAgent(size_t idx) {
     return *agents[idx];
 }
 
-std::string DeviceApiTestBase::getAgentName(size_t idx) {
+std::string
+DeviceApiTestBase::getAgentName(size_t idx) {
     return absl::StrFormat("agent_%d", idx);
 }
 
-template nixlDescList<nixlBasicDesc> DeviceApiTestBase::makeDescList<nixlBasicDesc>(const std::vector<MemBuffer> &buffers, nixl_mem_t mem_type);
+void
+DeviceApiTestBase::initTiming(unsigned long long **start_time_ptr,
+                              unsigned long long **end_time_ptr) {
+    cudaMalloc(start_time_ptr, sizeof(unsigned long long));
+    cudaMalloc(end_time_ptr, sizeof(unsigned long long));
+    cudaMemset(*start_time_ptr, 0, sizeof(unsigned long long));
+    cudaMemset(*end_time_ptr, 0, sizeof(unsigned long long));
+}
+
+void
+DeviceApiTestBase::getTiming(unsigned long long *start_time_ptr,
+                             unsigned long long *end_time_ptr,
+                             unsigned long long &start_time_cpu,
+                             unsigned long long &end_time_cpu) {
+    cudaMemcpy(&start_time_cpu, start_time_ptr, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&end_time_cpu, end_time_ptr, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
+}
+
+const char *
+DeviceApiTestBase::GetGpuXferLevelStr(nixl_gpu_level_t level) {
+    switch (level) {
+    case nixl_gpu_level_t::WARP:
+        return "WARP";
+    case nixl_gpu_level_t::BLOCK:
+        return "BLOCK";
+    case nixl_gpu_level_t::THREAD:
+        return "THREAD";
+    default:
+        return "UNKNOWN";
+    }
+}