Skip to content

Commit e162030

Browse files
committed
DEVICE/API: Add tests
Signed-off-by: Michal Shalev <[email protected]>
1 parent e144e49 commit e162030

18 files changed

+1749
-1079
lines changed
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
#ifndef NIXL_DEVICE_API_TEST_CUDA_ARRAY_H
19+
#define NIXL_DEVICE_API_TEST_CUDA_ARRAY_H
20+
21+
#include <cuda_runtime.h>
22+
23+
#include <cstddef>
24+
#include <stdexcept>
25+
#include <string>
26+
#include <vector>
27+
28+
template<typename T>
29+
class CudaArray {
30+
public:
31+
explicit CudaArray(size_t count) : count_(count) {
32+
const cudaError_t err = cudaMalloc(&ptr_, count * sizeof(T));
33+
if (err != cudaSuccess) {
34+
throw std::runtime_error(std::string("CudaArray: cudaMalloc failed: ") +
35+
cudaGetErrorString(err));
36+
}
37+
}
38+
39+
~CudaArray() {
40+
if (ptr_ != nullptr) {
41+
cudaFree(ptr_);
42+
}
43+
}
44+
45+
CudaArray(const CudaArray&) = delete;
46+
CudaArray& operator=(const CudaArray&) = delete;
47+
48+
void copyFromHost(const T *host_data, size_t count) {
49+
if (count > count_) {
50+
throw std::out_of_range("CudaArray: copy count exceeds array size");
51+
}
52+
const cudaError_t err =
53+
cudaMemcpy(ptr_, host_data, count * sizeof(T), cudaMemcpyHostToDevice);
54+
if (err != cudaSuccess) {
55+
throw std::runtime_error(
56+
std::string("CudaArray: cudaMemcpy from host failed: ") +
57+
cudaGetErrorString(err));
58+
}
59+
}
60+
61+
void copyFromHost(const std::vector<T> &host_vector) {
62+
copyFromHost(host_vector.data(), host_vector.size());
63+
}
64+
65+
void copyToHost(T *host_data, size_t count) const {
66+
if (count > count_) {
67+
throw std::out_of_range("CudaArray: copy count exceeds array size");
68+
}
69+
const cudaError_t err =
70+
cudaMemcpy(host_data, ptr_, count * sizeof(T), cudaMemcpyDeviceToHost);
71+
if (err != cudaSuccess) {
72+
throw std::runtime_error(
73+
std::string("CudaArray: cudaMemcpy to host failed: ") +
74+
cudaGetErrorString(err));
75+
}
76+
}
77+
78+
[[nodiscard]] T *get() const noexcept { return ptr_; }
79+
[[nodiscard]] size_t size() const noexcept { return count_; }
80+
81+
private:
82+
T *ptr_;
83+
size_t count_;
84+
};
85+
86+
#endif // NIXL_DEVICE_API_TEST_CUDA_ARRAY_H
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
#include "device_kernels.cuh"
19+
#include "device_utils.cuh"
20+
21+
namespace {
22+
23+
constexpr size_t maxThreadsPerBlock = 256;
24+
25+
template<nixl_gpu_level_t level>
26+
__device__ constexpr size_t threadsPerRequest() {
27+
if constexpr (level == nixl_gpu_level_t::THREAD) {
28+
return 1;
29+
} else if constexpr (level == nixl_gpu_level_t::WARP) {
30+
return 32;
31+
} else {
32+
return maxThreadsPerBlock;
33+
}
34+
}
35+
36+
template<nixl_gpu_level_t level>
37+
__device__ constexpr size_t sharedRequestCount() {
38+
return maxThreadsPerBlock / threadsPerRequest<level>();
39+
}
40+
41+
template<nixl_gpu_level_t level>
42+
__device__ nixl_status_t
43+
doOperation(const NixlDeviceKernelParams &params,
44+
nixlGpuXferStatusH *req_ptr) {
45+
nixl_status_t status;
46+
47+
switch (params.operation) {
48+
case NixlDeviceOperation::SINGLE_WRITE:
49+
status = nixlGpuPostSingleWriteXferReq<level>(
50+
params.reqHandle,
51+
params.singleWrite.index,
52+
params.singleWrite.localOffset,
53+
params.singleWrite.remoteOffset,
54+
params.singleWrite.size,
55+
params.singleWrite.channelId,
56+
params.withNoDelay,
57+
req_ptr);
58+
break;
59+
60+
case NixlDeviceOperation::PARTIAL_WRITE:
61+
status = nixlGpuPostPartialWriteXferReq<level>(
62+
params.reqHandle,
63+
params.partialWrite.count,
64+
params.partialWrite.descIndices,
65+
params.partialWrite.sizes,
66+
params.partialWrite.localOffsets,
67+
params.partialWrite.remoteOffsets,
68+
params.partialWrite.signalDescIndex,
69+
params.partialWrite.signalInc,
70+
params.partialWrite.signalOffset,
71+
params.partialWrite.channelId,
72+
params.withNoDelay,
73+
req_ptr);
74+
break;
75+
76+
case NixlDeviceOperation::FULL_WRITE:
77+
status = nixlGpuPostWriteXferReq<level>(
78+
params.reqHandle,
79+
params.fullWrite.signalInc,
80+
params.fullWrite.channelId,
81+
params.withNoDelay,
82+
req_ptr);
83+
break;
84+
85+
case NixlDeviceOperation::SIGNAL_POST:
86+
status = nixlGpuPostSignalXferReq<level>(
87+
params.reqHandle,
88+
params.signalPost.signalDescIndex,
89+
params.signalPost.signalInc,
90+
params.signalPost.signalOffset,
91+
params.signalPost.channelId,
92+
params.withNoDelay,
93+
req_ptr);
94+
break;
95+
96+
case NixlDeviceOperation::SIGNAL_READ: {
97+
if (params.signalRead.signalAddr == nullptr) {
98+
return NIXL_ERR_INVALID_PARAM;
99+
}
100+
101+
uint64_t value;
102+
do {
103+
value = nixlGpuReadSignal<level>(params.signalRead.signalAddr);
104+
} while (value != params.signalRead.expectedValue);
105+
106+
if (params.signalRead.resultPtr != nullptr) {
107+
*params.signalRead.resultPtr = value;
108+
}
109+
return NIXL_SUCCESS;
110+
}
111+
112+
case NixlDeviceOperation::SIGNAL_WRITE:
113+
if (params.signalWrite.signalAddr == nullptr) {
114+
return NIXL_ERR_INVALID_PARAM;
115+
}
116+
nixlGpuWriteSignal<level>(params.signalWrite.signalAddr,
117+
params.signalWrite.value);
118+
return NIXL_SUCCESS;
119+
120+
default:
121+
return NIXL_ERR_INVALID_PARAM;
122+
}
123+
124+
if (status != NIXL_IN_PROG) {
125+
return (status == NIXL_SUCCESS) ? NIXL_SUCCESS : NIXL_ERR_BACKEND;
126+
}
127+
128+
if (!params.withNoDelay || (req_ptr == nullptr)) {
129+
return NIXL_SUCCESS;
130+
}
131+
132+
do {
133+
status = nixlGpuGetXferStatus<level>(*req_ptr);
134+
} while (status == NIXL_IN_PROG);
135+
136+
return status;
137+
}
138+
139+
140+
template<nixl_gpu_level_t level>
141+
__device__ void
142+
kernelJob(const NixlDeviceKernelParams &params,
143+
NixlDeviceKernelResult *result_ptr) {
144+
if (result_ptr == nullptr) {
145+
return;
146+
}
147+
148+
nixl_status_t &status = result_ptr->status;
149+
150+
if (blockDim.x > maxThreadsPerBlock) {
151+
status = NIXL_ERR_INVALID_PARAM;
152+
return;
153+
}
154+
155+
if (params.numIters == 0) {
156+
status = NIXL_ERR_INVALID_PARAM;
157+
return;
158+
}
159+
160+
__shared__ nixlGpuXferStatusH shared_reqs[sharedRequestCount<level>()];
161+
nixlGpuXferStatusH *req_ptr = nullptr;
162+
if (params.withRequest) {
163+
const size_t req_index = threadIdx.x / threadsPerRequest<level>();
164+
req_ptr = &shared_reqs[req_index];
165+
}
166+
167+
for (size_t i = 0; i < params.numIters - 1; i++) {
168+
status = doOperation<level>(params, req_ptr);
169+
if (status != NIXL_SUCCESS) {
170+
return;
171+
}
172+
}
173+
174+
// Last iteration forces completion to ensure all operations are finished
175+
NixlDeviceKernelParams params_force_completion = params;
176+
params_force_completion.withNoDelay = true;
177+
nixlGpuXferStatusH *status_ptr = nullptr;
178+
if (params.withRequest) {
179+
const size_t req_index = threadIdx.x / threadsPerRequest<level>();
180+
status_ptr = &shared_reqs[req_index];
181+
}
182+
status = doOperation<level>(params_force_completion, status_ptr);
183+
}
184+
185+
template<nixl_gpu_level_t level>
186+
__global__ void
187+
nixlTestKernel(const NixlDeviceKernelParams params,
188+
NixlDeviceKernelResult *result_ptr) {
189+
kernelJob<level>(params, result_ptr);
190+
__threadfence_system();
191+
}
192+
193+
} // namespace
194+
195+
NixlDeviceKernelResult
196+
launchNixlDeviceKernel(const NixlDeviceKernelParams &params) {
197+
CudaArray<NixlDeviceKernelResult> result(1);
198+
NixlDeviceKernelResult init_result{NIXL_ERR_INVALID_PARAM};
199+
result.copyFromHost(&init_result, 1);
200+
201+
switch (params.level) {
202+
case nixl_gpu_level_t::THREAD:
203+
nixlTestKernel<nixl_gpu_level_t::THREAD>
204+
<<<params.numBlocks, params.numThreads>>>(params, result.get());
205+
break;
206+
case nixl_gpu_level_t::WARP:
207+
nixlTestKernel<nixl_gpu_level_t::WARP>
208+
<<<params.numBlocks, params.numThreads>>>(params, result.get());
209+
break;
210+
case nixl_gpu_level_t::BLOCK:
211+
nixlTestKernel<nixl_gpu_level_t::BLOCK>
212+
<<<params.numBlocks, params.numThreads>>>(params, result.get());
213+
break;
214+
default: {
215+
NixlDeviceKernelResult error_result{NIXL_ERR_INVALID_PARAM};
216+
return error_result;
217+
}
218+
}
219+
220+
const nixl_status_t sync_status = checkCudaErrors();
221+
if (sync_status != NIXL_SUCCESS) {
222+
NixlDeviceKernelResult error_result{sync_status};
223+
return error_result;
224+
}
225+
226+
NixlDeviceKernelResult host_result;
227+
result.copyToHost(&host_result, 1);
228+
return host_result;
229+
}

0 commit comments

Comments
 (0)