Skip to content

Commit 2c11a83

Browse files
[Offload] Add olCalculateOptimalOccupancy (#142950)
This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implemented on Cuda; AMDGPU and Host return unsupported. --------- Co-authored-by: Callum Fare <[email protected]>
1 parent 2c4f0e7 commit 2c11a83

File tree

11 files changed

+136
-1
lines changed

11 files changed

+136
-1
lines changed

offload/liboffload/API/Kernel.td

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88
//
9-
// This file contains Offload API definitions related to launching kernels
9+
// This file contains Offload API definitions related to kernels
1010
//
1111
//===----------------------------------------------------------------------===//
1212

@@ -42,3 +42,21 @@ def : Function {
4242
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
4343
];
4444
}
45+
46+
def : Function {
47+
let name = "olCalculateOptimalOccupancy";
48+
let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy.";
49+
let details = [
50+
"For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.",
51+
];
52+
let params = [
53+
Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
54+
Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
55+
Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>,
56+
Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT>
57+
];
58+
let returns = [
59+
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
60+
Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
61+
];
62+
}

offload/liboffload/src/OffloadImpl.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
781781
return olDestroy(Program);
782782
}
783783

784+
Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
785+
ol_symbol_handle_t Kernel,
786+
size_t DynamicMemSize,
787+
size_t *GroupSize) {
788+
if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
789+
return createOffloadError(ErrorCode::SYMBOL_KIND,
790+
"provided symbol is not a kernel");
791+
auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
792+
793+
auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
794+
if (auto Err = Res.takeError())
795+
return Err;
796+
797+
*GroupSize = *Res;
798+
799+
return Error::success();
800+
}
801+
784802
Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
785803
ol_symbol_handle_t Kernel, const void *ArgumentsData,
786804
size_t ArgumentsSize,

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy {
570570
KernelLaunchParamsTy LaunchParams,
571571
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
572572

573+
/// Return maximum block size for maximum occupancy
574+
///
575+
/// TODO: This needs to be implemented for amdgpu
576+
Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
577+
uint64_t DynamicMemSize) const override {
578+
return Plugin::error(
579+
ErrorCode::UNSUPPORTED,
580+
"occupancy calculations for AMDGPU are not yet implemented");
581+
}
582+
573583
/// Print more elaborate kernel launch info for AMDGPU
574584
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
575585
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,9 @@ struct GenericKernelTy {
388388
KernelLaunchParamsTy LaunchParams,
389389
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
390390

391+
virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
392+
uint64_t DynamicMemSize) const = 0;
393+
391394
/// Get the kernel name.
392395
const char *getName() const { return Name.c_str(); }
393396

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
7272
DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
7373
DLWRAP(cuDevicePrimaryCtxRetain, 2)
7474
DLWRAP(cuModuleLoadDataEx, 5)
75+
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
7576

7677
DLWRAP(cuDeviceCanAccessPeer, 3)
7778
DLWRAP(cuCtxEnablePeerAccess, 2)

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
290290
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
291291

292292
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
293+
typedef size_t (*CUoccupancyB2DSize)(int);
293294

294295
CUresult cuCtxGetDevice(CUdevice *);
295296
CUresult cuDeviceGet(CUdevice *, int);
@@ -372,5 +373,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
372373
CUresult cuMemGetAllocationGranularity(size_t *granularity,
373374
const CUmemAllocationProp *prop,
374375
CUmemAllocationGranularity_flags option);
376+
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
377+
CUoccupancyB2DSize, size_t, int);
375378

376379
#endif

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
157157
KernelLaunchParamsTy LaunchParams,
158158
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
159159

160+
/// Return maximum block size for maximum occupancy
161+
Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
162+
uint64_t DynamicMemSize) const override {
163+
int minGridSize;
164+
int maxBlockSize;
165+
auto Res = cuOccupancyMaxPotentialBlockSize(
166+
&minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
167+
if (auto Err = Plugin::check(
168+
Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
169+
return Err;
170+
}
171+
return maxBlockSize;
172+
}
173+
160174
private:
161175
/// The CUDA kernel function to execute.
162176
CUfunction Func;

offload/plugins-nextgen/host/src/rtl.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy {
114114
return Plugin::success();
115115
}
116116

117+
/// Return maximum block size for maximum occupancy
118+
Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device,
119+
uint64_t DynamicMemSize) const override {
120+
return Plugin::error(
121+
ErrorCode::UNSUPPORTED,
122+
"occupancy calculations are not implemented for the host device");
123+
}
124+
117125
private:
118126
/// The kernel function to execute.
119127
void (*Func)(void);

offload/unittests/OffloadAPI/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ add_offload_unittest("init"
2020
target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
2121

2222
add_offload_unittest("kernel"
23+
kernel/olCalculateOptimalOccupancy.cpp
2324
kernel/olLaunchKernel.cpp)
2425

2526
add_offload_unittest("memory"

offload/unittests/OffloadAPI/common/Fixtures.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,20 @@
2626
} while (0)
2727
#endif
2828

29+
#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED
30+
#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL) \
31+
do { \
32+
ol_result_t Res = ACTUAL; \
33+
if (Res && Res->Code == OL_ERRC_UNSUPPORTED) { \
34+
GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test"; \
35+
return; \
36+
} else if (Res && Res->Code != OL_ERRC_SUCCESS) { \
37+
GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": " \
38+
<< Res->Details; \
39+
} \
40+
} while (0)
41+
#endif
42+
2943
// TODO: rework this so the EXPECTED/ACTUAL results are readable
3044
#ifndef ASSERT_ERROR
3145
#define ASSERT_ERROR(EXPECTED, ACTUAL) \

0 commit comments

Comments
 (0)