Skip to content

Commit bdf6bda

Browse files
committed
gpu: intel: ocl: allow unlimited allocations
1 parent 3d80d73 commit bdf6bda

File tree

8 files changed

+49
-73
lines changed

8 files changed

+49
-73
lines changed

src/gpu/intel/compute/dispatch_reusable.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,7 @@ class reusable_dispatch_t {
534534
dim_t max_buffer_size = 0;
535535
uint64_t max_buffer_size_bytes = 0;
536536
compile_params.num_buffers = buffers.size();
537+
const size_t num_work_items = nd_range.global_range().nelems();
537538
for (size_t buf_idx = 0; buf_idx < buffers.size(); buf_idx++) {
538539
const named_buffer_t &buffer = buffers[buf_idx];
539540
// Copy buffer name into params
@@ -553,10 +554,10 @@ class reusable_dispatch_t {
553554
compile_params.buffer_types[buf_idx] = buffer.data_type;
554555

555556
// Check buffer sizes to see if we can use int32_t offsets
556-
// or do we need to use stateless addressing model
557+
// and do we need to use stateless addressing model
557558
max_buffer_size = std::max(max_buffer_size, buffer.nelems(true));
558-
max_buffer_size_bytes = std::max(
559-
max_buffer_size_bytes, buffer.size(0, true, true));
559+
max_buffer_size_bytes = std::max(max_buffer_size_bytes,
560+
buffer.size(0, true, true) * num_work_items);
560561
}
561562

562563
compile_params.use_int32_offset = max_buffer_size <= INT32_MAX;

src/gpu/intel/ocl/usm_utils.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,17 @@ void *malloc_host(impl::engine_t *engine, size_t size) {
5757
size_t, cl_uint, cl_int *);
5858

5959
if (size == 0) return nullptr;
60+
bool large_buffer = size
61+
> utils::downcast<const xpu::ocl::engine_impl_t *>(engine->impl())
62+
->max_allocation_size();
63+
static cl_bitfield properties[]
64+
= {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
6065

6166
static xpu::ocl::ext_func_t<clHostMemAllocINTEL_func_t> ext_func(
6267
"clHostMemAllocINTEL");
6368
cl_int err;
64-
void *p = ext_func(engine, get_ocl_context(engine), nullptr, size, 0, &err);
69+
void *p = ext_func(engine, get_ocl_context(engine),
70+
large_buffer ? properties : nullptr, size, 0, &err);
6571
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
6672
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
6773
return p;
@@ -72,12 +78,17 @@ void *malloc_device(impl::engine_t *engine, size_t size) {
7278
cl_ulong *, size_t, cl_uint, cl_int *);
7379

7480
if (size == 0) return nullptr;
81+
bool large_buffer = size
82+
> utils::downcast<const xpu::ocl::engine_impl_t *>(engine->impl())
83+
->max_allocation_size();
84+
static cl_bitfield properties[]
85+
= {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
7586

7687
static xpu::ocl::ext_func_t<clDeviceMemAllocINTEL_func_t> ext_func(
7788
"clDeviceMemAllocINTEL");
7889
cl_int err;
7990
void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
80-
nullptr, size, 0, &err);
91+
large_buffer ? properties : nullptr, size, 0, &err);
8192
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
8293
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
8394
return p;
@@ -88,12 +99,17 @@ void *malloc_shared(impl::engine_t *engine, size_t size) {
8899
cl_ulong *, size_t, cl_uint, cl_int *);
89100

90101
if (size == 0) return nullptr;
102+
bool large_buffer = size
103+
> utils::downcast<const xpu::ocl::engine_impl_t *>(engine->impl())
104+
->max_allocation_size();
105+
static cl_bitfield properties[]
106+
= {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
91107

92108
static xpu::ocl::ext_func_t<clSharedMemAllocINTEL_func_t> ext_func(
93109
"clSharedMemAllocINTEL");
94110
cl_int err;
95111
void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
96-
nullptr, size, 0, &err);
112+
large_buffer ? properties : nullptr, size, 0, &err);
97113
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
98114
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
99115
return p;

src/xpu/ocl/buffer_memory_storage.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,18 @@ namespace xpu {
3030
namespace ocl {
3131

3232
status_t buffer_memory_storage_t::init_allocate(size_t size) {
33-
auto context
34-
= utils::downcast<const xpu::ocl::engine_impl_t *>(engine()->impl())
35-
->context();
33+
auto engine_impl = utils::downcast<const xpu::ocl::engine_impl_t *>(
34+
engine()->impl());
35+
36+
cl_mem_flags flags = CL_MEM_READ_WRITE;
37+
#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
38+
if (size > engine_impl->max_allocation_size())
39+
flags |= CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL;
40+
#endif
41+
3642
cl_int err;
3743
mem_object_ = clCreateBuffer_wrapper(
38-
context, CL_MEM_READ_WRITE, size, nullptr, &err);
44+
engine_impl->context(), flags, size, nullptr, &err);
3945
OCL_CHECK(err);
4046
return status::success;
4147
}

src/xpu/ocl/engine_impl.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,10 @@ class engine_impl_t : public impl::engine_impl_t {
100100
runtime_version_.build = 0;
101101
}
102102

103+
err = clGetDeviceInfo(device_, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
104+
sizeof(max_allocation_size_), &max_allocation_size_, nullptr);
105+
OCL_CHECK(err);
106+
103107
return status::success;
104108
}
105109

@@ -132,6 +136,7 @@ class engine_impl_t : public impl::engine_impl_t {
132136
}
133137

134138
int get_buffer_alignment() const override { return 128; }
139+
uint64_t max_allocation_size() const { return max_allocation_size_; }
135140

136141
private:
137142
std::string name_;
@@ -140,6 +145,7 @@ class engine_impl_t : public impl::engine_impl_t {
140145
xpu::ocl::wrapper_t<cl_device_id> device_;
141146
xpu::ocl::wrapper_t<cl_context> context_;
142147
cl_platform_id platform_ = nullptr;
148+
uint64_t max_allocation_size_ = 0;
143149
bool is_user_context_;
144150
};
145151

src/xpu/ocl/utils.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828

2929
#include "xpu/utils.hpp"
3030

31+
#define CL_MEM_FLAGS_INTEL 0x10001
32+
#define CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL (1 << 23)
33+
3134
namespace dnnl {
3235
namespace impl {
3336
namespace xpu {

tests/benchdnn/dnnl_common.cpp

Lines changed: 5 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,9 +1028,9 @@ size_t get_cpu_ram_size() {
10281028
}
10291029
#endif
10301030

1031-
int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size) {
1031+
int get_gpu_ram_size(size_t &ram_size) {
10321032
if (!is_gpu()) return OK;
1033-
if (ram_size > 0 && max_alloc_size > 0) return OK;
1033+
if (ram_size > 0) return OK;
10341034

10351035
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
10361036
auto eng = dnnl::engine(get_test_engine(), true);
@@ -1042,26 +1042,16 @@ int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size) {
10421042
sizeof(cl_ulong), &ram_sz, nullptr);
10431043
if (status != CL_SUCCESS) return FAIL;
10441044

1045-
cl_ulong max_alloc_sz = 0;
1046-
status = clGetDeviceInfo(ocl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
1047-
sizeof(cl_ulong), &max_alloc_sz, nullptr);
1048-
if (status != CL_SUCCESS) return FAIL;
1049-
10501045
ram_size = (size_t)ram_sz;
1051-
max_alloc_size = (size_t)max_alloc_sz;
10521046
return OK;
10531047
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
10541048
auto eng = dnnl::engine(get_test_engine(), true);
10551049
auto sycl_dev = dnnl::sycl_interop::get_device(eng);
10561050
ram_size = (size_t)sycl_dev
10571051
.get_info<::sycl::info::device::global_mem_size>();
1058-
max_alloc_size
1059-
= (size_t)sycl_dev
1060-
.get_info<::sycl::info::device::max_mem_alloc_size>();
10611052
return OK;
10621053
#endif
10631054
ram_size = 0;
1064-
max_alloc_size = 0;
10651055
return OK;
10661056
}
10671057

@@ -1132,8 +1122,7 @@ int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {
11321122

11331123
static size_t cpu_device_capacity = get_cpu_ram_size();
11341124
static size_t gpu_device_capacity = 0;
1135-
static size_t gpu_max_alloc_capacity = 0;
1136-
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity), WARN);
1125+
SAFE(get_gpu_ram_size(gpu_device_capacity), WARN);
11371126

11381127
const size_t device_max_capacity
11391128
= is_cpu() ? cpu_device_capacity : gpu_device_capacity;
@@ -1171,32 +1160,13 @@ int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {
11711160
res->reason = skip_reason::not_enough_ram;
11721161
}
11731162

1174-
const bool all_allocation_fit_limit = std::all_of(
1175-
check_mem_size_args.sizes.cbegin(),
1176-
check_mem_size_args.sizes.cend(), [&](size_t s) {
1177-
const bool fit = s < gpu_max_alloc_capacity;
1178-
if (!fit) {
1179-
BENCHDNN_PRINT(1,
1180-
"[CHECK_MEM][%s]: Allocation of size %s "
1181-
"doesn't fit allocation limit of %s.\n",
1182-
dir_c_str(), smart_bytes(s).c_str(),
1183-
smart_bytes(gpu_max_alloc_capacity).c_str());
1184-
}
1185-
return fit;
1186-
});
1187-
if (!all_allocation_fit_limit) {
1188-
res->state = SKIPPED;
1189-
res->reason = skip_reason::not_enough_ram;
1190-
}
1191-
11921163
BENCHDNN_PRINT((!fits_device_ram ? 1 : 6),
11931164
"[CHECK_MEM][%s]: Requested: %s; benchdnn_device_limit: %s; "
1194-
"device_RAM_capacity: %s; gpu_max_alloc: %s;\n",
1165+
"device_RAM_capacity: %s;\n",
11951166
dir_c_str(),
11961167
smart_bytes(check_mem_size_args.total_size_device).c_str(),
11971168
smart_bytes(benchdnn_device_limit).c_str(),
1198-
smart_bytes(gpu_device_capacity).c_str(),
1199-
smart_bytes(gpu_max_alloc_capacity).c_str());
1169+
smart_bytes(gpu_device_capacity).c_str());
12001170
}
12011171

12021172
// Note: in theory, `total_size_ref` itself can be smaller for a `prim_ref`

tests/benchdnn/dnnl_common.hpp

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ struct cpu_cache_args_t {
231231
};
232232

233233
size_t get_cpu_ram_size();
234-
int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size);
234+
int get_gpu_ram_size(size_t &ram_size);
235235
int get_cpu_cache_size(cpu_cache_args_t &cache_args);
236236
int get_gpu_cache_size(size_t &cache_size);
237237

@@ -425,31 +425,6 @@ int create_primitive(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &primw,
425425
/* need_skip = */ !is_graph_ref),
426426
WARN);
427427

428-
// The library scratchpad is allocated at create_primitive stage. The memory
429-
// check is moved after the creation stage. It's necessary to check the
430-
// library scratchpad size against gpu_max_alloc, otherwise, out_of_memory
431-
// would be issued by the library.
432-
if (res->mem_size_args.scratchpad_size > 0 && is_gpu()
433-
&& query_scratchpad_mode(query_attr(pdw))
434-
== dnnl_scratchpad_mode_library) {
435-
static size_t gpu_device_capacity = 0;
436-
static size_t gpu_max_alloc_capacity = 0;
437-
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity),
438-
WARN);
439-
const bool fit
440-
= res->mem_size_args.scratchpad_size < gpu_max_alloc_capacity;
441-
if (!fit) {
442-
BENCHDNN_PRINT(1,
443-
"[CHECK_MEM]: Size of the scratchpad %s "
444-
"doesn't fit the allocation limit of %s.\n",
445-
smart_bytes(res->mem_size_args.scratchpad_size).c_str(),
446-
smart_bytes(gpu_max_alloc_capacity).c_str());
447-
res->state = SKIPPED;
448-
res->reason = skip_reason::not_enough_ram;
449-
return OK;
450-
}
451-
}
452-
453428
TIME_C_PRIM(DNN_SAFE(dnnl_primitive_create(&prim, pdw), WARN));
454429
primw.reset(prim);
455430

tests/benchdnn/graph/graph_memory.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ size_t get_benchdnn_cpu_limit() {
3434
size_t get_benchdnn_device_limit() {
3535
if (is_cpu()) return 0;
3636
static size_t gpu_device_capacity = 0;
37-
static size_t gpu_max_alloc_capacity = 0;
38-
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity), WARN);
37+
SAFE(get_gpu_ram_size(gpu_device_capacity), WARN);
3938

4039
const double benchdnn_device_limit = capacity_factor * gpu_device_capacity;
4140
assert(benchdnn_device_limit > 0);

0 commit comments

Comments
 (0)