Skip to content

Commit 79caa93

Browse files
committed
gpu: intel: ocl: allow unlimited allocations
1 parent 976bf2d commit 79caa93

File tree

10 files changed

+82
-92
lines changed

10 files changed

+82
-92
lines changed

src/gpu/intel/compute/device_info.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ struct device_info_t {
216216
static int max_slm_size_per_tg(gpu_arch_t gpu_arch);
217217
static int max_slm_size_per_tg(
218218
gpu_arch_t gpu_arch, int tg_size, bool large_grf_mode = false);
219+
size_t memory_size() const { return memory_size_; }
219220
size_t l3_cache_size() const { return l3_cache_size_; }
220221
size_t icache_size() const;
221222
size_t max_kernel_param_size() const { return max_kernel_param_size_; }
@@ -288,6 +289,7 @@ struct device_info_t {
288289
int32_t max_subgroup_size_ = 16;
289290
int max_exec_size_ = 0;
290291
size_t max_wg_size_ = 0;
292+
size_t memory_size_ = 0;
291293
size_t l3_cache_size_ = 0;
292294
size_t max_kernel_param_size_ = 1024;
293295
uint32_t device_address_bits_ = 64;

src/gpu/intel/compute/dispatch_reusable.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,7 @@ class reusable_dispatch_t {
534534
dim_t max_buffer_size = 0;
535535
uint64_t max_buffer_size_bytes = 0;
536536
compile_params.num_buffers = buffers.size();
537+
const size_t num_work_items = nd_range.global_range().nelems();
537538
for (size_t buf_idx = 0; buf_idx < buffers.size(); buf_idx++) {
538539
const named_buffer_t &buffer = buffers[buf_idx];
539540
// Copy buffer name into params
@@ -553,10 +554,10 @@ class reusable_dispatch_t {
553554
compile_params.buffer_types[buf_idx] = buffer.data_type;
554555

555556
// Check buffer sizes to see if we can use int32_t offsets
556-
// or do we need to use stateless addressing model
557+
// and do we need to use stateless addressing model
557558
max_buffer_size = std::max(max_buffer_size, buffer.nelems(true));
558-
max_buffer_size_bytes = std::max(
559-
max_buffer_size_bytes, buffer.size(0, true, true));
559+
max_buffer_size_bytes = std::max(max_buffer_size_bytes,
560+
buffer.size(0, true, true) * num_work_items);
560561
}
561562

562563
compile_params.use_int32_offset = max_buffer_size <= INT32_MAX;

src/gpu/intel/ocl/device_info.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,12 @@ status_t device_info_t::init_attributes(impl::engine_t *engine) {
134134
OCL_CHECK(err);
135135
max_wg_size_ = max_wg_size;
136136

137+
cl_ulong mem_size;
138+
err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size),
139+
&mem_size, nullptr);
140+
OCL_CHECK(err);
141+
memory_size_ = mem_size;
142+
137143
cl_ulong mem_cache_size;
138144
err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
139145
sizeof(mem_cache_size), &mem_cache_size, nullptr);

src/gpu/intel/ocl/usm_utils.cpp

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,8 @@
1414
* limitations under the License.
1515
*******************************************************************************/
1616

17-
#include "xpu/ocl/engine_impl.hpp"
18-
#include "xpu/ocl/stream_impl.hpp"
19-
2017
#include "gpu/intel/ocl/usm_utils.hpp"
18+
#include "gpu/intel/ocl/engine.hpp"
2119

2220
namespace dnnl {
2321
namespace impl {
@@ -42,6 +40,27 @@ cl_command_queue get_ocl_queue(impl::stream_t *stream) {
4240
return utils::downcast<xpu::ocl::stream_impl_t *>(stream->impl())->queue();
4341
}
4442

43+
const compute::device_info_t *get_device_info(impl::engine_t *engine) {
44+
return utils::downcast<const ocl::engine_t *>(engine)->device_info();
45+
}
46+
47+
template <typename F>
48+
void *usm_malloc_common(impl::engine_t *engine, size_t size, F ext_func) {
49+
auto device_info = get_device_info(engine);
50+
51+
if (size == 0 || size > device_info->memory_size()) return nullptr;
52+
bool large_buffer = size > device_info->max_allocation_size();
53+
cl_bitfield large_buffer_flag[]
54+
= {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
55+
56+
cl_int err;
57+
void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
58+
large_buffer ? large_buffer_flag : nullptr, size, 0, &err);
59+
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
60+
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
61+
return p;
62+
}
63+
4564
} // namespace
4665

4766
bool is_usm_supported(impl::engine_t *engine) {
@@ -55,13 +74,18 @@ bool is_usm_supported(impl::engine_t *engine) {
5574
void *malloc_host(impl::engine_t *engine, size_t size) {
5675
using clHostMemAllocINTEL_func_t = void *(*)(cl_context, const cl_ulong *,
5776
size_t, cl_uint, cl_int *);
58-
59-
if (size == 0) return nullptr;
60-
6177
static xpu::ocl::ext_func_t<clHostMemAllocINTEL_func_t> ext_func(
6278
"clHostMemAllocINTEL");
79+
auto device_info = get_device_info(engine);
80+
81+
if (size == 0 || size > device_info->memory_size()) return nullptr;
82+
bool large_buffer = size > device_info->max_allocation_size();
83+
cl_bitfield large_buffer_flag[]
84+
= {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
85+
6386
cl_int err;
64-
void *p = ext_func(engine, get_ocl_context(engine), nullptr, size, 0, &err);
87+
void *p = ext_func(engine, get_ocl_context(engine),
88+
large_buffer ? large_buffer_flag : nullptr, size, 0, &err);
6589
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
6690
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
6791
return p;
@@ -70,38 +94,21 @@ void *malloc_host(impl::engine_t *engine, size_t size) {
7094
void *malloc_device(impl::engine_t *engine, size_t size) {
7195
using clDeviceMemAllocINTEL_func_t = void *(*)(cl_context, cl_device_id,
7296
cl_ulong *, size_t, cl_uint, cl_int *);
73-
74-
if (size == 0) return nullptr;
75-
7697
static xpu::ocl::ext_func_t<clDeviceMemAllocINTEL_func_t> ext_func(
7798
"clDeviceMemAllocINTEL");
78-
cl_int err;
79-
void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
80-
nullptr, size, 0, &err);
81-
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
82-
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
83-
return p;
99+
return usm_malloc_common(engine, size, ext_func);
84100
}
85101

86102
void *malloc_shared(impl::engine_t *engine, size_t size) {
87103
using clSharedMemAllocINTEL_func_t = void *(*)(cl_context, cl_device_id,
88104
cl_ulong *, size_t, cl_uint, cl_int *);
89-
90-
if (size == 0) return nullptr;
91-
92105
static xpu::ocl::ext_func_t<clSharedMemAllocINTEL_func_t> ext_func(
93106
"clSharedMemAllocINTEL");
94-
cl_int err;
95-
void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
96-
nullptr, size, 0, &err);
97-
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
98-
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
99-
return p;
107+
return usm_malloc_common(engine, size, ext_func);
100108
}
101109

102110
void free(impl::engine_t *engine, void *ptr) {
103111
using clMemFreeINTEL_func_t = cl_int (*)(cl_context, void *);
104-
105112
if (!ptr) return;
106113
static xpu::ocl::ext_func_t<clMemFreeINTEL_func_t> ext_func(
107114
"clMemFreeINTEL");

src/gpu/intel/sycl/device_info.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ status_t device_info_t::init_attributes(impl::engine_t *engine) {
138138
eu_count_ = device.get_info<::sycl::info::device::max_compute_units>();
139139
}
140140
max_wg_size_ = device.get_info<::sycl::info::device::max_work_group_size>();
141+
memory_size_ = device.get_info<::sycl::info::device::global_mem_size>();
141142
l3_cache_size_
142143
= device.get_info<::sycl::info::device::global_mem_cache_size>();
143144
mayiuse_system_memory_allocators_

src/xpu/ocl/buffer_memory_storage.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,34 @@
2424
#include "xpu/ocl/stream_impl.hpp"
2525
#include "xpu/ocl/usm_utils.hpp"
2626

27+
#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
28+
#include "gpu/intel/ocl/engine.hpp"
29+
#endif
30+
2731
namespace dnnl {
2832
namespace impl {
2933
namespace xpu {
3034
namespace ocl {
3135

36+
#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
37+
status_t buffer_memory_storage_t::init_allocate(size_t size) {
38+
auto ocl_engine
39+
= utils::downcast<const gpu::intel::ocl::engine_t *>(engine());
40+
41+
cl_mem_flags flags = CL_MEM_READ_WRITE;
42+
if (size > ocl_engine->device_info()->max_allocation_size()) {
43+
if (size > ocl_engine->device_info()->memory_size())
44+
return status::invalid_arguments;
45+
flags |= CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL;
46+
}
47+
48+
cl_int err;
49+
mem_object_ = clCreateBuffer_wrapper(
50+
ocl_engine->context(), flags, size, nullptr, &err);
51+
OCL_CHECK(err);
52+
return status::success;
53+
}
54+
#else
3255
status_t buffer_memory_storage_t::init_allocate(size_t size) {
3356
auto context
3457
= utils::downcast<const xpu::ocl::engine_impl_t *>(engine()->impl())
@@ -39,6 +62,7 @@ status_t buffer_memory_storage_t::init_allocate(size_t size) {
3962
OCL_CHECK(err);
4063
return status::success;
4164
}
65+
#endif
4266

4367
namespace {
4468
status_t get_map_queue(cl_command_queue &queue, impl::engine_t *engine,

src/xpu/ocl/utils.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828

2929
#include "xpu/utils.hpp"
3030

31+
#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
32+
#define CL_MEM_FLAGS_INTEL 0x10001
33+
#define CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL (1 << 23)
34+
#endif
35+
3136
namespace dnnl {
3237
namespace impl {
3338
namespace xpu {

tests/benchdnn/dnnl_common.cpp

Lines changed: 5 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,9 +1028,9 @@ size_t get_cpu_ram_size() {
10281028
}
10291029
#endif
10301030

1031-
int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size) {
1031+
int get_gpu_ram_size(size_t &ram_size) {
10321032
if (!is_gpu()) return OK;
1033-
if (ram_size > 0 && max_alloc_size > 0) return OK;
1033+
if (ram_size > 0) return OK;
10341034

10351035
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
10361036
auto eng = dnnl::engine(get_test_engine(), true);
@@ -1042,26 +1042,16 @@ int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size) {
10421042
sizeof(cl_ulong), &ram_sz, nullptr);
10431043
if (status != CL_SUCCESS) return FAIL;
10441044

1045-
cl_ulong max_alloc_sz = 0;
1046-
status = clGetDeviceInfo(ocl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
1047-
sizeof(cl_ulong), &max_alloc_sz, nullptr);
1048-
if (status != CL_SUCCESS) return FAIL;
1049-
10501045
ram_size = (size_t)ram_sz;
1051-
max_alloc_size = (size_t)max_alloc_sz;
10521046
return OK;
10531047
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
10541048
auto eng = dnnl::engine(get_test_engine(), true);
10551049
auto sycl_dev = dnnl::sycl_interop::get_device(eng);
10561050
ram_size = (size_t)sycl_dev
10571051
.get_info<::sycl::info::device::global_mem_size>();
1058-
max_alloc_size
1059-
= (size_t)sycl_dev
1060-
.get_info<::sycl::info::device::max_mem_alloc_size>();
10611052
return OK;
10621053
#endif
10631054
ram_size = 0;
1064-
max_alloc_size = 0;
10651055
return OK;
10661056
}
10671057

@@ -1132,8 +1122,7 @@ int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {
11321122

11331123
static size_t cpu_device_capacity = get_cpu_ram_size();
11341124
static size_t gpu_device_capacity = 0;
1135-
static size_t gpu_max_alloc_capacity = 0;
1136-
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity), WARN);
1125+
SAFE(get_gpu_ram_size(gpu_device_capacity), WARN);
11371126

11381127
const size_t device_max_capacity
11391128
= is_cpu() ? cpu_device_capacity : gpu_device_capacity;
@@ -1171,32 +1160,13 @@ int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {
11711160
res->reason = skip_reason::not_enough_ram;
11721161
}
11731162

1174-
const bool all_allocation_fit_limit = std::all_of(
1175-
check_mem_size_args.sizes.cbegin(),
1176-
check_mem_size_args.sizes.cend(), [&](size_t s) {
1177-
const bool fit = s < gpu_max_alloc_capacity;
1178-
if (!fit) {
1179-
BENCHDNN_PRINT(1,
1180-
"[CHECK_MEM][%s]: Allocation of size %s "
1181-
"doesn't fit allocation limit of %s.\n",
1182-
dir_c_str(), smart_bytes(s).c_str(),
1183-
smart_bytes(gpu_max_alloc_capacity).c_str());
1184-
}
1185-
return fit;
1186-
});
1187-
if (!all_allocation_fit_limit) {
1188-
res->state = SKIPPED;
1189-
res->reason = skip_reason::not_enough_ram;
1190-
}
1191-
11921163
BENCHDNN_PRINT((!fits_device_ram ? 1 : 6),
11931164
"[CHECK_MEM][%s]: Requested: %s; benchdnn_device_limit: %s; "
1194-
"device_RAM_capacity: %s; gpu_max_alloc: %s;\n",
1165+
"device_RAM_capacity: %s;\n",
11951166
dir_c_str(),
11961167
smart_bytes(check_mem_size_args.total_size_device).c_str(),
11971168
smart_bytes(benchdnn_device_limit).c_str(),
1198-
smart_bytes(gpu_device_capacity).c_str(),
1199-
smart_bytes(gpu_max_alloc_capacity).c_str());
1169+
smart_bytes(gpu_device_capacity).c_str());
12001170
}
12011171

12021172
// Note: in theory, `total_size_ref` itself can be smaller for a `prim_ref`

tests/benchdnn/dnnl_common.hpp

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ struct cpu_cache_args_t {
231231
};
232232

233233
size_t get_cpu_ram_size();
234-
int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size);
234+
int get_gpu_ram_size(size_t &ram_size);
235235
int get_cpu_cache_size(cpu_cache_args_t &cache_args);
236236
int get_gpu_cache_size(size_t &cache_size);
237237

@@ -425,31 +425,6 @@ int create_primitive(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &primw,
425425
/* need_skip = */ !is_graph_ref),
426426
WARN);
427427

428-
// The library scratchpad is allocated at create_primitive stage. The memory
429-
// check is moved after the creation stage. It's necessary to check the
430-
// library scratchpad size against gpu_max_alloc, otherwise, out_of_memory
431-
// would be issued by the library.
432-
if (res->mem_size_args.scratchpad_size > 0 && is_gpu()
433-
&& query_scratchpad_mode(query_attr(pdw))
434-
== dnnl_scratchpad_mode_library) {
435-
static size_t gpu_device_capacity = 0;
436-
static size_t gpu_max_alloc_capacity = 0;
437-
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity),
438-
WARN);
439-
const bool fit
440-
= res->mem_size_args.scratchpad_size < gpu_max_alloc_capacity;
441-
if (!fit) {
442-
BENCHDNN_PRINT(1,
443-
"[CHECK_MEM]: Size of the scratchpad %s "
444-
"doesn't fit the allocation limit of %s.\n",
445-
smart_bytes(res->mem_size_args.scratchpad_size).c_str(),
446-
smart_bytes(gpu_max_alloc_capacity).c_str());
447-
res->state = SKIPPED;
448-
res->reason = skip_reason::not_enough_ram;
449-
return OK;
450-
}
451-
}
452-
453428
TIME_C_PRIM(DNN_SAFE(dnnl_primitive_create(&prim, pdw), WARN));
454429
primw.reset(prim);
455430

tests/benchdnn/graph/graph_memory.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ size_t get_benchdnn_cpu_limit() {
3434
size_t get_benchdnn_device_limit() {
3535
if (is_cpu()) return 0;
3636
static size_t gpu_device_capacity = 0;
37-
static size_t gpu_max_alloc_capacity = 0;
38-
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity), WARN);
37+
SAFE(get_gpu_ram_size(gpu_device_capacity), WARN);
3938

4039
const double benchdnn_device_limit = capacity_factor * gpu_device_capacity;
4140
assert(benchdnn_device_limit > 0);

0 commit comments

Comments
 (0)