Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/gpu/intel/compute/device_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ struct device_info_t {
static int max_slm_size_per_tg(gpu_arch_t gpu_arch);
static int max_slm_size_per_tg(
gpu_arch_t gpu_arch, int tg_size, bool large_grf_mode = false);
size_t memory_size() const { return memory_size_; }
size_t l3_cache_size() const { return l3_cache_size_; }
size_t icache_size() const;
size_t max_kernel_param_size() const { return max_kernel_param_size_; }
Expand Down Expand Up @@ -288,6 +289,7 @@ struct device_info_t {
int32_t max_subgroup_size_ = 16;
int max_exec_size_ = 0;
size_t max_wg_size_ = 0;
size_t memory_size_ = 0;
size_t l3_cache_size_ = 0;
size_t max_kernel_param_size_ = 1024;
uint32_t device_address_bits_ = 64;
Expand Down
7 changes: 4 additions & 3 deletions src/gpu/intel/compute/dispatch_reusable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ class reusable_dispatch_t {
dim_t max_buffer_size = 0;
uint64_t max_buffer_size_bytes = 0;
compile_params.num_buffers = buffers.size();
const size_t num_work_items = nd_range.global_range().nelems();
for (size_t buf_idx = 0; buf_idx < buffers.size(); buf_idx++) {
const named_buffer_t &buffer = buffers[buf_idx];
// Copy buffer name into params
Expand All @@ -553,10 +554,10 @@ class reusable_dispatch_t {
compile_params.buffer_types[buf_idx] = buffer.data_type;

// Check buffer sizes to see if we can use int32_t offsets
// or do we need to use stateless addressing model
// and do we need to use stateless addressing model
max_buffer_size = std::max(max_buffer_size, buffer.nelems(true));
max_buffer_size_bytes = std::max(
max_buffer_size_bytes, buffer.size(0, true, true));
max_buffer_size_bytes = std::max(max_buffer_size_bytes,
buffer.size(0, true, true) * num_work_items);
}

compile_params.use_int32_offset = max_buffer_size <= INT32_MAX;
Expand Down
6 changes: 6 additions & 0 deletions src/gpu/intel/ocl/device_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ status_t device_info_t::init_attributes(impl::engine_t *engine) {
OCL_CHECK(err);
max_wg_size_ = max_wg_size;

cl_ulong mem_size;
err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size),
&mem_size, nullptr);
OCL_CHECK(err);
memory_size_ = mem_size;

cl_ulong mem_cache_size;
err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
sizeof(mem_cache_size), &mem_cache_size, nullptr);
Expand Down
60 changes: 34 additions & 26 deletions src/gpu/intel/ocl/usm_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@
* limitations under the License.
*******************************************************************************/

#include "xpu/ocl/engine_impl.hpp"
#include "xpu/ocl/stream_impl.hpp"

#include "gpu/intel/ocl/usm_utils.hpp"
#include "gpu/intel/ocl/engine.hpp"

namespace dnnl {
namespace impl {
Expand All @@ -42,6 +40,28 @@ cl_command_queue get_ocl_queue(impl::stream_t *stream) {
return utils::downcast<xpu::ocl::stream_impl_t *>(stream->impl())->queue();
}

const compute::device_info_t *get_device_info(impl::engine_t *engine) {
return utils::downcast<const ocl::engine_t *>(engine)->device_info();
}

template <typename F>
void *usm_malloc_common(
impl::engine_t *engine, size_t size, const F &ext_func) {
auto device_info = get_device_info(engine);

if (size == 0 || size > device_info->memory_size()) return nullptr;
bool large_buffer = size > device_info->max_allocation_size();
cl_bitfield large_buffer_flag[]
= {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};

cl_int err;
void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
large_buffer ? large_buffer_flag : nullptr, size, 0, &err);
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
return p;
}

} // namespace

bool is_usm_supported(impl::engine_t *engine) {
Expand All @@ -55,13 +75,18 @@ bool is_usm_supported(impl::engine_t *engine) {
void *malloc_host(impl::engine_t *engine, size_t size) {
using clHostMemAllocINTEL_func_t = void *(*)(cl_context, const cl_ulong *,
size_t, cl_uint, cl_int *);

if (size == 0) return nullptr;

static xpu::ocl::ext_func_t<clHostMemAllocINTEL_func_t> ext_func(
"clHostMemAllocINTEL");
auto device_info = get_device_info(engine);

if (size == 0 || size > device_info->memory_size()) return nullptr;
bool large_buffer = size > device_info->max_allocation_size();
cl_bitfield large_buffer_flag[]
= {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};

cl_int err;
void *p = ext_func(engine, get_ocl_context(engine), nullptr, size, 0, &err);
void *p = ext_func(engine, get_ocl_context(engine),
large_buffer ? large_buffer_flag : nullptr, size, 0, &err);
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
return p;
Expand All @@ -70,38 +95,21 @@ void *malloc_host(impl::engine_t *engine, size_t size) {
void *malloc_device(impl::engine_t *engine, size_t size) {
using clDeviceMemAllocINTEL_func_t = void *(*)(cl_context, cl_device_id,
cl_ulong *, size_t, cl_uint, cl_int *);

if (size == 0) return nullptr;

static xpu::ocl::ext_func_t<clDeviceMemAllocINTEL_func_t> ext_func(
"clDeviceMemAllocINTEL");
cl_int err;
void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
nullptr, size, 0, &err);
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
return p;
return usm_malloc_common(engine, size, ext_func);
}

void *malloc_shared(impl::engine_t *engine, size_t size) {
using clSharedMemAllocINTEL_func_t = void *(*)(cl_context, cl_device_id,
cl_ulong *, size_t, cl_uint, cl_int *);

if (size == 0) return nullptr;

static xpu::ocl::ext_func_t<clSharedMemAllocINTEL_func_t> ext_func(
"clSharedMemAllocINTEL");
cl_int err;
void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
nullptr, size, 0, &err);
assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
return p;
return usm_malloc_common(engine, size, ext_func);
}

void free(impl::engine_t *engine, void *ptr) {
using clMemFreeINTEL_func_t = cl_int (*)(cl_context, void *);

if (!ptr) return;
static xpu::ocl::ext_func_t<clMemFreeINTEL_func_t> ext_func(
"clMemFreeINTEL");
Expand Down
1 change: 1 addition & 0 deletions src/gpu/intel/sycl/device_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ status_t device_info_t::init_attributes(impl::engine_t *engine) {
eu_count_ = device.get_info<::sycl::info::device::max_compute_units>();
}
max_wg_size_ = device.get_info<::sycl::info::device::max_work_group_size>();
memory_size_ = device.get_info<::sycl::info::device::global_mem_size>();
l3_cache_size_
= device.get_info<::sycl::info::device::global_mem_cache_size>();
mayiuse_system_memory_allocators_
Expand Down
24 changes: 24 additions & 0 deletions src/xpu/ocl/buffer_memory_storage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,34 @@
#include "xpu/ocl/stream_impl.hpp"
#include "xpu/ocl/usm_utils.hpp"

#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
#include "gpu/intel/ocl/engine.hpp"
#endif

namespace dnnl {
namespace impl {
namespace xpu {
namespace ocl {

#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
status_t buffer_memory_storage_t::init_allocate(size_t size) {
auto ocl_engine
= utils::downcast<const gpu::intel::ocl::engine_t *>(engine());

cl_mem_flags flags = CL_MEM_READ_WRITE;
if (size > ocl_engine->device_info()->max_allocation_size()) {
if (size > ocl_engine->device_info()->memory_size())
return status::invalid_arguments;
flags |= CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL;
}

cl_int err;
mem_object_ = clCreateBuffer_wrapper(
ocl_engine->context(), flags, size, nullptr, &err);
OCL_CHECK(err);
return status::success;
}
#else
status_t buffer_memory_storage_t::init_allocate(size_t size) {
auto context
= utils::downcast<const xpu::ocl::engine_impl_t *>(engine()->impl())
Expand All @@ -39,6 +62,7 @@ status_t buffer_memory_storage_t::init_allocate(size_t size) {
OCL_CHECK(err);
return status::success;
}
#endif

namespace {
status_t get_map_queue(cl_command_queue &queue, impl::engine_t *engine,
Expand Down
5 changes: 5 additions & 0 deletions src/xpu/ocl/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@

#include "xpu/utils.hpp"

#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
#define CL_MEM_FLAGS_INTEL 0x10001
#define CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL (1 << 23)
#endif

namespace dnnl {
namespace impl {
namespace xpu {
Expand Down
40 changes: 5 additions & 35 deletions tests/benchdnn/dnnl_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1028,9 +1028,9 @@ size_t get_cpu_ram_size() {
}
#endif

int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size) {
int get_gpu_ram_size(size_t &ram_size) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you have a chance to verify the change works with all 4 supported memory kinds times correctness and fast performance mode where different approach used for memory object management?

As a part of this question also: should this call be updated?

Copy link
Contributor Author

@spalicki spalicki Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it seems that CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL is truly unrestricted, allowing to allocate any size even over the GPU VRAM. So, I am going to have to add some additional guard.
NVM it looks like driver bug.

The flag according to documentation only applies to clCreateBuffer, clCreateBufferWithProperties, clCreateBufferWithPropertiesINTEL, clSVMAlloc, clSharedMemAllocINTEL, clDeviceMemAllocINTEL and clHostMemAllocINTEL.

if (!is_gpu()) return OK;
if (ram_size > 0 && max_alloc_size > 0) return OK;
if (ram_size > 0) return OK;

#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
auto eng = dnnl::engine(get_test_engine(), true);
Expand All @@ -1042,26 +1042,16 @@ int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size) {
sizeof(cl_ulong), &ram_sz, nullptr);
if (status != CL_SUCCESS) return FAIL;

cl_ulong max_alloc_sz = 0;
status = clGetDeviceInfo(ocl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_ulong), &max_alloc_sz, nullptr);
if (status != CL_SUCCESS) return FAIL;

ram_size = (size_t)ram_sz;
max_alloc_size = (size_t)max_alloc_sz;
return OK;
#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
auto eng = dnnl::engine(get_test_engine(), true);
auto sycl_dev = dnnl::sycl_interop::get_device(eng);
ram_size = (size_t)sycl_dev
.get_info<::sycl::info::device::global_mem_size>();
max_alloc_size
= (size_t)sycl_dev
.get_info<::sycl::info::device::max_mem_alloc_size>();
return OK;
#endif
ram_size = 0;
max_alloc_size = 0;
return OK;
}

Expand Down Expand Up @@ -1132,8 +1122,7 @@ int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {

static size_t cpu_device_capacity = get_cpu_ram_size();
static size_t gpu_device_capacity = 0;
static size_t gpu_max_alloc_capacity = 0;
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity), WARN);
SAFE(get_gpu_ram_size(gpu_device_capacity), WARN);

const size_t device_max_capacity
= is_cpu() ? cpu_device_capacity : gpu_device_capacity;
Expand Down Expand Up @@ -1171,32 +1160,13 @@ int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {
res->reason = skip_reason::not_enough_ram;
}

const bool all_allocation_fit_limit = std::all_of(
check_mem_size_args.sizes.cbegin(),
check_mem_size_args.sizes.cend(), [&](size_t s) {
const bool fit = s < gpu_max_alloc_capacity;
if (!fit) {
BENCHDNN_PRINT(1,
"[CHECK_MEM][%s]: Allocation of size %s "
"doesn't fit allocation limit of %s.\n",
dir_c_str(), smart_bytes(s).c_str(),
smart_bytes(gpu_max_alloc_capacity).c_str());
}
return fit;
});
if (!all_allocation_fit_limit) {
res->state = SKIPPED;
res->reason = skip_reason::not_enough_ram;
}

BENCHDNN_PRINT((!fits_device_ram ? 1 : 6),
"[CHECK_MEM][%s]: Requested: %s; benchdnn_device_limit: %s; "
"device_RAM_capacity: %s; gpu_max_alloc: %s;\n",
"device_RAM_capacity: %s;\n",
dir_c_str(),
smart_bytes(check_mem_size_args.total_size_device).c_str(),
smart_bytes(benchdnn_device_limit).c_str(),
smart_bytes(gpu_device_capacity).c_str(),
smart_bytes(gpu_max_alloc_capacity).c_str());
smart_bytes(gpu_device_capacity).c_str());
}

// Note: in theory, `total_size_ref` itself can be smaller for a `prim_ref`
Expand Down
27 changes: 1 addition & 26 deletions tests/benchdnn/dnnl_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ struct cpu_cache_args_t {
};

size_t get_cpu_ram_size();
int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size);
int get_gpu_ram_size(size_t &ram_size);
int get_cpu_cache_size(cpu_cache_args_t &cache_args);
int get_gpu_cache_size(size_t &cache_size);

Expand Down Expand Up @@ -425,31 +425,6 @@ int create_primitive(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &primw,
/* need_skip = */ !is_graph_ref),
WARN);

// The library scratchpad is allocated at create_primitive stage. The memory
// check is moved after the creation stage. It's necessary to check the
// library scratchpad size against gpu_max_alloc, otherwise, out_of_memory
// would be issued by the library.
if (res->mem_size_args.scratchpad_size > 0 && is_gpu()
&& query_scratchpad_mode(query_attr(pdw))
== dnnl_scratchpad_mode_library) {
static size_t gpu_device_capacity = 0;
static size_t gpu_max_alloc_capacity = 0;
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity),
WARN);
const bool fit
= res->mem_size_args.scratchpad_size < gpu_max_alloc_capacity;
if (!fit) {
BENCHDNN_PRINT(1,
"[CHECK_MEM]: Size of the scratchpad %s "
"doesn't fit the allocation limit of %s.\n",
smart_bytes(res->mem_size_args.scratchpad_size).c_str(),
smart_bytes(gpu_max_alloc_capacity).c_str());
res->state = SKIPPED;
res->reason = skip_reason::not_enough_ram;
return OK;
}
}

TIME_C_PRIM(DNN_SAFE(dnnl_primitive_create(&prim, pdw), WARN));
primw.reset(prim);

Expand Down
3 changes: 1 addition & 2 deletions tests/benchdnn/graph/graph_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ size_t get_benchdnn_cpu_limit() {
size_t get_benchdnn_device_limit() {
if (is_cpu()) return 0;
static size_t gpu_device_capacity = 0;
static size_t gpu_max_alloc_capacity = 0;
SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity), WARN);
SAFE(get_gpu_ram_size(gpu_device_capacity), WARN);

const double benchdnn_device_limit = capacity_factor * gpu_device_capacity;
assert(benchdnn_device_limit > 0);
Expand Down
Loading