gpu: intel: ocl: allow unlimited allocations

spalicki · spalicki · commit 79caa933371d · 2025-11-24T09:32:29.000-08:00
diff --git a/src/gpu/intel/compute/device_info.hpp b/src/gpu/intel/compute/device_info.hpp
@@ -216,6 +216,7 @@ struct device_info_t {
     static int max_slm_size_per_tg(gpu_arch_t gpu_arch);
     static int max_slm_size_per_tg(
             gpu_arch_t gpu_arch, int tg_size, bool large_grf_mode = false);
+    size_t memory_size() const { return memory_size_; }
     size_t l3_cache_size() const { return l3_cache_size_; }
     size_t icache_size() const;
     size_t max_kernel_param_size() const { return max_kernel_param_size_; }
@@ -288,6 +289,7 @@ struct device_info_t {
     int32_t max_subgroup_size_ = 16;
     int max_exec_size_ = 0;
     size_t max_wg_size_ = 0;
+    size_t memory_size_ = 0;
     size_t l3_cache_size_ = 0;
     size_t max_kernel_param_size_ = 1024;
     uint32_t device_address_bits_ = 64;
diff --git a/src/gpu/intel/compute/dispatch_reusable.hpp b/src/gpu/intel/compute/dispatch_reusable.hpp
@@ -534,6 +534,7 @@ class reusable_dispatch_t {
         dim_t max_buffer_size = 0;
         uint64_t max_buffer_size_bytes = 0;
         compile_params.num_buffers = buffers.size();
+        const size_t num_work_items = nd_range.global_range().nelems();
         for (size_t buf_idx = 0; buf_idx < buffers.size(); buf_idx++) {
             const named_buffer_t &buffer = buffers[buf_idx];
             // Copy buffer name into params
@@ -553,10 +554,10 @@ class reusable_dispatch_t {
             compile_params.buffer_types[buf_idx] = buffer.data_type;
 
             // Check buffer sizes to see if we can use int32_t offsets
-            // or do we need to use stateless addressing model
+            // and do we need to use stateless addressing model
             max_buffer_size = std::max(max_buffer_size, buffer.nelems(true));
-            max_buffer_size_bytes = std::max(
-                    max_buffer_size_bytes, buffer.size(0, true, true));
+            max_buffer_size_bytes = std::max(max_buffer_size_bytes,
+                    buffer.size(0, true, true) * num_work_items);
         }
 
         compile_params.use_int32_offset = max_buffer_size <= INT32_MAX;
diff --git a/src/gpu/intel/ocl/device_info.cpp b/src/gpu/intel/ocl/device_info.cpp
@@ -134,6 +134,12 @@ status_t device_info_t::init_attributes(impl::engine_t *engine) {
     OCL_CHECK(err);
     max_wg_size_ = max_wg_size;
 
+    cl_ulong mem_size;
+    err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size),
+            &mem_size, nullptr);
+    OCL_CHECK(err);
+    memory_size_ = mem_size;
+
     cl_ulong mem_cache_size;
     err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
             sizeof(mem_cache_size), &mem_cache_size, nullptr);
diff --git a/src/gpu/intel/ocl/usm_utils.cpp b/src/gpu/intel/ocl/usm_utils.cpp
@@ -14,10 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "xpu/ocl/engine_impl.hpp"
-#include "xpu/ocl/stream_impl.hpp"
-
 #include "gpu/intel/ocl/usm_utils.hpp"
+#include "gpu/intel/ocl/engine.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -42,6 +40,27 @@ cl_command_queue get_ocl_queue(impl::stream_t *stream) {
     return utils::downcast<xpu::ocl::stream_impl_t *>(stream->impl())->queue();
 }
 
+const compute::device_info_t *get_device_info(impl::engine_t *engine) {
+    return utils::downcast<const ocl::engine_t *>(engine)->device_info();
+}
+
+template <typename F>
+void *usm_malloc_common(impl::engine_t *engine, size_t size, F ext_func) {
+    auto device_info = get_device_info(engine);
+
+    if (size == 0 || size > device_info->memory_size()) return nullptr;
+    bool large_buffer = size > device_info->max_allocation_size();
+    cl_bitfield large_buffer_flag[]
+            = {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
+
+    cl_int err;
+    void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
+            large_buffer ? large_buffer_flag : nullptr, size, 0, &err);
+    assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
+            CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
+    return p;
+}
+
 } // namespace
 
 bool is_usm_supported(impl::engine_t *engine) {
@@ -55,13 +74,18 @@ bool is_usm_supported(impl::engine_t *engine) {
 void *malloc_host(impl::engine_t *engine, size_t size) {
     using clHostMemAllocINTEL_func_t = void *(*)(cl_context, const cl_ulong *,
             size_t, cl_uint, cl_int *);
-
-    if (size == 0) return nullptr;
-
     static xpu::ocl::ext_func_t<clHostMemAllocINTEL_func_t> ext_func(
             "clHostMemAllocINTEL");
+    auto device_info = get_device_info(engine);
+
+    if (size == 0 || size > device_info->memory_size()) return nullptr;
+    bool large_buffer = size > device_info->max_allocation_size();
+    cl_bitfield large_buffer_flag[]
+            = {CL_MEM_FLAGS_INTEL, CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL, 0};
+
     cl_int err;
-    void *p = ext_func(engine, get_ocl_context(engine), nullptr, size, 0, &err);
+    void *p = ext_func(engine, get_ocl_context(engine),
+            large_buffer ? large_buffer_flag : nullptr, size, 0, &err);
     assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
             CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
     return p;
@@ -70,38 +94,21 @@ void *malloc_host(impl::engine_t *engine, size_t size) {
 void *malloc_device(impl::engine_t *engine, size_t size) {
     using clDeviceMemAllocINTEL_func_t = void *(*)(cl_context, cl_device_id,
             cl_ulong *, size_t, cl_uint, cl_int *);
-
-    if (size == 0) return nullptr;
-
     static xpu::ocl::ext_func_t<clDeviceMemAllocINTEL_func_t> ext_func(
             "clDeviceMemAllocINTEL");
-    cl_int err;
-    void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
-            nullptr, size, 0, &err);
-    assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
-            CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
-    return p;
+    return usm_malloc_common(engine, size, ext_func);
 }
 
 void *malloc_shared(impl::engine_t *engine, size_t size) {
     using clSharedMemAllocINTEL_func_t = void *(*)(cl_context, cl_device_id,
             cl_ulong *, size_t, cl_uint, cl_int *);
-
-    if (size == 0) return nullptr;
-
     static xpu::ocl::ext_func_t<clSharedMemAllocINTEL_func_t> ext_func(
             "clSharedMemAllocINTEL");
-    cl_int err;
-    void *p = ext_func(engine, get_ocl_context(engine), get_ocl_device(engine),
-            nullptr, size, 0, &err);
-    assert(utils::one_of(err, CL_SUCCESS, CL_OUT_OF_RESOURCES,
-            CL_OUT_OF_HOST_MEMORY, CL_INVALID_BUFFER_SIZE));
-    return p;
+    return usm_malloc_common(engine, size, ext_func);
 }
 
 void free(impl::engine_t *engine, void *ptr) {
     using clMemFreeINTEL_func_t = cl_int (*)(cl_context, void *);
-
     if (!ptr) return;
     static xpu::ocl::ext_func_t<clMemFreeINTEL_func_t> ext_func(
             "clMemFreeINTEL");
diff --git a/src/gpu/intel/sycl/device_info.cpp b/src/gpu/intel/sycl/device_info.cpp
@@ -138,6 +138,7 @@ status_t device_info_t::init_attributes(impl::engine_t *engine) {
         eu_count_ = device.get_info<::sycl::info::device::max_compute_units>();
     }
     max_wg_size_ = device.get_info<::sycl::info::device::max_work_group_size>();
+    memory_size_ = device.get_info<::sycl::info::device::global_mem_size>();
     l3_cache_size_
             = device.get_info<::sycl::info::device::global_mem_cache_size>();
     mayiuse_system_memory_allocators_
diff --git a/src/xpu/ocl/buffer_memory_storage.cpp b/src/xpu/ocl/buffer_memory_storage.cpp
@@ -24,11 +24,34 @@
 #include "xpu/ocl/stream_impl.hpp"
 #include "xpu/ocl/usm_utils.hpp"
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+#include "gpu/intel/ocl/engine.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace xpu {
 namespace ocl {
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+status_t buffer_memory_storage_t::init_allocate(size_t size) {
+    auto ocl_engine
+            = utils::downcast<const gpu::intel::ocl::engine_t *>(engine());
+
+    cl_mem_flags flags = CL_MEM_READ_WRITE;
+    if (size > ocl_engine->device_info()->max_allocation_size()) {
+        if (size > ocl_engine->device_info()->memory_size())
+            return status::invalid_arguments;
+        flags |= CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL;
+    }
+
+    cl_int err;
+    mem_object_ = clCreateBuffer_wrapper(
+            ocl_engine->context(), flags, size, nullptr, &err);
+    OCL_CHECK(err);
+    return status::success;
+}
+#else
 status_t buffer_memory_storage_t::init_allocate(size_t size) {
     auto context
             = utils::downcast<const xpu::ocl::engine_impl_t *>(engine()->impl())
@@ -39,6 +62,7 @@ status_t buffer_memory_storage_t::init_allocate(size_t size) {
     OCL_CHECK(err);
     return status::success;
 }
+#endif
 
 namespace {
 status_t get_map_queue(cl_command_queue &queue, impl::engine_t *engine,
diff --git a/src/xpu/ocl/utils.hpp b/src/xpu/ocl/utils.hpp
@@ -28,6 +28,11 @@
 
 #include "xpu/utils.hpp"
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+#define CL_MEM_FLAGS_INTEL 0x10001
+#define CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL (1 << 23)
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace xpu {
diff --git a/tests/benchdnn/dnnl_common.cpp b/tests/benchdnn/dnnl_common.cpp
@@ -1028,9 +1028,9 @@ size_t get_cpu_ram_size() {
 }
 #endif
 
-int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size) {
+int get_gpu_ram_size(size_t &ram_size) {
     if (!is_gpu()) return OK;
-    if (ram_size > 0 && max_alloc_size > 0) return OK;
+    if (ram_size > 0) return OK;
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     auto eng = dnnl::engine(get_test_engine(), true);
@@ -1042,26 +1042,16 @@ int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size) {
             sizeof(cl_ulong), &ram_sz, nullptr);
     if (status != CL_SUCCESS) return FAIL;
 
-    cl_ulong max_alloc_sz = 0;
-    status = clGetDeviceInfo(ocl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-            sizeof(cl_ulong), &max_alloc_sz, nullptr);
-    if (status != CL_SUCCESS) return FAIL;
-
     ram_size = (size_t)ram_sz;
-    max_alloc_size = (size_t)max_alloc_sz;
     return OK;
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
     auto eng = dnnl::engine(get_test_engine(), true);
     auto sycl_dev = dnnl::sycl_interop::get_device(eng);
     ram_size = (size_t)sycl_dev
                        .get_info<::sycl::info::device::global_mem_size>();
-    max_alloc_size
-            = (size_t)sycl_dev
-                      .get_info<::sycl::info::device::max_mem_alloc_size>();
     return OK;
 #endif
     ram_size = 0;
-    max_alloc_size = 0;
     return OK;
 }
 
@@ -1132,8 +1122,7 @@ int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {
 
     static size_t cpu_device_capacity = get_cpu_ram_size();
     static size_t gpu_device_capacity = 0;
-    static size_t gpu_max_alloc_capacity = 0;
-    SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity), WARN);
+    SAFE(get_gpu_ram_size(gpu_device_capacity), WARN);
 
     const size_t device_max_capacity
             = is_cpu() ? cpu_device_capacity : gpu_device_capacity;
@@ -1171,32 +1160,13 @@ int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {
             res->reason = skip_reason::not_enough_ram;
         }
 
-        const bool all_allocation_fit_limit = std::all_of(
-                check_mem_size_args.sizes.cbegin(),
-                check_mem_size_args.sizes.cend(), [&](size_t s) {
-                    const bool fit = s < gpu_max_alloc_capacity;
-                    if (!fit) {
-                        BENCHDNN_PRINT(1,
-                                "[CHECK_MEM][%s]: Allocation of size %s "
-                                "doesn't fit allocation limit of %s.\n",
-                                dir_c_str(), smart_bytes(s).c_str(),
-                                smart_bytes(gpu_max_alloc_capacity).c_str());
-                    }
-                    return fit;
-                });
-        if (!all_allocation_fit_limit) {
-            res->state = SKIPPED;
-            res->reason = skip_reason::not_enough_ram;
-        }
-
         BENCHDNN_PRINT((!fits_device_ram ? 1 : 6),
                 "[CHECK_MEM][%s]: Requested: %s; benchdnn_device_limit: %s; "
-                "device_RAM_capacity: %s; gpu_max_alloc: %s;\n",
+                "device_RAM_capacity: %s;\n",
                 dir_c_str(),
                 smart_bytes(check_mem_size_args.total_size_device).c_str(),
                 smart_bytes(benchdnn_device_limit).c_str(),
-                smart_bytes(gpu_device_capacity).c_str(),
-                smart_bytes(gpu_max_alloc_capacity).c_str());
+                smart_bytes(gpu_device_capacity).c_str());
     }
 
     // Note: in theory, `total_size_ref` itself can be smaller for a `prim_ref`
diff --git a/tests/benchdnn/dnnl_common.hpp b/tests/benchdnn/dnnl_common.hpp
@@ -231,7 +231,7 @@ struct cpu_cache_args_t {
 };
 
 size_t get_cpu_ram_size();
-int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size);
+int get_gpu_ram_size(size_t &ram_size);
 int get_cpu_cache_size(cpu_cache_args_t &cache_args);
 int get_gpu_cache_size(size_t &cache_size);
 
@@ -425,31 +425,6 @@ int create_primitive(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &primw,
                  /* need_skip = */ !is_graph_ref),
             WARN);
 
-    // The library scratchpad is allocated at create_primitive stage. The memory
-    // check is moved after the creation stage. It's necessary to check the
-    // library scratchpad size against gpu_max_alloc, otherwise, out_of_memory
-    // would be issued by the library.
-    if (res->mem_size_args.scratchpad_size > 0 && is_gpu()
-            && query_scratchpad_mode(query_attr(pdw))
-                    == dnnl_scratchpad_mode_library) {
-        static size_t gpu_device_capacity = 0;
-        static size_t gpu_max_alloc_capacity = 0;
-        SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity),
-                WARN);
-        const bool fit
-                = res->mem_size_args.scratchpad_size < gpu_max_alloc_capacity;
-        if (!fit) {
-            BENCHDNN_PRINT(1,
-                    "[CHECK_MEM]: Size of the scratchpad %s "
-                    "doesn't fit the allocation limit of %s.\n",
-                    smart_bytes(res->mem_size_args.scratchpad_size).c_str(),
-                    smart_bytes(gpu_max_alloc_capacity).c_str());
-            res->state = SKIPPED;
-            res->reason = skip_reason::not_enough_ram;
-            return OK;
-        }
-    }
-
     TIME_C_PRIM(DNN_SAFE(dnnl_primitive_create(&prim, pdw), WARN));
     primw.reset(prim);
 
diff --git a/tests/benchdnn/graph/graph_memory.cpp b/tests/benchdnn/graph/graph_memory.cpp
@@ -34,8 +34,7 @@ size_t get_benchdnn_cpu_limit() {
 size_t get_benchdnn_device_limit() {
     if (is_cpu()) return 0;
     static size_t gpu_device_capacity = 0;
-    static size_t gpu_max_alloc_capacity = 0;
-    SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity), WARN);
+    SAFE(get_gpu_ram_size(gpu_device_capacity), WARN);
 
     const double benchdnn_device_limit = capacity_factor * gpu_device_capacity;
     assert(benchdnn_device_limit > 0);

Original file line number	Diff line number	Diff line change
`@@ -138,6 +138,7 @@ status_t device_info_t::init_attributes(impl::engine_t *engine) {`
`138`	`138`	`eu_count_ = device.get_info<::sycl::info::device::max_compute_units>();`
`139`	`139`	`}`
`140`	`140`	`max_wg_size_ = device.get_info<::sycl::info::device::max_work_group_size>();`
	`141`	`+ memory_size_ = device.get_info<::sycl::info::device::global_mem_size>();`
`141`	`142`	`l3_cache_size_`
`142`	`143`	`= device.get_info<::sycl::info::device::global_mem_cache_size>();`
`143`	`144`	`mayiuse_system_memory_allocators_`