diff --git a/platforms/artic/intrinsics_thorin.impala b/platforms/artic/intrinsics_thorin.impala index 93d28e60..fa627426 100644 --- a/platforms/artic/intrinsics_thorin.impala +++ b/platforms/artic/intrinsics_thorin.impala @@ -12,11 +12,12 @@ #[import(cc = "thorin")] fn cmpxchg_weak[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types #[import(cc = "thorin")] fn fence(_order: u32, _scope: &[u8]) -> (); #[import(cc = "thorin")] fn pe_info[T](_src: &[u8], _val: T) -> (); -#[import(cc = "thorin")] fn cuda(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); -#[import(cc = "thorin")] fn nvvm(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); -#[import(cc = "thorin")] fn opencl(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); -#[import(cc = "thorin")] fn amdgpu_hsa(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); -#[import(cc = "thorin")] fn amdgpu_pal(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> (); +#[import(cc = "thorin", name = "cuda")] fn cuda_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), i32, _body: fn() -> ()) -> (); +#[import(cc = "thorin", name = "nvvm")] fn nvvm_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), i32, _body: fn() -> ()) -> (); +#[import(cc = "thorin", name = "opencl")] fn opencl_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), i32, _body: fn() -> ()) -> (); +#[import(cc = "thorin", name = "amdgpu_hsa")] fn amdgpu_hsa_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), i32, _body: fn() -> ()) -> (); +#[import(cc = "thorin", name = "amdgpu_pal")] fn amdgpu_pal_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), i32, _body: fn() -> ()) -> (); +#[import(cc = "thorin")] fn local_memory_base() -> &mut addrspace(3)[u8]; #[import(cc = "thorin")] fn reserve_shared[T](_size: i32) -> &mut addrspace(3)[T]; #[import(cc = "thorin")] fn hls(_dev: i32, _body: fn() -> ()) -> (); #[import(cc = "thorin", name = "pipeline")] fn thorin_pipeline(_initiation_interval: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); // only for HLS/OpenCL backend @@ -36,6 +37,12 @@ #[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); #[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); +fn @cuda(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = cuda_with_lmem(dev, grid, block, 0, body); +fn @nvvm(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = nvvm_with_lmem(dev, grid, block, 0, body); +fn @opencl(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = opencl_with_lmem(dev, grid, block, 0, body); +fn @amdgpu_hsa(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = amdgpu_hsa_with_lmem(dev, grid, block, 0, body); +fn @amdgpu_pal(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = amdgpu_pal_with_lmem(dev, grid, block, 0, body); + fn @pipeline(body: fn(i32) -> ()) = @|initiation_interval: i32, lower: i32, upper: i32| thorin_pipeline(initiation_interval, lower, upper, body); fn @parallel(body: fn(i32) -> ()) = @|num_threads: i32, lower: i32, upper: i32| thorin_parallel(num_threads, lower, upper, body); fn @spawn(body: fn() -> ()) = @|| thorin_spawn(body); diff --git a/platforms/impala/intrinsics_thorin.impala b/platforms/impala/intrinsics_thorin.impala index d83079db..c7a4c0af 100644 --- a/platforms/impala/intrinsics_thorin.impala +++ b/platforms/impala/intrinsics_thorin.impala @@ -10,10 +10,12 @@ extern "thorin" { fn insert[T, U](T, i32, U) -> T; //fn shuffle[T](T, T, T) -> T; - fn cuda(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); - fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); - fn opencl(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); - fn amdgpu(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> (); + fn "cuda" cuda_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> (); + fn "nvvm" nvvm_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> (); + fn "opencl" opencl_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> (); + fn "amdgpu_hsa" amdgpu_hsa_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> (); + fn "amdgpu_pal" amdgpu_pal_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> (); + fn local_memory_base() -> &mut[3][u8]; fn reserve_shared[T](i32) -> &mut[3][T]; fn hls(dev: i32, body: fn() -> ()) -> (); @@ -42,3 +44,9 @@ extern "thorin" { fn vectorize(vector_length: i32, body: fn(i32) -> ()) -> (); } + +fn @cuda(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { cuda_with_lmem(dev, grid, block, 0, body) } +fn @nvvm(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { nvvm_with_lmem(dev, grid, block, 0, body) } +fn @opencl(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { opencl_with_lmem(dev, grid, block, 0, body) } +fn @amdgpu_hsa(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { amdgpu_hsa_with_lmem(dev, grid, block, 0, body) } +fn @amdgpu_pal(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { amdgpu_pal_with_lmem(dev, grid, block, 0, body) } diff --git a/src/anydsl_runtime.cpp b/src/anydsl_runtime.cpp index e369f1bc..a4bf6252 100644 --- a/src/anydsl_runtime.cpp +++ b/src/anydsl_runtime.cpp @@ -118,6 +118,7 @@ void anydsl_copy( void anydsl_launch_kernel( int32_t mask, const char* file_name, const char* kernel_name, const uint32_t* grid, const uint32_t* block, + uint32_t lmem, void** arg_data, const uint32_t* arg_sizes, const uint32_t* arg_aligns, @@ -129,6 +130,7 @@ void anydsl_launch_kernel( kernel_name, grid, block, + lmem, { arg_data, arg_sizes, diff --git a/src/anydsl_runtime.h b/src/anydsl_runtime.h index 0f3d331e..2b9dc71e 100644 --- a/src/anydsl_runtime.h +++ b/src/anydsl_runtime.h @@ -37,6 +37,7 @@ AnyDSL_runtime_API void anydsl_copy(int32_t, const void*, int64_t, int32_t, void AnyDSL_runtime_API void anydsl_launch_kernel( int32_t, const char*, const char*, const uint32_t*, const uint32_t*, + uint32_t, void**, const uint32_t*, const uint32_t*, const uint32_t*, const uint8_t*, uint32_t); AnyDSL_runtime_API void anydsl_synchronize(int32_t); diff --git a/src/cuda_platform.cpp b/src/cuda_platform.cpp index 32d9e7e1..cd2c245b 100644 --- a/src/cuda_platform.cpp +++ b/src/cuda_platform.cpp @@ -217,7 +217,9 @@ void CudaPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_params launch_params.grid[1] / launch_params.block[1], launch_params.grid[2] / launch_params.block[2], launch_params.block[0], launch_params.block[1], launch_params.block[2], - 0, nullptr, launch_params.args.data, nullptr); + launch_params.lmem, + nullptr, + launch_params.args.data, nullptr); CHECK_CUDA(err, "cuLaunchKernel()"); if (runtime_->profiling_enabled()) { diff --git a/src/hsa_platform.cpp b/src/hsa_platform.cpp index 5f68d1c9..c7c7ed70 100644 --- a/src/hsa_platform.cpp +++ b/src/hsa_platform.cpp @@ -409,7 +409,7 @@ void HSAPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_params) aql.kernel_object = kernel_info.kernel; aql.kernarg_address = kernel_info.kernarg_segment; aql.private_segment_size = kernel_info.private_segment_size; - aql.group_segment_size = kernel_info.group_segment_size; + aql.group_segment_size = (kernel_info.group_segment_size + 15) / 16 * kernel_info.group_segment_size + launch_params.lmem; // write to command queue const uint64_t index = hsa_queue_load_write_index_relaxed(queue); diff --git a/src/opencl_platform.cpp b/src/opencl_platform.cpp index 2b9d59ec..ef6ae0bc 100644 --- a/src/opencl_platform.cpp +++ b/src/opencl_platform.cpp @@ -375,7 +375,8 @@ void OpenCLPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_para cl_mem struct_buf = clCreateBuffer(devices_[dev].ctx, flags, launch_params.args.sizes[i], launch_params.args.data[i], &err); CHECK_OPENCL(err, "clCreateBuffer()"); kernel_structs[i] = struct_buf; - clSetKernelArg(kernel, i, sizeof(cl_mem), &kernel_structs[i]); + err = clSetKernelArg(kernel, i, sizeof(cl_mem), &kernel_structs[i]); + CHECK_OPENCL(err, "clSetKernelArg()"); } else { #ifdef CL_VERSION_2_0 if (launch_params.args.types[i] == KernelArgType::Ptr && devices_[dev].version_major == 2) { @@ -391,6 +392,11 @@ void OpenCLPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_para } } + if (launch_params.lmem != 0) { + cl_int err = clSetKernelArg(kernel, launch_params.num_args, launch_params.lmem, nullptr); + CHECK_OPENCL(err, "clSetKernelArg()"); + } + size_t global_work_size[] = {launch_params.grid [0], launch_params.grid [1], launch_params.grid [2]}; size_t local_work_size[] = {launch_params.block[0], launch_params.block[1], launch_params.block[2]}; diff --git a/src/pal_platform.cpp b/src/pal_platform.cpp index 98962d42..d154cdce 100644 --- a/src/pal_platform.cpp +++ b/src/pal_platform.cpp @@ -215,6 +215,7 @@ void PALPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_params) Pal::PipelineBindParams params = {}; params.pipelineBindPoint = Pal::PipelineBindPoint::Compute; params.pPipeline = pipeline; + params.cs.ldsBytesPerTg = launch_params.lmem; // TODO: add static LDS size constexpr Pal::HwPipePoint pipe_point = Pal::HwPipePostCs; Pal::BarrierInfo barrier_info = {}; diff --git a/src/runtime.h b/src/runtime.h index a9af9891..79a9731a 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -35,6 +35,7 @@ struct LaunchParams { const char* kernel_name; const uint32_t* grid; const uint32_t* block; + uint32_t lmem; ParamsArgs args; uint32_t num_args; };