Open
Description
With an FP16 input, the example
using CUDA, KernelAbstractions, Atomix
function index_fun_fixed(arr; backend=get_backend(arr))
out = similar(arr)
fill!(out, 0)
kernel! = my_kernel_fixed!(backend)
kernel!(out, arr, ndrange=(size(arr, 1), size(arr, 2)))
return out
end
@kernel function my_kernel_fixed!(out, arr)
i, j = @index(Global, NTuple)
for k in 1:size(out, 1)
Atomix.@atomic out[k, i] += arr[i, j]
end
end
img_f16 = zeros(Float16, (50, 50))
index_fun_fixed(CuArray(img_f16))
throws an error.
ERROR: LLVM error: Cannot select: 0x434d1250: f16,ch = AtomicLoadFAdd<(load store seq_cst (s16) on %ir.15, addrspace 1)> 0x43223230:1, 0x434d0fe0, 0x43223230, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:363 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/internal.jl:20 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:33 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ]
0x434d0fe0: i64 = add 0x43222f58, Constant:i64<-2>, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
0x43222f58: i64 = add 0x434cd998, 0x43222598, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
0x434cd998: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %5, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
0x41eb8eb8: i64 = Register %5
0x43222598: i64 = shl 0x43222600, Constant:i32<1>, int.jl:88 @[ abstractarray.jl:1244 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ]
0x43222600: i64 = add 0x43222bb0, 0x434d0a98, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x43222bb0: i64 = mul 0x434cde78, 0x41eb9538, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x434cde78: i64 = AssertZext 0x41eb97a8, ValueType:ch:i63, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x41eb97a8: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %23, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x434d08f8: i64 = Register %23
0x41eb9538: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %24, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x43222c18: i64 = Register %24
0x434d0a98: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %12, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
0x434cde10: i64 = Register %12
0x41eb9190: i32 = Constant<1>
0x41eb8aa8: i64 = Constant<-2>
0x43223230: f16,ch = load<(load (s16) from %ir.64, !tbaa !511, addrspace 1)> 0x45ac35c8, 0x434d1180, undef:i64, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
0x434d1180: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %27, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
0x43222a78: i64 = Register %27
0x43222ce8: i64 = undef
In function: _Z20gpu_my_kernel_fixed_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEE7NDRangeILi2ES0_S0_S2_ILi2ES3_IS4_IS5_ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEEEE13CuDeviceArrayI7Float16Li2ELi1EES7_IS8_Li2ELi1EE
Stacktrace:
[1] handle_error(reason::Cstring)
@ LLVM ~/.julia/packages/LLVM/6cDbl/src/core/context.jl:168
[2] LLVMTargetMachineEmitToMemoryBuffer(T::LLVM.TargetMachine, M::LLVM.Module, codegen::LLVM.API.LLVMCodeGenFileType, ErrorMessage::Base.RefValue{…}, OutMemBuf::Base.RefValue{…})
@ LLVM.API ~/.julia/packages/LLVM/6cDbl/lib/15/libLLVM.jl:5318
[3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
@ LLVM ~/.julia/packages/LLVM/6cDbl/src/targetmachine.jl:45
[4] mcgen
@ ~/.julia/packages/GPUCompiler/nWT2N/src/mcgen.jl:84 [inlined]
[5] mcgen(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:131
[6] macro expansion
@ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
[7] macro expansion
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:481 [inlined]
[8] macro expansion
@ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
[9] macro expansion
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:478 [inlined]
[10] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:103
[11] emit_asm
@ ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:97 [inlined]
[12]
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:156
[13] codegen
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:115 [inlined]
[14]
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:111
[15] compile
@ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:103 [inlined]
[16] #1145
@ ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:254 [inlined]
[17] JuliaContext(f::CUDA.var"#1145#1148"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:52
[18] JuliaContext(f::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:42
[19] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:253
[20] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:128
[21] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:103
[22] macro expansion
@ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:369 [inlined]
[23] macro expansion
@ ./lock.jl:267 [inlined]
[24] cufunction(f::typeof(gpu_my_kernel_fixed!), tt::Type{Tuple{…}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Nothing})
@ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:364
[25] macro expansion
@ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:112 [inlined]
[26] (::KernelAbstractions.Kernel{…})(::CuArray{…}, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
@ CUDA.CUDAKernels ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:103
[27] Kernel
@ ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:89 [inlined]
[28] #index_fun_fixed#1
@ ./REPL[96]:5 [inlined]
[29] index_fun_fixed(arr::CuArray{Float16, 2, CUDA.DeviceMemory})
@ Main ./REPL[96]:1
[30] top-level scope
@ REPL[110]:1
Some type information was truncated. Use `show(err)` to see complete types.
It works fine on FP32 inputs.
Julia and package version:
julia> versioninfo()
Julia Version 1.10.4
Commit 48d4fd48430 (2024-06-04 10:41 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 24 × AMD Ryzen 9 5900X 12-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 1 default, 0 interactive, 1 GC (on 24 virtual cores)
(TestKA) pkg> st
Project TestKA v0.1.10
Status `~/Projects/TestKA.jl/Project.toml`
[a9b6321e] Atomix v0.1.0
[052768ef] CUDA v5.4.2
[63c18a36] KernelAbstractions v0.9.21
Metadata
Metadata
Assignees
Labels
No labels