Skip to content

LLVM error when using Atomix.@atomic with FP16 #486

Open
@ymtoo

Description

@ymtoo

With an FP16 input, the example

using CUDA, KernelAbstractions, Atomix

function index_fun_fixed(arr; backend=get_backend(arr))
	out = similar(arr)
	fill!(out, 0)
	kernel! = my_kernel_fixed!(backend)
	kernel!(out, arr, ndrange=(size(arr, 1), size(arr, 2)))
	return out
end

@kernel function my_kernel_fixed!(out, arr)
	i, j = @index(Global, NTuple)
	for k in 1:size(out, 1)
		Atomix.@atomic out[k, i] += arr[i, j]
	end
end

img_f16 = zeros(Float16, (50, 50))
index_fun_fixed(CuArray(img_f16))

throws an error.

ERROR: LLVM error: Cannot select: 0x434d1250: f16,ch = AtomicLoadFAdd<(load store seq_cst (s16) on %ir.15, addrspace 1)> 0x43223230:1, 0x434d0fe0, 0x43223230, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:363 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/internal.jl:20 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:33 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ]
  0x434d0fe0: i64 = add 0x43222f58, Constant:i64<-2>, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
    0x43222f58: i64 = add 0x434cd998, 0x43222598, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
      0x434cd998: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %5, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
        0x41eb8eb8: i64 = Register %5
      0x43222598: i64 = shl 0x43222600, Constant:i32<1>, int.jl:88 @[ abstractarray.jl:1244 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ]
        0x43222600: i64 = add 0x43222bb0, 0x434d0a98, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
          0x43222bb0: i64 = mul 0x434cde78, 0x41eb9538, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
            0x434cde78: i64 = AssertZext 0x41eb97a8, ValueType:ch:i63, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
              0x41eb97a8: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %23, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
                0x434d08f8: i64 = Register %23
            0x41eb9538: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %24, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
              0x43222c18: i64 = Register %24
          0x434d0a98: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %12, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
            0x434cde10: i64 = Register %12
        0x41eb9190: i32 = Constant<1>
    0x41eb8aa8: i64 = Constant<-2>
  0x43223230: f16,ch = load<(load (s16) from %ir.64, !tbaa !511, addrspace 1)> 0x45ac35c8, 0x434d1180, undef:i64, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
    0x434d1180: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %27, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
      0x43222a78: i64 = Register %27
    0x43222ce8: i64 = undef
In function: _Z20gpu_my_kernel_fixed_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEE7NDRangeILi2ES0_S0_S2_ILi2ES3_IS4_IS5_ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEEEE13CuDeviceArrayI7Float16Li2ELi1EES7_IS8_Li2ELi1EE
Stacktrace:
  [1] handle_error(reason::Cstring)
    @ LLVM ~/.julia/packages/LLVM/6cDbl/src/core/context.jl:168
  [2] LLVMTargetMachineEmitToMemoryBuffer(T::LLVM.TargetMachine, M::LLVM.Module, codegen::LLVM.API.LLVMCodeGenFileType, ErrorMessage::Base.RefValue{…}, OutMemBuf::Base.RefValue{…})
    @ LLVM.API ~/.julia/packages/LLVM/6cDbl/lib/15/libLLVM.jl:5318
  [3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
    @ LLVM ~/.julia/packages/LLVM/6cDbl/src/targetmachine.jl:45
  [4] mcgen
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/mcgen.jl:84 [inlined]
  [5] mcgen(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:131
  [6] macro expansion
    @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:481 [inlined]
  [8] macro expansion
    @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [9] macro expansion
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:478 [inlined]
 [10] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:103
 [11] emit_asm
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:97 [inlined]
 [12] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:156
 [13] codegen
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:115 [inlined]
 [14] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:111
 [15] compile
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:103 [inlined]
 [16] #1145
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:254 [inlined]
 [17] JuliaContext(f::CUDA.var"#1145#1148"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:52
 [18] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:42
 [19] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:253
 [20] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:128
 [21] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:103
 [22] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:369 [inlined]
 [23] macro expansion
    @ ./lock.jl:267 [inlined]
 [24] cufunction(f::typeof(gpu_my_kernel_fixed!), tt::Type{Tuple{…}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Nothing})
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:364
 [25] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:112 [inlined]
 [26] (::KernelAbstractions.Kernel{…})(::CuArray{…}, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:103
 [27] Kernel
    @ ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:89 [inlined]
 [28] #index_fun_fixed#1
    @ ./REPL[96]:5 [inlined]
 [29] index_fun_fixed(arr::CuArray{Float16, 2, CUDA.DeviceMemory})
    @ Main ./REPL[96]:1
 [30] top-level scope
    @ REPL[110]:1
Some type information was truncated. Use `show(err)` to see complete types.

It works fine on FP32 inputs.

Julia and package version:

julia> versioninfo()
Julia Version 1.10.4
Commit 48d4fd48430 (2024-06-04 10:41 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 24 × AMD Ryzen 9 5900X 12-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 1 default, 0 interactive, 1 GC (on 24 virtual cores)

(TestKA) pkg> st
Project TestKA v0.1.10
Status `~/Projects/TestKA.jl/Project.toml`
  [a9b6321e] Atomix v0.1.0
  [052768ef] CUDA v5.4.2
  [63c18a36] KernelAbstractions v0.9.21

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions