diff --git a/src/sorting.jl b/src/sorting.jl index c4fc933cc5..cedcf281c3 100644 --- a/src/sorting.jl +++ b/src/sorting.jl @@ -73,8 +73,10 @@ Uses block y index to decide which values to operate on. sync_threads() blockIdx_yz = (blockIdx().z - 1i32) * gridDim().y + blockIdx().y idx0 = lo + (blockIdx_yz - 1i32) * blockDim().x + threadIdx().x - val = idx0 <= hi ? values[idx0] : one(eltype(values)) - comparison = flex_lt(pivot, val, parity, lt, by) + @inbounds if idx0 <= hi + val = values[idx0] + comparison = flex_lt(pivot, val, parity, lt, by) + end @inbounds if idx0 <= hi sums[threadIdx().x] = 1 & comparison @@ -85,9 +87,11 @@ Uses block y index to decide which values to operate on. cumsum!(sums) - dest_idx = @inbounds comparison ? blockDim().x - sums[end] + sums[threadIdx().x] : threadIdx().x - sums[threadIdx().x] - @inbounds if idx0 <= hi && dest_idx <= length(swap) - swap[dest_idx] = val + @inbounds if idx0 <= hi + dest_idx = @inbounds comparison ? blockDim().x - sums[end] + sums[threadIdx().x] : threadIdx().x - sums[threadIdx().x] + if dest_idx <= length(swap) + swap[dest_idx] = val + end end sync_threads() @@ -180,10 +184,8 @@ Must only run on 1 SM. c = n_eff() - d to_move = min(b, c) sync_threads() - swap = if threadIdx().x <= to_move - vals[lo + a + threadIdx().x] - else - zero(eltype(vals)) # unused value + if threadIdx().x <= to_move + swap = vals[lo + a + threadIdx().x] end sync_threads() if threadIdx().x <= to_move @@ -215,7 +217,6 @@ function bitonic_median(vals :: AbstractArray{T}, swap, lo, L, stride, lt::F1, b @inbounds swap[threadIdx().x] = vals[lo + threadIdx().x * stride] sync_threads() - old_val = zero(eltype(swap)) log_blockDim = begin out = 0 @@ -269,10 +270,8 @@ elements spaced by `stride`. Good for sampling pivot values as well as short sor for level in 0:L # get left/right neighbor depending on even/odd level buddy = threadIdx().x - 1i32 + 2i32 * (1i32 & (threadIdx().x % 2i32 != level % 2i32)) - buddy_val = if 1 <= buddy <= L && threadIdx().x <= L - swap[buddy] - else - zero(eltype(swap)) # unused value + if 1 <= buddy <= L && threadIdx().x <= L + buddy_val = swap[buddy] end sync_threads() if 1 <= buddy <= L && threadIdx().x <= L @@ -738,7 +737,7 @@ Each view is indexed along block x dim: one view per pseudo-block @inbounds swap[threadIdx().x, threadIdx().y] = vals[index+one(I)] end sync_threads() - return @view swap[:, threadIdx().y] + return @inbounds @view swap[:, threadIdx().y] end """ diff --git a/test/sorting.jl b/test/sorting.jl index 6a2f48c05b..34245abd1d 100644 --- a/test/sorting.jl +++ b/test/sorting.jl @@ -280,6 +280,9 @@ end end end +# XXX: some tests here make compute-sanitizer hang, but only on CI. +# maybe related to the container set-up? try again once we use Sandbox.jl. + @testset "interface" begin @testset "quicksort" begin # pre-sorted @@ -302,6 +305,7 @@ end @test check_sort!(Float64, 10000, x -> rand(Float64); alg=CUDA.QuickSort) @test check_sort!(Float32, 10000, x -> rand(Float32); alg=CUDA.QuickSort) @test check_sort!(Float16, 10000, x -> rand(Float16); alg=CUDA.QuickSort) + @not_if_sanitize @test check_sort!(Tuple{Int,Int}, 10000, x -> (rand(Int), rand(Int)); alg=CUDA.QuickSort) # non-uniform distributions @test check_sort!(UInt8, 100000, x -> round(255 * rand() ^ 2); alg=CUDA.QuickSort) @@ -345,6 +349,7 @@ end @test check_sort!(Float64, 10000, x -> rand(Float64); alg=CUDA.BitonicSort) @test check_sort!(Float32, 10000, x -> rand(Float32); alg=CUDA.BitonicSort) @test check_sort!(Float16, 10000, x -> rand(Float16); alg=CUDA.BitonicSort) + @not_if_sanitize @test check_sort!(Tuple{Int,Int}, 10000, x -> (rand(Int), rand(Int)); alg=CUDA.BitonicSort) # test various sizes @test check_sort!(Float32, 1, x -> rand(Float32); alg=CUDA.BitonicSort)