@@ -73,8 +73,10 @@ Uses block y index to decide which values to operate on.
73
73
sync_threads ()
74
74
blockIdx_yz = (blockIdx (). z - 1 i32) * gridDim (). y + blockIdx (). y
75
75
idx0 = lo + (blockIdx_yz - 1 i32) * blockDim (). x + threadIdx (). x
76
- val = idx0 <= hi ? values[idx0] : one (eltype (values))
77
- comparison = flex_lt (pivot, val, parity, lt, by)
76
+ @inbounds if idx0 <= hi
77
+ val = values[idx0]
78
+ comparison = flex_lt (pivot, val, parity, lt, by)
79
+ end
78
80
79
81
@inbounds if idx0 <= hi
80
82
sums[threadIdx (). x] = 1 & comparison
@@ -85,9 +87,11 @@ Uses block y index to decide which values to operate on.
85
87
86
88
cumsum! (sums)
87
89
88
- dest_idx = @inbounds comparison ? blockDim (). x - sums[end ] + sums[threadIdx (). x] : threadIdx (). x - sums[threadIdx (). x]
89
- @inbounds if idx0 <= hi && dest_idx <= length (swap)
90
- swap[dest_idx] = val
90
+ @inbounds if idx0 <= hi
91
+ dest_idx = @inbounds comparison ? blockDim (). x - sums[end ] + sums[threadIdx (). x] : threadIdx (). x - sums[threadIdx (). x]
92
+ if dest_idx <= length (swap)
93
+ swap[dest_idx] = val
94
+ end
91
95
end
92
96
sync_threads ()
93
97
@@ -180,10 +184,8 @@ Must only run on 1 SM.
180
184
c = n_eff () - d
181
185
to_move = min (b, c)
182
186
sync_threads ()
183
- swap = if threadIdx (). x <= to_move
184
- vals[lo + a + threadIdx (). x]
185
- else
186
- zero (eltype (vals)) # unused value
187
+ if threadIdx (). x <= to_move
188
+ swap = vals[lo + a + threadIdx (). x]
187
189
end
188
190
sync_threads ()
189
191
if threadIdx (). x <= to_move
@@ -215,7 +217,6 @@ function bitonic_median(vals :: AbstractArray{T}, swap, lo, L, stride, lt::F1, b
215
217
216
218
@inbounds swap[threadIdx (). x] = vals[lo + threadIdx (). x * stride]
217
219
sync_threads ()
218
- old_val = zero (eltype (swap))
219
220
220
221
log_blockDim = begin
221
222
out = 0
@@ -269,10 +270,8 @@ elements spaced by `stride`. Good for sampling pivot values as well as short sor
269
270
for level in 0 : L
270
271
# get left/right neighbor depending on even/odd level
271
272
buddy = threadIdx (). x - 1 i32 + 2 i32 * (1 i32 & (threadIdx (). x % 2 i32 != level % 2 i32))
272
- buddy_val = if 1 <= buddy <= L && threadIdx (). x <= L
273
- swap[buddy]
274
- else
275
- zero (eltype (swap)) # unused value
273
+ if 1 <= buddy <= L && threadIdx (). x <= L
274
+ buddy_val = swap[buddy]
276
275
end
277
276
sync_threads ()
278
277
if 1 <= buddy <= L && threadIdx (). x <= L
@@ -738,7 +737,7 @@ Each view is indexed along block x dim: one view per pseudo-block
738
737
@inbounds swap[threadIdx (). x, threadIdx (). y] = vals[index+ one (I)]
739
738
end
740
739
sync_threads ()
741
- return @view swap[:, threadIdx (). y]
740
+ return @inbounds @ view swap[:, threadIdx (). y]
742
741
end
743
742
744
743
"""
0 commit comments