Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ version = "0.8.0"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
Expand Down
119 changes: 119 additions & 0 deletions examples/histogram.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
using KernelAbstractions, Test
using Atomix: @atomic, @atomicswap, @atomicreplace
include(joinpath(@__DIR__, "utils.jl")) # Load backend


# Function to use as a baseline for CPU metrics
function create_histogram(input)
histogram_output = zeros(Int, maximum(input))
for i = 1:length(input)
histogram_output[input[i]] += 1
end
return histogram_output
end

# This a 1D histogram kernel where the histogramming happens on shmem
@kernel function histogram_kernel!(histogram_output, input)
tid = @index(Global, Linear)
lid = @index(Local, Linear)

@uniform warpsize = Int(32)

@uniform gs = @groupsize()[1]
@uniform N = length(histogram_output)

shared_histogram = @localmem Int (gs)

# This will go through all input elements and assign them to a location in
# shmem. Note that if there is not enough shem, we create different shmem
# blocks to write to. For example, if shmem is of size 256, but it's
# possible to get a value of 312, then we will have 2 separate shmem blocks,
# one from 1->256, and another from 256->512
@uniform max_element = 1
for min_element = 1:gs:N

# Setting shared_histogram to 0
@inbounds shared_histogram[lid] = 0
@synchronize()

max_element = min_element + gs
if max_element > N
max_element = N+1
end

# Defining bin on shared memory and writing to it if possible
bin = input[tid]
if bin >= min_element && bin < max_element
bin -= min_element-1
GC.@preserve shared_histogram begin
@atomic shared_histogram[bin] += 1
end
end

@synchronize()

if ((lid+min_element-1) <= N)
@atomic histogram_output[lid+min_element-1] += shared_histogram[lid]
end

end

end

function histogram!(histogram_output, input;
numcores = 4, numthreads = 256)

if isa(input, Array)
kernel! = histogram_kernel!(CPU(), numcores)
else
kernel! = histogram_kernel!(CUDADevice(), numthreads)
end

kernel!(histogram_output, input, ndrange=size(input))
end

@testset "histogram tests" begin

rand_input = [rand(1:128) for i = 1:1000]
linear_input = [i for i = 1:1024]
all_2 = [2 for i = 1:512]

histogram_rand_baseline = create_histogram(rand_input)
histogram_linear_baseline = create_histogram(linear_input)
histogram_2_baseline = create_histogram(all_2)

if Base.VERSION >= v"1.7.0"
CPU_rand_histogram = zeros(Int, 128)
CPU_linear_histogram = zeros(Int, 1024)
CPU_2_histogram = zeros(Int, 2)

wait(histogram!(CPU_rand_histogram, rand_input))
wait(histogram!(CPU_linear_histogram, linear_input))
wait(histogram!(CPU_2_histogram, all_2))

@test isapprox(CPU_rand_histogram, histogram_rand_baseline)
@test isapprox(CPU_linear_histogram, histogram_linear_baseline)
@test isapprox(CPU_2_histogram, histogram_2_baseline)
end

if has_cuda_gpu()
CUDA.allowscalar(false)

GPU_rand_input = CuArray(rand_input)
GPU_linear_input = CuArray(linear_input)
GPU_2_input = CuArray(all_2)

GPU_rand_histogram = CuArray(zeros(Int, 128))
GPU_linear_histogram = CuArray(zeros(Int, 1024))
GPU_2_histogram = CuArray(zeros(Int, 2))

wait(histogram!(GPU_rand_histogram, GPU_rand_input))
wait(histogram!(GPU_linear_histogram, GPU_linear_input))
wait(histogram!(GPU_2_histogram, GPU_2_input))

@test isapprox(Array(GPU_rand_histogram), histogram_rand_baseline)
@test isapprox(Array(GPU_linear_histogram), histogram_linear_baseline)
@test isapprox(Array(GPU_2_histogram), histogram_2_baseline)
end

end
1 change: 1 addition & 0 deletions lib/CUDAKernels/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"

[compat]
Adapt = "3.0"
Expand Down
1 change: 1 addition & 0 deletions lib/CUDAKernels/src/CUDAKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import StaticArrays
import StaticArrays: MArray
import Adapt
import KernelAbstractions
import UnsafeAtomicsLLVM

export CUDADevice

Expand Down
1 change: 1 addition & 0 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export @Const, @localmem, @private, @uniform, @synchronize, @index, @groupsize,
export Device, GPU, CPU, Event, MultiEvent, NoneEvent
export async_copy!

import Atomix: @atomic, @atomicswap, @atomicreplace

using LinearAlgebra
using MacroTools
Expand Down