Skip to content

Commit

Permalink
Merge pull request #44 from arhik/main
Browse files Browse the repository at this point in the history
atomics support
  • Loading branch information
arhik authored Jul 12, 2024
2 parents e130cff + bfe0dd6 commit 2f8c002
Show file tree
Hide file tree
Showing 15 changed files with 521 additions and 36 deletions.
43 changes: 43 additions & 0 deletions examples/atomic.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Revise
using WGPUCompute
using Test

# Silly example
# Need to comeup with better example

empty!(task_local_storage())

function atomiccount_kernel(hist::WgpuArray{T, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
gId = workgroupId.x*workgroupId.y + localId.x
stride = workgroupDims.x*workgroupCount.x
@wgpuatomic a::UInt32
val = x[gId]
a = hist[val]
while gId < iSize
val = x[gId]
a += T(1)
gId += stride
end
hist[val] = a
end

function atomiccount(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
y = WgpuArray{UInt32}(undef, nbins)
copyto!(y, hist)
@wgpukernel(
launch=true,
workgroupSizes=(64,),
workgroupCount=(1,),
shmem=(),
atomiccount_kernel(y, x, reduce(*, size(x)) |> UInt32)
)
return y
end

nbins = 10
x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
count = WgpuArray{UInt32}(zeros(UInt32, 10))

z = atomiccount(x, count)

# histogram(x)
17 changes: 15 additions & 2 deletions examples/cast_kernel.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using WGPUCompute
using Test

function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
xdim = workgroupDims.x
Expand All @@ -9,14 +10,26 @@ function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
out[gId] = S(ceil(x[gId]))
end

function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
gId = xDims.x*globalId.y + globalId.x
out[gId] = S(ceil(x[gId]))
end

function cast(S::DataType, x::WgpuArray{T, N}) where {T, N}
y = WgpuArray{S}(undef, size(x))
@wgpukernel launch=true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() cast_kernel(x, y)
return y
end

x = WgpuArray{Float32}(rand(Float32, 8, 8) .- 0.5f0)
z = cast(UInt32, x)
x = rand(Float32, 8, 8) .- 0.5f0

x_gpu = WgpuArray{Float32}(x)
z_gpu = cast(UInt32, x_gpu)
z_cpu = z_gpu |> collect

z = UInt32.(x .> 0.0)

@test z z_cpu

# TODO Bool cast is not working yet
# y = cast(Bool, x)
18 changes: 12 additions & 6 deletions examples/clamp_kernel.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
using Revise
using WGPUCompute
using Test

function clamp_kernel(x::WgpuArray{T, N}, out::WgpuArray{T, N}, minval::T, maxval::T) where {T, N}
gId = xDims.x*globalId.y + globalId.x
value = x[gId]
out[gId] = clamp(value, minval, maxval)
gId = xDims.x * globalId.y + globalId.x
value = x[gId]
out[gId] = clamp(value, minval, maxval)
end


function Base.clamp(x::WgpuArray{T, N}, minValue::T, maxValue::T) where {T, N}
y = similar(x)
@wgpukernel launch=true workgroupSizes=size(y) workgroupCount=(1, 1) shmem=() clamp_kernel(x, y, minValue, maxValue)
return y
y = similar(x)
@wgpukernel launch=true workgroupSizes=size(y) workgroupCount=(1, 1) shmem=() clamp_kernel(x, y, minValue, maxValue)
return y
end

x = WgpuArray{Float32, 2}(rand(16, 16))

y = Base.clamp(x, 0.2f0, 0.5f0)
y_cpu = y |> collect

@testset "Clamp minimum and maximum" begin
@test minimum(y_cpu) == 0.2f0
@test maximum(y_cpu) == 0.5f0
end
43 changes: 43 additions & 0 deletions examples/divfree_reduce_kernel.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Revise
using WGPUCompute
using Test

empty!(task_local_storage())

function divfree_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
gId = xDims.x * globalId.y + globalId.x
W = Float32(xDims.x * xDims.y)
steps = UInt32(ceil(log2(W)))
out[gId] = x[gId]
base = 2.0f0
for itr in 0:steps
exponent = Float32(steps - itr - 1)
baseexp = pow(base, exponent)
stride = UInt32(baseexp)
if localId.x < stride
out[gId] = op(out[gId], out[gId + stride])
end
end
end

function divfree_reduce(x::WgpuArray{T,N}, op::Function) where {T,N}
y = WgpuArray{T}(undef, size(x))
@wgpukernel(
launch = true,
workgroupSizes = (8, 8),
workgroupCount = (1, 1),
shmem = (),
divfree_reduce_kernel(x, y, op)
)
return (y |> collect)
end

x = WgpuArray{Float32}(rand(Float32, 8, 8))
z = divfree_reduce(x, +)

x_cpu = (x |> collect)

sum_cpu = sum(x_cpu)
sum_gpu = (z|>collect)[1]

@test sum_cpu sum_gpu
13 changes: 13 additions & 0 deletions examples/gpuarrays.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using Revise
using WGPUCompute

x = WgpuArray{UInt32}(rand(UInt32, 10, 10))
y = WgpuArray{WAtomic{UInt32}}(undef, 10, 10)

cntxt = WGPUCompute.WgpuKernelContext()
Base.unsafe_copyto!(WGPUCompute.device(y), pointer(y, 1), pointer(x, 1), reduce(*, size(x)))

copyto!(y, x)

copyto!(x, y)

42 changes: 42 additions & 0 deletions examples/histogram.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using Revise
using WGPUCompute
using Test

empty!(task_local_storage())

function histogram_kernel(hist::WgpuArray{WAtomic{T}, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
gId = workgroupId.x*workgroupId.y + localId.x
stride = workgroupDims.x*workgroupCount.x
while gId < iSize
val = x[gId]
hist[val] += T(1)
gId += stride
end
end

function histogram(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
y = WgpuArray{WAtomic{UInt32}}(undef, nbins)
copyto!(y, hist)
@wgpukernel(
launch=true,
workgroupSizes=(64,),
workgroupCount=(1,),
shmem=(),
histogram_kernel(y, x, reduce(*, size(x)) |> UInt32)
)
copyto!(hist, y)
return hist
end

nbins = 10
x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
hist = WgpuArray{UInt32}(zeros(UInt32, 10))

z = histogram(x, hist)

hist_cpu = zeros(UInt32, nbins)
for i in (x |> collect)
hist_cpu[i%nbins + 1] += 1
end

@test hist_cpu hist |> collect
26 changes: 26 additions & 0 deletions examples/localarray.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using Revise
using WGPUCompute
using Test

function localarray_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
a = Vec4{Float32}(1.0f0, 2.0f0, 3.0f0, 4.0f0);
gId = xDims.x*globalId.y + globalId.x
out[gId] = S(ceil(x[gId]))
end

function localarray(S::DataType, x::WgpuArray{T, N}) where {T, N}
y = WgpuArray{S}(undef, size(x))
@wgpukernel launch = true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() localarray_kernel(x, y)
return y
end

x = rand(Float32, 8, 8) .- 0.5f0

x_gpu = WgpuArray{Float32}(x)
z_gpu = localarray(UInt32, x_gpu)
z_cpu = z_gpu |> collect

z = UInt32.(x .> 0.0)

@test z z_cpu

2 changes: 1 addition & 1 deletion examples/matmul_kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ out = matmul(x, y)

xcpu*ycpu

@test (xcpu*ycpu) == (out |> collect)
@test (xcpu*ycpu) (out |> collect)
43 changes: 43 additions & 0 deletions examples/naive_reduce_mul.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Revise
using WGPUCompute
using Test

empty!(task_local_storage())

function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
gId = xDims.x * globalId.y + globalId.x
W = Float32(xDims.x * xDims.y)
steps = UInt32(ceil(log2(W)))
out[gId] = x[gId]
base = 2.0f0
for itr in 0:steps
if gId%2 == 0
exponent = Float32(itr)
baseexp = pow(base, exponent)
stride = UInt32(baseexp)
out[gId] = op(out[gId], out[gId + stride])
end
end
end

function naive_reduce(x::WgpuArray{T,N}, *) where {T,N}
y = WgpuArray{T}(undef, size(x))
@wgpukernel(
launch = true,
workgroupSizes = (8, 8),
workgroupCount = (1, 1),
shmem = (),
naive_reduce_kernel(x, y, *)
)
return (y |> collect)
end

x = WgpuArray{Float32}(rand(Float32, 8, 8))
z = naive_reduce(x, *)

x_cpu = (x |> collect)

mul_cpu = reduce(*, x_cpu)
mul_gpu = (z|>collect)[1]

@test mul_cpu mul_gpu
43 changes: 43 additions & 0 deletions examples/naive_reduce_plus.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Revise
using WGPUCompute
using Test

empty!(task_local_storage())

function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
gId = xDims.x * globalId.y + globalId.x
W = Float32(xDims.x * xDims.y)
steps = UInt32(ceil(log2(W)))
out[gId] = x[gId]
base = 2.0f0
for itr in 0:steps
if gId%2 == 0
exponent = Float32(itr)
baseexp = pow(base, exponent)
stride = UInt32(baseexp)
out[gId] = op(out[gId], out[gId + stride])
end
end
end

function naive_reduce(x::WgpuArray{T,N}, +) where {T,N}
y = WgpuArray{T}(undef, size(x))
@wgpukernel(
launch = true,
workgroupSizes = (8, 8),
workgroupCount = (1, 1),
shmem = (),
naive_reduce_kernel(x, y, +)
)
return (y |> collect)
end

x = WgpuArray{Float32}(rand(Float32, 8, 8))
z = naive_reduce(x, +)

x_cpu = (x |> collect)

sum_cpu = sum(x_cpu)
sum_gpu = (z|>collect)[1]

@test sum_cpu sum_gpu
3 changes: 2 additions & 1 deletion examples/reduce_kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}) where {T,N}
for itr in 0:steps
if gId%2 == 0
exponent = Float32(itr)
stride = UInt32(pow(base, exponent))
baseexp = pow(base, exponent)
stride = UInt32(baseexp)
out[gId] += out[gId + stride]
end
end
Expand Down
Loading

0 comments on commit 2f8c002

Please sign in to comment.