Merge pull request #44 from arhik/main

atomics support
JuliaWGPU · Jul 12, 2024 · 2f8c002 · 2f8c002
2 parents e130cff + bfe0dd6
commit 2f8c002
Show file tree

Hide file tree

Showing 15 changed files with 521 additions and 36 deletions.
diff --git a/examples/atomic.jl b/examples/atomic.jl
@@ -0,0 +1,43 @@
+using Revise
+using WGPUCompute
+using Test
+
+# Silly example
+# Need to comeup with better example
+
+empty!(task_local_storage())
+
+function atomiccount_kernel(hist::WgpuArray{T, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
+	gId = workgroupId.x*workgroupId.y + localId.x
+	stride = workgroupDims.x*workgroupCount.x
+	@wgpuatomic a::UInt32
+	val = x[gId]
+	a = hist[val]
+	while gId < iSize
+		val = x[gId]
+		a += T(1)
+		gId += stride
+	end
+	hist[val] = a
+end
+
+function atomiccount(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
+	y = WgpuArray{UInt32}(undef, nbins)
+	copyto!(y, hist)
+	@wgpukernel(
+		launch=true,
+		workgroupSizes=(64,),
+		workgroupCount=(1,),
+		shmem=(),
+		atomiccount_kernel(y, x, reduce(*, size(x)) |> UInt32)
+	)
+	return y
+end
+
+nbins = 10
+x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
+count = WgpuArray{UInt32}(zeros(UInt32, 10))
+
+z = atomiccount(x, count)
+
+# histogram(x)
diff --git a/examples/cast_kernel.jl b/examples/cast_kernel.jl
@@ -1,4 +1,5 @@
 using WGPUCompute
+using Test
 
 function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
 	xdim = workgroupDims.x
@@ -9,14 +10,26 @@ function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
 	out[gId] = S(ceil(x[gId]))
 end
 
+function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
+	gId = xDims.x*globalId.y + globalId.x
+	out[gId] = S(ceil(x[gId]))
+end
+
 function cast(S::DataType, x::WgpuArray{T, N}) where {T, N}
 	y = WgpuArray{S}(undef, size(x))
 	@wgpukernel launch=true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() cast_kernel(x, y)
 	return y
 end
 
-x = WgpuArray{Float32}(rand(Float32, 8, 8) .- 0.5f0)
-z = cast(UInt32, x)
+x = rand(Float32, 8, 8) .- 0.5f0
+
+x_gpu = WgpuArray{Float32}(x)
+z_gpu = cast(UInt32, x_gpu)
+z_cpu = z_gpu |> collect
+
+z = UInt32.(x .> 0.0)
+
+@test z ≈ z_cpu
 
 # TODO Bool cast is not working yet
 # y = cast(Bool, x)
diff --git a/examples/clamp_kernel.jl b/examples/clamp_kernel.jl
@@ -1,20 +1,26 @@
 using Revise
 using WGPUCompute
+using Test
 
 function clamp_kernel(x::WgpuArray{T, N}, out::WgpuArray{T, N}, minval::T, maxval::T) where {T, N}
-	gId = xDims.x*globalId.y + globalId.x
-	value = x[gId]
-	out[gId] = clamp(value, minval, maxval)
+    gId = xDims.x * globalId.y + globalId.x
+    value = x[gId]
+    out[gId] = clamp(value, minval, maxval)
 end
 
 
 function Base.clamp(x::WgpuArray{T, N}, minValue::T, maxValue::T) where {T, N}
-	y = similar(x)
-	@wgpukernel launch=true workgroupSizes=size(y) workgroupCount=(1, 1) shmem=() clamp_kernel(x, y, minValue, maxValue)
-	return y
+    y = similar(x)
+    @wgpukernel launch=true workgroupSizes=size(y) workgroupCount=(1, 1) shmem=() clamp_kernel(x, y, minValue, maxValue)
+    return y
 end
 
 x = WgpuArray{Float32, 2}(rand(16, 16))
 
 y = Base.clamp(x, 0.2f0, 0.5f0)
+y_cpu = y |> collect
 
+@testset "Clamp minimum and maximum" begin
+	@test minimum(y_cpu) == 0.2f0
+	@test maximum(y_cpu) == 0.5f0
+end
diff --git a/examples/divfree_reduce_kernel.jl b/examples/divfree_reduce_kernel.jl
@@ -0,0 +1,43 @@
+using Revise
+using WGPUCompute
+using Test
+
+empty!(task_local_storage())
+
+function divfree_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
+    gId = xDims.x * globalId.y + globalId.x
+    W = Float32(xDims.x * xDims.y)
+    steps = UInt32(ceil(log2(W)))
+    out[gId] = x[gId]
+    base = 2.0f0
+    for itr in 0:steps
+   		exponent = Float32(steps - itr - 1)
+   		baseexp = pow(base, exponent)
+		stride = UInt32(baseexp)
+    	if localId.x < stride
+			out[gId] = op(out[gId], out[gId + stride])
+		end
+	end
+end
+
+function divfree_reduce(x::WgpuArray{T,N}, op::Function) where {T,N}
+    y = WgpuArray{T}(undef, size(x))
+    @wgpukernel(
+        launch = true,
+        workgroupSizes = (8, 8),
+        workgroupCount = (1, 1),
+        shmem = (),
+        divfree_reduce_kernel(x, y, op)
+    )
+    return (y |> collect)
+end
+
+x = WgpuArray{Float32}(rand(Float32, 8, 8))
+z = divfree_reduce(x, +)
+
+x_cpu = (x |> collect)
+
+sum_cpu = sum(x_cpu)
+sum_gpu = (z|>collect)[1]
+
+@test sum_cpu ≈ sum_gpu
diff --git a/examples/gpuarrays.jl b/examples/gpuarrays.jl
@@ -0,0 +1,13 @@
+using Revise
+using WGPUCompute
+
+x = WgpuArray{UInt32}(rand(UInt32, 10, 10))
+y = WgpuArray{WAtomic{UInt32}}(undef, 10, 10)
+
+cntxt = WGPUCompute.WgpuKernelContext()
+Base.unsafe_copyto!(WGPUCompute.device(y), pointer(y, 1), pointer(x, 1), reduce(*, size(x)))
+
+copyto!(y, x)
+
+copyto!(x, y)
+
diff --git a/examples/histogram.jl b/examples/histogram.jl
@@ -0,0 +1,42 @@
+using Revise
+using WGPUCompute
+using Test
+
+empty!(task_local_storage())
+
+function histogram_kernel(hist::WgpuArray{WAtomic{T}, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
+	gId = workgroupId.x*workgroupId.y + localId.x
+	stride = workgroupDims.x*workgroupCount.x
+	while gId < iSize
+		val = x[gId]
+		hist[val] += T(1)
+		gId += stride
+	end
+end
+
+function histogram(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
+	y = WgpuArray{WAtomic{UInt32}}(undef, nbins)
+	copyto!(y, hist)
+	@wgpukernel(
+		launch=true,
+		workgroupSizes=(64,),
+		workgroupCount=(1,),
+		shmem=(),
+		histogram_kernel(y, x, reduce(*, size(x)) |> UInt32)
+	)
+	copyto!(hist, y)
+	return hist
+end
+
+nbins = 10
+x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
+hist = WgpuArray{UInt32}(zeros(UInt32, 10))
+
+z = histogram(x, hist)
+
+hist_cpu = zeros(UInt32, nbins)
+for i in (x |> collect)
+	hist_cpu[i%nbins + 1] += 1
+end
+
+@test hist_cpu ≈ hist |> collect
diff --git a/examples/localarray.jl b/examples/localarray.jl
@@ -0,0 +1,26 @@
+using Revise
+using WGPUCompute
+using Test
+
+function localarray_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
+	a = Vec4{Float32}(1.0f0, 2.0f0, 3.0f0, 4.0f0);
+	gId = xDims.x*globalId.y + globalId.x
+	out[gId] = S(ceil(x[gId]))
+end
+
+function localarray(S::DataType, x::WgpuArray{T, N}) where {T, N}
+	y = WgpuArray{S}(undef, size(x))
+	@wgpukernel launch = true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() localarray_kernel(x, y)
+	return y
+end
+
+x = rand(Float32, 8, 8) .- 0.5f0
+
+x_gpu = WgpuArray{Float32}(x)
+z_gpu = localarray(UInt32, x_gpu)
+z_cpu = z_gpu |> collect
+
+z = UInt32.(x .> 0.0)
+
+@test z ≈ z_cpu
+
diff --git a/examples/matmul_kernel.jl b/examples/matmul_kernel.jl
@@ -46,4 +46,4 @@ out = matmul(x, y)
 
 xcpu*ycpu
 
-@test (xcpu*ycpu) == (out |> collect)
+@test (xcpu*ycpu) ≈ (out |> collect)
diff --git a/examples/naive_reduce_mul.jl b/examples/naive_reduce_mul.jl
@@ -0,0 +1,43 @@
+using Revise
+using WGPUCompute
+using Test
+
+empty!(task_local_storage())
+
+function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
+    gId = xDims.x * globalId.y + globalId.x
+    W = Float32(xDims.x * xDims.y)
+    steps = UInt32(ceil(log2(W)))
+    out[gId] = x[gId]
+    base = 2.0f0
+    for itr in 0:steps
+	    if gId%2 == 0
+    		exponent = Float32(itr)
+    		baseexp = pow(base, exponent)
+			stride = UInt32(baseexp)
+			out[gId] = op(out[gId], out[gId + stride])
+	    end
+	end
+end
+
+function naive_reduce(x::WgpuArray{T,N}, *) where {T,N}
+    y = WgpuArray{T}(undef, size(x))
+    @wgpukernel(
+        launch = true,
+        workgroupSizes = (8, 8),
+        workgroupCount = (1, 1),
+        shmem = (),
+        naive_reduce_kernel(x, y, *)
+    )
+    return (y |> collect)
+end
+
+x = WgpuArray{Float32}(rand(Float32, 8, 8))
+z = naive_reduce(x, *)
+
+x_cpu = (x |> collect)
+
+mul_cpu = reduce(*, x_cpu)
+mul_gpu = (z|>collect)[1]
+
+@test mul_cpu ≈ mul_gpu
diff --git a/examples/naive_reduce_plus.jl b/examples/naive_reduce_plus.jl
@@ -0,0 +1,43 @@
+using Revise
+using WGPUCompute
+using Test
+
+empty!(task_local_storage())
+
+function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
+    gId = xDims.x * globalId.y + globalId.x
+    W = Float32(xDims.x * xDims.y)
+    steps = UInt32(ceil(log2(W)))
+    out[gId] = x[gId]
+    base = 2.0f0
+    for itr in 0:steps
+	    if gId%2 == 0
+    		exponent = Float32(itr)
+    		baseexp = pow(base, exponent)
+			stride = UInt32(baseexp)
+			out[gId] = op(out[gId], out[gId + stride])
+	    end
+	end
+end
+
+function naive_reduce(x::WgpuArray{T,N}, +) where {T,N}
+    y = WgpuArray{T}(undef, size(x))
+    @wgpukernel(
+        launch = true,
+        workgroupSizes = (8, 8),
+        workgroupCount = (1, 1),
+        shmem = (),
+        naive_reduce_kernel(x, y, +)
+    )
+    return (y |> collect)
+end
+
+x = WgpuArray{Float32}(rand(Float32, 8, 8))
+z = naive_reduce(x, +)
+
+x_cpu = (x |> collect)
+
+sum_cpu = sum(x_cpu)
+sum_gpu = (z|>collect)[1]
+
+@test sum_cpu ≈ sum_gpu
diff --git a/examples/reduce_kernel.jl b/examples/reduce_kernel.jl
@@ -13,7 +13,8 @@ function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}) where {T,N}
     for itr in 0:steps
 	    if gId%2 == 0
     		exponent = Float32(itr)
-			stride = UInt32(pow(base, exponent))
+    		baseexp = pow(base, exponent)
+			stride = UInt32(baseexp)
 			out[gId] += out[gId + stride]
 	    end
 	end
Original file line number	Diff line number	Diff line change
Expand Up		@@ -46,4 +46,4 @@ out = matmul(x, y)

		xcpu*ycpu

		@test (xcpu*ycpu) == (out \|> collect)
		@test (xcpu*ycpu) ≈ (out \|> collect)