Skip to content

Commit

Permalink
small optimizations to GPU kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
bjarthur committed Aug 16, 2024
1 parent aec8854 commit 3bd69d7
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 34 deletions.
26 changes: 16 additions & 10 deletions src/gpu/loop.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,25 @@ function update_inputs(bspike,
istride = blockDim().x * gridDim().x
jstride = blockDim().y * gridDim().y

@inbounds for i=i0:istride:length(bspike)
bspike[i] || continue
for j=j0:jstride:max(size(w0Index,1),size(wpIndexOut,1))
@static if p.K>0
if j<=size(w0Index,1)
CUDA.@atomic inputsE[w0Index[j,i]] += max(w0Weights[j,i], charge0)
CUDA.@atomic inputsI[w0Index[j,i]] += min(w0Weights[j,i], charge0)
i = i0
jmax = max(size(w0Index,1), size(wpIndexOut,1))
@inbounds while i <= length(bspike)
if bspike[i]
j = j0
while j <= jmax
@static if p.K>0
if j <= size(w0Index,1)
CUDA.@atomic inputsE[w0Index[j,i]] += max(w0Weights[j,i], charge0)
CUDA.@atomic inputsI[w0Index[j,i]] += min(w0Weights[j,i], charge0)
end
end
end
if j<=size(wpIndexOut,1)
CUDA.@atomic inputsP[wpIndexOut[j,i]] += wpWeightOut[j+1,i+1]
if j <= size(wpIndexOut,1)
CUDA.@atomic inputsP[wpIndexOut[j,i]] += wpWeightOut[j+1,i+1]
end
j += jstride
end
end
i += istride
end
return nothing
end
Expand Down
20 changes: 14 additions & 6 deletions src/gpu/rls-small.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ function copyto_raug(raug, LX, rX, r, wpIndexIn, ci)
i0 = threadIdx().x + (blockIdx().x - 1) * blockDim().x
istride = blockDim().x * gridDim().x

@inbounds for i=i0:istride:size(raug,1)
i = i0
@inbounds while i <= size(raug,1)
raug[i] = i<=LX ? rX[i] : r[wpIndexIn[i,ci]]
i += istride
end
return nothing
end
Expand All @@ -69,12 +71,18 @@ function update_Pinv_with_raughist(Pinv, raughist, LX, wpIndexIn, ci)
istride = blockDim().x * gridDim().x
jstride = blockDim().y * gridDim().y

@inbounds for i=i0:istride:size(Pinv,1), j=j0:jstride:size(Pinv,2)
irrX = i<=LX ? i : wpIndexIn[i,ci]
jrrX = j<=LX ? j : wpIndexIn[j,ci]
for h=1:size(raughist,2)
Pinv[i,j] += raughist[irrX,h] * raughist[jrrX,h]
i = i0
@inbounds while i <= size(Pinv,1)
j = j0
while j <= size(Pinv,2)
irrX = i<=LX ? i : wpIndexIn[i,ci]
jrrX = j<=LX ? j : wpIndexIn[j,ci]
for h=1:size(raughist,2)
Pinv[i,j] += raughist[irrX,h] * raughist[jrrX,h]
end
j += jstride
end
i += istride
end
return nothing
end
Expand Down
42 changes: 26 additions & 16 deletions src/gpu/variables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,24 @@ scratch = Scratch{CuMatrix{_TTimeInt, CUDA.Mem.DeviceBuffer},
CuVector{TVoltage, CUDA.Mem.DeviceBuffer},
CuVector{_TNoise, CUDA.Mem.DeviceBuffer}}()

const WARP_SIZE = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_WARP_SIZE)
const MAX_GRID_DIM_X = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)
const MAX_GRID_DIM_Y = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)

function configurator(kernel, dim1)
config = launch_configuration(kernel.fun)
xthreads = min(32, dim1)
xblocks = min(config.blocks, cld(dim1, xthreads))
return (xthreads, ), (xblocks<<2, )
xthreads = min(config.threads, dim1)
xblocks = min(MAX_GRID_DIM_X, cld(dim1, xthreads))
return (xthreads,), (xblocks,)
end

function configurator(kernel, dim1, dim2)
config = launch_configuration(kernel.fun)
xthreads = min(32, dim1)
xthreads = min(WARP_SIZE, dim1)
ythreads = min(fld(config.threads, xthreads), cld(dim1*dim2, xthreads))
xblocks = min(config.blocks, cld(dim1, xthreads))
yblocks = min(cld(config.blocks, xblocks), cld(dim2, ythreads))
return (xthreads, ythreads), (xblocks<<2, yblocks<<2)
xblocks = min(MAX_GRID_DIM_X, cld(dim1, xthreads))
yblocks = min(MAX_GRID_DIM_Y, cld(dim2, ythreads))
return (xthreads, ythreads), (xblocks, yblocks)
end

function generate_Pinv!(Pinv, ci, wpWeightIn, charge0, LX, penmu, penlamFF, penlambda)
Expand All @@ -98,16 +102,22 @@ function generate_Pinv!(Pinv, ci, wpWeightIn, charge0, LX, penmu, penlamFF, penl
istride = blockDim().x * gridDim().x
jstride = blockDim().y * gridDim().y

@inbounds for i=i0:istride:size(Pinv,1), j=j0:jstride:size(Pinv,2)
Pinv[i,j] = 0
if i==j
Pinv[i,j] = i<=LX ? penlamFF : penlambda
end
if i>LX && j>LX
Pinv[i,j] += penmu * (
(wpWeightIn[i,ci] > charge0 && wpWeightIn[j,ci] > charge0) ||
(wpWeightIn[i,ci] < charge0 && wpWeightIn[j,ci] < charge0) )
i = i0
@inbounds while i <= size(Pinv,1)
j = j0
while j <= size(Pinv,2)
Pinv[i,j] = 0
if i==j
Pinv[i,j] = i<=LX ? penlamFF : penlambda
end
if i>LX && j>LX
Pinv[i,j] += penmu * (
(wpWeightIn[i,ci] > charge0 && wpWeightIn[j,ci] > charge0) ||
(wpWeightIn[i,ci] < charge0 && wpWeightIn[j,ci] < charge0) )
end
j += jstride
end
i += istride
end
return nothing
end
Expand Down
10 changes: 8 additions & 2 deletions src/gpu/wpWeightIn2Out.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@ function wpWeightIn2Out!(wpWeightOut, wpIndexIn, wpIndexConvert, wpWeightIn)
istride = blockDim().x * gridDim().x
jstride = blockDim().y * gridDim().y

@inbounds for i=i0:istride:size(wpWeightIn,1), j=j0:jstride:size(wpWeightIn,2)
wpWeightOut[wpIndexConvert[i,j],wpIndexIn[i,j]] = wpWeightIn[i,j]
i = i0
@inbounds while i <= size(wpWeightIn,1)
j = j0
while j <= size(wpWeightIn,2)
wpWeightOut[wpIndexConvert[i,j],wpIndexIn[i,j]] = wpWeightIn[i,j]
j += jstride
end
i += istride
end
return nothing
end
Expand Down

0 comments on commit 3bd69d7

Please sign in to comment.