Skip to content

Commit

Permalink
Merge pull request #1488 from timfelle/math/add4
Browse files Browse the repository at this point in the history
  • Loading branch information
njansson authored Sep 19, 2024
2 parents 9d60f50 + 59ebadd commit 26102eb
Show file tree
Hide file tree
Showing 9 changed files with 227 additions and 28 deletions.
14 changes: 14 additions & 0 deletions src/math/bcknd/device/cuda/math.cu
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,20 @@ extern "C" {
CUDA_CHECK(cudaGetLastError());
}

/**
* Fortran wrapper for add4
* Vector addition \f$ a = b + c + d \f$
*/
void cuda_add4(void *a, void *b, void *c, void *d, int *n) {

const dim3 nthrds(1024, 1, 1);
const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);

add4_kernel<real><<<nblcks, nthrds, 0,
(cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, (real *) d, *n);
CUDA_CHECK(cudaGetLastError());

}
/**
* Fortran wrapper for add2s1
* Vector addition with scalar multiplication \f$ a = c_1 a + b \f$
Expand Down
18 changes: 18 additions & 0 deletions src/math/bcknd/device/cuda/math_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,24 @@ __global__ void add3_kernel(T * __restrict__ a,
}
}

/**
* Device kernel for add4
*/
template< typename T >
__global__ void add4_kernel(T * __restrict__ a,
const T * __restrict__ b,
const T * __restrict__ c,
const T * __restrict__ d,
const int n) {

const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const int str = blockDim.x * gridDim.x;

for (int i = idx; i < n; i += str) {
a[i] = b[i] + c[i] + d[i];
}
}

/**
* Device kernel for add2s1
*/
Expand Down
61 changes: 54 additions & 7 deletions src/math/bcknd/device/device_math.F90
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ subroutine hip_masked_copy(a_d, b_d, mask_d, n, m) &
integer(c_int) :: n, m
end subroutine hip_masked_copy
end interface

interface
subroutine hip_cfill_mask(a_d, c, size, mask_d, mask_size) &
bind(c, name='hip_cfill_mask')
Expand Down Expand Up @@ -146,6 +146,17 @@ subroutine hip_add2(a_d, b_d, n) &
end subroutine hip_add2
end interface

interface
subroutine hip_add4(a_d, b_d, c_d, d_d, n) &
bind(c, name='hip_add4')
use, intrinsic :: iso_c_binding
import c_rp
implicit none
type(c_ptr), value :: a_d, b_d, c_d, d_d
integer(c_int) :: n
end subroutine hip_add4
end interface

interface
subroutine hip_add2s1(a_d, b_d, c1, n) &
bind(c, name='hip_add2s1')
Expand Down Expand Up @@ -478,6 +489,17 @@ subroutine cuda_add2(a_d, b_d, n) &
end subroutine cuda_add2
end interface

interface
subroutine cuda_add4(a_d, b_d, c_d, d_d, n) &
bind(c, name='cuda_add4')
use, intrinsic :: iso_c_binding
import c_rp
implicit none
type(c_ptr), value :: a_d, b_d, c_d, d_d
integer(c_int) :: n
end subroutine cuda_add4
end interface

interface
subroutine cuda_add2s1(a_d, b_d, c1, n) &
bind(c, name='cuda_add2s1')
Expand Down Expand Up @@ -818,6 +840,16 @@ subroutine opencl_add2(a_d, b_d, n) &
end subroutine opencl_add2
end interface

interface
subroutine opencl_add4(a_d, b_d, c_d, d_d, n) &
bind(c, name='opencl_add4')
use, intrinsic :: iso_c_binding
implicit none
type(c_ptr), value :: a_d, b_d, c_d, d_d
integer(c_int) :: n
end subroutine opencl_add4
end interface

interface
subroutine opencl_add2s1(a_d, b_d, c1, n) &
bind(c, name='opencl_add2s1')
Expand Down Expand Up @@ -1034,13 +1066,14 @@ end function opencl_glsum
#endif

public :: device_copy, device_rzero, device_rone, device_cmult, device_cmult2,&
device_cadd, device_cfill, device_add2, device_add2s1, device_add2s2, &
device_addsqr2s2, device_add3s2, device_invcol1, device_invcol2, &
device_col2, device_col3, device_subcol3, device_sub2, device_sub3, &
device_addcol3, device_addcol4, device_vdot3, device_vlsc3, device_glsc3, &
device_cadd, device_cadd2, device_cfill, device_add2, device_add3, &
device_add4, device_add2s1, device_add2s2, device_addsqr2s2, &
device_add3s2, device_invcol1, device_invcol2, device_col2, &
device_col3, device_subcol3, device_sub2, device_sub3, device_addcol3, &
device_addcol4, device_vdot3, device_vlsc3, device_glsc3, &
device_glsc3_many, device_add2s2_many, device_glsc2, device_glsum, &
device_masked_copy, device_cfill_mask, device_add3, device_cadd2
device_masked_copy, device_cfill_mask

contains

!> Copy a vector \f$ a = b \f$
Expand Down Expand Up @@ -1217,6 +1250,20 @@ subroutine device_add2(a_d, b_d, n)
#endif
end subroutine device_add2

subroutine device_add4(a_d, b_d, c_d, d_d, n)
type(c_ptr) :: a_d, b_d, c_d, d_d
integer :: n
#ifdef HAVE_HIP
call hip_add4(a_d, b_d, c_d, d_d, n)
#elif HAVE_CUDA
call cuda_add4(a_d, b_d, c_d, d_d, n)
#elif HAVE_OPENCL
call opencl_add4(a_d, b_d, c_d, d_d, n)
#else
call neko_error('No device backend configured')
#endif
end subroutine device_add4

subroutine device_add2s1(a_d, b_d, c1, n)
type(c_ptr) :: a_d, b_d
real(kind=rp) :: c1
Expand Down
15 changes: 15 additions & 0 deletions src/math/bcknd/device/hip/math.hip
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,21 @@ extern "C" {
HIP_CHECK(hipGetLastError());
}

/**
* Fortran wrapper for add4
* Vector addition \f$ a = b + c + d\f$
*/
void hip_add4(void *a, void *b, void *c, void *d, int *n) {

const dim3 nthrds(1024, 1, 1);
const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);

hipLaunchKernelGGL(HIP_KERNEL_NAME(add4_kernel<real>),
nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
(real *) a, (real *) b, (real *) c, (real *) d, *n);
HIP_CHECK(hipGetLastError());
}

/**
* Fortran wrapper for add2s1
* Vector addition with scalar multiplication \f$ a = c_1 a + b \f$
Expand Down
18 changes: 18 additions & 0 deletions src/math/bcknd/device/hip/math_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,24 @@ __global__ void add3_kernel(T * __restrict__ a,
}
}

/**
* Device kernel for add4
*/
template< typename T >
__global__ void add4_kernel(T * __restrict__ a,
const T * __restrict__ b,
const T * __restrict__ c,
const T * __restrict__ d,
const int n) {

const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const int str = blockDim.x * gridDim.x;

for (int i = idx; i < n; i += str) {
a[i] = b[i] + c[i] + d[i];
}
}

/**
* Device kernel for add2s1
*/
Expand Down
28 changes: 28 additions & 0 deletions src/math/bcknd/device/opencl/math.c
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,34 @@ void opencl_add3(void *a, void *b, void *c, int *n) {
0, NULL, NULL));
}

/**
* Fortran wrapper for add4
* Vector addition \f$ a = b + c + d \f$
*/
void opencl_add4(void *a, void *b, void *c, void *d, int *n) {
cl_int err;

if (math_program == NULL)
opencl_kernel_jit(math_kernel, (cl_program *) &math_program);

cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
CL_CHECK(err);

CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));

const int nb = ((*n) + 256 - 1) / 256;
const size_t global_item_size = 256 * nb;
const size_t local_item_size = 256;

CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
NULL, &global_item_size, &local_item_size,
0, NULL, NULL));
}

/**
* Fortran wrapper for add2s1
* Vector addition with scalar multiplication \f$ a = c_1 a + b \f$
Expand Down
17 changes: 17 additions & 0 deletions src/math/bcknd/device/opencl/math_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,23 @@ __kernel void cfill_kernel(__global real * __restrict__ a,
}
}

/**
* Device kernel for add4
*/
__kernel void add4_kernel(__global real * __restrict__ a,
__global const real * __restrict__ b,
__global const real * __restrict__ c,
__global const real * __restrict__ d,
const int n) {

const int idx = get_global_id(0);
const int str = get_global_size(0);

for (int i = idx; i < n; i += str) {
a[i] = d[i] + c[i] + d[i];
}
}

/**
* Device kernel for add2
*/
Expand Down
54 changes: 48 additions & 6 deletions src/math/field_math.f90
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ module field_math
use num_types, only: rp
use field, only: field_t
use math, only: rzero, rone, copy, cmult, cadd, cfill, invcol1, vdot3, add2, &
sub2, sub3, add2s1, add2s2, addsqr2s2, cmult2, invcol2, col2, col3, &
subcol3, add3s2, addcol3, addcol4, glsum, glsc2, glsc3
add3, add4, sub2, sub3, add2s1, add2s2, addsqr2s2, cmult2, invcol2, &
col2, col3, subcol3, add3s2, addcol3, addcol4, glsum, glsc2, glsc3
use device_math, only: device_rzero, device_rone, device_copy, device_cmult, &
device_cadd, device_cfill, device_invcol1, device_vdot3, device_add2, &
device_sub2, device_sub3, device_add2s1, device_add2s2, &
device_addsqr2s2, device_cmult2, device_invcol2, device_col2, &
device_col3, device_subcol3, device_add3s2, device_addcol3, &
device_addcol4, device_glsum, device_glsc2, device_glsc3
device_add3, device_add4, device_sub2, device_sub3, device_add2s1, &
device_add2s2, device_addsqr2s2, device_cmult2, device_invcol2, &
device_col2, device_col3, device_subcol3, device_add3s2, &
device_addcol3, device_addcol4, device_glsum, device_glsc2, device_glsc3
implicit none
private

Expand Down Expand Up @@ -271,6 +271,48 @@ subroutine field_add2(a, b, n)

end subroutine field_add2

!> Vector addition \f$ a = b + c \f$
subroutine field_add3(a, b, c, n)
integer, intent(in), optional :: n
type(field_t), intent(inout) :: a
type(field_t), intent(in) :: b, c
integer :: size

if (present(n)) then
size = n
else
size = a%size()
end if

if (NEKO_BCKND_DEVICE .eq. 1) then
call device_add3(a%x_d, b%x_d, c%x_d, size)
else
call add3(a%x, b%x, c%x, size)
end if

end subroutine field_add3

!> Vector addition \f$ a = b + c + d \f$
subroutine field_add4(a, b, c, d, n)
integer, intent(in), optional :: n
type(field_t), intent(inout) :: a
type(field_t), intent(in) :: b, c, d
integer :: size

if (present(n)) then
size = n
else
size = a%size()
end if

if (NEKO_BCKND_DEVICE .eq. 1) then
call device_add4(a%x_d, b%x_d, c%x_d, d%x_d, size)
else
call add4(a%x, b%x, c%x, d%x, size)
end if

end subroutine field_add4

!> Vector substraction \f$ a = a - b \f$
subroutine field_sub2(a, b, n)
integer, intent(in), optional :: n
Expand Down
Loading

0 comments on commit 26102eb

Please sign in to comment.