Merge pull request #1488 from timfelle/math/add4

ExtremeFLOW · Sep 19, 2024 · 26102eb · 26102eb
2 parents 9d60f50 + 59ebadd
commit 26102eb
Show file tree

Hide file tree

Showing 9 changed files with 227 additions and 28 deletions.
diff --git a/src/math/bcknd/device/cuda/math.cu b/src/math/bcknd/device/cuda/math.cu
@@ -188,6 +188,20 @@ extern "C" {
     CUDA_CHECK(cudaGetLastError());
   }
 
+  /**
+   * Fortran wrapper for add4
+   * Vector addition \f$ a = b + c + d \f$
+   */
+  void cuda_add4(void *a, void *b, void *c, void *d, int *n) {
+
+    const dim3 nthrds(1024, 1, 1);
+    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
+
+    add4_kernel<real><<<nblcks, nthrds, 0,
+      (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, (real *) d, *n);
+    CUDA_CHECK(cudaGetLastError());
+
+  }
   /**
    * Fortran wrapper for add2s1
    * Vector addition with scalar multiplication \f$ a = c_1 a + b \f$

diff --git a/src/math/bcknd/device/cuda/math_kernel.h b/src/math/bcknd/device/cuda/math_kernel.h
@@ -183,6 +183,24 @@ __global__ void add3_kernel(T * __restrict__ a,
   }
 }
 
+/**
+ * Device kernel for add4
+ */
+template< typename T >
+__global__ void add4_kernel(T * __restrict__ a,
+                            const T * __restrict__ b,
+                            const T * __restrict__ c,
+                            const T * __restrict__ d,
+                            const int n) {
+
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int str = blockDim.x * gridDim.x;
+
+  for (int i = idx; i < n; i += str) {
+    a[i] = b[i] + c[i] + d[i];
+  }
+}
+
 /**
  * Device kernel for add2s1
  */

diff --git a/src/math/bcknd/device/device_math.F90 b/src/math/bcknd/device/device_math.F90
@@ -56,7 +56,7 @@ subroutine hip_masked_copy(a_d, b_d, mask_d, n, m) &
        integer(c_int) :: n, m
      end subroutine hip_masked_copy
   end interface
-  
+
   interface
      subroutine hip_cfill_mask(a_d, c, size, mask_d, mask_size) &
           bind(c, name='hip_cfill_mask')
@@ -146,6 +146,17 @@ subroutine hip_add2(a_d, b_d, n) &
      end subroutine hip_add2
   end interface
 
+  interface
+     subroutine hip_add4(a_d, b_d, c_d, d_d, n) &
+          bind(c, name='hip_add4')
+       use, intrinsic :: iso_c_binding
+       import c_rp
+       implicit none
+       type(c_ptr), value :: a_d, b_d, c_d, d_d
+       integer(c_int) :: n
+     end subroutine hip_add4
+  end interface
+
   interface
      subroutine hip_add2s1(a_d, b_d, c1, n) &
           bind(c, name='hip_add2s1')
@@ -478,6 +489,17 @@ subroutine cuda_add2(a_d, b_d, n) &
      end subroutine cuda_add2
   end interface
 
+  interface
+     subroutine cuda_add4(a_d, b_d, c_d, d_d, n) &
+          bind(c, name='cuda_add4')
+       use, intrinsic :: iso_c_binding
+       import c_rp
+       implicit none
+       type(c_ptr), value :: a_d, b_d, c_d, d_d
+       integer(c_int) :: n
+     end subroutine cuda_add4
+  end interface
+
   interface
      subroutine cuda_add2s1(a_d, b_d, c1, n) &
           bind(c, name='cuda_add2s1')
@@ -818,6 +840,16 @@ subroutine opencl_add2(a_d, b_d, n) &
      end subroutine opencl_add2
   end interface
 
+  interface
+     subroutine opencl_add4(a_d, b_d, c_d, d_d, n) &
+          bind(c, name='opencl_add4')
+       use, intrinsic :: iso_c_binding
+       implicit none
+       type(c_ptr), value :: a_d, b_d, c_d, d_d
+       integer(c_int) :: n
+     end subroutine opencl_add4
+  end interface
+
   interface
      subroutine opencl_add2s1(a_d, b_d, c1, n) &
           bind(c, name='opencl_add2s1')
@@ -1034,13 +1066,14 @@ end function opencl_glsum
 #endif
 
   public :: device_copy, device_rzero, device_rone, device_cmult, device_cmult2,&
-       device_cadd, device_cfill, device_add2, device_add2s1, device_add2s2, &
-       device_addsqr2s2, device_add3s2, device_invcol1, device_invcol2, &
-       device_col2, device_col3, device_subcol3, device_sub2, device_sub3, &
-       device_addcol3, device_addcol4, device_vdot3, device_vlsc3, device_glsc3, &
+       device_cadd, device_cadd2, device_cfill, device_add2, device_add3, &
+       device_add4, device_add2s1, device_add2s2, device_addsqr2s2, &
+       device_add3s2, device_invcol1, device_invcol2, device_col2, &
+       device_col3, device_subcol3, device_sub2, device_sub3, device_addcol3, &
+       device_addcol4, device_vdot3, device_vlsc3, device_glsc3, &
        device_glsc3_many, device_add2s2_many, device_glsc2, device_glsum, &
-       device_masked_copy, device_cfill_mask, device_add3, device_cadd2
-  
+       device_masked_copy, device_cfill_mask
+
 contains
 
   !> Copy a vector \f$ a = b \f$
@@ -1217,6 +1250,20 @@ subroutine device_add2(a_d, b_d, n)
 #endif
   end subroutine device_add2
 
+  subroutine device_add4(a_d, b_d, c_d, d_d, n)
+    type(c_ptr) :: a_d, b_d, c_d, d_d
+    integer :: n
+#ifdef HAVE_HIP
+    call hip_add4(a_d, b_d, c_d, d_d, n)
+#elif HAVE_CUDA
+    call cuda_add4(a_d, b_d, c_d, d_d, n)
+#elif HAVE_OPENCL
+    call opencl_add4(a_d, b_d, c_d, d_d, n)
+#else
+    call neko_error('No device backend configured')
+#endif
+  end subroutine device_add4
+
   subroutine device_add2s1(a_d, b_d, c1, n)
     type(c_ptr) :: a_d, b_d
     real(kind=rp) :: c1

diff --git a/src/math/bcknd/device/hip/math.hip b/src/math/bcknd/device/hip/math.hip
@@ -177,6 +177,21 @@ extern "C" {
     HIP_CHECK(hipGetLastError());
   }
 
+  /**
+   * Fortran wrapper for add4
+   * Vector addition \f$ a = b + c + d\f$
+   */
+  void hip_add4(void *a, void *b, void *c, void *d, int *n) {
+
+    const dim3 nthrds(1024, 1, 1);
+    const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(add4_kernel<real>),
+                       nblcks, nthrds, 0, (hipStream_t) glb_cmd_queue,
+                       (real *) a, (real *) b, (real *) c, (real *) d, *n);
+    HIP_CHECK(hipGetLastError());
+  }
+
   /**
    * Fortran wrapper for add2s1
    * Vector addition with scalar multiplication \f$ a = c_1 a + b \f$

diff --git a/src/math/bcknd/device/hip/math_kernel.h b/src/math/bcknd/device/hip/math_kernel.h
@@ -183,6 +183,24 @@ __global__ void add3_kernel(T * __restrict__ a,
   }
 }
 
+/**
+ * Device kernel for add4
+ */
+template< typename T >
+__global__ void add4_kernel(T * __restrict__ a,
+                            const T * __restrict__ b,
+                            const T * __restrict__ c,
+                            const T * __restrict__ d,
+                            const int n) {
+
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int str = blockDim.x * gridDim.x;
+
+  for (int i = idx; i < n; i += str) {
+    a[i] = b[i] + c[i] + d[i];
+  }
+}
+
 /**
  * Device kernel for add2s1
  */

diff --git a/src/math/bcknd/device/opencl/math.c b/src/math/bcknd/device/opencl/math.c
@@ -318,6 +318,34 @@ void opencl_add3(void *a, void *b, void *c, int *n) {
                                   0, NULL, NULL));
 }
 
+/**
+ * Fortran wrapper for add4
+ * Vector addition \f$ a = b + c + d \f$
+ */
+void opencl_add4(void *a, void *b, void *c, void *d, int *n) {
+  cl_int err;
+
+  if (math_program == NULL)
+    opencl_kernel_jit(math_kernel, (cl_program *) &math_program);
+
+  cl_kernel kernel = clCreateKernel(math_program, "add4_kernel", &err);
+  CL_CHECK(err);
+
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a));
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
+
+  const int nb = ((*n) + 256 - 1) / 256;
+  const size_t global_item_size = 256 * nb;
+  const size_t local_item_size = 256;
+
+  CL_CHECK(clEnqueueNDRangeKernel((cl_command_queue) glb_cmd_queue, kernel, 1,
+                                  NULL, &global_item_size, &local_item_size,
+                                  0, NULL, NULL));
+}
+
 /**
  * Fortran wrapper for add2s1
  * Vector addition with scalar multiplication \f$ a = c_1 a + b \f$

diff --git a/src/math/bcknd/device/opencl/math_kernel.cl b/src/math/bcknd/device/opencl/math_kernel.cl
@@ -143,6 +143,23 @@ __kernel void cfill_kernel(__global real * __restrict__ a,
   } 
 }
 
+/**
+ * Device kernel for add4
+ */
+__kernel void add4_kernel(__global real * __restrict__ a,
+                          __global const real * __restrict__ b,
+                          __global const real * __restrict__ c,
+                          __global const real * __restrict__ d,
+                          const int n) {
+
+  const int idx = get_global_id(0);
+  const int str = get_global_size(0);
+
+  for (int i = idx; i < n; i += str) {
+    a[i] = d[i] + c[i] + d[i];
+  } 
+}
+
 /**
  * Device kernel for add2
  */

diff --git a/src/math/field_math.f90 b/src/math/field_math.f90
@@ -62,14 +62,14 @@ module field_math
   use num_types, only: rp
   use field, only: field_t
   use math, only: rzero, rone, copy, cmult, cadd, cfill, invcol1, vdot3, add2, &
-       sub2, sub3, add2s1, add2s2, addsqr2s2, cmult2, invcol2, col2, col3, &
-       subcol3, add3s2, addcol3, addcol4, glsum, glsc2, glsc3
+       add3, add4, sub2, sub3, add2s1, add2s2, addsqr2s2, cmult2, invcol2, &
+       col2, col3, subcol3, add3s2, addcol3, addcol4, glsum, glsc2, glsc3
   use device_math, only: device_rzero, device_rone, device_copy, device_cmult, &
        device_cadd, device_cfill, device_invcol1, device_vdot3, device_add2, &
-       device_sub2, device_sub3, device_add2s1, device_add2s2, &
-       device_addsqr2s2, device_cmult2, device_invcol2, device_col2, &
-       device_col3, device_subcol3, device_add3s2, device_addcol3, &
-       device_addcol4, device_glsum, device_glsc2, device_glsc3
+       device_add3, device_add4, device_sub2, device_sub3, device_add2s1, &
+       device_add2s2, device_addsqr2s2, device_cmult2, device_invcol2, &
+       device_col2, device_col3, device_subcol3, device_add3s2, &
+       device_addcol3, device_addcol4, device_glsum, device_glsc2, device_glsc3
   implicit none
   private
 
@@ -271,6 +271,48 @@ subroutine field_add2(a, b, n)
 
   end subroutine field_add2
 
+  !> Vector addition \f$ a = b + c \f$
+  subroutine field_add3(a, b, c, n)
+    integer, intent(in), optional :: n
+    type(field_t), intent(inout) :: a
+    type(field_t), intent(in) :: b, c
+    integer :: size
+
+    if (present(n)) then
+       size = n
+    else
+       size = a%size()
+    end if
+
+    if (NEKO_BCKND_DEVICE .eq. 1) then
+       call device_add3(a%x_d, b%x_d, c%x_d, size)
+    else
+       call add3(a%x, b%x, c%x, size)
+    end if
+
+  end subroutine field_add3
+
+  !> Vector addition \f$ a = b + c + d \f$
+  subroutine field_add4(a, b, c, d, n)
+    integer, intent(in), optional :: n
+    type(field_t), intent(inout) :: a
+    type(field_t), intent(in) :: b, c, d
+    integer :: size
+
+    if (present(n)) then
+       size = n
+    else
+       size = a%size()
+    end if
+
+    if (NEKO_BCKND_DEVICE .eq. 1) then
+       call device_add4(a%x_d, b%x_d, c%x_d, d%x_d, size)
+    else
+       call add4(a%x, b%x, c%x, d%x, size)
+    end if
+
+  end subroutine field_add4
+
   !> Vector substraction \f$ a = a - b \f$
   subroutine field_sub2(a, b, n)
     integer, intent(in), optional :: n