From 4d03281329f050f4497b1f5ff51428ff99be0708 Mon Sep 17 00:00:00 2001
From: Niclas Jansson <njansson@kth.se>
Date: Wed, 29 May 2024 08:46:30 +0200
Subject: [PATCH] Use standard language constructs to guide vectorisation

---
 src/common/bcknd/cpu/rhs_maker_cpu.f90 | 32 +++++++++++++-------------
 src/fluid/bcknd/cpu/pnpn_res_cpu.f90   | 12 +++++-----
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/common/bcknd/cpu/rhs_maker_cpu.f90 b/src/common/bcknd/cpu/rhs_maker_cpu.f90
index d52a2308ab1..1998f4e70c7 100644
--- a/src/common/bcknd/cpu/rhs_maker_cpu.f90
+++ b/src/common/bcknd/cpu/rhs_maker_cpu.f90
@@ -36,14 +36,14 @@ subroutine rhs_maker_sumab_cpu(u, v, w, uu, vv, ww, uulag, vvlag, wwlag, ab, nab
 
     n = uu%dof%size()
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        u%x(i,1,1,1) = ab(1) * uu%x(i,1,1,1) + ab(2) * uulag%lf(1)%x(i,1,1,1)
        v%x(i,1,1,1) = ab(1) * vv%x(i,1,1,1) + ab(2) * vvlag%lf(1)%x(i,1,1,1)
        w%x(i,1,1,1) = ab(1) * ww%x(i,1,1,1) + ab(2) * wwlag%lf(1)%x(i,1,1,1)
     end do
 
     if (nab .eq. 3) then
-       do i = 1, n
+       do concurrent (i = 1:n)
           u%x(i,1,1,1) = u%x(i,1,1,1) + ab(3) * uulag%lf(2)%x(i,1,1,1)
           v%x(i,1,1,1) = v%x(i,1,1,1) + ab(3) * vvlag%lf(2)%x(i,1,1,1)
           w%x(i,1,1,1) = w%x(i,1,1,1) + ab(3) * wwlag%lf(2)%x(i,1,1,1)
@@ -68,7 +68,7 @@ subroutine rhs_maker_ext_cpu(fx_lag, fy_lag, fz_lag, &
     call neko_scratch_registry%request_field(temp2, temp_indices(2))
     call neko_scratch_registry%request_field(temp3, temp_indices(3))
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        temp1%x(i,1,1,1) = ext_coeffs(2) * fx_lag%x(i,1,1,1) + &
                           ext_coeffs(3) * fx_laglag%x(i,1,1,1)
        temp2%x(i,1,1,1) = ext_coeffs(2) * fy_lag%x(i,1,1,1) + &
@@ -77,7 +77,7 @@ subroutine rhs_maker_ext_cpu(fx_lag, fy_lag, fz_lag, &
                           ext_coeffs(3) * fz_laglag%x(i,1,1,1)
     end do
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        fx_laglag%x(i,1,1,1) = fx_lag%x(i,1,1,1)
        fy_laglag%x(i,1,1,1) = fy_lag%x(i,1,1,1)
        fz_laglag%x(i,1,1,1) = fz_lag%x(i,1,1,1)
@@ -86,7 +86,7 @@ subroutine rhs_maker_ext_cpu(fx_lag, fy_lag, fz_lag, &
        fz_lag%x(i,1,1,1) = fz(i)
     end do
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        fx(i) = (ext_coeffs(1) * fx(i) + temp1%x(i,1,1,1)) * rho
        fy(i) = (ext_coeffs(1) * fy(i) + temp2%x(i,1,1,1)) * rho
        fz(i) = (ext_coeffs(1) * fz(i) + temp3%x(i,1,1,1)) * rho
@@ -108,17 +108,17 @@ subroutine scalar_rhs_maker_ext_cpu(fs_lag, fs_laglag, fs, rho, ext_coeffs, n)
 
     call neko_scratch_registry%request_field(temp1, temp_index)
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        temp1%x(i,1,1,1) = ext_coeffs(2) * fs_lag%x(i,1,1,1) + &
                           ext_coeffs(3) * fs_laglag%x(i,1,1,1)
     end do
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        fs_laglag%x(i,1,1,1) = fs_lag%x(i,1,1,1)
        fs_lag%x(i,1,1,1) = fs(i)
     end do
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        fs(i) = (ext_coeffs(1) * fs(i) + temp1%x(i,1,1,1)) * rho
     end do
 
@@ -145,27 +145,27 @@ subroutine rhs_maker_bdf_cpu(ulag, vlag, wlag, bfx, bfy, bfz, &
     call neko_scratch_registry%request_field(tb2, temp_indices(5))
     call neko_scratch_registry%request_field(tb3, temp_indices(6))
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        tb1%x(i,1,1,1) = u%x(i,1,1,1) * B(i) * bd(2)
        tb2%x(i,1,1,1) = v%x(i,1,1,1) * B(i) * bd(2)
        tb3%x(i,1,1,1) = w%x(i,1,1,1) * B(i) * bd(2)
     end do
 
     do ilag = 2, nbd
-       do i = 1, n
+       do concurrent (i = 1:n)
           ta1%x(i,1,1,1) = ulag%lf(ilag-1)%x(i,1,1,1) * B(i) * bd(ilag+1)
           ta2%x(i,1,1,1) = vlag%lf(ilag-1)%x(i,1,1,1) * B(i) * bd(ilag+1)
           ta3%x(i,1,1,1) = wlag%lf(ilag-1)%x(i,1,1,1) * B(i) * bd(ilag+1)
        end do
 
-       do i = 1, n
+       do concurrent (i = 1:n)
           tb1%x(i,1,1,1) = tb1%x(i,1,1,1) + ta1%x(i,1,1,1)
           tb2%x(i,1,1,1) = tb2%x(i,1,1,1) + ta2%x(i,1,1,1)
           tb3%x(i,1,1,1) = tb3%x(i,1,1,1) + ta3%x(i,1,1,1)
        end do
     end do
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        bfx(i) = bfx(i) + tb1%x(i,1,1,1) * (rho / dt)
        bfy(i) = bfy(i) + tb2%x(i,1,1,1) * (rho / dt)
        bfz(i) = bfz(i) + tb3%x(i,1,1,1) * (rho / dt)
@@ -189,21 +189,21 @@ subroutine scalar_rhs_maker_bdf_cpu(s_lag, fs, s, B, rho, dt, bd, nbd, n)
     call neko_scratch_registry%request_field(temp1, temp_indices(1))
     call neko_scratch_registry%request_field(temp2, temp_indices(2))
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        temp2%x(i,1,1,1) = s%x(i,1,1,1) * B(i) * bd(2)
     end do
 
     do ilag = 2, nbd
-       do i = 1, n
+       do concurrent (i = 1:n)
           temp1%x(i,1,1,1) = s_lag%lf(ilag-1)%x(i,1,1,1) * B(i) * bd(ilag+1)
        end do
 
-       do i = 1, n
+       do concurrent (i = 1:n)
           temp2%x(i,1,1,1) = temp2%x(i,1,1,1) + temp1%x(i,1,1,1)
        end do
     end do
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        fs(i) = fs(i) + temp2%x(i,1,1,1) * (rho / dt)
     end do
 
diff --git a/src/fluid/bcknd/cpu/pnpn_res_cpu.f90 b/src/fluid/bcknd/cpu/pnpn_res_cpu.f90
index 31df4c94f03..9b842fc6c20 100644
--- a/src/fluid/bcknd/cpu/pnpn_res_cpu.f90
+++ b/src/fluid/bcknd/cpu/pnpn_res_cpu.f90
@@ -36,7 +36,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, &
     type(gs_t), intent(inout) :: gs_Xh
     type(facet_normal_t), intent(inout) :: bc_prs_surface
     type(facet_normal_t), intent(inout) :: bc_sym_surface
-    class(Ax_t), intent(inout) :: Ax
+    class(ax_t), intent(inout) :: Ax
     real(kind=rp), intent(inout) :: bd
     real(kind=rp), intent(in) :: dt
     real(kind=rp), intent(in) :: mu
@@ -67,7 +67,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, &
     call curl(ta1, ta2, ta3, u_e, v_e, w_e, work1, work2, c_Xh)
     call curl(wa1, wa2, wa3, ta1, ta2, ta3, work1, work2, c_Xh)
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        ta1%x(i,1,1,1) = f_x%x(i,1,1,1) / rho &
             - ((wa1%x(i,1,1,1) * (mu / rho)) * c_Xh%B(i,1,1,1))
        ta2%x(i,1,1,1) = f_y%x(i,1,1,1) / rho &
@@ -80,7 +80,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, &
     call gs_Xh%op(ta2, GS_OP_ADD)
     call gs_Xh%op(ta3, GS_OP_ADD)
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        ta1%x(i,1,1,1) = ta1%x(i,1,1,1) * c_Xh%Binv(i,1,1,1)
        ta2%x(i,1,1,1) = ta2%x(i,1,1,1) * c_Xh%Binv(i,1,1,1)
        ta3%x(i,1,1,1) = ta3%x(i,1,1,1) * c_Xh%Binv(i,1,1,1)
@@ -100,7 +100,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, &
     !
     ! Surface velocity terms
     !
-    do i = 1, n
+    do concurrent (i = 1:n)
        wa1%x(i,1,1,1) = 0.0_rp
        wa2%x(i,1,1,1) = 0.0_rp
        wa3%x(i,1,1,1) = 0.0_rp
@@ -109,7 +109,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, &
     call bc_sym_surface%apply_surfvec(wa1%x,wa2%x,wa3%x,ta1%x, ta2%x, ta3%x, n)
 
     dtbd = bd / dt
-    do i = 1, n
+    do concurrent (i = 1:n)
        ta1%x(i,1,1,1) = 0.0_rp
        ta2%x(i,1,1,1) = 0.0_rp
        ta3%x(i,1,1,1) = 0.0_rp
@@ -117,7 +117,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, &
 
     call bc_prs_surface%apply_surfvec(ta1%x, ta2%x, ta3%x, u%x, v%x, w%x, n)
 
-    do i = 1, n
+    do concurrent (i = 1:n)
        p_res%x(i,1,1,1) = p_res%x(i,1,1,1) &
             - (dtbd * (ta1%x(i,1,1,1) + ta2%x(i,1,1,1) + ta3%x(i,1,1,1)))&
             - (wa1%x(i,1,1,1) + wa2%x(i,1,1,1) + wa3%x(i,1,1,1))