From 4d03281329f050f4497b1f5ff51428ff99be0708 Mon Sep 17 00:00:00 2001 From: Niclas Jansson Date: Wed, 29 May 2024 08:46:30 +0200 Subject: [PATCH] Use standard language constructs to guide vectorisation --- src/common/bcknd/cpu/rhs_maker_cpu.f90 | 32 +++++++++++++------------- src/fluid/bcknd/cpu/pnpn_res_cpu.f90 | 12 +++++----- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/common/bcknd/cpu/rhs_maker_cpu.f90 b/src/common/bcknd/cpu/rhs_maker_cpu.f90 index d52a2308ab1..1998f4e70c7 100644 --- a/src/common/bcknd/cpu/rhs_maker_cpu.f90 +++ b/src/common/bcknd/cpu/rhs_maker_cpu.f90 @@ -36,14 +36,14 @@ subroutine rhs_maker_sumab_cpu(u, v, w, uu, vv, ww, uulag, vvlag, wwlag, ab, nab n = uu%dof%size() - do i = 1, n + do concurrent (i = 1:n) u%x(i,1,1,1) = ab(1) * uu%x(i,1,1,1) + ab(2) * uulag%lf(1)%x(i,1,1,1) v%x(i,1,1,1) = ab(1) * vv%x(i,1,1,1) + ab(2) * vvlag%lf(1)%x(i,1,1,1) w%x(i,1,1,1) = ab(1) * ww%x(i,1,1,1) + ab(2) * wwlag%lf(1)%x(i,1,1,1) end do if (nab .eq. 3) then - do i = 1, n + do concurrent (i = 1:n) u%x(i,1,1,1) = u%x(i,1,1,1) + ab(3) * uulag%lf(2)%x(i,1,1,1) v%x(i,1,1,1) = v%x(i,1,1,1) + ab(3) * vvlag%lf(2)%x(i,1,1,1) w%x(i,1,1,1) = w%x(i,1,1,1) + ab(3) * wwlag%lf(2)%x(i,1,1,1) @@ -68,7 +68,7 @@ subroutine rhs_maker_ext_cpu(fx_lag, fy_lag, fz_lag, & call neko_scratch_registry%request_field(temp2, temp_indices(2)) call neko_scratch_registry%request_field(temp3, temp_indices(3)) - do i = 1, n + do concurrent (i = 1:n) temp1%x(i,1,1,1) = ext_coeffs(2) * fx_lag%x(i,1,1,1) + & ext_coeffs(3) * fx_laglag%x(i,1,1,1) temp2%x(i,1,1,1) = ext_coeffs(2) * fy_lag%x(i,1,1,1) + & @@ -77,7 +77,7 @@ subroutine rhs_maker_ext_cpu(fx_lag, fy_lag, fz_lag, & ext_coeffs(3) * fz_laglag%x(i,1,1,1) end do - do i = 1, n + do concurrent (i = 1:n) fx_laglag%x(i,1,1,1) = fx_lag%x(i,1,1,1) fy_laglag%x(i,1,1,1) = fy_lag%x(i,1,1,1) fz_laglag%x(i,1,1,1) = fz_lag%x(i,1,1,1) @@ -86,7 +86,7 @@ subroutine rhs_maker_ext_cpu(fx_lag, fy_lag, fz_lag, & fz_lag%x(i,1,1,1) = fz(i) end do - do i = 1, n + do concurrent (i = 1:n) fx(i) = (ext_coeffs(1) * fx(i) + temp1%x(i,1,1,1)) * rho fy(i) = (ext_coeffs(1) * fy(i) + temp2%x(i,1,1,1)) * rho fz(i) = (ext_coeffs(1) * fz(i) + temp3%x(i,1,1,1)) * rho @@ -108,17 +108,17 @@ subroutine scalar_rhs_maker_ext_cpu(fs_lag, fs_laglag, fs, rho, ext_coeffs, n) call neko_scratch_registry%request_field(temp1, temp_index) - do i = 1, n + do concurrent (i = 1:n) temp1%x(i,1,1,1) = ext_coeffs(2) * fs_lag%x(i,1,1,1) + & ext_coeffs(3) * fs_laglag%x(i,1,1,1) end do - do i = 1, n + do concurrent (i = 1:n) fs_laglag%x(i,1,1,1) = fs_lag%x(i,1,1,1) fs_lag%x(i,1,1,1) = fs(i) end do - do i = 1, n + do concurrent (i = 1:n) fs(i) = (ext_coeffs(1) * fs(i) + temp1%x(i,1,1,1)) * rho end do @@ -145,27 +145,27 @@ subroutine rhs_maker_bdf_cpu(ulag, vlag, wlag, bfx, bfy, bfz, & call neko_scratch_registry%request_field(tb2, temp_indices(5)) call neko_scratch_registry%request_field(tb3, temp_indices(6)) - do i = 1, n + do concurrent (i = 1:n) tb1%x(i,1,1,1) = u%x(i,1,1,1) * B(i) * bd(2) tb2%x(i,1,1,1) = v%x(i,1,1,1) * B(i) * bd(2) tb3%x(i,1,1,1) = w%x(i,1,1,1) * B(i) * bd(2) end do do ilag = 2, nbd - do i = 1, n + do concurrent (i = 1:n) ta1%x(i,1,1,1) = ulag%lf(ilag-1)%x(i,1,1,1) * B(i) * bd(ilag+1) ta2%x(i,1,1,1) = vlag%lf(ilag-1)%x(i,1,1,1) * B(i) * bd(ilag+1) ta3%x(i,1,1,1) = wlag%lf(ilag-1)%x(i,1,1,1) * B(i) * bd(ilag+1) end do - do i = 1, n + do concurrent (i = 1:n) tb1%x(i,1,1,1) = tb1%x(i,1,1,1) + ta1%x(i,1,1,1) tb2%x(i,1,1,1) = tb2%x(i,1,1,1) + ta2%x(i,1,1,1) tb3%x(i,1,1,1) = tb3%x(i,1,1,1) + ta3%x(i,1,1,1) end do end do - do i = 1, n + do concurrent (i = 1:n) bfx(i) = bfx(i) + tb1%x(i,1,1,1) * (rho / dt) bfy(i) = bfy(i) + tb2%x(i,1,1,1) * (rho / dt) bfz(i) = bfz(i) + tb3%x(i,1,1,1) * (rho / dt) @@ -189,21 +189,21 @@ subroutine scalar_rhs_maker_bdf_cpu(s_lag, fs, s, B, rho, dt, bd, nbd, n) call neko_scratch_registry%request_field(temp1, temp_indices(1)) call neko_scratch_registry%request_field(temp2, temp_indices(2)) - do i = 1, n + do concurrent (i = 1:n) temp2%x(i,1,1,1) = s%x(i,1,1,1) * B(i) * bd(2) end do do ilag = 2, nbd - do i = 1, n + do concurrent (i = 1:n) temp1%x(i,1,1,1) = s_lag%lf(ilag-1)%x(i,1,1,1) * B(i) * bd(ilag+1) end do - do i = 1, n + do concurrent (i = 1:n) temp2%x(i,1,1,1) = temp2%x(i,1,1,1) + temp1%x(i,1,1,1) end do end do - do i = 1, n + do concurrent (i = 1:n) fs(i) = fs(i) + temp2%x(i,1,1,1) * (rho / dt) end do diff --git a/src/fluid/bcknd/cpu/pnpn_res_cpu.f90 b/src/fluid/bcknd/cpu/pnpn_res_cpu.f90 index 31df4c94f03..9b842fc6c20 100644 --- a/src/fluid/bcknd/cpu/pnpn_res_cpu.f90 +++ b/src/fluid/bcknd/cpu/pnpn_res_cpu.f90 @@ -36,7 +36,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, & type(gs_t), intent(inout) :: gs_Xh type(facet_normal_t), intent(inout) :: bc_prs_surface type(facet_normal_t), intent(inout) :: bc_sym_surface - class(Ax_t), intent(inout) :: Ax + class(ax_t), intent(inout) :: Ax real(kind=rp), intent(inout) :: bd real(kind=rp), intent(in) :: dt real(kind=rp), intent(in) :: mu @@ -67,7 +67,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, & call curl(ta1, ta2, ta3, u_e, v_e, w_e, work1, work2, c_Xh) call curl(wa1, wa2, wa3, ta1, ta2, ta3, work1, work2, c_Xh) - do i = 1, n + do concurrent (i = 1:n) ta1%x(i,1,1,1) = f_x%x(i,1,1,1) / rho & - ((wa1%x(i,1,1,1) * (mu / rho)) * c_Xh%B(i,1,1,1)) ta2%x(i,1,1,1) = f_y%x(i,1,1,1) / rho & @@ -80,7 +80,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, & call gs_Xh%op(ta2, GS_OP_ADD) call gs_Xh%op(ta3, GS_OP_ADD) - do i = 1, n + do concurrent (i = 1:n) ta1%x(i,1,1,1) = ta1%x(i,1,1,1) * c_Xh%Binv(i,1,1,1) ta2%x(i,1,1,1) = ta2%x(i,1,1,1) * c_Xh%Binv(i,1,1,1) ta3%x(i,1,1,1) = ta3%x(i,1,1,1) * c_Xh%Binv(i,1,1,1) @@ -100,7 +100,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, & ! ! Surface velocity terms ! - do i = 1, n + do concurrent (i = 1:n) wa1%x(i,1,1,1) = 0.0_rp wa2%x(i,1,1,1) = 0.0_rp wa3%x(i,1,1,1) = 0.0_rp @@ -109,7 +109,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, & call bc_sym_surface%apply_surfvec(wa1%x,wa2%x,wa3%x,ta1%x, ta2%x, ta3%x, n) dtbd = bd / dt - do i = 1, n + do concurrent (i = 1:n) ta1%x(i,1,1,1) = 0.0_rp ta2%x(i,1,1,1) = 0.0_rp ta3%x(i,1,1,1) = 0.0_rp @@ -117,7 +117,7 @@ subroutine pnpn_prs_res_cpu_compute(p, p_res, u, v, w, u_e, v_e, w_e, f_x, & call bc_prs_surface%apply_surfvec(ta1%x, ta2%x, ta3%x, u%x, v%x, w%x, n) - do i = 1, n + do concurrent (i = 1:n) p_res%x(i,1,1,1) = p_res%x(i,1,1,1) & - (dtbd * (ta1%x(i,1,1,1) + ta2%x(i,1,1,1) + ta3%x(i,1,1,1)))& - (wa1%x(i,1,1,1) + wa2%x(i,1,1,1) + wa3%x(i,1,1,1))