diff --git a/cgyro/src/cgyro_check_memory.F90 b/cgyro/src/cgyro_check_memory.F90 index 304b7e798..84a821f28 100644 --- a/cgyro/src/cgyro_check_memory.F90 +++ b/cgyro/src/cgyro_check_memory.F90 @@ -106,9 +106,6 @@ subroutine cgyro_check_memory(datafile) call cgyro_alloc_add_4d(io,n_field,nc,nv_loc,nt_loc,16,'omega_ss') call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'omega_cap_h') call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'omega_h') - if (triad_print_flag == 1) then - call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'diss_r') - endif call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'h_x') call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'h0_x') call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'g_x') @@ -136,12 +133,6 @@ subroutine cgyro_check_memory(datafile) call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitA*n_toroidal_procs,16,'fpackA') call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitB*n_toroidal_procs,16,'fpackB') call cgyro_alloc_add_4d(io,n_field,n_radial,n_jtheta,n_toroidal,16,'gpack') - if (triad_print_flag == 1) then - call cgyro_alloc_add_4d(io,n_radial,nt_loc,nsplitA,n_toroidal_procs,16,'eA_nl') - call cgyro_alloc_add_4d(io,n_radial,nt_loc,nsplitB,n_toroidal_procs,16,'eB_nl') - call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitA*n_toroidal_procs,16,'epackA') - call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitB*n_toroidal_procs,16,'epackB') - endif endif write(io,*) diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90 index 177021040..7828e8bf5 100644 --- a/cgyro/src/cgyro_cleanup.F90 +++ b/cgyro/src/cgyro_cleanup.F90 @@ -43,33 +43,21 @@ subroutine cgyro_cleanup ccl_del_device(vel2) deallocate(vel2) endif - if(allocated(w_e)) then - ccl_del_device(w_e) - deallocate(w_e) - endif + if(allocated(w_e)) deallocate(w_e) if(allocated(e_deriv1_mat)) deallocate(e_deriv1_mat) if(allocated(e_deriv1_rot_mat)) deallocate(e_deriv1_rot_mat) if(allocated(xi)) then ccl_del_device(xi) deallocate(xi) endif - if(allocated(w_xi)) then - ccl_del_device(w_xi) - deallocate(w_xi) - endif - if(allocated(w_exi)) then - ccl_del_device(w_exi) - deallocate(w_exi) - endif + if(allocated(w_xi)) deallocate(w_xi) + if(allocated(w_exi)) deallocate(w_exi) if(allocated(xi_lor_mat)) deallocate(xi_lor_mat) if(allocated(xi_deriv_mat)) deallocate(xi_deriv_mat) if(allocated(theta)) deallocate(theta) if(allocated(thetab)) deallocate(thetab) - if(allocated(w_theta)) then - ccl_del_device(w_theta) - deallocate(w_theta) - endif + if(allocated(w_theta)) deallocate(w_theta) if(allocated(g_theta)) deallocate(g_theta) if(allocated(g_theta_geo)) deallocate(g_theta_geo) if(allocated(bmag)) deallocate(bmag) @@ -95,10 +83,7 @@ subroutine cgyro_cleanup if(allocated(lambda_rot)) deallocate(lambda_rot) if(allocated(dlambda_rot)) deallocate(dlambda_rot) if(allocated(dens_rot)) deallocate(dens_rot) - if(allocated(dens2_rot)) then - ccl_del_device(dens2_rot) - deallocate(dens2_rot) - endif + if(allocated(dens2_rot)) deallocate(dens2_rot) if(allocated(dens_ele_rot)) deallocate(dens_ele_rot) if(allocated(dens_avg_rot)) deallocate(dens_avg_rot) if(allocated(dlnndr_avg_rot)) deallocate(dlnndr_avg_rot) @@ -152,18 +137,6 @@ subroutine cgyro_cleanup ccl_del_device(source) deallocate(source) endif - if(allocated(triad)) then - ccl_del_device(triad) - deallocate(triad) - endif - if(allocated(triad_loc)) then - ccl_del_device(triad_loc) - deallocate(triad_loc) - endif - if(allocated(triad_loc_old)) then - ccl_del_device(triad_loc_old) - deallocate(triad_loc_old) - endif if(allocated(thfac_itor)) then ccl_del_device(thfac_itor) deallocate(thfac_itor) @@ -228,10 +201,6 @@ subroutine cgyro_cleanup ccl_del_device(omega_sbeta) deallocate(omega_sbeta) endif - if(allocated(diss_r)) then - ccl_del_device(diss_r) - deallocate(diss_r) - endif if(allocated(jvec_c)) then ccl_del_device(jvec_c) deallocate(jvec_c) @@ -292,14 +261,6 @@ subroutine cgyro_cleanup ccl_del_device(g_nl) deallocate(g_nl) endif - if(allocated(eA_nl)) then - ccl_del_device(eA_nl) - deallocate(eA_nl) - endif - if(allocated(eB_nl)) then - ccl_del_device(eB_nl) - deallocate(eB_nl) - endif if(allocated(fpackA)) then ccl_del_device(fpackA) deallocate(fpackA) @@ -312,14 +273,6 @@ subroutine cgyro_cleanup ccl_del_device(gpack) deallocate(gpack) endif - if(allocated(epackA)) then - ccl_del_device(epackA) - deallocate(epackA) - endif - if(allocated(epackB)) then - ccl_del_device(epackB) - deallocate(epackB) - endif if (allocated(cmat)) then ccl_del_bigdevice(cmat) deallocate(cmat) diff --git a/cgyro/src/cgyro_equilibrium.F90 b/cgyro/src/cgyro_equilibrium.F90 index a4edcfd6a..453ea1681 100644 --- a/cgyro/src/cgyro_equilibrium.F90 +++ b/cgyro/src/cgyro_equilibrium.F90 @@ -277,9 +277,9 @@ subroutine cgyro_equilibrium enddo #if defined(OMPGPU) -!$omp target enter data map(to:xi,omega_stream,w_theta,dens2_rot) +!$omp target enter data map(to:xi,omega_stream) #elif defined(_OPENACC) -!$acc enter data copyin(xi,omega_stream,w_theta,dens2_rot) +!$acc enter data copyin(xi,omega_stream) #endif #if defined(OMPGPU) || defined(_OPENACC) diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90 index b9622a165..bfdc2f728 100644 --- a/cgyro/src/cgyro_flux.f90 +++ b/cgyro/src/cgyro_flux.f90 @@ -194,9 +194,8 @@ subroutine cgyro_flux ! 2-1. Compute Triad energy transfer !------------------------------------------------------------- - if (triad_print_flag == 1) then - kx = 2*pi*rho/length - do is=1,n_species + kx = 2*pi*rho/length + do is=1,n_species ! Triad energy transfer : T_k triad_loc_old(is,:,itor,1) = triad_loc(is,:,itor,1) *temp(is)/dlntdr(is_ele) ! From Nonzonal Triad energy transfer : T_k [NZ(k',k")->k] @@ -215,8 +214,7 @@ subroutine cgyro_flux triad_loc_old(is,:,itor,7) = triad_loc(is,:,itor,6) *temp(is)/dlntdr(is_ele) ! Diss. (Coll. ) triad_loc_old(is,:,itor,8) = triad_loc(is,:,itor,7) *temp(is)/dlntdr(is_ele) - enddo - endif + enddo !----------------------------------------------------- @@ -281,16 +279,14 @@ subroutine cgyro_flux NEW_COMM_1, & i_err) - if (triad_print_flag == 1) then - ! Reduced complex triad(ns,kx), below, is still distributed over n - call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), & + ! Reduced complex triad(ns,kx), below, is still distributed over n + call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), & triad, & size(triad), & MPI_DOUBLE_COMPLEX, & MPI_SUM, & NEW_COMM_1, & i_err) - endif tave_step = tave_step + 1 diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90 index 09670ca24..7b0b63b46 100644 --- a/cgyro/src/cgyro_init_arrays.F90 +++ b/cgyro/src/cgyro_init_arrays.F90 @@ -448,14 +448,12 @@ subroutine cgyro_init_arrays + abs(omega_rot_drift_r(it,is)) & + abs(omega_rot_edrift_r(it))) - if (triad_print_flag == 1) then - ! (d/dr) upwind dissipation for triad energy transfer diagnostics - diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial & - * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) & - + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) & - + abs(omega_rot_drift_r(it,is)) & - + abs(omega_rot_edrift_r(it))) - endif + ! (d/dr) upwind dissipation for triad energy transfer diagnostics + diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial & + * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) & + + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) & + + abs(omega_rot_drift_r(it,is)) & + + abs(omega_rot_edrift_r(it))) ! omega_star carg = & @@ -489,13 +487,6 @@ subroutine cgyro_init_arrays #elif defined(_OPENACC) !$acc enter data copyin(omega_cap_h,omega_h,omega_s,omega_ss,omega_sbeta) #endif - if (triad_print_flag == 1) then -#if defined(OMPGPU) -!$omp target enter data map(to:diss_r) -#elif defined(_OPENACC) -!$acc enter data copyin(diss_r) -#endif - endif !------------------------------------------------------------------------- deallocate(gdlnndr,gdlntdr) diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90 index 7316e6951..8a2644e75 100644 --- a/cgyro/src/cgyro_init_manager.F90 +++ b/cgyro/src/cgyro_init_manager.F90 @@ -111,12 +111,6 @@ subroutine cgyro_init_manager enddo enddo -#if defined(OMPGPU) -!$omp target enter data map(to:w_exi,w_e,w_xi) -#elif defined(_OPENACC) -!$acc enter data copyin(w_exi,w_e,w_xi) -#endif - allocate(theta(n_theta)) allocate(thetab(n_theta,n_radial/box_size)) allocate(w_theta(n_theta)) @@ -185,6 +179,10 @@ subroutine cgyro_init_manager allocate( gflux(0:n_global,n_species,4,n_field,nt1:nt2)) allocate(gflux_loc(0:n_global,n_species,4,n_field,nt1:nt2)) + allocate( triad(n_species,n_radial,nt1:nt2,8)) + allocate(triad_loc(n_species,n_radial,nt1:nt2,7)) + allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8)) + allocate(cflux_tave(n_species,4)) allocate(gflux_tave(n_species,4)) @@ -197,16 +195,6 @@ subroutine cgyro_init_manager #elif defined(_OPENACC) !$acc enter data create(fcoef,gcoef,field,field_loc,source) #endif - if (triad_print_flag == 1) then - allocate( triad(n_species,n_radial,nt1:nt2,8)) - allocate(triad_loc(n_species,n_radial,nt1:nt2,7)) - allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8)) -#if defined(OMPGPU) -!$omp target enter data map(alloc:triad,triad_loc,triad_loc_old) -#elif defined(_OPENACC) -!$acc enter data create(triad,triad_loc,triad_loc_old) -#endif - endif if ((collision_model /= 5) .AND. (collision_field_model == 1)) then ! nc and nc_loc must be last, since it will be collated @@ -274,9 +262,7 @@ subroutine cgyro_init_manager allocate(cap_h_v(nc_loc,nt1:nt2,nv)) allocate(omega_cap_h(nc,nv_loc,nt1:nt2)) allocate(omega_h(nc,nv_loc,nt1:nt2)) - if (triad_print_flag == 1) then - allocate(diss_r(nc,nv_loc,nt1:nt2)) - endif + allocate(diss_r(nc,nv_loc,nt1:nt2)) allocate(omega_s(n_field,nc,nv_loc,nt1:nt2)) allocate(omega_ss(n_field,nc,nv_loc,nt1:nt2)) allocate(omega_sbeta(nc,nv_loc,nt1:nt2)) @@ -316,8 +302,10 @@ subroutine cgyro_init_manager ! Nonlinear arrays if (nonlinear_flag == 1) then allocate(fA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) + allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) allocate(g_nl(n_field,n_radial,n_jtheta,n_toroidal)) allocate(fpackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) + allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) allocate(gpack(n_field,n_radial,n_jtheta,n_toroidal)) allocate(jvec_c_nl(n_field,n_radial,n_jtheta,nv_loc,n_toroidal)) #if defined(OMPGPU) @@ -325,32 +313,16 @@ subroutine cgyro_init_manager #elif defined(_OPENACC) !$acc enter data create(fpackA,gpack,fA_nl,g_nl,jvec_c_nl) #endif - if (triad_print_flag == 1) then - allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) - allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) -#if defined(OMPGPU) -!$omp target enter data map(alloc:epackA,eA_nl) -#elif defined(_OPENACC) -!$acc enter data create(epackA,eA_nl) -#endif - endif if (nsplitB > 0) then ! nsplitB can be zero at large MPI allocate(fB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) allocate(fpackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) + allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) + allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) #if defined(OMPGPU) !$omp target enter data map(alloc:fpackB,fB_nl) #elif defined(_OPENACC) !$acc enter data create(fpackB,fB_nl) #endif - if (triad_print_flag == 1) then - allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) - allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) -#if defined(OMPGPU) -!$omp target enter data map(alloc:epackB,eB_nl) -#elif defined(_OPENACC) -!$acc enter data create(epackB,eB_nl) -#endif - endif endif endif diff --git a/cgyro/src/cgyro_kernel.F90 b/cgyro/src/cgyro_kernel.F90 index ee3bc3c24..95c08273f 100644 --- a/cgyro/src/cgyro_kernel.F90 +++ b/cgyro/src/cgyro_kernel.F90 @@ -135,13 +135,6 @@ subroutine cgyro_kernel ! wait for cap_h_c to be synched into system memory, used by cgyro_write_timedata !$acc wait(4) #endif - if (triad_print_flag == 1) then -#if defined(OMPGPU) -!$omp target update from(triad_loc) -#elif defined(_OPENACC) -!$acc update host(triad_loc) -#endif - endif call timer_lib_out('coll_mem') call timer_lib_in('io') diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90 index a4cd1d28f..4e0e74fae 100644 --- a/cgyro/src/cgyro_nl_comm.F90 +++ b/cgyro/src/cgyro_nl_comm.F90 @@ -326,24 +326,9 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) real :: dv,dvr,dvp,rval,rval2 complex :: cprod,cprod2,thfac -#if defined(OMPGPU) -!$omp target teams distribute parallel do simd collapse(3) -#elif defined(_OPENACC) -!$acc parallel loop collapse(3) gang vector independent & -!$acc& present(triad_loc_old,triad_loc) & -!$acc& present(nt1,nt2,n_radial,n_species) default(none) -#else -!$omp parallel do private(ir,is) -#endif - do itor=nt1,nt2 - do ir=1,n_radial - do is=1,n_species - triad_loc_old(is,ir,itor,3) = triad_loc(is,ir,itor,3) - triad_loc_old(is,ir,itor,4) = triad_loc(is,ir,itor,4) - triad_loc(is,ir,itor,:) = 0.0 - enddo - enddo - enddo + triad_loc_old(:,:,nt1:nt2,3) = triad_loc(:,:,nt1:nt2,3) + triad_loc_old(:,:,nt1:nt2,4) = triad_loc(:,:,nt1:nt2,4) + triad_loc(:,:,nt1:nt2,:) = 0.0 call timer_lib_in('nl_comm') call parallel_slib_r_nc_wait(nsplitA,fA_nl,fpackA,fA_req) @@ -366,16 +351,11 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) #if defined(OMPGPU) !$omp target teams distribute parallel do simd collapse(4) & !$omp& private(iexch0,itor0,isplit0,iexch_base) & -!$omp& private(id,itorbox,jr0,jc,itd,itd_class,thfac) & !$omp& private(ic_loc_m,my_psi) #elif defined(_OPENACC) !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & !$acc& private(iexch0,itor0,isplit0,iexch_base) & -!$acc& private(id,itorbox,jr0,jc,itd,itd_class,thfac) & -!$acc& present(h_x,g_x,cap_h_c,cap_h_ct,field,dvjvec_c,jvec_c) & -!$acc& present(ic_c,is_v,ix_v,ie_v,w_exi,w_theta,dens2_rot,z,temp) & -!$acc& present(omega_stream,vel,xi,thfac_itor,cderiv,uderiv) & -!$acc& present(px,rhs,fpackA,fpackB,epackA,epackB,diss_r,triad_loc) copyin(psi_mul,zf_scale) & +!$acc& present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) & !$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(ij) default(none) #else !$omp parallel do collapse(2) private(ic_loc_m,my_psi) & @@ -385,12 +365,12 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) do itor=nt1,nt2 do iv_loc_m=1,nv_loc do ir=1,n_radial - do it=1,n_theta - itorbox = itor*box_size*sign_qs - jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) - jr0(1) = n_theta*(ir-1) - jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + itorbox = itor*box_size*sign_qs + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + do it=1,n_theta ic_loc_m = ic_c(ir,it) is = is_v(iv_loc_m +nv1 -1 ) @@ -510,16 +490,11 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) #if defined(OMPGPU) !$omp target teams distribute parallel do simd collapse(4) & !$omp& private(iexch0,itor0,isplit0,iexch_base) & -!$omp& private(id,itorbox,jr0,jc,itd,itd_class,thfac) & !$omp& private(ic_loc_m,my_psi) #elif defined(_OPENACC) !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & !$acc& private(iexch0,itor0,isplit0,iexch_base) & -!$acc& private(id,itorbox,jr0,jc,itd,itd_class,thfac) & -!$acc& present(h_x,g_x,cap_h_c,cap_h_ct,field,dvjvec_c,jvec_c) & -!$acc& present(ic_c,is_v,ix_v,ie_v,w_exi,w_theta,dens2_rot,z,temp) & -!$acc& present(omega_stream,vel,xi,thfac_itor,cderiv,uderiv) & -!$acc& present(px,rhs,fpackA,epackA,diss_r,triad_loc) copyin(psi_mul,zf_scale) & +!$acc& present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) & !$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none) #else !$omp parallel do collapse(2) private(ic_loc_m,my_psi) & @@ -529,12 +504,12 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij) do itor=nt1,nt2 do iv_loc_m=1,nv_loc do ir=1,n_radial - do it=1,n_theta - itorbox = itor*box_size*sign_qs - jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) - jr0(1) = n_theta*(ir-1) - jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + itorbox = itor*box_size*sign_qs + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + do it=1,n_theta ic_loc_m = ic_c(ir,it) is = is_v(iv_loc_m +nv1 -1 ) diff --git a/cgyro/src/cgyro_nl_fftw.F90 b/cgyro/src/cgyro_nl_fftw.F90 index a4c58206a..a2e065fde 100644 --- a/cgyro/src/cgyro_nl_fftw.F90 +++ b/cgyro/src/cgyro_nl_fftw.F90 @@ -378,63 +378,7 @@ subroutine cgyro_nl_fftw_mul(sz,uvm,uxm,vym,uym,vxm,inv_nxny) end subroutine -subroutine cgyro_nl_fftw_mul_sub_mean(ny,nx,nt,uvm,uxm,vym,uym,vxm,inv_nxny) - implicit none - - integer, intent(in) :: ny,nx,nt - real, dimension(*),intent(out) :: uvm - real, dimension(*),intent(in) :: uxm,vym,uym,vxm - real, intent(in) :: inv_nxny - - integer :: iy,ix,it - integer :: sz,sz_t - integer :: i,i_xt - real :: y_mean_ux, y_mean_uy, y_mean_vx, y_mean_vy - real :: r_ux, r_uy, r_vx, r_vy - real :: inv_ny - - sz = ny*nx*nt - sz_t = ny*nx - inv_ny = 1.0/ny - -#if defined(OMPGPU) -!$omp target teams distribute parallel do collapse(2) & -!$omp& private(iy,i_xt,y_mean_ux,y_mean_uy,y_mean_vx,y_mean_vy) & -!$omp& private(i,r_ux,r_uy,r_vx,r_vy) & -!$omp& map(to:uxm(1:sz),vym(1:sz),uym(1:sz),vxm(1:sz)) & -!$omp& map(from:uvm(1:sz)) -#else -!$acc parallel loop independent gang collapse(2) & -!$acc& private(iy,i_xt,y_mean_ux,y_mean_uy,y_mean_vx,y_mean_vy) & -!$acc& present(uvm,uxm,vym,uym,vxm) -#endif - do it=0,nt-1 - do ix=0,nx-1 - i_xt = (it*sz_t) + ix*ny; - y_mean_ux = sum(uxm(i_xt+1:i_xt+ny) ) * inv_ny - y_mean_uy = sum(uym(i_xt+1:i_xt+ny) ) * inv_ny - y_mean_vx = sum(vxm(i_xt+1:i_xt+ny) ) * inv_ny - y_mean_vy = sum(vym(i_xt+1:i_xt+ny) ) * inv_ny -#if (!defined(OMPGPU)) && defined(_OPENACC) -!$acc loop vector private(i,r_ux,r_uy,r_vx,r_vy) -#endif - do iy=1,ny - i = i_xt+iy - ! remove ky=0 - r_ux = uxm(i) - y_mean_ux - r_uy = uym(i) - y_mean_uy - r_vx = vxm(i) - y_mean_vx - r_vy = vym(i) - y_mean_vy - - ! compute and save uv - uvm(i) = (uxm(i)*vym(i)-uym(i)*vxm(i))*inv_nxny - enddo - enddo - enddo - -end subroutine - -subroutine cgyro_nl_fftw(i_triad) +subroutine cgyro_nl_fftw(ij) #if defined(HIPGPU) use hipfort @@ -454,7 +398,7 @@ subroutine cgyro_nl_fftw(i_triad) include 'fftw/fftw3.f' #endif !----------------------------------- - integer, intent(in) :: i_triad + integer, intent(in) :: ij !----------------------------------- integer :: j,p,iexch integer :: it,ir,itm,itl,ix,iy @@ -533,7 +477,7 @@ subroutine cgyro_nl_fftw(i_triad) call timer_lib_in('nl') #if !defined(OMPGPU) -!$acc data present(fA_nl,eA_nl) & +!$acc data present(fA_nl) & !$acc& present(fxmany,fymany,gxmany,gymany) & !$acc& present(uxmany,uymany,vxmany,vymany) & !$acc& present(uvmany) @@ -985,99 +929,8 @@ subroutine cgyro_nl_fftw(i_triad) enddo enddo - if (i_triad == 1) then - call timer_lib_out('nl_mem') - call timer_lib_in('nl') - ! 2. Poisson bracket in real space with Non_Zonal pairs - - call cgyro_nl_fftw_mul_sub_mean(size(uvmany,1),size(uvmany,2),nsplitA, & - uvmany, & - uxmany,vymany(:,:,1:nsplitA), & - uymany,vxmany(:,:,1:nsplitA), & - inv_nxny) - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! ------------------ - ! Transform uv to fx - ! ------------------ - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: uvmany,fxmany) -#else -!$omp target data use_device_ptr(uvmany,fxmany) -#endif - -#else -!$acc wait -!$acc host_data use_device(uvmany,fxmany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecD2Z(hip_plan_r2c_manyA,c_loc(uvmany),c_loc(fxmany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyA,uvmany,fxmany) - rc = 0 -#else - rc = cufftExecD2Z(cu_plan_r2c_manyA,uvmany,fxmany) -#endif - -#ifdef HIPGPU - ! hipfftExec is asynchronous, will need the results below - rc = hipDeviceSynchronize() -#endif - -#if defined(OMPGPU) -!$omp end target data -#else -!$acc wait -!$acc end host_data -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - call timer_lib_out('nl') - call timer_lib_in('nl_mem') - - ! tile for performance, since this is effectively a transpose -#if defined(OMPGPU) -!$omp target teams distribute parallel do collapse(5) & -!$omp& private(iy,ir,itm,itl,ix) -#else -!$acc parallel loop independent collapse(5) gang & -!$acc& private(iy,ir,itm,itl,ix) present(eA_nl,fxmany) -#endif - do j=1,nsplitA - do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE ! round up - do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE ! round up - do iy1=0,(R_TORTILE-1) ! tile - do ir1=0,(R_RADTILE-1) ! tile - iy = iy0 + iy1 - ir = 1 + ir0 + ir1 - if ((iy < n_toroidal) .and. (ir <= n_radial)) then - ! itor = iy+1 - itm = 1 + iy/nt_loc - itl = 1 + modulo(iy,nt_loc) - ix = ir-1-nx0/2 - if (ix < 0) ix = ix+nx - - eA_nl(ir,itl,j,itm) = fxmany(iy,ix,j) - endif - enddo - enddo - enddo - enddo - enddo - - endif ! i_triad==1 - #if !defined(OMPGPU) - ! end data fA_nl,eA_nl + ! end data fA_nl !$acc end data #endif @@ -1125,9 +978,6 @@ subroutine cgyro_nl_fftw(i_triad) ! start the async reverse comm ! can reuse the same req, no overlap with forward fA_req call parallel_slib_r_nc_async(nsplitA,fA_nl,fpackA,fA_req) - if (i_triad == 1) then - call parallel_slib_r_nc_async(nsplitA,eA_nl,epackA,eA_req) - end if fA_req_valid = .TRUE. if (nsplitB > 0) then @@ -1143,7 +993,7 @@ subroutine cgyro_nl_fftw(i_triad) call timer_lib_in('nl') #if !defined(OMPGPU) -!$acc data present(fB_nl,eB_nl) & +!$acc data present(fB_nl) & !$acc& present(fxmany,fymany,gxmany,gymany) & !$acc& present(uxmany,uymany,vxmany,vymany) & !$acc& present(uvmany) @@ -1392,105 +1242,8 @@ subroutine cgyro_nl_fftw(i_triad) enddo enddo - if (i_triad == 1) then - call timer_lib_out('nl_mem') - call timer_lib_in('nl') - ! 2. Poisson bracket in real space with Non_Zonal pairs - call cgyro_nl_fftw_mul(size(uvmany,1)*size(uvmany,2)*nsplitB, & - uvmany, & - uxmany,vymany(:,:,(nsplitA+1):nsplit), & - uymany,vxmany(:,:,(nsplitA+1):nsplit), & - inv_nxny) - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! ------------------ - ! Transform uv to fx - ! ------------------ - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: uvmany,fxmany) -#else -!$omp target data use_device_ptr(uvmany,fxmany) -#endif - -#else -!$acc wait -!$acc host_data use_device(uvmany,fxmany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecD2Z(hip_plan_r2c_manyB,c_loc(uvmany),c_loc(fxmany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyB,uvmany,fxmany) - rc = 0 -#else - rc = cufftExecD2Z(cu_plan_r2c_manyB,uvmany,fxmany) -#endif - -#ifdef HIPGPU - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - ! hipfftExec is asynchronous, will need the results below - rc = hipDeviceSynchronize() -#endif - -#if defined(OMPGPU) -!$omp end target data -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait -!$acc end host_data -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - call timer_lib_out('nl') - call timer_lib_in('nl_mem') - - ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component - ! that will be filtered in the main time-stepping loop - - ! tile for performance, since this is effectively a transpose -#if defined(OMPGPU) -!$omp target teams distribute parallel do collapse(5) & -!$omp& private(iy,ir,itm,itl,ix) -#else -!$acc parallel loop independent collapse(5) gang & -!$acc& private(iy,ir,itm,itl,ix) present(eB_nl,fxmany) -#endif - do j=1,nsplitB - do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE ! round up - do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE ! round up - do iy1=0,(R_TORTILE-1) ! tile - do ir1=0,(R_RADTILE-1) ! tile - iy = iy0 + iy1 - ir = 1 + ir0 + ir1 - if ((iy < n_toroidal) .and. (ir <= n_radial)) then - ! itor = iy+1 - itm = 1 + iy/nt_loc - itl = 1 + modulo(iy,nt_loc) - ix = ir-1-nx0/2 - if (ix < 0) ix = ix+nx - - eB_nl(ir,itl,j,itm) = fxmany(iy,ix,j) - endif - enddo - enddo - enddo - enddo - enddo - - endif ! if i_triad - #if !defined(OMPGPU) - ! end data fB_nl,eB_nl + ! end data fB_nl !$acc end data #endif @@ -1500,9 +1253,6 @@ subroutine cgyro_nl_fftw(i_triad) ! start the async reverse comm ! can reuse the same req, no overlap with forward fB_req call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req) - if (i_triad == 1) then - call parallel_slib_r_nc_async(nsplitB,eB_nl,epackB,eB_req) - end if fB_req_valid = .TRUE. ! make sure reqs progress call cgyro_nl_fftw_comm_test() @@ -1522,24 +1272,16 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp) implicit none - !----------------------------------- integer, intent(in) :: g_j, f_j integer,intent(in) :: nl_idx ! 1=>A, 2=>B integer,intent(in) :: i_omp - !----------------------------------- integer :: ix,iy integer :: ir,itm,itl,itor - real :: inv_nxny include 'fftw3.f03' - inv_nxny = 1.0/(nx*ny) ! Poisson bracket in real space - do ix=0,nx-1 - do iy=0,ny-1 - uv(iy,ix,i_omp) = (uxmany(iy,ix,f_j)*vymany(iy,ix,g_j)-uymany(iy,ix,f_j)*vxmany(iy,ix,g_j)) * inv_nxny - enddo - enddo + uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) @@ -1565,7 +1307,6 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp) end subroutine cgyro_nl_fftw_stepr - ! assumes cgyro_nl_fftw_stepr has already been called subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp) use timer_lib @@ -1574,47 +1315,73 @@ subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp) implicit none - !----------------------------------- integer, intent(in) :: g_j, f_j integer,intent(in) :: nl_idx ! 1=>A, 2=>B integer,intent(in) :: i_omp - !----------------------------------- - real :: y_mean_ux, y_mean_uy, y_mean_vx, y_mean_vy - real :: r_ux, r_uy, r_vx, r_vy + real :: y_mean(nx) integer :: ix,iy integer :: ir,itm,itl,itor - real :: inv_nxny,inv_ny include 'fftw3.f03' - inv_ny = 1.0/ny - inv_nxny = 1.0/(nx*ny) - ! 1. Poisson bracket in real space was done in cgyro_nl_fftw_stepr + ! 1. Poisson bracket in real space + uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) - ! 2. Poisson bracket in real space with Non_Zonal pairs + call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) - do ix=0,nx-1 - y_mean_ux = sum(uxmany(:,ix,f_j) ,dim=1 ) * inv_ny - y_mean_uy = sum(uymany(:,ix,f_j) ,dim=1 ) * inv_ny - y_mean_vx = sum(vxmany(:,ix,f_j) ,dim=1 ) * inv_ny - y_mean_vy = sum(vymany(:,ix,f_j) ,dim=1 ) * inv_ny - do iy=0,ny-1 - ! remove ky=0 - r_ux = uxmany(iy,ix,f_j) - y_mean_ux - r_uy = uymany(iy,ix,f_j) - y_mean_uy - r_vx = vxmany(iy,ix,f_j) - y_mean_vx - r_vy = vymany(iy,ix,f_j) - y_mean_vy - ! we could save, but we do not really need to (not used outside from this function) - !uxmany(iy,ix,f_j) = r_ux - !uymany(iy,ix,f_j) = r_y - !vxmany(iy,ix,f_j) = r_vx - !vymany(iy,ix,f_j) = r_vy - - ! compute and save uv - uv(iy,ix,i_omp) = (r_ux*r_vy-r_uy*r_vx) * inv_nxny + ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component + ! that will be filtered in the main time-stepping loop + + ! this should really be accounted against nl_mem, but hard to do with OMP + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + do ir=1,n_radial + ix = ir-1-nx0/2 + if (ix < 0) ix = ix+nx + iy = itor-1 + if (nl_idx==1) then + fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + else + fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + endif enddo + enddo enddo + + ! 2. Poisson bracket in real space with Non_Zonal pairs + + ! remove ky=0 in uxmany + y_mean = sum(uxmany(:,:,f_j) ,dim=1 ) / ny + do iy=0,ny-1 + uxmany(iy,:,f_j) = uxmany(iy,:,f_j) -y_mean + + end do + + ! remove ky=0 in uymany + y_mean = sum(uymany(:,:,f_j) ,dim=1 ) / ny ! =0 + do iy=0,ny-1 + uymany(iy,:,f_j) = uymany(iy,:,f_j) -y_mean + + end do + + ! remove ky=0 in vx + y_mean = sum(vxmany(:,:,g_j) ,dim=1 ) / ny + do iy=0,ny-1 + vxmany(iy,:,g_j) = vxmany(iy,:,g_j) -y_mean + + end do + + ! remove ky=0 in vy + y_mean = sum(vymany(:,:,g_j) ,dim=1 ) / ny ! =0 + do iy=0,ny-1 + vymany(iy,:,g_j) = vymany(iy,:,g_j) -y_mean + + end do + + uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) + call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) do itm=1,n_toroidal_procs @@ -1636,7 +1403,7 @@ subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp) end subroutine cgyro_nl_fftw_stepr_triad ! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw -subroutine cgyro_nl_fftw(i_triad) +subroutine cgyro_nl_fftw(ij) use timer_lib use parallel_lib @@ -1647,9 +1414,9 @@ subroutine cgyro_nl_fftw(i_triad) !----------------------------------- - integer, intent(in) :: i_triad + integer, intent(in) :: ij !----------------------------------- - integer :: ix,iy + integer :: ix,iy,i_triad=0 integer :: ir,it,itm,itl,it_loc integer :: itor,mytm integer :: j,p @@ -1662,6 +1429,10 @@ subroutine cgyro_nl_fftw(i_triad) include 'fftw3.f03' + if (triad_print_flag == 1 .and. ij == 3) then + i_triad=1 + endif + call cgyro_nl_fftw_comm_test() ! time to wait for the FA_nl to become avaialble @@ -1803,14 +1574,10 @@ subroutine cgyro_nl_fftw(i_triad) endif if (j<=nsplitA) then - call cgyro_nl_fftw_stepr(j, j, 1, i_omp) - if (i_omp==1) then - ! use the main thread to progress the async MPI - call cgyro_nl_fftw_comm_test() - endif - if (i_triad == 1) then call cgyro_nl_fftw_stepr_triad(j, j, 1, i_omp) + else + call cgyro_nl_fftw_stepr(j, j, 1, i_omp) endif endif ! else we will do it in the next loop @@ -1893,14 +1660,10 @@ subroutine cgyro_nl_fftw(i_triad) call cgyro_nl_fftw_comm_test() endif - call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp) - if (i_omp==1) then - ! use the main thread to progress the async MPI - call cgyro_nl_fftw_comm_test() - endif - if (i_triad == 1) then call cgyro_nl_fftw_stepr_triad(nsplitA+j, j, 2, i_omp) + else + call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp) endif enddo ! j diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90 index b566b4328..b83770355 100644 --- a/cgyro/src/cgyro_rhs.F90 +++ b/cgyro/src/cgyro_rhs.F90 @@ -393,7 +393,6 @@ subroutine cgyro_rhs(ij,update_cap) integer, intent(in) :: ij logical, intent(in) :: update_cap !-------------------------------- - integer :: i_triad ! fields is ready by now call cgyro_rhs_comm_async_fd @@ -413,13 +412,9 @@ subroutine cgyro_rhs(ij,update_cap) ! Nonlinear evaluation [f,g] if (nonlinear_flag == 1) then - i_triad=0 - if (triad_print_flag == 1 .and. ij == 3) then - i_triad=1 - endif ! assumes someone already started the input comm ! and will finish the output comm - call cgyro_nl_fftw(i_triad) + call cgyro_nl_fftw(ij) endif call cgyro_upwind_complete diff --git a/cgyro/src/cgyro_step_collision.F90 b/cgyro/src/cgyro_step_collision.F90 index eb97d6207..c2297993c 100644 --- a/cgyro/src/cgyro_step_collision.F90 +++ b/cgyro/src/cgyro_step_collision.F90 @@ -1049,38 +1049,6 @@ subroutine cgyro_step_collision_gpu(use_simple) call timer_lib_in('coll') ! Compute H given h and [phi(h), apar(h)] - if (triad_print_flag == 1 ) then - -#if defined(OMPGPU) -!$omp target teams distribute parallel do simd collapse(3) & -!$omp& private(iv_loc,is,my_psi,my_ch) -#else -!$acc parallel loop collapse(3) gang vector private(iv_loc,is,my_psi,my_ch) & -!$acc& present(is_v,cap_h_c,cap_h_ct,cap_h_c,jvec_c,field,z,temp,h_x) & -!$acc& present(nt1,nt2,nv1,nv2,nc) default(none) -#endif - do itor=nt1,nt2 - do iv=nv1,nv2 - do ic=1,nc - iv_loc = iv-nv1+1 - is = is_v(iv) - - ! Save collisional diss. - my_ch = cap_h_ct(iv_loc,itor,ic) - my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor)) - - my_psi = my_ch-my_psi*(z(is)/temp(is)) - cap_h_ct(iv_loc,itor,ic) = (cap_h_c(ic,iv_loc,itor) + my_ch) / 2.0 - cap_h_ct(iv_loc,itor,ic) = conjg(cap_h_ct(iv_loc,itor,ic)) & - * ( my_psi - h_x(ic,iv_loc,itor) ) - - h_x(ic,iv_loc,itor) = my_psi - cap_h_c(ic,iv_loc,itor) = my_ch - enddo - enddo - enddo - - else #if defined(OMPGPU) !$omp target teams distribute parallel do simd collapse(3) & @@ -1103,8 +1071,6 @@ subroutine cgyro_step_collision_gpu(use_simple) enddo enddo - end if ! (triad_print_flag == 1 ) - call timer_lib_out('coll') end subroutine cgyro_step_collision_gpu