diff --git a/cgyro/bin/cgyro_parse.py b/cgyro/bin/cgyro_parse.py index d2f613d11..7f3c86c29 100644 --- a/cgyro/bin/cgyro_parse.py +++ b/cgyro/bin/cgyro_parse.py @@ -67,6 +67,7 @@ x.add('MOMENT_PRINT_FLAG','0') x.add('GFLUX_PRINT_FLAG','0') x.add('FIELD_PRINT_FLAG','0') +x.add('TRIAD_PRINT_FLAG','0') x.add('AMP0','0.0') x.add('AMP','0.1') x.add('GAMMA_E','0.0') diff --git a/cgyro/bin/cgyro_plot b/cgyro/bin/cgyro_plot index 97cd5d995..102ac03db 100755 --- a/cgyro/bin/cgyro_plot +++ b/cgyro/bin/cgyro_plot @@ -45,6 +45,8 @@ then echo " = flux (time-trace of fluxes for all species)" echo " = ky_flux (bar plot of flux versus ky)" echo " = phi (time-dependent ZF vs. finite-n intensities)" + echo " = triad (triad energy transfer for all ky)" + echo " = triad_v2 (free energy balance of ky=0)" echo " = ky_phi (time-dependent ky-phi intensities)" echo " = kx_phi (average kx-phi intensities)" echo " = kx_shift (kx-phi intensities in both domains)" @@ -411,7 +413,7 @@ PYROOT=$GACODE_ROOT/f2py/pygacode/cgyro PLT="-m pygacode.cgyro.data_plot_single $TEX $FONTSIZE" -PFLAGS="$PLOT_TYPE $LX $LY $W $NORM $EXT $TIME $FIELD $MOMENT $TMAX $THETA $YMIN $YMAX $KXMIN $KXMAX $N $ABS $FC $LOC $NSCALE $CFLUX $SPECIES $BAR $IE $MESH" +PFLAGS="$PLOT_TYPE $LX $LY $W $NORM $EXT $TIME $FIELD $MOMENT $TMAX $THETA $YMIN $YMAX $KXMIN $KXMAX $N $ABS $FC $LOC $NSCALE $CFLUX $SPECIES $BAR $IE $MESH $DN" case "$VIS_TYPE" in @@ -443,7 +445,7 @@ case "$PLOT_TYPE" in template) cp $PYROOT/plot_template.py . ;; - freq | ky_freq | error | geo | ball | ky_phi | phi | flux | rcorr_phi | low | zf | ky_flux | shift | corrug | kx_phi | kxky_phi | kx_shift | poly_phi | xflux | hb | hbcut | hball ) + freq | ky_freq | error | geo | ball | ky_phi | phi | triad | triad_v2 | flux | rcorr_phi | low | zf | ky_flux | shift | corrug | kx_phi | kxky_phi | kx_shift | poly_phi | xflux | hb | hbcut | hball ) python $PLT $PFLAGS ;; fluct) diff --git a/cgyro/bin/cgyrodb b/cgyro/bin/cgyrodb index a3ea46ff4..4cafe1b1c 100755 --- a/cgyro/bin/cgyrodb +++ b/cgyro/bin/cgyrodb @@ -18,17 +18,21 @@ ignore = 'eslshared' # Command line option parser def opts(): +<<<<<<< HEAD mytext = '''\ output: NEED DOCUMENTATION HERE ''' +======= +>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6 parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog = 'cgyrodb', description="CGYRO database utility", epilog=textwrap.dedent(mytext)) +<<<<<<< HEAD parser.add_argument('-ref', help='Refresh directories', action='store_true') @@ -47,6 +51,18 @@ def opts(): return args.ref,args.json,args.db,args.flux ref,json,db,doflux = opts() +======= + parser.add_argument('-mode', + help='Mode switch (data,update)', + type=str, + default='data') + + args=parser.parse_args() + + return args.mode + +mode = opts() +>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6 meta = {} @@ -87,6 +103,8 @@ def gendict(sim,doflux): return mydict +ignore = 'eslshared' + # First locate all directories y = [] for root,xd,xf in os.walk('./'): @@ -97,6 +115,7 @@ for root,xd,xf in os.walk('./'): if ref: # Run through directories and update for mdir in y: +<<<<<<< HEAD print('Refreshing '+mdir) os.system('cd '+mdir+' ; python $GACODE_ROOT/cgyro/bin/cgyro_parse.py') os.system('cd '+mdir+' ; $GACODE_ROOT/cgyro/src/cgyro ') @@ -108,6 +127,16 @@ if json: os.system('cgyro_json -e '+mdir) if db: +======= +<<<<<<< HEAD + print('Updating '+mdir) + os.system('cgyro_plot -plot text -e '+mdir) +======= + print('Updating '+mdir) +>>>>>>> ad9e1b42 (Added ignore feature) + os.system('cgyro -t '+mdir) +else: +>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6 # Create master dictionary "meta" for mdir in tqdm(y): sim = cgyrodata(mdir+'/',silent=True) diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90 index 52599e119..b5c8be3df 100644 --- a/cgyro/src/cgyro_flux.f90 +++ b/cgyro/src/cgyro_flux.f90 @@ -44,6 +44,7 @@ subroutine cgyro_flux complex :: cprod real, parameter :: x_fraction=0.2 real :: u + real :: kx !$omp parallel do private(iv_loc,iv,is,ix,ie,dv,vpar,ic,ir,it,erot,cprod,cn) & !$omp& private(prod1,prod2,prod3,l,icl,dvr,u,flux_norm) & @@ -188,6 +189,34 @@ subroutine cgyro_flux cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)+2*sin(u)*real(gflux_loc(l,:,:,:,itor))/u enddo + + !------------------------------------------------------------- + ! 2-1. Compute Triad energy transfer + !------------------------------------------------------------- + + kx = 2*pi*rho/length + do is=1,n_species + ! Triad energy transfer : T_k + triad_loc_old(is,:,itor,1) = triad_loc(is,:,itor,1) *temp(is)/dlntdr(is_ele) + ! From Nonzonal Triad energy transfer : T_k [NZ(k',k")->k] + triad_loc_old(is,:,itor,2) = triad_loc(is,:,itor,2) *temp(is)/dlntdr(is_ele) + ! dEntropy / dt : dS_k/dt + triad_loc_old(is,:,itor,3) = (triad_loc(is,:,itor,3)-triad_loc_old(is,:,itor,3))/delta_t + triad_loc_old(is,:,itor,3) = triad_loc_old(is,:,itor,3) *temp(is)/dlntdr(is_ele)*0.5 + ! dWk_perp / dt , (ky = 0) + triad_loc_old(is,:,itor,4) = (triad_loc(is,:,itor,4)-triad_loc_old(is,:,itor,4))/delta_t + triad_loc_old(is,:,itor,4) = triad_loc_old(is,:,itor,4) *(kx*lambda_star)**2/dlntdr(is_ele)*0.5 + ! Entropy : S_k + triad_loc_old(is,:,itor,5) = triad_loc(is,:,itor,3) *temp(is)/dlntdr(is_ele)*0.5 + ! Diss. (radial) + triad_loc_old(is,:,itor,6) = triad_loc(is,:,itor,5) *temp(is)/dlntdr(is_ele) + ! Diss. (theta ) + triad_loc_old(is,:,itor,7) = triad_loc(is,:,itor,6) *temp(is)/dlntdr(is_ele) + ! Diss. (Coll. ) + triad_loc_old(is,:,itor,8) = triad_loc(is,:,itor,7) *temp(is)/dlntdr(is_ele) + enddo + + !----------------------------------------------------- ! 3. Renormalize fluxes to GB or quasilinear forms !~---------------------------------------------------- @@ -216,7 +245,7 @@ subroutine cgyro_flux ! GyroBohm normalizations gflux_loc(:,:,:,:,itor) = gflux_loc(:,:,:,:,itor)/rho**2 cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)/rho**2 - + triad_loc_old(:,:,itor,:) = triad_loc_old(:,:,itor,:)/rho**2 endif enddo @@ -240,6 +269,15 @@ subroutine cgyro_flux NEW_COMM_1, & i_err) + ! Reduced complex triad(ns,kx), below, is still distributed over n + call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), & + triad, & + size(triad), & + MPI_DOUBLE_COMPLEX, & + MPI_SUM, & + NEW_COMM_1, & + i_err) + ! Reduced real cflux(ky), below, is still distributed over n call MPI_ALLREDUCE(cflux_loc, & diff --git a/cgyro/src/cgyro_globals.F90 b/cgyro/src/cgyro_globals.F90 index 61ec763d2..ea0c88f50 100644 --- a/cgyro/src/cgyro_globals.F90 +++ b/cgyro/src/cgyro_globals.F90 @@ -80,6 +80,7 @@ module cgyro_globals integer :: moment_print_flag integer :: gflux_print_flag integer :: field_print_flag + integer :: triad_print_flag real :: amp0 real :: amp real :: gamma_e @@ -192,7 +193,7 @@ module cgyro_globals integer :: nsplit,nsplitA,nsplitB integer :: ns1,ns2 integer, dimension(:), allocatable :: recv_status - integer :: fA_req, fB_req, g_req + integer :: fA_req, fB_req, eA_req, eB_req, g_req logical :: fA_req_valid, fB_req_valid, g_req_valid ! Thetas present in the process after NL AllToAll integer :: n_jtheta @@ -243,6 +244,7 @@ module cgyro_globals character(len=12) :: binfile_hb = 'bin.cgyro.hb' character(len=17) :: binfile_ky_flux = 'bin.cgyro.ky_flux' character(len=18) :: binfile_ky_cflux = 'bin.cgyro.ky_cflux' + character(len=15) :: binfile_triad = 'bin.cgyro.triad' character(len=15), dimension(3) :: binfile_fieldb = & (/'bin.cgyro.phib ','bin.cgyro.aparb','bin.cgyro.bparb'/) character(len=16), dimension(3) :: binfile_kxky = & @@ -344,11 +346,14 @@ module cgyro_globals complex, dimension(:,:,:), allocatable :: h0_x complex, dimension(:,:,:), allocatable :: h0_old complex, dimension(:,:,:,:), allocatable :: fA_nl,fB_nl + complex, dimension(:,:,:,:), allocatable :: eA_nl,eB_nl complex, dimension(:,:,:,:), allocatable :: g_nl complex, dimension(:,:,:), allocatable :: fpackA,fpackB + complex, dimension(:,:,:), allocatable :: epackA,epackB complex, dimension(:,:,:,:), allocatable :: gpack complex, dimension(:,:,:), allocatable :: omega_cap_h complex, dimension(:,:,:), allocatable :: omega_h + complex, dimension(:,:,:), allocatable :: diss_r complex, dimension(:,:,:,:), allocatable :: omega_s,omega_ss complex, dimension(:,:,:), allocatable :: omega_sbeta complex, dimension(:,:,:), allocatable :: cap_h_c @@ -382,6 +387,7 @@ module cgyro_globals real, dimension(:,:,:,:), allocatable :: cflux complex, dimension(:,:,:,:,:), allocatable :: gflux_loc complex, dimension(:,:,:,:,:), allocatable :: gflux + complex, dimension(:,:,:,:), allocatable :: triad,triad_loc,triad_loc_old real, dimension(:,:), allocatable :: cflux_tave, gflux_tave real :: tave_min, tave_max integer :: tave_step diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90 index 1377dd3fb..acd4d99e0 100644 --- a/cgyro/src/cgyro_init_arrays.F90 +++ b/cgyro/src/cgyro_init_arrays.F90 @@ -447,7 +447,14 @@ subroutine cgyro_init_arrays + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) & + abs(omega_rot_drift_r(it,is)) & + abs(omega_rot_edrift_r(it))) - + + ! (d/dr) upwind diss. + diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial & + * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) & + + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) & + + abs(omega_rot_drift_r(it,is)) & + + abs(omega_rot_edrift_r(it))) + ! omega_star carg = & -i_c*k_theta_base*itor*rho*(dlnndr(is)+dlntdr(is)*(energy(ie)-1.5)) & diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90 index 956fb3372..8a2644e75 100644 --- a/cgyro/src/cgyro_init_manager.F90 +++ b/cgyro/src/cgyro_init_manager.F90 @@ -178,6 +178,11 @@ subroutine cgyro_init_manager allocate(cflux_loc(n_species,4,n_field,nt1:nt2)) allocate( gflux(0:n_global,n_species,4,n_field,nt1:nt2)) allocate(gflux_loc(0:n_global,n_species,4,n_field,nt1:nt2)) + + allocate( triad(n_species,n_radial,nt1:nt2,8)) + allocate(triad_loc(n_species,n_radial,nt1:nt2,7)) + allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8)) + allocate(cflux_tave(n_species,4)) allocate(gflux_tave(n_species,4)) @@ -257,6 +262,7 @@ subroutine cgyro_init_manager allocate(cap_h_v(nc_loc,nt1:nt2,nv)) allocate(omega_cap_h(nc,nv_loc,nt1:nt2)) allocate(omega_h(nc,nv_loc,nt1:nt2)) + allocate(diss_r(nc,nv_loc,nt1:nt2)) allocate(omega_s(n_field,nc,nv_loc,nt1:nt2)) allocate(omega_ss(n_field,nc,nv_loc,nt1:nt2)) allocate(omega_sbeta(nc,nv_loc,nt1:nt2)) @@ -296,8 +302,10 @@ subroutine cgyro_init_manager ! Nonlinear arrays if (nonlinear_flag == 1) then allocate(fA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) + allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs)) allocate(g_nl(n_field,n_radial,n_jtheta,n_toroidal)) allocate(fpackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) + allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs)) allocate(gpack(n_field,n_radial,n_jtheta,n_toroidal)) allocate(jvec_c_nl(n_field,n_radial,n_jtheta,nv_loc,n_toroidal)) #if defined(OMPGPU) @@ -308,6 +316,8 @@ subroutine cgyro_init_manager if (nsplitB > 0) then ! nsplitB can be zero at large MPI allocate(fB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) allocate(fpackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) + allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs)) + allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs)) #if defined(OMPGPU) !$omp target enter data map(alloc:fpackB,fB_nl) #elif defined(_OPENACC) diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90 index 674d75e94..6392d334d 100644 --- a/cgyro/src/cgyro_nl_comm.F90 +++ b/cgyro/src/cgyro_nl_comm.F90 @@ -304,6 +304,316 @@ subroutine cgyro_nl_fftw_comm1_r(ij) end subroutine cgyro_nl_fftw_comm1_r +subroutine cgyro_nl_fftw_comm1_r_triad(ij) + use timer_lib + use parallel_lib + use cgyro_globals + + implicit none + + !----------------------------------- + integer, intent(in) :: ij + !----------------------------------- + + integer :: is,ix,ie + integer :: id,itd,itd_class,jr0(0:2),itorbox,jc + integer :: ir,it,iv_loc_m,ic_loc_m,itor + integer :: iexch0,itor0,isplit0,iexch_base + complex :: my_psi + real :: psi_mul + + real :: dv,dvr,dvp,rval,rval2 + complex :: cprod,cprod2,thfac + + triad_loc_old(:,:,:,3)=triad_loc(:,:,:,3) + triad_loc_old(:,:,:,4)=triad_loc(:,:,:,4) + triad_loc(:,:,:,:)=0.0 + + call timer_lib_in('nl_comm') + call parallel_slib_r_nc_wait(nsplitA,fA_nl,fpackA,fA_req) + call parallel_slib_r_nc_wait(nsplitA,eA_nl,epackA,eA_req) + fA_req_valid = .FALSE. + if (nsplitB > 0) then + ! no major compute to overlap + call parallel_slib_r_nc_wait(nsplitB,fB_nl,fpackB,fB_req) + call parallel_slib_r_nc_wait(nsplitB,eB_nl,epackB,eB_req) + fB_req_valid = .FALSE. + endif + call timer_lib_out('nl_comm') + + call timer_lib_in('nl') + + psi_mul = (q*rho/rmin)*(2*pi/length) + + if (nsplitB > 0) then + +#if defined(OMPGPU) +!$omp target teams distribute parallel do simd collapse(4) & +!$omp& private(iexch0,itor0,isplit0,iexch_base) & +!$omp& private(ic_loc_m,my_psi) +#elif defined(_OPENACC) +!$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & +!$acc& private(iexch0,itor0,isplit0,iexch_base) & +!$acc& present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) & +!$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(ij) default(none) +#else +!$omp parallel do collapse(2) private(ic_loc_m,my_psi) & +!$omp& private(iexch0,itor0,isplit0,iexch_base,is,ix,ie,dv,dvr,dvp,rval,rval2,cprod,cprod2) & +!$omp& private(id,itorbox,jr0,jc,itd,itd_class,thfac) +#endif + do itor=nt1,nt2 + do iv_loc_m=1,nv_loc + do ir=1,n_radial + itorbox = itor*box_size*sign_qs + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + + do it=1,n_theta + ic_loc_m = ic_c(ir,it) + + is = is_v(iv_loc_m +nv1 -1 ) + ix = ix_v(iv_loc_m +nv1 -1 ) + ie = ie_v(iv_loc_m +nv1 -1 ) + dv = w_exi(ie,ix) + dvr = w_theta(it)*dens2_rot(it,is)*dv + dvp = w_theta(it)*dv*(ir-1-nx0/2)**2 + + ! Density moment + cprod = w_theta(it)*cap_h_c(ic_loc_m,iv_loc_m,itor)*dvjvec_c(1,ic_loc_m,iv_loc_m,itor)/z(is) + cprod = -(dvr*z(is)/temp(is)*field(1,ic_loc_m,itor)-cprod) + cprod2= ( jvec_c(1,ic_loc_m,iv_loc_m,itor)*z(is)/temp(is) )*conjg(field(1,ic_loc_m,itor) ) + + iexch0 = (iv_loc_m-1) + (it-1)*nv_loc + itor0 = iexch0/nsplit + isplit0 = modulo(iexch0,nsplit) + if (isplit0 < nsplitA) then + iexch_base = 1+itor0*nsplitA + my_psi = fpackA(ir,itor-nt1+1,iexch_base+isplit0) + + ! 1. Triad energy transfer (all) + triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) & + + fpackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + ! 2. Triad energy transfer ( {NZF-NZF} coupling ) + if (itor == 0) then + ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ] + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackA(ir,itor-nt1+1,iexch_base+isplit0)*cprod2*dvr*psi_mul + else + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + endif + else + iexch_base = 1+itor0*nsplitB + my_psi = fpackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA)) + + ! 1. Triad energy transfer (all) + triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) & + + fpackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + ! 2. Triad energy transfer ( {NZF-NZF} coupling ) + if (itor == 0) then + ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ] + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*cprod2*dvr*psi_mul + else + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + endif + endif + + ! 3. Entropy + triad_loc(is,ir,itor,3) = triad_loc(is,ir,itor,3) & + + cap_h_c(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr & + - field(1,ic_loc_m,itor)*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))**2*dvr & + - 2.0*cprod*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is)) + ! 4. Field potential + triad_loc(is,ir,itor,4) = triad_loc(is,ir,itor,4) & + + sum( field(:,ic_loc_m,itor)*conjg(field(:,ic_loc_m,itor)) )*dvp + ! 5. Diss. (radial) + triad_loc(is,ir,itor,5) = triad_loc(is,ir,itor,5) & + + diss_r(ic_loc_m,iv_loc_m,itor)*h_x(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr + ! 6. Diss. (theta ) + rval = omega_stream(it,is,itor)*vel(ie)*xi(ix) + rval2 = abs(omega_stream(it,is,itor)) + cprod = 0.0 + cprod2= 0.0 + + !icd_c(ic, id, itor) = ic_c(jr,modulo(it+id-1,n_theta)+1) + !jc = icd_c(ic, id, itor) + !dtheta(ic, id, itor) := cderiv(id)*thfac + !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta + itd = n_theta+it-nup_theta + itd_class = 0 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + + do id=-nup_theta,nup_theta + if (itd > n_theta) then + ! move to next itd_class of compute + itd = itd - n_theta + itd_class = itd_class + 1 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + endif + + cprod2 = cprod2 - rval* thfac*cderiv(id) *cap_h_c(jc,iv_loc_m,itor) + cprod = cprod - rval2* uderiv(id)*up_theta *g_x(jc,iv_loc_m,itor) + itd = itd + 1 + jc = jc + 1 + enddo + + triad_loc(is,ir,itor,6) = triad_loc(is,ir,itor,6) + cprod*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr + ! 7. Diss. (Coll. = Implicit advance - theta_streaming ) + triad_loc(is,ir,itor,7) = triad_loc(is,ir,itor,7) & + + ( cap_h_ct(iv_loc_m,itor,ic_loc_m)/delta_t + cprod2*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor)) )*dvr + + + if ( (itor == 0) .and. (ir == 1 .or. px(ir) == 0) ) then + ! filter + my_psi = (0.0,0.0) + endif + + if (itor == 0) then + my_psi = my_psi*zf_scale + endif + + ! RHS -> -[f,g] = [f,g]_{r,-alpha} + rhs(ic_loc_m,iv_loc_m,itor,ij) = rhs(ic_loc_m,iv_loc_m,itor,ij)+psi_mul*my_psi + enddo + enddo + enddo + enddo + + else ! nsplitB==0 + +#if defined(OMPGPU) +!$omp target teams distribute parallel do simd collapse(4) & +!$omp& private(iexch0,itor0,isplit0,iexch_base) & +!$omp& private(ic_loc_m,my_psi) +#elif defined(_OPENACC) +!$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) & +!$acc& private(iexch0,itor0,isplit0,iexch_base) & +!$acc& present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) & +!$acc& present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none) +#else +!$omp parallel do collapse(2) private(ic_loc_m,my_psi) & +!$omp& private(iexch0,itor0,isplit0,iexch_base,is,ix,ie,dv,dvr,dvp,rval,rval2,cprod,cprod2) & +!$omp& private(id,itorbox,jr0,jc,itd,itd_class,thfac) +#endif + do itor=nt1,nt2 + do iv_loc_m=1,nv_loc + do ir=1,n_radial + itorbox = itor*box_size*sign_qs + jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial) + jr0(1) = n_theta*(ir-1) + jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial) + + do it=1,n_theta + ic_loc_m = ic_c(ir,it) + + is = is_v(iv_loc_m +nv1 -1 ) + ix = ix_v(iv_loc_m +nv1 -1 ) + ie = ie_v(iv_loc_m +nv1 -1 ) + dv = w_exi(ie,ix) + dvr = w_theta(it)*dens2_rot(it,is)*dv + dvp = w_theta(it)*dv*(ir-1-nx0/2)**2 + + ! Density moment + cprod = w_theta(it)*cap_h_c(ic_loc_m,iv_loc_m,itor)*dvjvec_c(1,ic_loc_m,iv_loc_m,itor)/z(is) + cprod = -(dvr*z(is)/temp(is)*field(1,ic_loc_m,itor)-cprod) + cprod2= ( jvec_c(1,ic_loc_m,iv_loc_m,itor)*z(is)/temp(is) )*conjg(field(1,ic_loc_m,itor) ) + + + iexch0 = (iv_loc_m-1) + (it-1)*nv_loc + itor0 = iexch0/nsplit + isplit0 = modulo(iexch0,nsplit) + iexch_base = 1+itor0*nsplitA + my_psi = fpackA(ir,itor-nt1+1,iexch_base+isplit0) + + + ! 1. Triad energy transfer (all) + triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) & + + fpackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + ! 2. Triad energy transfer ( {NZF-NZF} coupling ) + if (itor == 0) then + ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ] + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackA(ir,itor-nt1+1,iexch_base+isplit0)*cprod2*dvr*psi_mul + else + triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) & + + epackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul + endif + + ! 3. Entropy + triad_loc(is,ir,itor,3) = triad_loc(is,ir,itor,3) & + + cap_h_c(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr & + - field(1,ic_loc_m,itor)*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))**2*dvr & + - 2.0*cprod*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is)) + ! 4. Field potential + triad_loc(is,ir,itor,4) = triad_loc(is,ir,itor,4) & + + sum( field(:,ic_loc_m,itor)*conjg(field(:,ic_loc_m,itor)) )*dvp + ! 5. Diss. (radial) + triad_loc(is,ir,itor,5) = triad_loc(is,ir,itor,5) & + + diss_r(ic_loc_m,iv_loc_m,itor)*h_x(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr + ! 6. Diss. (theta ) + rval = omega_stream(it,is,itor)*vel(ie)*xi(ix) + rval2 = abs(omega_stream(it,is,itor)) + cprod = 0.0 + cprod2= 0.0 + + !icd_c(ic, id, itor) = ic_c(jr,modulo(it+id-1,n_theta)+1) + !jc = icd_c(ic, id, itor) + !dtheta(ic, id, itor) := cderiv(id)*thfac + !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta + itd = n_theta+it-nup_theta + itd_class = 0 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + + do id=-nup_theta,nup_theta + if (itd > n_theta) then + ! move to next itd_class of compute + itd = itd - n_theta + itd_class = itd_class + 1 + jc = jr0(itd_class)+itd + thfac = thfac_itor(itd_class,itor) + endif + + cprod2 = cprod2 - rval* thfac*cderiv(id) *cap_h_c(jc,iv_loc_m,itor) + cprod = cprod - rval2* uderiv(id)*up_theta *g_x(jc,iv_loc_m,itor) + itd = itd + 1 + jc = jc + 1 + enddo + + triad_loc(is,ir,itor,6) = triad_loc(is,ir,itor,6) + cprod*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr + ! 7. Diss. (Coll. = Implicit advance - theta_streaming ) + triad_loc(is,ir,itor,7) = triad_loc(is,ir,itor,7) & + + ( cap_h_ct(iv_loc_m,itor,ic_loc_m)/delta_t + cprod2*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor)) )*dvr + + + if ( (itor == 0) .and. (ir == 1 .or. px(ir) == 0) ) then + ! filter + my_psi = (0.0,0.0) + endif + + if (itor == 0) then + my_psi = my_psi*zf_scale + endif + + ! RHS -> -[f,g] = [f,g]_{r,-alpha} + rhs(ic_loc_m,iv_loc_m,itor,ij) = rhs(ic_loc_m,iv_loc_m,itor,ij)+psi_mul*my_psi + enddo + enddo + enddo + enddo + + endif ! if nsplitB>0 + + call timer_lib_out('nl') + +end subroutine cgyro_nl_fftw_comm1_r_triad + + ! ! Comm2 is a transpose ! Reminder: nc ~= n_radial*n_theta diff --git a/cgyro/src/cgyro_nl_fftw.F90 b/cgyro/src/cgyro_nl_fftw.F90 index 4c5407d85..d56143bbf 100644 --- a/cgyro/src/cgyro_nl_fftw.F90 +++ b/cgyro/src/cgyro_nl_fftw.F90 @@ -1,5 +1,5 @@ !----------------------------------------------------------------- -! cgyro_nl_fftw.F90 +! cgyro_nl_fftw.f90 ! ! PURPOSE: ! Evaluate nonlinear bracket with dealiased FFT. It is natural @@ -10,460 +10,183 @@ ! NOTE: Need to be careful with (p=-nr/2,n=0) component. !----------------------------------------------------------------- -#if defined(_OPENACC) || defined(OMPGPU) -#define CGYRO_GPU_FFT -#endif - -!----------------------------------------------------------------- -! cgyro_nl_fftw_init -!----------------------------------------------------------------- +subroutine cgyro_nl_fftw_init + use cgyro_globals + implicit none + include 'fftw3.f03' -#ifdef CGYRO_GPU_FFT + ! Create plans once and for all, with global arrays fx,ux + plan_c2r = fftw_plan_dft_c2r_2d(nx,ny,fx(:,:,1),uxmany(:,:,1),FFTW_PATIENT) + plan_r2c = fftw_plan_dft_r2c_2d(nx,ny,uv(:,:,1),fx(:,:,1),FFTW_PATIENT) -#if defined(MKLGPU) -include 'fftw/offload/fftw3_omp_offload.f90' -#endif +end subroutine cgyro_nl_fftw_init -subroutine cgyro_nl_fftw_init +subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp) -#if defined(HIPGPU) - use hipfort_hipfft -#elif defined(MKLGPU) - use fftw3_omp_offload -#else - use cufft -#endif + use timer_lib + use parallel_lib use cgyro_globals implicit none -#if defined(MKLGPU) - include 'fftw/fftw3.f' -#endif - - integer :: howmany,istatus - integer, parameter :: irank = 2 - integer, dimension(irank) :: ndim,inembed,onembed - integer :: idist,odist - integer, parameter :: istride = 1 - integer, parameter :: ostride = 1 - -#if !defined(MKLGPU) - integer, parameter :: singlePrecision = selected_real_kind(6,30) -#endif - - !------------------------------------------------------------------- - ! 2D - ! input[ b*idist + (x * inembed[1] + y)*istride ] - ! output[ b*odist + (x * onembed[1] + y)*ostride ] - ! isign is the sign of the exponent in the formula that defines - ! Fourier transform -1 == FFTW_FORWARD - ! 1 == FFTW_BACKWARD - !------------------------------------------------------------------- - -#if defined(MKLGPU) - ! oneMKL offload uses the reverse ordering - ndim(2) = nx - ndim(1) = ny -#else - ndim(1) = nx - ndim(2) = ny -#endif - idist = size(fxmany,1)*size(fxmany,2) - odist = size(uxmany,1)*size(uxmany,2) - inembed = size(fxmany,1) - onembed = size(uxmany,1) -#if defined(MKLGPU) - inembed(2) = size(fxmany,2) - onembed(2) = size(uxmany,2) -#endif - -#if defined(HIPGPU) - hip_plan_c2r_manyA = c_null_ptr - istatus = hipfftPlanMany(& - hip_plan_c2r_manyA, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(HIPFFT_C2R,HIPFFT_Z2D,kind(uxmany) == singlePrecision), & - nsplitA) - - if (nsplitB > 0) then ! no fft if nsplitB==0 - hip_plan_c2r_manyB = c_null_ptr - istatus = hipfftPlanMany(& - hip_plan_c2r_manyB, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(HIPFFT_C2R,HIPFFT_Z2D,kind(uxmany) == singlePrecision), & - nsplitB) - endif - hip_plan_c2r_manyG = c_null_ptr - istatus = hipfftPlanMany(& - hip_plan_c2r_manyG, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(HIPFFT_C2R,HIPFFT_Z2D,kind(uxmany) == singlePrecision), & - nsplit) -#elif defined(MKLGPU) - dfftw_plan_c2r_manyA = 0 -!$omp target data map(tofrom: fymany,uymany) - !$omp dispatch - call dfftw_plan_many_dft_c2r(& - dfftw_plan_c2r_manyA, & - irank, & - ndim, & - nsplitA, & - fymany, & - inembed, & - istride, & - idist, & - uymany, & - onembed, & - ostride, & - odist, & - FFTW_ESTIMATE) - - if (nsplitB > 0) then ! no fft if nsplitB==0 - dfftw_plan_c2r_manyB = 0 - !$omp dispatch - call dfftw_plan_many_dft_c2r(& - dfftw_plan_c2r_manyB, & - irank, & - ndim, & - nsplitB, & - fymany, & - inembed, & - istride, & - idist, & - uymany, & - onembed, & - ostride, & - odist, & - FFTW_ESTIMATE) - endif -!$omp end target data - - dfftw_plan_c2r_manyG = 0 -!$omp target data map(tofrom: gymany,vymany) - !$omp dispatch - call dfftw_plan_many_dft_c2r(& - dfftw_plan_c2r_manyG, & - irank, & - ndim, & - nsplit, & - gymany, & - inembed, & - istride, & - idist, & - vymany, & - onembed, & - ostride, & - odist, & - FFTW_ESTIMATE) -!$omp end target data - -#else - istatus = cufftPlanMany(& - cu_plan_c2r_manyA, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(CUFFT_C2R,CUFFT_Z2D,kind(uxmany) == singlePrecision), & - nsplitA) - - if (nsplitB > 0) then ! no fft if nsplitB==0 - istatus = cufftPlanMany(& - cu_plan_c2r_manyB, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(CUFFT_C2R,CUFFT_Z2D,kind(uxmany) == singlePrecision), & - nsplitB) - endif + integer, intent(in) :: g_j, f_j + integer,intent(in) :: nl_idx ! 1=>A, 2=>B + integer,intent(in) :: i_omp + integer :: ix,iy + integer :: ir,itm,itl,itor - istatus = cufftPlanMany(& - cu_plan_c2r_manyG, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(CUFFT_C2R,CUFFT_Z2D,kind(uxmany) == singlePrecision), & - nsplit) -#endif - - idist = size(uxmany,1)*size(uxmany,2) - odist = size(fxmany,1)*size(fxmany,2) - inembed = size(uxmany,1) - onembed = size(fxmany,1) -#if defined(MKLGPU) - inembed(2) = size(uxmany,2) - onembed(2) = size(fxmany,2) -#endif - -#if defined(HIPGPU) - hip_plan_r2c_manyA = c_null_ptr - istatus = hipfftPlanMany(& - hip_plan_r2c_manyA, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(HIPFFT_R2C,HIPFFT_D2Z,kind(uxmany) == singlePrecision), & - nsplitA) - - if (nsplitB > 0) then ! no fft if nsplitB==0 - hip_plan_r2c_manyB = c_null_ptr - istatus = hipfftPlanMany(& - hip_plan_r2c_manyB, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(HIPFFT_R2C,HIPFFT_D2Z,kind(uxmany) == singlePrecision), & - nsplitB) - endif -#elif defined(MKLGPU) - dfftw_plan_r2c_manyA = 0 -!$omp target data map(tofrom: uvmany,fxmany) - !$omp dispatch - call dfftw_plan_many_dft_r2c(& - dfftw_plan_r2c_manyA, & - irank, & - ndim, & - nsplitA, & - uvmany, & - inembed, & - istride, & - idist, & - fxmany, & - onembed, & - ostride, & - odist, & - FFTW_ESTIMATE) - - if (nsplitB > 0) then ! no fft if nsplitB==0 - dfftw_plan_r2c_manyB = 0 - !$omp dispatch - call dfftw_plan_many_dft_r2c(& - dfftw_plan_r2c_manyB, & - irank, & - ndim, & - nsplitB, & - uvmany, & - inembed, & - istride, & - idist, & - fxmany, & - onembed, & - ostride, & - odist, & - FFTW_ESTIMATE) - endif -!$omp end target data - -#else - istatus = cufftPlanMany(& - cu_plan_r2c_manyA, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(CUFFT_R2C,CUFFT_D2Z,kind(uxmany) == singlePrecision), & - nsplitA) - - if (nsplitB > 0) then ! no fft if nsplitB==0 - istatus = cufftPlanMany(& - cu_plan_r2c_manyB, & - irank, & - ndim, & - inembed, & - istride, & - idist, & - onembed, & - ostride, & - odist, & - merge(CUFFT_R2C,CUFFT_D2Z,kind(uxmany) == singlePrecision), & - nsplitB) - endif -#endif + include 'fftw3.f03' -end subroutine cgyro_nl_fftw_init + ! Poisson bracket in real space + uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) -#else /* if not defined CGYRO_GPU_FFT */ + call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) -subroutine cgyro_nl_fftw_init + ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component + ! that will be filtered in the main time-stepping loop + + ! this should really be accounted against nl_mem, but hard to do with OMP + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + do ir=1,n_radial + ix = ir-1-nx0/2 + if (ix < 0) ix = ix+nx + iy = itor-1 + if (nl_idx==1) then + fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + else + fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + endif + enddo + enddo + enddo + +end subroutine cgyro_nl_fftw_stepr + +subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp) + + use timer_lib + use parallel_lib use cgyro_globals + implicit none + + integer, intent(in) :: g_j, f_j + integer,intent(in) :: nl_idx ! 1=>A, 2=>B + integer,intent(in) :: i_omp + real :: y_mean(nx) + integer :: ix,iy + integer :: ir,itm,itl,itor + include 'fftw3.f03' - ! Create plans once and for all, with global arrays fx,ux - plan_c2r = fftw_plan_dft_c2r_2d(nx,ny,fx(:,:,1),uxmany(:,:,1),FFTW_PATIENT) - plan_r2c = fftw_plan_dft_r2c_2d(nx,ny,uv(:,:,1),fx(:,:,1),FFTW_PATIENT) + ! 1. Poisson bracket in real space + uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) -end subroutine cgyro_nl_fftw_init + call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) -#endif /* CGYRO_GPU_FFT */ + ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component + ! that will be filtered in the main time-stepping loop -!----------------------------------------------------------------- -! cgyro_nl_fftw_init -! (and all the helper sub-functions) -!----------------------------------------------------------------- + ! this should really be accounted against nl_mem, but hard to do with OMP + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + do ir=1,n_radial + ix = ir-1-nx0/2 + if (ix < 0) ix = ix+nx + iy = itor-1 + if (nl_idx==1) then + fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + else + fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + endif + enddo + enddo + enddo -#ifdef CGYRO_GPU_FFT -subroutine cgyro_nl_fftw_mul(sz,uvm,uxm,vym,uym,vxm,inv_nxny) - implicit none + ! 2. Poisson bracket in real space with Non_Zonal pairs + + ! remove ky=0 in uxmany + y_mean = sum(uxmany(:,:,f_j) ,dim=1 ) / ny + do iy=0,ny-1 + uxmany(iy,:,f_j) = uxmany(iy,:,f_j) -y_mean + + end do + + ! remove ky=0 in uymany + y_mean = sum(uymany(:,:,f_j) ,dim=1 ) / ny ! =0 + do iy=0,ny-1 + uymany(iy,:,f_j) = uymany(iy,:,f_j) -y_mean + + end do + + ! remove ky=0 in vx + y_mean = sum(vxmany(:,:,g_j) ,dim=1 ) / ny + do iy=0,ny-1 + vxmany(iy,:,g_j) = vxmany(iy,:,g_j) -y_mean + + end do + + ! remove ky=0 in vy + y_mean = sum(vymany(:,:,g_j) ,dim=1 ) / ny ! =0 + do iy=0,ny-1 + vymany(iy,:,g_j) = vymany(iy,:,g_j) -y_mean + + end do + + uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) + + call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) - integer, intent(in) :: sz - real, dimension(*),intent(out) :: uvm - real, dimension(*),intent(in) :: uxm,vym,uym,vxm - real, intent(in) :: inv_nxny - - integer :: i - -#if defined(OMPGPU) -!$omp target teams distribute parallel do simd & -!$omp& map(to:uxm(1:sz),vym(1:sz),uym(1:sz),vxm(1:sz)) & -!$omp& map(from:uvm(1:sz)) -#else -!$acc parallel loop independent gang vector & -!$acc& present(uvm,uxm,vym,uym,vxm) private(i) -#endif - do i=1,sz - uvm(i) = (uxm(i)*vym(i)-uym(i)*vxm(i))*inv_nxny + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor=itl + (itm-1)*nt_loc + do ir=1,n_radial + ix = ir-1-nx0/2 + if (ix < 0) ix = ix+nx + iy = itor-1 + if (nl_idx==1) then + eA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + else + eB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) + endif + enddo + enddo enddo -end subroutine +end subroutine cgyro_nl_fftw_stepr_triad + -subroutine cgyro_nl_fftw +! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw +subroutine cgyro_nl_fftw(ij) -#if defined(HIPGPU) - use hipfort - use hipfort_hipfft -#elif defined(MKLGPU) - use fftw3_omp_offload -#else - use cufft -#endif use timer_lib use parallel_lib use cgyro_nl_comm use cgyro_globals implicit none -#if defined(MKLGPU) - include 'fftw/fftw3.f' -#endif + + !----------------------------------- + integer, intent(in) :: ij !----------------------------------- - integer :: j,p,iexch - integer :: it,ir,itm,itl,ix,iy + integer :: ix,iy + integer :: ir,it,itm,itl,it_loc integer :: itor,mytm - integer :: i1,i2 - integer :: it_loc - integer :: ierr - integer :: rc - complex :: f0,g0 + integer :: j,p + integer :: i_omp integer :: jtheta_min - integer :: iy0, iy1, ir0, ir1 - - real :: inv_nxny - -#ifdef GACODE_GPU_AMD - ! AMD GPU (MI250X) optimal - integer, parameter :: F_RADTILE = 8 - integer, parameter :: F_TORTILE = 16 - integer, parameter :: R_RADTILE = 32 - integer, parameter :: R_TORTILE = 8 -#else - ! NVIDIA GPU (A100) optimal - integer, parameter :: F_RADTILE = 8 - integer, parameter :: F_TORTILE = 16 - integer, parameter :: R_RADTILE = 16 - integer, parameter :: R_TORTILE = 8 -#endif - - call timer_lib_in('nl_mem') - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - ! we can zero the elements we know are zero while we wait -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(3) -#else -!$acc parallel loop gang vector independent collapse(3) async(2) & -!$acc& private(j,ix,iy) & -!$acc& present(nsplitA,ny2,nx0,nx2) -#endif - do j=1,nsplitA - do ix=nx2,nx0-1 - do iy=0,ny2 - fxmany(iy,ix,j) = 0 - fymany(iy,ix,j) = 0 - enddo - enddo - enddo + complex :: f0,g0 -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(3) -#else -!$acc parallel loop gang vector independent collapse(3) async(2) & -!$acc& private(j,ix,iy) & -!$acc& present(nsplitA,ny2,n_toroidal,nx) -#endif - do j=1,nsplitA - do ix=0,nx-1 - do iy=n_toroidal,ny2 - fxmany(iy,ix,j) = 0 - fymany(iy,ix,j) = 0 - enddo - enddo - enddo - call timer_lib_out('nl_mem') + integer, external :: omp_get_thread_num + + include 'fftw3.f03' + + call cgyro_nl_fftw_comm_test() ! time to wait for the FA_nl to become avaialble call timer_lib_in('nl_comm') @@ -474,188 +197,57 @@ subroutine cgyro_nl_fftw call timer_lib_out('nl_comm') call timer_lib_in('nl') -#if !defined(OMPGPU) -!$acc data present(fA_nl) & -!$acc& present(fxmany,fymany,gxmany,gymany) & -!$acc& present(uxmany,uymany,vxmany,vymany) & -!$acc& present(uvmany) -#endif ! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs) ! where nv_loc1 * toroidal_procs >= nv_loc - - ! no tiling, does not seem to help -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(4) & -!$omp& private(j,ir,p,ix,itor,iy,f0,itm,itl) -#else -!$acc parallel loop gang vector independent collapse(4) async(2) & -!$acc& private(j,ir,p,ix,itor,iy,f0,itm,itl) -#endif +!$omp parallel do schedule(dynamic,1) private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j) do j=1,nsplitA - do ir=1,n_radial - do itm=1,n_toroidal_procs - do itl=1,nt_loc + i_omp = omp_get_thread_num()+1 + + ! zero elements not otherwise set below + fx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + + ! Array mapping + do ir=1,n_radial + p = ir-1-nx0/2 + ix = p + if (ix < 0) ix = ix+nx + do itm=1,n_toroidal_procs + do itl=1,nt_loc itor=itl + (itm-1)*nt_loc iy = itor-1 - p = ir-1-nx0/2 - ix = p - if (ix < 0) ix = ix+nx - f0 = i_c*fA_nl(ir,itl,j,itm) - fxmany(iy,ix,j) = p*f0 - fymany(iy,ix,j) = iy*f0 - enddo - enddo - enddo - enddo - -#if defined(OMPGPU) - !no async for OMPGPU for now -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait(2) -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! Average elements so as to ensure - ! f(kx,ky=0) = f(-kx,ky=0)^* - ! This symmetry is required for complex input to c2r -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(2) & -!$omp& private(j,ix,f0) -#else -!$acc parallel loop gang vector independent collapse(2) async(2) & -!$acc& private(j,ix,f0) & -!$acc& present(fxmany) & -!$acc& present(nsplitA,nx) -#endif - do j=1,nsplitA - do ix=1,nx/2-1 - f0 = 0.5*( fxmany(0,ix,j)+conjg(fxmany(0,nx-ix,j)) ) - fxmany(0,ix ,j) = f0 - fxmany(0,nx-ix,j) = conjg(f0) - enddo - enddo - - ! -------------------------------------- - ! perform many Fourier Transforms at once - ! -------------------------------------- - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: fymany,uymany) -#else -!$omp target data use_device_ptr(fymany,uymany) -#endif - -#else -!$acc host_data use_device(fymany,uymany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecZ2D(hip_plan_c2r_manyA,c_loc(fymany),c_loc(uymany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyA,fymany,uymany) - rc = 0 -#else - rc = cufftExecZ2D(cu_plan_c2r_manyA,fymany,uymany) -#endif - -#if defined(OMPGPU) -!$omp end target data -#else -!$acc end host_data -#endif - -#if defined(OMPGPU) - !no async for OMPGPU for now -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait(2) -#endif - ! fxmany is complete now - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() + fx(iy,ix,i_omp) = p*f0 + fy(iy,ix,i_omp) = iy*f0 + enddo + enddo + if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 + ! Average elements so as to ensure + ! f(kx,ky=0) = f(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) + fx(0,ix ,i_omp) = f0 + fx(0,nx-ix,i_omp) = conjg(f0) + endif + fx(n_toroidal:ny2,ix,i_omp) = 0.0 + fy(n_toroidal:ny2,ix,i_omp) = 0.0 + enddo -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: fxmany,uxmany) -#else -!$omp target data use_device_ptr(fxmany,uxmany) -#endif - -#else -!$acc host_data use_device(fxmany,uxmany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecZ2D(hip_plan_c2r_manyA,c_loc(fxmany),c_loc(uxmany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyA,fxmany,uxmany) - rc = 0 -#else - rc = cufftExecZ2D(cu_plan_c2r_manyA,fxmany,uxmany) -#endif - -#if defined(OMPGPU) -!$omp end target data -#else -!$acc end host_data -#endif + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() + call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j)) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif -#if !defined(OMPGPU) -!$acc data present(g_nl) & -!$acc& present(gxmany,gymany) -#endif - ! we can zero the elements we know are zero while we wait for comm -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(3) -#else -!$acc parallel loop gang vector independent collapse(3) async(2) & -!$acc& private(j,ix,iy) & -!$acc& present(nsplit,ny2,nx0,nx2) -#endif - do j=1,nsplit - do ix=nx2,nx0-1 - do iy=0,ny2 - gxmany(iy,ix,j) = 0 - gymany(iy,ix,j) = 0 - enddo - enddo - enddo + call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j)) + enddo ! j -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(3) -#else -!$acc parallel loop gang vector independent collapse(3) async(2) & -!$acc& private(j,ix,iy) & -!$acc& present(nsplit,ny2,n_toroidal,nx) -#endif - do j=1,nsplit - do ix=0,nx-1 - do iy=n_toroidal,ny2 - gxmany(iy,ix,j) = 0 - gymany(iy,ix,j) = 0 - enddo - enddo - enddo call timer_lib_out('nl') call timer_lib_in('nl_comm') @@ -670,307 +262,77 @@ subroutine cgyro_nl_fftw ! g_nl is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs) ! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs) - - ! tile for performance, since this is effectively a transpose -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(5) & -!$omp& private(j,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,ir) -#else -!$acc parallel loop gang vector independent collapse(5) async(2) & -!$acc& private(j,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,ir) & -!$acc& present(g_nl,jvec_c_nl) & -!$acc& present(nsplit,n_radial,n_toroidal_procs,nt_loc,nt1,n_theta,nv_loc,nx0) -#endif +!$omp parallel do schedule(dynamic,1) & +!$omp& private(itor,mytm,itm,itl,iy,ir,p,ix,g0,i_omp,j,it,iv_loc,it_loc,jtheta_min) do j=1,nsplit - do iy0=0,n_toroidal+(F_TORTILE-1)-1,F_TORTILE ! round up - do ir0=0,n_radial+(F_RADTILE-1)-1,F_RADTILE ! round up - do iy1=0,(F_TORTILE-1) ! tile - do ir1=0,(F_RADTILE-1) ! tile - iy = iy0+iy1 - ir = 1 + ir0+ir1 - if ((iy < n_toroidal) .and. (ir <= n_radial)) then - itor = iy+1 - itm = 1 + iy/nt_loc - itl = 1 + modulo(iy,nt_loc) + i_omp = omp_get_thread_num()+1 + + ! zero elements not otherwise set below + gx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + gy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + + ! Array mapping + do ir=1,n_radial + p = ir-1-nx0/2 + ix = p + if (ix < 0) ix = ix+nx + do itm=1,n_toroidal_procs + do itl=1,nt_loc + itor = itl + (itm-1)*nt_loc mytm = 1 + nt1/nt_loc !my toroidal proc number it = 1+((mytm-1)*nsplit+j-1)/nv_loc iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc) jtheta_min = 1+((mytm-1)*nsplit)/nv_loc it_loc = it-jtheta_min+1 - iy = itor-1 - - p = ir-1-nx0/2 - ix = p - if (ix < 0) ix = ix+nx + iy = itor-1 if (it > n_theta) then g0 = (0.0,0.0) else - g0 = i_c*sum( jvec_c_nl(1:n_field,ir,it_loc,iv_loc,itor)*g_nl(1:n_field,ir,it_loc,itor)) + g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor)) endif - gxmany(iy,ix,j) = p*g0 - gymany(iy,ix,j) = iy*g0 - endif - enddo + gx(iy,ix,i_omp) = p*g0 + gy(iy,ix,i_omp) = iy*g0 + enddo + enddo + if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 + ! Average elements so as to ensure + ! g(kx,ky=0) = g(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) ) + gx(0,ix ,i_omp) = g0 + gx(0,nx-ix,i_omp) = conjg(g0) + endif + gx(n_toroidal:ny2,ix,i_omp) = 0.0 + gy(n_toroidal:ny2,ix,i_omp) = 0.0 enddo - enddo - enddo - enddo - -#if defined(OMPGPU) - !no async for OMPGPU for now -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait(2) -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! Average elements so as to ensure - ! g(kx,ky=0) = g(-kx,ky=0)^* - ! This symmetry is required for complex input to c2r -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(2) & -!$omp& private(j,ix,g0) -#else -!$acc parallel loop gang vector independent collapse(2) async(2) & -!$acc& private(j,ix,g0) & -!$acc& present(gxmany) & -!$acc& present(nsplit,nx) -#endif - do j=1,nsplit - do ix=1,nx/2-1 - g0 = 0.5*( gxmany(0,ix,j)+conjg(gxmany(0,nx-ix,j)) ) - gxmany(0,ix ,j) = g0 - gxmany(0,nx-ix,j) = conjg(g0) - enddo - enddo - -#if !defined(OMPGPU) -!$acc end data -#endif - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: gymany,vymany) -#else -!$omp target data use_device_ptr(gymany,vymany) -#endif - -#else -!$acc host_data use_device(gymany,vymany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecZ2D(hip_plan_c2r_manyG,c_loc(gymany),c_loc(vymany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyG,gymany,vymany) - rc = 0 -#else - rc = cufftExecZ2D(cu_plan_c2r_manyG,gymany,vymany) -#endif - -#if defined(OMPGPU) -!$omp end target data -#else -!$acc end host_data -#endif - -#if defined(OMPGPU) - !no async for OMPGPU for now -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait(2) -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - ! gxmany is complete now - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: gxmany,vxmany) -#else -!$omp target data use_device_ptr(gxmany,vxmany) -#endif - -#else -!$acc host_data use_device(gxmany,vxmany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecZ2D(hip_plan_c2r_manyG,c_loc(gxmany),c_loc(vxmany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyG,gxmany,vxmany) - rc = 0 -#else - rc = cufftExecZ2D(cu_plan_c2r_manyG,gxmany,vxmany) -#endif - -#ifdef HIPGPU - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - ! hipfftExec is asynchronous, will need the results below - rc = hipDeviceSynchronize() -#endif - -#if defined(OMPGPU) -!$omp end target data -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait -!$acc end host_data -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! Poisson bracket in real space - ! uv = (ux*vy-uy*vx)/(nx*ny) - - inv_nxny = 1.0/(nx*ny) - call cgyro_nl_fftw_mul(size(uvmany,1)*size(uvmany,2)*nsplitA, & - uvmany, & - uxmany,vymany(:,:,1:nsplitA), & - uymany,vxmany(:,:,1:nsplitA), & - inv_nxny) - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! ------------------ - ! Transform uv to fx - ! ------------------ - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: uvmany,fxmany) -#else -!$omp target data use_device_ptr(uvmany,fxmany) -#endif - -#else -!$acc wait -!$acc host_data use_device(uvmany,fxmany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecD2Z(hip_plan_r2c_manyA,c_loc(uvmany),c_loc(fxmany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyA,uvmany,fxmany) - rc = 0 -#else - rc = cufftExecD2Z(cu_plan_r2c_manyA,uvmany,fxmany) -#endif - -#ifdef HIPGPU - ! hipfftExec is asynchronous, will need the results below - rc = hipDeviceSynchronize() -#endif - -#if defined(OMPGPU) -!$omp end target data -#else -!$acc wait -!$acc end host_data -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - call timer_lib_out('nl') - call timer_lib_in('nl_mem') - - ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component - ! that will be filtered in the main time-stepping loop + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif - ! tile for performance, since this is effectively a transpose -#if defined(OMPGPU) -!$omp target teams distribute parallel do collapse(5) & -!$omp& private(iy,ir,itm,itl,ix) -#else -!$acc parallel loop independent collapse(5) gang & -!$acc& private(iy,ir,itm,itl,ix) present(fA_nl,fxmany) -#endif - do j=1,nsplitA - do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE ! round up - do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE ! round up - do iy1=0,(R_TORTILE-1) ! tile - do ir1=0,(R_RADTILE-1) ! tile - iy = iy0 + iy1 - ir = 1 + ir0 + ir1 - if ((iy < n_toroidal) .and. (ir <= n_radial)) then - ! itor = iy+1 - itm = 1 + iy/nt_loc - itl = 1 + modulo(iy,nt_loc) - ix = ir-1-nx0/2 - if (ix < 0) ix = ix+nx + call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vxmany(:,:,j)) - fA_nl(ir,itl,j,itm) = fxmany(iy,ix,j) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() endif - enddo - enddo - enddo - enddo - enddo -#if !defined(OMPGPU) - ! end data fA_nl -!$acc end data -#endif + call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vymany(:,:,j)) - if (nsplitB > 0) then - ! we can zero the elements we know are zero while we waita - ! assuming nsplitB<=nsplitA -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(3) -#else -!$acc parallel loop gang vector independent collapse(3) async(2) & -!$acc& private(j,ix,iy) & -!$acc& present(nsplitB,ny2,nx0,nx2) -#endif - do j=1,nsplitB - do ix=nx2,nx0-1 - do iy=0,ny2 - fxmany(iy,ix,j) = 0 - fymany(iy,ix,j) = 0 - enddo - enddo - enddo + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(3) -#else -!$acc parallel loop gang vector independent collapse(3) async(2) & -!$acc& private(j,ix,iy) & -!$acc& present(nsplitB,ny2,n_toroidal,nx) -#endif - do j=1,nsplitB - do ix=0,nx-1 - do iy=n_toroidal,ny2 - fxmany(iy,ix,j) = 0 - fymany(iy,ix,j) = 0 - enddo - enddo - enddo - endif ! if nsplitB>0 + if (j<=nsplitA) then + call cgyro_nl_fftw_stepr(j, j, 1, i_omp) + endif + ! else we will do it in the next loop + enddo ! j - call timer_lib_out('nl_mem') + call timer_lib_out('nl') call timer_lib_in('nl_comm') ! start the async reverse comm @@ -983,330 +345,86 @@ subroutine cgyro_nl_fftw call parallel_slib_f_nc_wait(nsplitB,fpackB,fB_nl,fB_req) fB_req_valid = .FALSE. endif + ! make sure reqs progress call cgyro_nl_fftw_comm_test() call timer_lib_out('nl_comm') if (nsplitB > 0) then - - call timer_lib_in('nl') -#if !defined(OMPGPU) -!$acc data present(fB_nl) & -!$acc& present(fxmany,fymany,gxmany,gymany) & -!$acc& present(uxmany,uymany,vxmany,vymany) & -!$acc& present(uvmany) -#endif + call timer_lib_in('nl') ! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs) ! where nv_loc1 * toroidal_procs >= nv_loc +!$omp parallel do schedule(dynamic,1) private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j) + do j=1,nsplitB + i_omp = omp_get_thread_num()+1 - ! no tiling, does not seem to help -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(4) & -!$omp& private(j,ir,p,ix,itor,iy,f0,g0,itm,itl) -#else -!$acc parallel loop gang vector independent collapse(4) async(2) & -!$acc& private(j,ir,p,ix,itor,iy,f0,itm,itl) -#endif - do j=1,nsplitB - do ir=1,n_radial - do itm=1,n_toroidal_procs - do itl=1,nt_loc + ! zero elements not otherwise set below + fx(0:ny2,nx2:nx0-1,i_omp) = 0.0 + fy(0:ny2,nx2:nx0-1,i_omp) = 0.0 + + ! Array mapping + do ir=1,n_radial + p = ir-1-nx0/2 + ix = p + if (ix < 0) ix = ix+nx + do itm=1,n_toroidal_procs + do itl=1,nt_loc itor=itl + (itm-1)*nt_loc iy = itor-1 - p = ir-1-nx0/2 - ix = p - if (ix < 0) ix = ix+nx - f0 = i_c*fB_nl(ir,itl,j,itm) - fxmany(iy,ix,j) = p*f0 - fymany(iy,ix,j) = iy*f0 - enddo - enddo - enddo - enddo - -#if defined(OMPGPU) - !no async for OMPGPU for now -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait(2) -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! Average elements so as to ensure - ! f(kx,ky=0) = f(-kx,ky=0)^* - ! This symmetry is required for complex input to c2r -#if defined(OMPGPU) - !no async for OMPGPU for now -!$omp target teams distribute parallel do simd collapse(2) & -!$omp& private(j,ix,f0) -#else -!$acc parallel loop gang vector independent collapse(2) async(2) & -!$acc& private(j,ix,f0) & -!$acc& present(fxmany) & -!$acc& present(nsplitB,nx) -#endif - do j=1,nsplitB - do ix=1,nx/2-1 - f0 = 0.5*( fxmany(0,ix,j)+conjg(fxmany(0,nx-ix,j)) ) - fxmany(0,ix ,j) = f0 - fxmany(0,nx-ix,j) = conjg(f0) - enddo - enddo - - ! -------------------------------------- - ! perform many Fourier Transforms at once - ! -------------------------------------- - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: fymany,uymany) -#else -!$omp target data use_device_ptr(fymany,uymany) -#endif - -#else -!$acc host_data use_device(fymany,uymany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecZ2D(hip_plan_c2r_manyB,c_loc(fymany),c_loc(uymany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyB,fymany,uymany) - rc = 0 -#else - rc = cufftExecZ2D(cu_plan_c2r_manyB,fymany,uymany) -#endif - -#if defined(OMPGPU) -!$omp end target data -#else -!$acc end host_data -#endif - -#if defined(OMPGPU) - !no async for OMPGPU for now -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait(2) -#endif - ! fxmany is complete now - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: fxmany,uxmany) -#else -!$omp target data use_device_ptr(fxmany,uxmany) -#endif - -#else -!$acc host_data use_device(fxmany,uxmany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecZ2D(hip_plan_c2r_manyB,c_loc(fxmany),c_loc(uxmany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyB,fxmany,uxmany) - rc = 0 -#else - rc = cufftExecZ2D(cu_plan_c2r_manyB,fxmany,uxmany) -#endif - -#ifdef HIPGPU - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - ! hipfftExec is asynchronous, will need the results below - rc = hipDeviceSynchronize() -#endif - -#if defined(OMPGPU) -!$omp end target data -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait -!$acc end host_data -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! Poisson bracket in real space - ! uv = (ux*vy-uy*vx)/(nx*ny) - - inv_nxny = 1.0/(nx*ny) - - call cgyro_nl_fftw_mul(size(uvmany,1)*size(uvmany,2)*nsplitB, & - uvmany, & - uxmany,vymany(:,:,(nsplitA+1):nsplit), & - uymany,vxmany(:,:,(nsplitA+1):nsplit), & - inv_nxny) - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - ! ------------------ - ! Transform uv to fx - ! ------------------ - -#if defined(OMPGPU) - -#if defined(MKLGPU) -!$omp target data map(tofrom: uvmany,fxmany) -#else -!$omp target data use_device_ptr(uvmany,fxmany) -#endif - -#else -!$acc wait -!$acc host_data use_device(uvmany,fxmany) -#endif - -#if defined(HIPGPU) - rc = hipfftExecD2Z(hip_plan_r2c_manyB,c_loc(uvmany),c_loc(fxmany)) -#elif defined(MKLGPU) - !$omp dispatch - call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyB,uvmany,fxmany) - rc = 0 -#else - rc = cufftExecD2Z(cu_plan_r2c_manyB,uvmany,fxmany) -#endif - -#ifdef HIPGPU - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - ! hipfftExec is asynchronous, will need the results below - rc = hipDeviceSynchronize() -#endif - -#if defined(OMPGPU) -!$omp end target data -#else - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() -!$acc wait -!$acc end host_data -#endif - - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - - call timer_lib_out('nl') - call timer_lib_in('nl_mem') - - ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component - ! that will be filtered in the main time-stepping loop + fx(iy,ix,i_omp) = p*f0 + fy(iy,ix,i_omp) = iy*f0 + enddo + enddo + if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2 + ! Average elements so as to ensure + ! f(kx,ky=0) = f(-kx,ky=0)^* + ! This symmetry is required for complex input to c2r + f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) ) + fx(0,ix ,i_omp) = f0 + fx(0,nx-ix,i_omp) = conjg(f0) + endif + fx(n_toroidal:ny2,ix,i_omp) = 0.0 + fy(n_toroidal:ny2,ix,i_omp) = 0.0 + enddo - ! tile for performance, since this is effectively a transpose -#if defined(OMPGPU) -!$omp target teams distribute parallel do collapse(5) & -!$omp& private(iy,ir,itm,itl,ix) -#else -!$acc parallel loop independent collapse(5) gang & -!$acc& private(iy,ir,itm,itl,ix) present(fB_nl,fxmany) -#endif - do j=1,nsplitB - do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE ! round up - do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE ! round up - do iy1=0,(R_TORTILE-1) ! tile - do ir1=0,(R_RADTILE-1) ! tile - iy = iy0 + iy1 - ir = 1 + ir0 + ir1 - if ((iy < n_toroidal) .and. (ir <= n_radial)) then - ! itor = iy+1 - itm = 1 + iy/nt_loc - itl = 1 + modulo(iy,nt_loc) - ix = ir-1-nx0/2 - if (ix < 0) ix = ix+nx + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif - fB_nl(ir,itl,j,itm) = fxmany(iy,ix,j) + call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j)) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() endif - enddo - enddo - enddo - enddo - enddo -#if !defined(OMPGPU) - ! end data fB_nl -!$acc end data -#endif + call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j)) + if (i_omp==1) then + ! use the main thread to progress the async MPI + call cgyro_nl_fftw_comm_test() + endif - call timer_lib_out('nl_mem') + call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp) + enddo ! j - call timer_lib_in('nl_comm') - ! start the async reverse comm - ! can reuse the same req, no overlap with forward fB_req - call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req) - fB_req_valid = .TRUE. - ! make sure reqs progress - call cgyro_nl_fftw_comm_test() - call timer_lib_out('nl_comm') + call timer_lib_out('nl') - endif ! nsplitB>0 + call timer_lib_in('nl_comm') + ! start the async reverse comm + ! can reuse the same req, no overlap with forward fB_req + call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req) + fB_req_valid = .TRUE. + ! make sure reqs progress + call cgyro_nl_fftw_comm_test() + call timer_lib_out('nl_comm') + endif ! if nsplitB>0 end subroutine cgyro_nl_fftw -#else /* if not definedCGYRO_GPU_FFT */ -subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp) - - use timer_lib - use parallel_lib - use cgyro_globals - - implicit none - - integer, intent(in) :: g_j, f_j - integer,intent(in) :: nl_idx ! 1=>A, 2=>B - integer,intent(in) :: i_omp - integer :: ix,iy - integer :: ir,itm,itl,itor - - include 'fftw3.f03' - - ! Poisson bracket in real space - uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny) - - call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp)) - - ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component - ! that will be filtered in the main time-stepping loop - - ! this should really be accounted against nl_mem, but hard to do with OMP - do itm=1,n_toroidal_procs - do itl=1,nt_loc - itor=itl + (itm-1)*nt_loc - do ir=1,n_radial - ix = ir-1-nx0/2 - if (ix < 0) ix = ix+nx - iy = itor-1 - if (nl_idx==1) then - fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) - else - fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp) - endif - enddo - enddo - enddo - -end subroutine cgyro_nl_fftw_stepr - -! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw -subroutine cgyro_nl_fftw +subroutine cgyro_nl_fftw_triad(ij) use timer_lib use parallel_lib @@ -1315,6 +433,8 @@ subroutine cgyro_nl_fftw implicit none + !----------------------------------- + integer, intent(in) :: ij !----------------------------------- integer :: ix,iy integer :: ir,it,itm,itl,it_loc @@ -1470,7 +590,7 @@ subroutine cgyro_nl_fftw endif if (j<=nsplitA) then - call cgyro_nl_fftw_stepr(j, j, 1, i_omp) + call cgyro_nl_fftw_stepr_triad(j, j, 1, i_omp) endif ! else we will do it in the next loop enddo ! j @@ -1481,6 +601,7 @@ subroutine cgyro_nl_fftw ! start the async reverse comm ! can reuse the same req, no overlap with forward fA_req call parallel_slib_r_nc_async(nsplitA,fA_nl,fpackA,fA_req) + call parallel_slib_r_nc_async(nsplitA,eA_nl,epackA,eA_req) fA_req_valid = .TRUE. if (nsplitB > 0) then @@ -1549,7 +670,7 @@ subroutine cgyro_nl_fftw call cgyro_nl_fftw_comm_test() endif - call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp) + call cgyro_nl_fftw_stepr_triad(nsplitA+j, j, 2, i_omp) enddo ! j call timer_lib_out('nl') @@ -1558,13 +679,11 @@ subroutine cgyro_nl_fftw ! start the async reverse comm ! can reuse the same req, no overlap with forward fB_req call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req) + call parallel_slib_r_nc_async(nsplitB,eB_nl,epackB,eB_req) fB_req_valid = .TRUE. ! make sure reqs progress call cgyro_nl_fftw_comm_test() call timer_lib_out('nl_comm') endif ! if nsplitB>0 -end subroutine cgyro_nl_fftw - -#endif /* CGYRO_GPU_FFT */ - +end subroutine cgyro_nl_fftw_triad diff --git a/cgyro/src/cgyro_read_input.f90 b/cgyro/src/cgyro_read_input.f90 index be77de7dd..bfbe4bf71 100644 --- a/cgyro/src/cgyro_read_input.f90 +++ b/cgyro/src/cgyro_read_input.f90 @@ -70,6 +70,7 @@ subroutine cgyro_read_input call cgyro_readbc_int(moment_print_flag,'MOMENT_PRINT_FLAG') call cgyro_readbc_int(gflux_print_flag,'GFLUX_PRINT_FLAG') call cgyro_readbc_int(field_print_flag,'FIELD_PRINT_FLAG') + call cgyro_readbc_int(triad_print_flag,'TRIAD_PRINT_FLAG') call cgyro_readbc_real(amp0) call cgyro_readbc_real(amp) call cgyro_readbc_real(gamma_e) diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90 index 2f762bd16..b12c6cca5 100644 --- a/cgyro/src/cgyro_rhs.F90 +++ b/cgyro/src/cgyro_rhs.F90 @@ -35,7 +35,11 @@ subroutine cgyro_rhs_r_comm_async(ij) integer, intent(in) :: ij if (nonlinear_flag == 1) then - call cgyro_nl_fftw_comm1_r(ij) + if (triad_print_flag == 1 .and. ij == 4) then + call cgyro_nl_fftw_comm1_r_triad(ij) + else + call cgyro_nl_fftw_comm1_r(ij) + endif endif end subroutine cgyro_rhs_r_comm_async @@ -410,7 +414,11 @@ subroutine cgyro_rhs(ij,update_cap) if (nonlinear_flag == 1) then ! assumes someone already started the input comm ! and will finish the output comm - call cgyro_nl_fftw() + if (triad_print_flag == 1 .and. ij == 4) then + call cgyro_nl_fftw_triad() + else + call cgyro_nl_fftw() + endif endif call cgyro_upwind_complete diff --git a/cgyro/src/cgyro_step_collision.F90 b/cgyro/src/cgyro_step_collision.F90 index f422e88a4..228360d2f 100644 --- a/cgyro/src/cgyro_step_collision.F90 +++ b/cgyro/src/cgyro_step_collision.F90 @@ -334,6 +334,7 @@ subroutine cgyro_step_collision_cpu(use_simple) ! Compute H given h and [phi(h), apar(h)] + if (triad_print_flag == 1 ) then !$omp parallel do collapse(2) & !$omp& private(iv_loc,is,ic,iv,my_psi,my_ch) firstprivate(nc) do itor=nt1,nt2 @@ -341,14 +342,44 @@ subroutine cgyro_step_collision_cpu(use_simple) iv_loc = iv-nv1+1 is = is_v(iv) do ic=1,nc + + ! Save collisional diss. + my_ch = cap_h_ct(iv_loc,itor,ic) + my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor)) + + my_psi = my_ch-my_psi*(z(is)/temp(is)) + cap_h_ct(iv_loc,itor,ic) = (cap_h_c(ic,iv_loc,itor) + my_ch) / 2.0 + cap_h_ct(iv_loc,itor,ic) = conjg(cap_h_ct(iv_loc,itor,ic)) & + * ( my_psi - h_x(ic,iv_loc,itor) ) + + h_x(ic,iv_loc,itor) = my_psi + cap_h_c(ic,iv_loc,itor) = my_ch + + enddo + enddo + enddo + + else + +!$omp parallel do collapse(2) & +!$omp& private(iv_loc,is,ic,iv,my_psi,my_ch) firstprivate(nc) + do itor=nt1,nt2 + do iv=nv1,nv2 + iv_loc = iv-nv1+1 + is = is_v(iv) + do ic=1,nc + my_ch = cap_h_ct(iv_loc,itor,ic) my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor)) h_x(ic,iv_loc,itor) = my_ch-my_psi*(z(is)/temp(is)) cap_h_c(ic,iv_loc,itor) = my_ch + enddo enddo enddo + end if ! (triad_print_flag == 1 ) + call timer_lib_out('coll') end subroutine cgyro_step_collision_cpu diff --git a/cgyro/src/cgyro_write_timedata.f90 b/cgyro/src/cgyro_write_timedata.f90 index 3d6b808a1..36864a9ab 100644 --- a/cgyro/src/cgyro_write_timedata.f90 +++ b/cgyro/src/cgyro_write_timedata.f90 @@ -70,6 +70,14 @@ subroutine cgyro_write_timedata enddo endif + if (nonlinear_flag == 1 .and. triad_print_flag == 1) then + ! Triad energy transfer for all kxky + call cgyro_write_distributed_bcomplex(& + trim(path)//binfile_triad,& + size(triad(:,:,:,:)),& + triad(:,:,:,:)) + endif + if (field_print_flag == 1) then p_field = n_field else diff --git a/f2py/pygacode/cgyro/data.py b/f2py/pygacode/cgyro/data.py index 0e67a3395..15e59cb6e 100644 --- a/f2py/pygacode/cgyro/data.py +++ b/f2py/pygacode/cgyro/data.py @@ -302,6 +302,13 @@ def getbigfield(self): if fmt != 'null': print('INFO: (data.py) Read data in '+fmt+'.cgyro.kxky_v. '+t) self.kxky_v = np.reshape(data[0:nd],(2,self.n_radial,self.theta_plot,self.n_species,self.n_n,nt),'F') + + # 5. triad + t,fmt,data = self.extract('.cgyro.triad') + if fmt != 'null': + print('INFO: (data.py) Read data in '+fmt+'.cgyro.triad. '+t) + nd = 2*self.n_n*self.n_radial*self.n_species*nt*8 + self.triad = np.reshape(data[0:nd],(2,self.n_species,self.n_radial,8,self.n_n,nt),'F') #----------------------------------------------------------------- def getgeo(self): diff --git a/f2py/pygacode/cgyro/data_plot.py b/f2py/pygacode/cgyro/data_plot.py index 309ee7106..99f36af79 100644 --- a/f2py/pygacode/cgyro/data_plot.py +++ b/f2py/pygacode/cgyro/data_plot.py @@ -433,6 +433,200 @@ def plot_phi(self,xin): return head,self.t,y0,yn + + def plot_triad(self,xin): + + w = xin['w'] + theta = xin['theta'] + field = xin['field'] + ymin = xin['ymin'] + ymax = xin['ymax'] + absnorm = xin['abs'] + moment = xin['moment'] + spec = xin['spec'] + + + t = self.getnorm(xin['norm']) + + if xin['fig'] is None: + fig = plt.figure(MYDIR,figsize=(xin['lx'],xin['ly'])) + + self.getbigfield() + + f = self.triad[0,spec,:,:,:,:]+1j*self.triad[1,spec,:,:,:,:] + # 0- Triad , 1- non-zonal pairs Triad , 2- d(Entropy)/dt, 3- W_k_{perp}/dt, 4- Entropy + # 5- Diss.(radial) , 6- Diss.(theta) , 7- Diss.(Collision) + f = f[:,0,:,:] + + #====================================== + # Set figure size and axes + ax = fig.add_subplot(111) + ax.grid(which="both",ls=":") + ax.grid(which="major",ls=":") + ax.set_xlabel(self.tstr) + ax.set_title(r'$\mathrm{Triad~energy~transfer~intensity}$'+ r'$-(L_{T_e}/Q_{GBD})$') + #====================================== + + if spec==1: + ft=r'{T_e}' + elif spec==0: + ft=r'{T_D}' + else: + ft=r'{T_%d}'%spec + + + # n=0 intensity + T0 = np.sum(f[:,0,:].real,axis=0) + lab0=r'$\left\langle \left|'+ft+r'_0\right|\right\rangle$' + ax.plot(t,T0,label=lab0,linewidth=4) + + # n!=0 intensity + for i in range(1,self.n_n): + yn = np.sum(f[:,i,:].real,axis=(0)) *2.0 # (KY<0 domain) + labn=r'$\left\langle \left|'+ft+r'_{%d}\right|\right\rangle$'%i + if i>10: + ax.plot(t,yn,':',label=labn,linewidth=2) + else: + ax.plot(t,yn,label=labn,linewidth=2) + + # total intensity + Tn = np.sum(f[:,1:,:].real,axis=(0,1))*2.0 + T0 + ax.plot(self.t,Tn,'r',label='Total',linewidth=4) + + ax.set_xlim(0,t[-1]) + if ymax != 'auto': + ax.set_ylim(top=float(ymax)) + if ymin != 'auto': + ax.set_ylim(bottom=float(ymin)) + ax.legend(loc=3) + + # JC: labels only correct for default norm + head = '(cs/a) t Phi_0/rho_* Phi_n/rho_*' + + fig.tight_layout(pad=0.3) + + return head + + + def plot_triad_v2(self,xin): + + w = xin['w'] + theta = xin['theta'] + field = xin['field'] + ymin = xin['ymin'] + ymax = xin['ymax'] + absnorm = xin['abs'] + moment = xin['moment'] + spec = xin['spec'] + dn = xin['dn'] + + + t = self.getnorm(xin['norm']) + + if xin['fig'] is None: + fig = plt.figure(MYDIR,figsize=(xin['lx'],xin['ly'])) + + self.getbigfield() + + if spec==-1: + f = (self.triad[0,:,:,:,:,:] + 1j*self.triad[1,:,:,:,:,:]).sum(axis=0) + else: + f = self.triad[0,spec,:,:,:,:] + 1j*self.triad[1,spec,:,:,:,:] + # 0- Triad , 1- non-zonal pairs Triad , 2- d(Entropy)/dt, 3- W_k_{perp}/dt, 4- Entropy + # 5- Diss.(radial) , 6- Diss.(theta) , 7- Diss.(Collision) + T = f[:,0,:,:].real + Ent= f[:,2,:,:].real + Wkt= self.triad[0,0,:,3,:,:].real + diss_r= f[:,5,:,:].real + diss_th=f[:,6,:,:].real + diss_c= f[:,7,:,:].real + + ## get heat flux + usec = self.getflux(cflux='auto') + + ys = np.sum(self.ky_flux,axis=(2,3)) + y = ys[:,1,:] # 'Q' + yn = ys[:,0,:] # 'Gamma' + + #====================================== + # Set figure size and axes + ax = fig.add_subplot(111) + ax.grid(which="both",ls=":") + ax.grid(which="major",ls=":") + ax.set_xlabel(self.tstr) + ax.set_title(r'$\mathrm{Triad~energy~transfer~intensity}$'+ r'$-(L_{T_e}/Q_{GBD})$') + #====================================== + + # n=0 intensity + print('HINT: adjust -dn to match experimental dn (rho/a and Lx/a will shrink)') + T0 = np.sum(T[:,0,:],axis=0) + Ent0= np.sum(Ent[:,0,:],axis=0) + Wkt0= np.sum(Wkt[:,0,:],axis=0) *dn**2 + diss_r0 = np.sum(diss_r[:,0,:],axis=0) + diss_th0= np.sum(diss_th[:,0,:],axis=0) + diss_c0 = np.sum(diss_c[:,0,:],axis=0) + + if spec == 1: # electron + ft = r'{L_{T_e} T_{NZF->ZF,e}/Q_{GBD}}' + Sft= r'{L_{T_e} {dS_e \over dt} /Q_{GBD}}' + Qft= r'{q_e/Q_{GBD}}' + Wft= r'{L_{T_e} W_{k_{\perp}}/Q_{GBD}}' + Q = y[spec,:] + G = (self.dlnndr[1] - 1.5*self.dlntdr[1] )*yn[spec,:] / self.dlntdr[1] + + elif spec == 0: # main ion + ft = r'{L_{T_e} T_{NZF->ZF,D} /Q_{GBD}}' + Sft= r'{L_{T_e} {dS_D \over dt} /Q_{GBD}}' + Qft= r'{L_{T_e} q_D/Q_{GBD}L_{T_D}}' + Wft= r'{L_{T_e} W_{k_{\perp}}/Q_{GBD}}' + Q = y[spec,:] * (self.dlntdr[0] /self.dlntdr[1] ) + G = (self.dlnndr[0] - 1.5*self.dlntdr[0] )*yn[spec,:] / self.dlntdr[1] *self.temp[0] + + elif spec == -1: # total + ft = r'{L_{T_e} \Sigma_s T_{NZF->ZF,s} /Q_{GBD}}' + Sft= r'{L_{T_e} \Sigma_s {dS_s \over dt} /Q_{GBD}}' + Qft= r'{L_{T_e} \Sigma_s q_s/Q_{GBD}L_{T_s}}' + Wft= r'{L_{T_e} W_{k_{\perp}}/Q_{GBD}}' + Q = (y[:,:] * self.dlntdr[:,np.newaxis]).sum(axis=0) /self.dlntdr[1] + G = ( (self.dlnndr[:,np.newaxis] - 1.5*self.dlntdr[:,np.newaxis]) \ + *yn[:,:] *self.temp[:,np.newaxis] ).sum(axis=0) / self.dlntdr[1] + diss_rt = r'D_r' + diss_tht= r'D_\theta' + diss_ct = r'D_c' + + lab0=r'$\left\langle \left|'+ft+r'\right|\right\rangle$' + lab1=r'$\left\langle \left|'+Sft+r'\right|\right\rangle$' + lab2=r'$\left\langle \left|'+Wft+r'\right|\right\rangle$' + lab3=r'$\left\langle \left|'+Qft+r'\right|\right\rangle$' + lab4=r'$\left\langle \left|'+diss_rt +r'\right|\right\rangle$' + lab5=r'$\left\langle \left|'+diss_tht+r'\right|\right\rangle$' + lab6=r'$\left\langle \left|'+diss_ct +r'\right|\right\rangle$' + + ax.plot(self.t ,T0 ,label=lab0,linewidth=2) + ax.plot(self.t ,Ent0 ,label=lab1,linewidth=2) + ax.plot(self.t ,Wkt0 ,label=lab2,linewidth=2) + #ax.plot(self.t ,G+Q ,label=lab3,linewidth=2) # q_s = (1/Ln-1.5/LT)T_s*Gamma_s + Q_s + ax.plot(self.t ,diss_r0 ,label=lab4,linewidth=2) + ax.plot(self.t ,diss_th0 ,label=lab5,linewidth=2) + ax.plot(self.t ,diss_c0 ,label=lab6,linewidth=2) + if spec == -1: ax.plot(self.t ,(T0 - Ent0) ,':k', label=r'$ZF: T0-S0 $',linewidth=2) + if spec == -1: ax.plot(self.t ,-(diss_r0+diss_th0+diss_c0) ,':r', label=r'$ZF: Dissipations $',linewidth=2) + + ax.set_xlim(0,t[-1]) + if ymax != 'auto': + ax.set_ylim(top=float(ymax)) + if ymin != 'auto': + ax.set_ylim(bottom=float(ymin)) + ax.legend(loc=1, ncol=3, framealpha=0) + + # JC: labels only correct for default norm + head = '(cs/a) t Phi_0/rho_* Phi_n/rho_*' + + fig.tight_layout(pad=0.3) + + return head + + def plot_flux(self,xin): w = xin['w'] diff --git a/f2py/pygacode/cgyro/data_plot_single.py b/f2py/pygacode/cgyro/data_plot_single.py index 42406e7a4..0ac7df2f5 100644 --- a/f2py/pygacode/cgyro/data_plot_single.py +++ b/f2py/pygacode/cgyro/data_plot_single.py @@ -50,6 +50,7 @@ xin['bar'] = bool(int(sys.argv[25])) xin['ie'] = int(sys.argv[26]) xin['mesh'] = int(sys.argv[27]) +xin['dn'] = int(sys.argv[28]) # plotting control ftype = xin['ftype'] @@ -84,6 +85,14 @@ head,x,y1,y2 = data_in.plot_phi(xin) +elif plot_type == 'triad': + + head = data_in.plot_triad(xin) + +elif plot_type == 'triad_v2': + + head = data_in.plot_triad_v2(xin) + elif plot_type == 'flux': if ftype == 'nox' or ftype == 'dump': diff --git a/platform/exec/exec.MINT_OPENMPI b/platform/exec/exec.MINT_OPENMPI index bcb1db045..4879c7562 100755 --- a/platform/exec/exec.MINT_OPENMPI +++ b/platform/exec/exec.MINT_OPENMPI @@ -9,5 +9,9 @@ numa=${5} mpinuma=${6} cd $simdir +<<<<<<< HEAD mpiexec --use-hwthread-cpus -x OMP_NUM_THREADS=$nomp -np $nmpi $exec +======= +mpiexec -x OMP_NUM_THREADS=$nomp -np $nmpi $exec +>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6 diff --git a/shared/landau/landau.F90 b/shared/landau/landau.F90 index e265f851f..c1c26e2df 100644 --- a/shared/landau/landau.F90 +++ b/shared/landau/landau.F90 @@ -1,6 +1,7 @@ ! gfortran -march=native -O3 -fno-stack-arrays -fimplicit-none -fdefault-real-8 landau.F90 -c !intel: !ifort -stand f15 -warn all -march=native -O3 -heap-arrays 10 -implicitnone -real-size 64 landau.F90 -c +<<<<<<< HEAD #ifdef LANDAU_PREC16 !still cannot use quad precision with nvhpc/24.3 compiler version @@ -17,6 +18,22 @@ module landau ! int gphys(v) d^3v=1=int g(x) x^2 dx *intnor ! Maybe one should normalise this once for a given xmax and density 1? real, parameter :: vunit=real(sqrt(2._PREC)) +======= +#ifndef __PGI + !can't use quad precision with frumpy pgi compiler +#define HIREAL real(16) +#define HRS _16 +#else +#define HIREAL real +#define HRS +#endif +module landau + HIREAL, parameter,private :: pi1=atan(1.HRS)*4 + ! real, parameter :: intnorm=pi1**(-1.5)*4*pi1 + ! int gphys(v) d^3v=1=int g(x) x^2 dx *intnor + ! Maybe one should normalise this once for a given xmax and density 1? + real, parameter :: vunit=real(sqrt(2.HRS)) +>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6 ! v*vunit is v in units of sqrt(T/m) ! v itself is v in units of sqrt(2T/m) ("collision operator units") real, parameter :: intnorm=real(4/sqrt(pi1)) @@ -78,7 +95,11 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu real p0,p1,pi,q0,q1,qi,r0,r1,ri,s0,s1,si,x,xmax1 real f1,f2 integer i,j,k +<<<<<<< HEAD real(PREC) xmaxx,x1 +======= + HIREAL xmaxx,x1 +>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6 logical t1t2,addcutoff1 real t1,t2 ! for timing @@ -288,6 +309,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu if (verbose>0) print '(A,G13.3)','gentestkernel took ',t2-t1 contains +<<<<<<< HEAD elemental real(PREC) function fct1(w) implicit none real(PREC),intent(in) :: w @@ -306,6 +328,26 @@ end function fct1lo elemental real(PREC) function fct1hi(w) implicit none real(PREC),intent(in) :: w +======= + elemental HIREAL function fct1(w) + implicit none + HIREAL,intent(in) :: w + fct1=.125HRS*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5HRS*sqrt(pi1)*erf(w)) + end function fct1 + elemental HIREAL function fct2(w) + implicit none + HIREAL,intent(in) :: w + fct2=.25HRS*w**(-1)*(.5HRS*sqrt(pi1)*erf(w)-w*exp(-w**2)) + end function fct2 + elemental HIREAL function fct1lo(w) + implicit none + HIREAL,intent(in) :: w + fct1lo=fct1(w)-exp(-xmaxx**2)/6 ! for w>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6 ! fct1hi(w)=w**(-3)*((1./24)*exp(-xmaxx**2)*xmaxx*(3-6*w**2+2*xmaxx**2)+(sqrt(pi1)/16)*(2*w**2-1)*erf(xmaxx)) fct1hi=-w**(-3)/48*((-2*exp(-xmaxx**2)*xmaxx*(3+2*xmaxx**2)+3*sqrt(pi1)*erf(xmaxx))+& w**2*(12*exp(-xmaxx**2)*xmaxx-6*sqrt(pi1)*erf(xmaxx))) @@ -313,6 +355,7 @@ elemental real(PREC) function fct1hi(w) !./hhv 5 .5 0 50 1 1 !was due to inaccuracies in this routine, likely due to cancellations in the fcts. end function fct1hi +<<<<<<< HEAD elemental real(PREC) function fct2lo(w) implicit none real(PREC),intent(in) :: w @@ -322,6 +365,17 @@ elemental real(PREC) function fct2hi(w) implicit none real(PREC),intent(in) :: w fct2hi=-w**(-1)*((1._PREC/12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx)) +======= + elemental HIREAL function fct2lo(w) + implicit none + HIREAL,intent(in) :: w + fct2lo=fct2(w)-w*w*exp(-xmaxx**2)/6 !for w>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6 end function fct2hi end subroutine gentestkernel