diff --git a/cgyro/bin/cgyro_parse.py b/cgyro/bin/cgyro_parse.py
index d2f613d11..7f3c86c29 100644
--- a/cgyro/bin/cgyro_parse.py
+++ b/cgyro/bin/cgyro_parse.py
@@ -67,6 +67,7 @@
 x.add('MOMENT_PRINT_FLAG','0')
 x.add('GFLUX_PRINT_FLAG','0')
 x.add('FIELD_PRINT_FLAG','0')
+x.add('TRIAD_PRINT_FLAG','0')
 x.add('AMP0','0.0')
 x.add('AMP','0.1')
 x.add('GAMMA_E','0.0')
diff --git a/cgyro/bin/cgyro_plot b/cgyro/bin/cgyro_plot
index 97cd5d995..102ac03db 100755
--- a/cgyro/bin/cgyro_plot
+++ b/cgyro/bin/cgyro_plot
@@ -45,6 +45,8 @@ then
   echo "                = flux     (time-trace of fluxes for all species)"
   echo "                = ky_flux  (bar plot of flux versus ky)"
   echo "                = phi      (time-dependent ZF vs. finite-n intensities)"
+  echo "                = triad    (triad energy transfer for all ky)"
+  echo "                = triad_v2 (free energy balance of ky=0)"
   echo "                = ky_phi   (time-dependent ky-phi intensities)"
   echo "                = kx_phi   (average kx-phi intensities)"
   echo "                = kx_shift (kx-phi intensities in both domains)"
@@ -411,7 +413,7 @@ PYROOT=$GACODE_ROOT/f2py/pygacode/cgyro
 
 PLT="-m pygacode.cgyro.data_plot_single $TEX $FONTSIZE"
 
-PFLAGS="$PLOT_TYPE $LX $LY $W $NORM $EXT $TIME $FIELD $MOMENT $TMAX $THETA $YMIN $YMAX $KXMIN $KXMAX $N $ABS $FC $LOC $NSCALE $CFLUX $SPECIES $BAR $IE $MESH"
+PFLAGS="$PLOT_TYPE $LX $LY $W $NORM $EXT $TIME $FIELD $MOMENT $TMAX $THETA $YMIN $YMAX $KXMIN $KXMAX $N $ABS $FC $LOC $NSCALE $CFLUX $SPECIES $BAR $IE $MESH $DN"
 
 case "$VIS_TYPE" in
 
@@ -443,7 +445,7 @@ case "$PLOT_TYPE" in
    template)
        cp $PYROOT/plot_template.py . ;;
 
-   freq | ky_freq | error | geo | ball | ky_phi | phi | flux | rcorr_phi | low | zf | ky_flux | shift | corrug | kx_phi | kxky_phi | kx_shift | poly_phi | xflux | hb | hbcut | hball )
+   freq | ky_freq | error | geo | ball | ky_phi | phi | triad | triad_v2 | flux | rcorr_phi | low | zf | ky_flux | shift | corrug | kx_phi | kxky_phi | kx_shift | poly_phi | xflux | hb | hbcut | hball )
        python $PLT $PFLAGS ;;
   
    fluct)
diff --git a/cgyro/bin/cgyrodb b/cgyro/bin/cgyrodb
index a3ea46ff4..4cafe1b1c 100755
--- a/cgyro/bin/cgyrodb
+++ b/cgyro/bin/cgyrodb
@@ -18,17 +18,21 @@ ignore = 'eslshared'
 # Command line option parser
 def opts():
 
+<<<<<<< HEAD
     mytext = '''\
     output:
        NEED DOCUMENTATION HERE
     '''
 
+=======
+>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6
     parser = argparse.ArgumentParser(
         formatter_class=argparse.RawTextHelpFormatter,
         prog = 'cgyrodb',
         description="CGYRO database utility",
         epilog=textwrap.dedent(mytext))
 
+<<<<<<< HEAD
     parser.add_argument('-ref',
                         help='Refresh directories',
                         action='store_true')
@@ -47,6 +51,18 @@ def opts():
     return args.ref,args.json,args.db,args.flux
 
 ref,json,db,doflux = opts()
+=======
+    parser.add_argument('-mode',
+                        help='Mode switch (data,update)',
+                        type=str,
+                        default='data')
+
+    args=parser.parse_args()
+
+    return args.mode
+
+mode = opts()
+>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6
 
 meta = {}
 
@@ -87,6 +103,8 @@ def gendict(sim,doflux):
 
    return mydict
 
+ignore = 'eslshared'
+
 # First locate all directories
 y = []
 for root,xd,xf in os.walk('./'):
@@ -97,6 +115,7 @@ for root,xd,xf in os.walk('./'):
 if ref:
    # Run through directories and update
    for mdir in y:
+<<<<<<< HEAD
       print('Refreshing '+mdir)
       os.system('cd '+mdir+' ; python $GACODE_ROOT/cgyro/bin/cgyro_parse.py')
       os.system('cd '+mdir+' ; $GACODE_ROOT/cgyro/src/cgyro ')
@@ -108,6 +127,16 @@ if json:
       os.system('cgyro_json -e '+mdir)
 
 if db:
+=======
+<<<<<<< HEAD
+      print('Updating '+mdir) 
+      os.system('cgyro_plot -plot text -e '+mdir)
+=======
+      print('Updating '+mdir)
+>>>>>>> ad9e1b42 (Added ignore feature)
+      os.system('cgyro -t '+mdir)
+else:
+>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6
    # Create master dictionary "meta"
    for mdir in tqdm(y):
       sim = cgyrodata(mdir+'/',silent=True)
diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90
index 52599e119..b5c8be3df 100644
--- a/cgyro/src/cgyro_flux.f90
+++ b/cgyro/src/cgyro_flux.f90
@@ -44,6 +44,7 @@ subroutine cgyro_flux
   complex :: cprod
   real, parameter :: x_fraction=0.2
   real :: u
+  real :: kx
 
 !$omp parallel do private(iv_loc,iv,is,ix,ie,dv,vpar,ic,ir,it,erot,cprod,cn) &
 !$omp&            private(prod1,prod2,prod3,l,icl,dvr,u,flux_norm) &
@@ -188,6 +189,34 @@ subroutine cgyro_flux
         cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)+2*sin(u)*real(gflux_loc(l,:,:,:,itor))/u
      enddo
 
+
+  !-------------------------------------------------------------
+  ! 2-1. Compute Triad energy transfer
+  !-------------------------------------------------------------
+
+     kx = 2*pi*rho/length
+     do is=1,n_species
+        ! Triad energy transfer   :   T_k
+        triad_loc_old(is,:,itor,1) = triad_loc(is,:,itor,1) *temp(is)/dlntdr(is_ele)
+        ! From Nonzonal Triad energy transfer   :   T_k [NZ(k',k")->k]
+        triad_loc_old(is,:,itor,2) = triad_loc(is,:,itor,2) *temp(is)/dlntdr(is_ele)
+        ! dEntropy / dt   :   dS_k/dt 
+        triad_loc_old(is,:,itor,3) = (triad_loc(is,:,itor,3)-triad_loc_old(is,:,itor,3))/delta_t
+        triad_loc_old(is,:,itor,3) = triad_loc_old(is,:,itor,3) *temp(is)/dlntdr(is_ele)*0.5
+        ! dWk_perp / dt    , (ky = 0)  
+        triad_loc_old(is,:,itor,4) = (triad_loc(is,:,itor,4)-triad_loc_old(is,:,itor,4))/delta_t
+        triad_loc_old(is,:,itor,4) = triad_loc_old(is,:,itor,4) *(kx*lambda_star)**2/dlntdr(is_ele)*0.5
+        ! Entropy   :   S_k
+        triad_loc_old(is,:,itor,5) = triad_loc(is,:,itor,3) *temp(is)/dlntdr(is_ele)*0.5
+        ! Diss. (radial)
+        triad_loc_old(is,:,itor,6) = triad_loc(is,:,itor,5) *temp(is)/dlntdr(is_ele)
+        ! Diss. (theta )
+        triad_loc_old(is,:,itor,7) = triad_loc(is,:,itor,6) *temp(is)/dlntdr(is_ele)
+        ! Diss. (Coll. )
+        triad_loc_old(is,:,itor,8) = triad_loc(is,:,itor,7) *temp(is)/dlntdr(is_ele)
+     enddo
+
+
      !-----------------------------------------------------
      ! 3. Renormalize fluxes to GB or quasilinear forms
      !~----------------------------------------------------
@@ -216,7 +245,7 @@ subroutine cgyro_flux
         ! GyroBohm normalizations
         gflux_loc(:,:,:,:,itor) = gflux_loc(:,:,:,:,itor)/rho**2
         cflux_loc(:,:,:,itor) = cflux_loc(:,:,:,itor)/rho**2
-
+        triad_loc_old(:,:,itor,:)  = triad_loc_old(:,:,itor,:)/rho**2
      endif
   enddo
 
@@ -240,6 +269,15 @@ subroutine cgyro_flux
        NEW_COMM_1, &
        i_err)
 
+  ! Reduced complex triad(ns,kx), below, is still distributed over n 
+  call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), &
+       triad, &
+       size(triad), &
+       MPI_DOUBLE_COMPLEX, &
+       MPI_SUM, &
+       NEW_COMM_1, &
+       i_err)
+
   ! Reduced real cflux(ky), below, is still distributed over n 
 
   call MPI_ALLREDUCE(cflux_loc, &
diff --git a/cgyro/src/cgyro_globals.F90 b/cgyro/src/cgyro_globals.F90
index 61ec763d2..ea0c88f50 100644
--- a/cgyro/src/cgyro_globals.F90
+++ b/cgyro/src/cgyro_globals.F90
@@ -80,6 +80,7 @@ module cgyro_globals
   integer :: moment_print_flag
   integer :: gflux_print_flag
   integer :: field_print_flag
+  integer :: triad_print_flag
   real :: amp0
   real :: amp
   real :: gamma_e
@@ -192,7 +193,7 @@ module cgyro_globals
   integer :: nsplit,nsplitA,nsplitB
   integer :: ns1,ns2
   integer, dimension(:), allocatable :: recv_status
-  integer :: fA_req, fB_req, g_req
+  integer :: fA_req, fB_req, eA_req, eB_req, g_req
   logical :: fA_req_valid, fB_req_valid, g_req_valid
   ! Thetas present in the process after NL AllToAll
   integer :: n_jtheta
@@ -243,6 +244,7 @@ module cgyro_globals
   character(len=12) :: binfile_hb      = 'bin.cgyro.hb'
   character(len=17) :: binfile_ky_flux = 'bin.cgyro.ky_flux'
   character(len=18) :: binfile_ky_cflux = 'bin.cgyro.ky_cflux'
+  character(len=15) :: binfile_triad   = 'bin.cgyro.triad'
   character(len=15), dimension(3) :: binfile_fieldb = &
        (/'bin.cgyro.phib ','bin.cgyro.aparb','bin.cgyro.bparb'/)
   character(len=16), dimension(3) :: binfile_kxky = &
@@ -344,11 +346,14 @@ module cgyro_globals
   complex, dimension(:,:,:), allocatable :: h0_x
   complex, dimension(:,:,:), allocatable :: h0_old
   complex, dimension(:,:,:,:), allocatable :: fA_nl,fB_nl
+  complex, dimension(:,:,:,:), allocatable :: eA_nl,eB_nl
   complex, dimension(:,:,:,:), allocatable :: g_nl
   complex, dimension(:,:,:), allocatable :: fpackA,fpackB
+  complex, dimension(:,:,:), allocatable :: epackA,epackB
   complex, dimension(:,:,:,:), allocatable :: gpack
   complex, dimension(:,:,:), allocatable :: omega_cap_h
   complex, dimension(:,:,:), allocatable :: omega_h
+  complex, dimension(:,:,:), allocatable :: diss_r
   complex, dimension(:,:,:,:), allocatable :: omega_s,omega_ss
   complex, dimension(:,:,:), allocatable :: omega_sbeta
   complex, dimension(:,:,:), allocatable :: cap_h_c
@@ -382,6 +387,7 @@ module cgyro_globals
   real, dimension(:,:,:,:), allocatable :: cflux
   complex, dimension(:,:,:,:,:), allocatable :: gflux_loc
   complex, dimension(:,:,:,:,:), allocatable :: gflux
+  complex, dimension(:,:,:,:), allocatable :: triad,triad_loc,triad_loc_old
   real, dimension(:,:), allocatable :: cflux_tave, gflux_tave
   real :: tave_min, tave_max
   integer :: tave_step
diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90
index 1377dd3fb..acd4d99e0 100644
--- a/cgyro/src/cgyro_init_arrays.F90
+++ b/cgyro/src/cgyro_init_arrays.F90
@@ -447,7 +447,14 @@ subroutine cgyro_init_arrays
                 + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) &
                 + abs(omega_rot_drift_r(it,is)) &
                 + abs(omega_rot_edrift_r(it)))          
-             
+
+            ! (d/dr) upwind diss.
+            diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial &
+             * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) &
+             + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) &
+             + abs(omega_rot_drift_r(it,is)) &
+             + abs(omega_rot_edrift_r(it))) 
+
            ! omega_star 
            carg = &
                 -i_c*k_theta_base*itor*rho*(dlnndr(is)+dlntdr(is)*(energy(ie)-1.5)) &
diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90
index 956fb3372..8a2644e75 100644
--- a/cgyro/src/cgyro_init_manager.F90
+++ b/cgyro/src/cgyro_init_manager.F90
@@ -178,6 +178,11 @@ subroutine cgyro_init_manager
      allocate(cflux_loc(n_species,4,n_field,nt1:nt2))
      allocate(    gflux(0:n_global,n_species,4,n_field,nt1:nt2))
      allocate(gflux_loc(0:n_global,n_species,4,n_field,nt1:nt2))
+
+     allocate(    triad(n_species,n_radial,nt1:nt2,8))
+     allocate(triad_loc(n_species,n_radial,nt1:nt2,7))
+     allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8))
+
      allocate(cflux_tave(n_species,4))
      allocate(gflux_tave(n_species,4))
 
@@ -257,6 +262,7 @@ subroutine cgyro_init_manager
      allocate(cap_h_v(nc_loc,nt1:nt2,nv))
      allocate(omega_cap_h(nc,nv_loc,nt1:nt2))
      allocate(omega_h(nc,nv_loc,nt1:nt2))
+     allocate(diss_r(nc,nv_loc,nt1:nt2))
      allocate(omega_s(n_field,nc,nv_loc,nt1:nt2))
      allocate(omega_ss(n_field,nc,nv_loc,nt1:nt2))
      allocate(omega_sbeta(nc,nv_loc,nt1:nt2))
@@ -296,8 +302,10 @@ subroutine cgyro_init_manager
      ! Nonlinear arrays
      if (nonlinear_flag == 1) then
         allocate(fA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
+        allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
         allocate(g_nl(n_field,n_radial,n_jtheta,n_toroidal))
         allocate(fpackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
+        allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
         allocate(gpack(n_field,n_radial,n_jtheta,n_toroidal))
         allocate(jvec_c_nl(n_field,n_radial,n_jtheta,nv_loc,n_toroidal))
 #if defined(OMPGPU)
@@ -308,6 +316,8 @@ subroutine cgyro_init_manager
         if (nsplitB > 0) then ! nsplitB can be zero at large MPI
           allocate(fB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
           allocate(fpackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
+          allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
+          allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
 #if defined(OMPGPU)
 !$omp target enter data map(alloc:fpackB,fB_nl)
 #elif defined(_OPENACC)
diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90
index 674d75e94..6392d334d 100644
--- a/cgyro/src/cgyro_nl_comm.F90
+++ b/cgyro/src/cgyro_nl_comm.F90
@@ -304,6 +304,316 @@ subroutine cgyro_nl_fftw_comm1_r(ij)
 end subroutine cgyro_nl_fftw_comm1_r
 
 
+subroutine cgyro_nl_fftw_comm1_r_triad(ij)
+  use timer_lib
+  use parallel_lib
+  use cgyro_globals
+
+  implicit none
+
+  !-----------------------------------
+  integer, intent(in) :: ij
+  !-----------------------------------
+  
+  integer :: is,ix,ie
+  integer :: id,itd,itd_class,jr0(0:2),itorbox,jc
+  integer :: ir,it,iv_loc_m,ic_loc_m,itor
+  integer :: iexch0,itor0,isplit0,iexch_base
+  complex :: my_psi
+  real :: psi_mul
+
+  real :: dv,dvr,dvp,rval,rval2
+  complex :: cprod,cprod2,thfac
+
+  triad_loc_old(:,:,:,3)=triad_loc(:,:,:,3)
+  triad_loc_old(:,:,:,4)=triad_loc(:,:,:,4)
+  triad_loc(:,:,:,:)=0.0
+
+  call timer_lib_in('nl_comm')
+  call parallel_slib_r_nc_wait(nsplitA,fA_nl,fpackA,fA_req)
+  call parallel_slib_r_nc_wait(nsplitA,eA_nl,epackA,eA_req)
+  fA_req_valid = .FALSE.
+  if (nsplitB > 0) then
+    ! no major compute to overlap
+    call parallel_slib_r_nc_wait(nsplitB,fB_nl,fpackB,fB_req)
+    call parallel_slib_r_nc_wait(nsplitB,eB_nl,epackB,eB_req)
+    fB_req_valid = .FALSE.
+  endif
+  call timer_lib_out('nl_comm')
+
+  call timer_lib_in('nl')
+
+  psi_mul = (q*rho/rmin)*(2*pi/length)
+
+  if (nsplitB > 0) then
+
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do simd collapse(4) &
+!$omp&         private(iexch0,itor0,isplit0,iexch_base) &
+!$omp&         private(ic_loc_m,my_psi)
+#elif defined(_OPENACC)
+!$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) &
+!$acc&         private(iexch0,itor0,isplit0,iexch_base) &
+!$acc&         present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) &
+!$acc&         present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(ij) default(none)
+#else
+!$omp parallel do collapse(2) private(ic_loc_m,my_psi) &
+!$omp&         private(iexch0,itor0,isplit0,iexch_base,is,ix,ie,dv,dvr,dvp,rval,rval2,cprod,cprod2) &
+!$omp&         private(id,itorbox,jr0,jc,itd,itd_class,thfac)
+#endif
+  do itor=nt1,nt2
+    do iv_loc_m=1,nv_loc
+      do ir=1,n_radial
+        itorbox = itor*box_size*sign_qs
+        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+        jr0(1) = n_theta*(ir-1)
+        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+
+        do it=1,n_theta
+           ic_loc_m = ic_c(ir,it)
+
+           is = is_v(iv_loc_m +nv1 -1 )
+           ix = ix_v(iv_loc_m +nv1 -1 )
+           ie = ie_v(iv_loc_m +nv1 -1 )
+           dv = w_exi(ie,ix)
+           dvr  = w_theta(it)*dens2_rot(it,is)*dv
+           dvp = w_theta(it)*dv*(ir-1-nx0/2)**2
+
+           ! Density moment
+           cprod = w_theta(it)*cap_h_c(ic_loc_m,iv_loc_m,itor)*dvjvec_c(1,ic_loc_m,iv_loc_m,itor)/z(is)
+           cprod = -(dvr*z(is)/temp(is)*field(1,ic_loc_m,itor)-cprod)
+           cprod2= ( jvec_c(1,ic_loc_m,iv_loc_m,itor)*z(is)/temp(is) )*conjg(field(1,ic_loc_m,itor) )
+
+           iexch0 = (iv_loc_m-1) + (it-1)*nv_loc
+           itor0 = iexch0/nsplit
+           isplit0 = modulo(iexch0,nsplit)
+           if (isplit0 < nsplitA) then
+              iexch_base = 1+itor0*nsplitA
+              my_psi = fpackA(ir,itor-nt1+1,iexch_base+isplit0)
+
+              ! 1. Triad energy transfer (all)
+              triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) &
+               + fpackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+              ! 2. Triad energy transfer ( {NZF-NZF} coupling )
+              if (itor == 0) then
+                ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ]
+                triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2)  &
+               + epackA(ir,itor-nt1+1,iexch_base+isplit0)*cprod2*dvr*psi_mul
+              else
+              triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) &
+               + epackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+              endif
+           else
+              iexch_base = 1+itor0*nsplitB
+              my_psi = fpackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))
+
+              ! 1. Triad energy transfer (all)
+              triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) &
+               + fpackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+              ! 2. Triad energy transfer ( {NZF-NZF} coupling )
+              if (itor == 0) then
+                ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ]
+                triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2)  &
+               + epackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*cprod2*dvr*psi_mul
+              else
+                triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2)  &
+               + epackB(ir,itor-nt1+1,iexch_base+(isplit0-nsplitA))*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+              endif
+           endif
+
+           ! 3. Entropy 
+           triad_loc(is,ir,itor,3) = triad_loc(is,ir,itor,3) &
+                + cap_h_c(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr &
+                - field(1,ic_loc_m,itor)*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))**2*dvr &
+                - 2.0*cprod*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))
+           ! 4. Field potential
+           triad_loc(is,ir,itor,4) = triad_loc(is,ir,itor,4)  & 
+                + sum( field(:,ic_loc_m,itor)*conjg(field(:,ic_loc_m,itor)) )*dvp
+           ! 5. Diss. (radial)
+           triad_loc(is,ir,itor,5) = triad_loc(is,ir,itor,5)  &  
+                + diss_r(ic_loc_m,iv_loc_m,itor)*h_x(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr
+           ! 6. Diss. (theta )
+           rval = omega_stream(it,is,itor)*vel(ie)*xi(ix)
+           rval2 = abs(omega_stream(it,is,itor))
+           cprod = 0.0 
+           cprod2= 0.0
+
+          !icd_c(ic, id, itor)     = ic_c(jr,modulo(it+id-1,n_theta)+1)
+          !jc = icd_c(ic, id, itor)
+          !dtheta(ic, id, itor)    := cderiv(id)*thfac
+          !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta
+          itd = n_theta+it-nup_theta
+          itd_class = 0
+          jc = jr0(itd_class)+itd
+          thfac = thfac_itor(itd_class,itor)
+
+           do id=-nup_theta,nup_theta
+              if (itd > n_theta) then
+                ! move to next itd_class of compute
+                itd = itd - n_theta
+                itd_class = itd_class + 1
+                jc = jr0(itd_class)+itd
+                thfac = thfac_itor(itd_class,itor)
+              endif
+
+              cprod2 = cprod2 - rval* thfac*cderiv(id) *cap_h_c(jc,iv_loc_m,itor)
+              cprod = cprod - rval2* uderiv(id)*up_theta *g_x(jc,iv_loc_m,itor)
+              itd = itd + 1
+              jc = jc + 1
+           enddo 
+
+           triad_loc(is,ir,itor,6) = triad_loc(is,ir,itor,6) + cprod*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr
+           ! 7. Diss. (Coll. = Implicit advance - theta_streaming )
+           triad_loc(is,ir,itor,7) = triad_loc(is,ir,itor,7) &
+                 + ( cap_h_ct(iv_loc_m,itor,ic_loc_m)/delta_t + cprod2*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor)) )*dvr
+
+
+           if ( (itor == 0) .and.  (ir == 1 .or. px(ir) == 0) ) then
+              ! filter
+              my_psi = (0.0,0.0)
+           endif
+ 
+           if (itor == 0) then
+              my_psi = my_psi*zf_scale
+           endif
+
+           ! RHS -> -[f,g] = [f,g]_{r,-alpha}
+           rhs(ic_loc_m,iv_loc_m,itor,ij) = rhs(ic_loc_m,iv_loc_m,itor,ij)+psi_mul*my_psi
+        enddo
+      enddo
+    enddo
+  enddo
+
+  else ! nsplitB==0
+
+#if defined(OMPGPU)
+!$omp target teams distribute parallel do simd collapse(4) &
+!$omp&         private(iexch0,itor0,isplit0,iexch_base) &
+!$omp&         private(ic_loc_m,my_psi)
+#elif defined(_OPENACC)
+!$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) &
+!$acc&         private(iexch0,itor0,isplit0,iexch_base) &
+!$acc&         present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) &
+!$acc&         present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none)
+#else
+!$omp parallel do collapse(2) private(ic_loc_m,my_psi) &
+!$omp&         private(iexch0,itor0,isplit0,iexch_base,is,ix,ie,dv,dvr,dvp,rval,rval2,cprod,cprod2) &
+!$omp&         private(id,itorbox,jr0,jc,itd,itd_class,thfac)
+#endif
+  do itor=nt1,nt2
+    do iv_loc_m=1,nv_loc
+      do ir=1,n_radial
+        itorbox = itor*box_size*sign_qs
+        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+        jr0(1) = n_theta*(ir-1)
+        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+
+        do it=1,n_theta
+           ic_loc_m = ic_c(ir,it)
+
+           is = is_v(iv_loc_m +nv1 -1 )
+           ix = ix_v(iv_loc_m +nv1 -1 )
+           ie = ie_v(iv_loc_m +nv1 -1 )
+           dv = w_exi(ie,ix)
+           dvr  = w_theta(it)*dens2_rot(it,is)*dv
+           dvp = w_theta(it)*dv*(ir-1-nx0/2)**2
+
+           ! Density moment
+           cprod = w_theta(it)*cap_h_c(ic_loc_m,iv_loc_m,itor)*dvjvec_c(1,ic_loc_m,iv_loc_m,itor)/z(is)
+           cprod = -(dvr*z(is)/temp(is)*field(1,ic_loc_m,itor)-cprod)
+           cprod2= ( jvec_c(1,ic_loc_m,iv_loc_m,itor)*z(is)/temp(is) )*conjg(field(1,ic_loc_m,itor) )
+
+
+           iexch0 = (iv_loc_m-1) + (it-1)*nv_loc
+           itor0 = iexch0/nsplit
+           isplit0 = modulo(iexch0,nsplit)
+           iexch_base = 1+itor0*nsplitA
+           my_psi = fpackA(ir,itor-nt1+1,iexch_base+isplit0)
+     
+
+           ! 1. Triad energy transfer (all)
+           triad_loc(is,ir,itor,1) = triad_loc(is,ir,itor,1) &
+               + fpackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+           ! 2. Triad energy transfer ( {NZF-NZF} coupling )
+           if (itor == 0) then
+             ! New : Diag. direct ZF production [A. Ishizawa PRL 2019 ]
+             triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2)  &
+               + epackA(ir,itor-nt1+1,iexch_base+isplit0)*cprod2*dvr*psi_mul
+           else
+           triad_loc(is,ir,itor,2) = triad_loc(is,ir,itor,2) &
+               + epackA(ir,itor-nt1+1,iexch_base+isplit0)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr*psi_mul
+           endif
+           
+           ! 3. Entropy 
+           triad_loc(is,ir,itor,3) = triad_loc(is,ir,itor,3) &
+                + cap_h_c(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr &
+                - field(1,ic_loc_m,itor)*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))**2*dvr &
+                - 2.0*cprod*conjg(field(1,ic_loc_m,itor))*(z(is)/temp(is))
+           ! 4. Field potential
+           triad_loc(is,ir,itor,4) = triad_loc(is,ir,itor,4)  & 
+                + sum( field(:,ic_loc_m,itor)*conjg(field(:,ic_loc_m,itor)) )*dvp
+           ! 5. Diss. (radial)
+           triad_loc(is,ir,itor,5) = triad_loc(is,ir,itor,5)  &  
+                + diss_r(ic_loc_m,iv_loc_m,itor)*h_x(ic_loc_m,iv_loc_m,itor)*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr
+           ! 6. Diss. (theta )
+           rval = omega_stream(it,is,itor)*vel(ie)*xi(ix)
+           rval2 = abs(omega_stream(it,is,itor))
+           cprod = 0.0 
+           cprod2= 0.0
+
+          !icd_c(ic, id, itor)     = ic_c(jr,modulo(it+id-1,n_theta)+1)
+          !jc = icd_c(ic, id, itor)
+          !dtheta(ic, id, itor)    := cderiv(id)*thfac
+          !dtheta_up(ic, id, itor) := uderiv(id)*thfac*up_theta
+          itd = n_theta+it-nup_theta
+          itd_class = 0
+          jc = jr0(itd_class)+itd
+          thfac = thfac_itor(itd_class,itor)
+
+           do id=-nup_theta,nup_theta
+              if (itd > n_theta) then
+                ! move to next itd_class of compute
+                itd = itd - n_theta
+                itd_class = itd_class + 1
+                jc = jr0(itd_class)+itd
+                thfac = thfac_itor(itd_class,itor)
+              endif
+
+              cprod2 = cprod2 - rval* thfac*cderiv(id) *cap_h_c(jc,iv_loc_m,itor)
+              cprod = cprod - rval2* uderiv(id)*up_theta *g_x(jc,iv_loc_m,itor)
+              itd = itd + 1
+              jc = jc + 1
+           enddo 
+
+           triad_loc(is,ir,itor,6) = triad_loc(is,ir,itor,6) + cprod*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor))*dvr
+           ! 7. Diss. (Coll. = Implicit advance - theta_streaming )
+           triad_loc(is,ir,itor,7) = triad_loc(is,ir,itor,7) &
+                 + ( cap_h_ct(iv_loc_m,itor,ic_loc_m)/delta_t + cprod2*conjg(cap_h_c(ic_loc_m,iv_loc_m,itor)) )*dvr
+
+
+           if ( (itor == 0) .and.  (ir == 1 .or. px(ir) == 0) ) then
+              ! filter
+              my_psi = (0.0,0.0)
+           endif
+
+           if (itor == 0) then
+              my_psi = my_psi*zf_scale
+           endif
+           
+           ! RHS -> -[f,g] = [f,g]_{r,-alpha}
+           rhs(ic_loc_m,iv_loc_m,itor,ij) = rhs(ic_loc_m,iv_loc_m,itor,ij)+psi_mul*my_psi
+        enddo
+      enddo
+    enddo
+  enddo
+
+  endif ! if nsplitB>0
+
+  call timer_lib_out('nl')
+
+end subroutine cgyro_nl_fftw_comm1_r_triad
+
+
 !
 ! Comm2 is a transpose
 ! Reminder: nc ~= n_radial*n_theta
diff --git a/cgyro/src/cgyro_nl_fftw.F90 b/cgyro/src/cgyro_nl_fftw.F90
index 4c5407d85..d56143bbf 100644
--- a/cgyro/src/cgyro_nl_fftw.F90
+++ b/cgyro/src/cgyro_nl_fftw.F90
@@ -1,5 +1,5 @@
 !-----------------------------------------------------------------
-! cgyro_nl_fftw.F90
+! cgyro_nl_fftw.f90
 !
 ! PURPOSE:
 !  Evaluate nonlinear bracket with dealiased FFT.  It is natural 
@@ -10,460 +10,183 @@
 ! NOTE: Need to be careful with (p=-nr/2,n=0) component.
 !-----------------------------------------------------------------
 
-#if defined(_OPENACC) || defined(OMPGPU)
-#define CGYRO_GPU_FFT
-#endif
-
-!-----------------------------------------------------------------
-! cgyro_nl_fftw_init
-!-----------------------------------------------------------------
+subroutine cgyro_nl_fftw_init
+  use cgyro_globals
+  implicit none
+  include 'fftw3.f03'
 
-#ifdef CGYRO_GPU_FFT
+  ! Create plans once and for all, with global arrays fx,ux
+  plan_c2r = fftw_plan_dft_c2r_2d(nx,ny,fx(:,:,1),uxmany(:,:,1),FFTW_PATIENT)
+  plan_r2c = fftw_plan_dft_r2c_2d(nx,ny,uv(:,:,1),fx(:,:,1),FFTW_PATIENT)
 
-#if defined(MKLGPU)
-include 'fftw/offload/fftw3_omp_offload.f90'
-#endif
+end subroutine cgyro_nl_fftw_init
 
-subroutine cgyro_nl_fftw_init
+subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp)
 
-#if defined(HIPGPU)
-  use hipfort_hipfft
-#elif defined(MKLGPU)
-  use fftw3_omp_offload
-#else
-  use cufft
-#endif
+  use timer_lib
+  use parallel_lib
   use cgyro_globals
 
   implicit none
-#if defined(MKLGPU)
-  include 'fftw/fftw3.f'
-#endif
-
-  integer :: howmany,istatus
-  integer, parameter :: irank = 2
-  integer, dimension(irank) :: ndim,inembed,onembed
-  integer :: idist,odist
-  integer, parameter :: istride = 1
-  integer, parameter :: ostride = 1
-
-#if !defined(MKLGPU)
-  integer, parameter :: singlePrecision = selected_real_kind(6,30)
-#endif
-
-  !-------------------------------------------------------------------
-  ! 2D
-  !   input[ b*idist + (x * inembed[1] + y)*istride ]
-  !  output[ b*odist + (x * onembed[1] + y)*ostride ]
-  !  isign is the sign of the exponent in the formula that defines
-  !  Fourier transform  -1 == FFTW_FORWARD
-  !                      1 == FFTW_BACKWARD
-  !-------------------------------------------------------------------
-
-#if defined(MKLGPU)
-  ! oneMKL offload uses the reverse ordering
-  ndim(2) = nx
-  ndim(1) = ny
-#else
-  ndim(1) = nx
-  ndim(2) = ny
-#endif
-  idist = size(fxmany,1)*size(fxmany,2)
-  odist = size(uxmany,1)*size(uxmany,2)
-  inembed = size(fxmany,1)
-  onembed = size(uxmany,1)
-#if defined(MKLGPU)
-  inembed(2) = size(fxmany,2)
-  onembed(2) = size(uxmany,2)
-#endif
-
-#if defined(HIPGPU)
-  hip_plan_c2r_manyA = c_null_ptr
-  istatus = hipfftPlanMany(&
-       hip_plan_c2r_manyA, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(HIPFFT_C2R,HIPFFT_Z2D,kind(uxmany) == singlePrecision), &
-       nsplitA)
-
-  if (nsplitB > 0) then ! no fft if nsplitB==0
-    hip_plan_c2r_manyB = c_null_ptr
-    istatus = hipfftPlanMany(&
-       hip_plan_c2r_manyB, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(HIPFFT_C2R,HIPFFT_Z2D,kind(uxmany) == singlePrecision), &
-       nsplitB)
-  endif
 
-  hip_plan_c2r_manyG = c_null_ptr
-  istatus = hipfftPlanMany(&
-       hip_plan_c2r_manyG, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(HIPFFT_C2R,HIPFFT_Z2D,kind(uxmany) == singlePrecision), &
-       nsplit)
-#elif defined(MKLGPU)
-     dfftw_plan_c2r_manyA = 0
-!$omp target data map(tofrom: fymany,uymany)
-     !$omp dispatch
-     call dfftw_plan_many_dft_c2r(&
-          dfftw_plan_c2r_manyA, &
-          irank, &
-          ndim, &
-          nsplitA, &
-          fymany, &
-          inembed, &
-          istride, &
-          idist, &
-          uymany, &
-          onembed, &
-          ostride, &
-          odist, &
-          FFTW_ESTIMATE)
-
-  if (nsplitB > 0) then ! no fft if nsplitB==0
-     dfftw_plan_c2r_manyB = 0
-     !$omp dispatch
-     call dfftw_plan_many_dft_c2r(&
-          dfftw_plan_c2r_manyB, &
-          irank, &
-          ndim, &
-          nsplitB, &
-          fymany, &
-          inembed, &
-          istride, &
-          idist, &
-          uymany, &
-          onembed, &
-          ostride, &
-          odist, &
-          FFTW_ESTIMATE)
-  endif
-!$omp end target data
-
-     dfftw_plan_c2r_manyG = 0
-!$omp target data map(tofrom: gymany,vymany)
-     !$omp dispatch
-     call dfftw_plan_many_dft_c2r(&
-          dfftw_plan_c2r_manyG, &
-          irank, &
-          ndim, &
-          nsplit, &
-          gymany, &
-          inembed, &
-          istride, &
-          idist, &
-          vymany, &
-          onembed, &
-          ostride, &
-          odist, &
-          FFTW_ESTIMATE)
-!$omp end target data
-
-#else
-  istatus = cufftPlanMany(&
-       cu_plan_c2r_manyA, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(CUFFT_C2R,CUFFT_Z2D,kind(uxmany) == singlePrecision), &
-       nsplitA)
-
-  if (nsplitB > 0) then ! no fft if nsplitB==0
-    istatus = cufftPlanMany(&
-       cu_plan_c2r_manyB, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(CUFFT_C2R,CUFFT_Z2D,kind(uxmany) == singlePrecision), &
-       nsplitB)
-  endif
+  integer, intent(in) :: g_j, f_j
+  integer,intent(in) :: nl_idx ! 1=>A, 2=>B
+  integer,intent(in) :: i_omp
+  integer :: ix,iy
+  integer :: ir,itm,itl,itor
 
-  istatus = cufftPlanMany(&
-       cu_plan_c2r_manyG, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(CUFFT_C2R,CUFFT_Z2D,kind(uxmany) == singlePrecision), &
-       nsplit)
-#endif
-
-  idist = size(uxmany,1)*size(uxmany,2)
-  odist = size(fxmany,1)*size(fxmany,2)
-  inembed = size(uxmany,1)
-  onembed = size(fxmany,1) 
-#if defined(MKLGPU)
-  inembed(2) = size(uxmany,2)
-  onembed(2) = size(fxmany,2) 
-#endif
-
-#if defined(HIPGPU)
-  hip_plan_r2c_manyA = c_null_ptr
-  istatus = hipfftPlanMany(&
-       hip_plan_r2c_manyA, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(HIPFFT_R2C,HIPFFT_D2Z,kind(uxmany) == singlePrecision), &
-       nsplitA)
-
-  if (nsplitB > 0) then ! no fft if nsplitB==0
-    hip_plan_r2c_manyB = c_null_ptr
-    istatus = hipfftPlanMany(&
-       hip_plan_r2c_manyB, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(HIPFFT_R2C,HIPFFT_D2Z,kind(uxmany) == singlePrecision), &
-       nsplitB)
-  endif
-#elif defined(MKLGPU)
-     dfftw_plan_r2c_manyA = 0
-!$omp target data map(tofrom: uvmany,fxmany)
-     !$omp dispatch
-     call dfftw_plan_many_dft_r2c(&
-          dfftw_plan_r2c_manyA, &
-          irank, &
-          ndim, &
-          nsplitA, &
-          uvmany, &
-          inembed, &
-          istride, &
-          idist, &
-          fxmany, &
-          onembed, &
-          ostride, &
-          odist, &
-          FFTW_ESTIMATE)
-
-  if (nsplitB > 0) then ! no fft if nsplitB==0
-     dfftw_plan_r2c_manyB = 0
-     !$omp dispatch
-     call dfftw_plan_many_dft_r2c(&
-          dfftw_plan_r2c_manyB, &
-          irank, &
-          ndim, &
-          nsplitB, &
-          uvmany, &
-          inembed, &
-          istride, &
-          idist, &
-          fxmany, &
-          onembed, &
-          ostride, &
-          odist, &
-          FFTW_ESTIMATE)
-  endif
-!$omp end target data
-
-#else
-  istatus = cufftPlanMany(&
-       cu_plan_r2c_manyA, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(CUFFT_R2C,CUFFT_D2Z,kind(uxmany) == singlePrecision), &
-       nsplitA)
-
-  if (nsplitB > 0) then ! no fft if nsplitB==0
-    istatus = cufftPlanMany(&
-       cu_plan_r2c_manyB, &
-       irank, &
-       ndim, &
-       inembed, &
-       istride, &
-       idist, &
-       onembed, &
-       ostride, &
-       odist, &
-       merge(CUFFT_R2C,CUFFT_D2Z,kind(uxmany) == singlePrecision), &
-       nsplitB)
-  endif
-#endif
+  include 'fftw3.f03'
 
-end subroutine cgyro_nl_fftw_init
+  ! Poisson bracket in real space
+  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
 
-#else  /* if not defined CGYRO_GPU_FFT */
+  call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
-subroutine cgyro_nl_fftw_init
+  ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
+  ! that will be filtered in the main time-stepping loop
+
+  ! this should really be accounted against nl_mem, but hard to do with OMP
+  do itm=1,n_toroidal_procs
+   do itl=1,nt_loc
+    itor=itl + (itm-1)*nt_loc
+    do ir=1,n_radial
+     ix = ir-1-nx0/2
+     if (ix < 0) ix = ix+nx
+     iy = itor-1
+     if (nl_idx==1) then
+       fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     else
+       fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     endif
+    enddo
+   enddo
+  enddo
+
+end subroutine cgyro_nl_fftw_stepr
+
+subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp)
+
+  use timer_lib
+  use parallel_lib
   use cgyro_globals
+
   implicit none
+
+  integer, intent(in) :: g_j, f_j
+  integer,intent(in) :: nl_idx ! 1=>A, 2=>B
+  integer,intent(in) :: i_omp
+  real :: y_mean(nx)
+  integer :: ix,iy
+  integer :: ir,itm,itl,itor
+
   include 'fftw3.f03'
 
-  ! Create plans once and for all, with global arrays fx,ux
-  plan_c2r = fftw_plan_dft_c2r_2d(nx,ny,fx(:,:,1),uxmany(:,:,1),FFTW_PATIENT)
-  plan_r2c = fftw_plan_dft_r2c_2d(nx,ny,uv(:,:,1),fx(:,:,1),FFTW_PATIENT)
+  ! 1. Poisson bracket in real space
+  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
 
-end subroutine cgyro_nl_fftw_init
+  call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
-#endif /* CGYRO_GPU_FFT */
+  ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
+  ! that will be filtered in the main time-stepping loop
 
-!-----------------------------------------------------------------
-! cgyro_nl_fftw_init
-! (and all the helper sub-functions)
-!-----------------------------------------------------------------
+  ! this should really be accounted against nl_mem, but hard to do with OMP
+  do itm=1,n_toroidal_procs
+   do itl=1,nt_loc
+    itor=itl + (itm-1)*nt_loc
+    do ir=1,n_radial
+     ix = ir-1-nx0/2
+     if (ix < 0) ix = ix+nx
+     iy = itor-1
+     if (nl_idx==1) then
+       fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     else
+       fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     endif
+    enddo
+   enddo
+  enddo
 
-#ifdef CGYRO_GPU_FFT
 
-subroutine cgyro_nl_fftw_mul(sz,uvm,uxm,vym,uym,vxm,inv_nxny)
-  implicit none
+  ! 2. Poisson bracket in real space with Non_Zonal pairs
+
+  ! remove ky=0 in uxmany
+  y_mean = sum(uxmany(:,:,f_j) ,dim=1 ) / ny
+  do iy=0,ny-1
+    uxmany(iy,:,f_j) = uxmany(iy,:,f_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in uymany
+  y_mean = sum(uymany(:,:,f_j) ,dim=1 ) / ny ! =0
+  do iy=0,ny-1
+    uymany(iy,:,f_j) = uymany(iy,:,f_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in vx
+  y_mean = sum(vxmany(:,:,g_j) ,dim=1 ) / ny
+  do iy=0,ny-1
+    vxmany(iy,:,g_j) = vxmany(iy,:,g_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in vy
+  y_mean = sum(vymany(:,:,g_j) ,dim=1 ) / ny ! =0
+  do iy=0,ny-1
+    vymany(iy,:,g_j) = vymany(iy,:,g_j) -y_mean
+
+  end do
+
+  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
+
+  call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
-  integer, intent(in) :: sz
-  real, dimension(*),intent(out) :: uvm
-  real, dimension(*),intent(in) :: uxm,vym,uym,vxm
-  real, intent(in) :: inv_nxny
-
-  integer :: i
-
-#if defined(OMPGPU)
-!$omp target teams distribute parallel do simd &
-!$omp&  map(to:uxm(1:sz),vym(1:sz),uym(1:sz),vxm(1:sz)) &
-!$omp&  map(from:uvm(1:sz))
-#else
-!$acc parallel loop independent gang vector &
-!$acc&         present(uvm,uxm,vym,uym,vxm) private(i)
-#endif
-  do i=1,sz
-    uvm(i) = (uxm(i)*vym(i)-uym(i)*vxm(i))*inv_nxny
+  do itm=1,n_toroidal_procs
+   do itl=1,nt_loc
+    itor=itl + (itm-1)*nt_loc
+    do ir=1,n_radial
+     ix = ir-1-nx0/2
+     if (ix < 0) ix = ix+nx
+     iy = itor-1
+     if (nl_idx==1) then
+       eA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     else
+       eB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     endif
+    enddo
+   enddo
   enddo
 
-end subroutine
+end subroutine cgyro_nl_fftw_stepr_triad
+
 
-subroutine cgyro_nl_fftw
+! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw
+subroutine cgyro_nl_fftw(ij)
 
-#if defined(HIPGPU)
-  use hipfort
-  use hipfort_hipfft
-#elif defined(MKLGPU)
-  use fftw3_omp_offload
-#else
-  use cufft
-#endif
   use timer_lib
   use parallel_lib
   use cgyro_nl_comm
   use cgyro_globals
 
   implicit none
-#if defined(MKLGPU)
-  include 'fftw/fftw3.f'
-#endif
+
+  !-----------------------------------
+  integer, intent(in) :: ij
   !-----------------------------------
-  integer :: j,p,iexch
-  integer :: it,ir,itm,itl,ix,iy
+  integer :: ix,iy
+  integer :: ir,it,itm,itl,it_loc
   integer :: itor,mytm
-  integer :: i1,i2
-  integer :: it_loc
-  integer :: ierr
-  integer :: rc
-  complex :: f0,g0
+  integer :: j,p
+  integer :: i_omp
   integer :: jtheta_min
-  integer :: iy0, iy1, ir0, ir1
-
-  real :: inv_nxny
-
-#ifdef GACODE_GPU_AMD
-  ! AMD GPU  (MI250X) optimal
-  integer, parameter :: F_RADTILE = 8
-  integer, parameter :: F_TORTILE = 16
-  integer, parameter :: R_RADTILE = 32
-  integer, parameter :: R_TORTILE = 8
-#else
-  ! NVIDIA GPU  (A100) optimal
-  integer, parameter :: F_RADTILE = 8
-  integer, parameter :: F_TORTILE = 16
-  integer, parameter :: R_RADTILE = 16
-  integer, parameter :: R_TORTILE = 8
-#endif
-
-  call timer_lib_in('nl_mem')
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
 
-  ! we can zero the elements we know are zero while we wait
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(3)
-#else
-!$acc parallel loop gang vector independent collapse(3) async(2) &
-!$acc&         private(j,ix,iy) &
-!$acc&         present(nsplitA,ny2,nx0,nx2)
-#endif
-  do j=1,nsplitA
-     do ix=nx2,nx0-1
-       do iy=0,ny2
-         fxmany(iy,ix,j) = 0
-         fymany(iy,ix,j) = 0
-       enddo
-     enddo
-  enddo
+  complex :: f0,g0
 
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(3)
-#else
-!$acc parallel loop gang vector independent collapse(3) async(2) &
-!$acc&         private(j,ix,iy) &
-!$acc&         present(nsplitA,ny2,n_toroidal,nx)
-#endif
-  do j=1,nsplitA
-     do ix=0,nx-1
-       do iy=n_toroidal,ny2
-         fxmany(iy,ix,j) = 0
-         fymany(iy,ix,j) = 0
-       enddo
-     enddo
-  enddo
-  call timer_lib_out('nl_mem')
+  integer, external :: omp_get_thread_num
+
+  include 'fftw3.f03'
+  
+  call cgyro_nl_fftw_comm_test()
 
   ! time to wait for the FA_nl to become avaialble
   call timer_lib_in('nl_comm')
@@ -474,188 +197,57 @@ subroutine cgyro_nl_fftw
   call timer_lib_out('nl_comm')
 
   call timer_lib_in('nl')
-#if !defined(OMPGPU)
-!$acc  data present(fA_nl)  &
-!$acc&      present(fxmany,fymany,gxmany,gymany) &
-!$acc&      present(uxmany,uymany,vxmany,vymany) &
-!$acc&      present(uvmany)
-#endif
 
 ! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs)
 ! where nv_loc1 * toroidal_procs >= nv_loc
-
-  ! no tiling, does not seem to help
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(4) &
-!$omp&  private(j,ir,p,ix,itor,iy,f0,itm,itl)
-#else
-!$acc parallel loop gang vector independent collapse(4) async(2) &
-!$acc&         private(j,ir,p,ix,itor,iy,f0,itm,itl)
-#endif
+!$omp parallel do schedule(dynamic,1) private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j)
   do j=1,nsplitA
-     do ir=1,n_radial
-       do itm=1,n_toroidal_procs
-         do itl=1,nt_loc
+        i_omp = omp_get_thread_num()+1
+
+        ! zero elements not otherwise set below
+        fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+
+        ! Array mapping
+        do ir=1,n_radial
+           p  = ir-1-nx0/2
+           ix = p
+           if (ix < 0) ix = ix+nx
+           do itm=1,n_toroidal_procs
+            do itl=1,nt_loc
               itor=itl + (itm-1)*nt_loc
               iy = itor-1
-              p  = ir-1-nx0/2
-              ix = p
-              if (ix < 0) ix = ix+nx
-
               f0 = i_c*fA_nl(ir,itl,j,itm)
-              fxmany(iy,ix,j) = p*f0
-              fymany(iy,ix,j) = iy*f0
-         enddo
-       enddo
-     enddo
-  enddo
-
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait(2)
-#endif
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  ! Average elements so as to ensure
-  !   f(kx,ky=0) = f(-kx,ky=0)^*
-  ! This symmetry is required for complex input to c2r
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(2) &
-!$omp&  private(j,ix,f0)
-#else
-!$acc parallel loop gang vector independent collapse(2) async(2) &
-!$acc&         private(j,ix,f0) &
-!$acc&         present(fxmany) &
-!$acc&         present(nsplitA,nx)
-#endif
-  do j=1,nsplitA
-    do ix=1,nx/2-1
-      f0 = 0.5*( fxmany(0,ix,j)+conjg(fxmany(0,nx-ix,j)) )
-      fxmany(0,ix   ,j) = f0
-      fxmany(0,nx-ix,j) = conjg(f0)
-    enddo
-  enddo
-
-     ! --------------------------------------
-     ! perform many Fourier Transforms at once
-     ! --------------------------------------
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: fymany,uymany)
-#else
-!$omp target data use_device_ptr(fymany,uymany)
-#endif
-
-#else
-!$acc  host_data use_device(fymany,uymany)
-#endif
-
-#if defined(HIPGPU)
-  rc = hipfftExecZ2D(hip_plan_c2r_manyA,c_loc(fymany),c_loc(uymany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-  call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyA,fymany,uymany)
-  rc = 0
-#else
-  rc = cufftExecZ2D(cu_plan_c2r_manyA,fymany,uymany)
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-!$acc end host_data
-#endif
-
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait(2)
-#endif
-  ! fxmany is complete now
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
+              fx(iy,ix,i_omp) = p*f0
+              fy(iy,ix,i_omp) = iy*f0
+            enddo
+           enddo
+           if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
+             ! Average elements so as to ensure
+             !   f(kx,ky=0) = f(-kx,ky=0)^*
+             ! This symmetry is required for complex input to c2r
+             f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
+             fx(0,ix   ,i_omp) = f0
+             fx(0,nx-ix,i_omp) = conjg(f0)
+           endif
+           fx(n_toroidal:ny2,ix,i_omp) = 0.0
+           fy(n_toroidal:ny2,ix,i_omp) = 0.0
+        enddo
 
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: fxmany,uxmany)
-#else
-!$omp target data use_device_ptr(fxmany,uxmany)
-#endif
-
-#else
-!$acc  host_data use_device(fxmany,uxmany)
-#endif
-
-#if defined(HIPGPU)
-  rc = hipfftExecZ2D(hip_plan_c2r_manyA,c_loc(fxmany),c_loc(uxmany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-  call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyA,fxmany,uxmany)
-  rc = 0
-#else
-  rc = cufftExecZ2D(cu_plan_c2r_manyA,fxmany,uxmany)
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-!$acc end host_data
-#endif
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
 
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
+        call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j))
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
 
-#if !defined(OMPGPU)
-!$acc  data present(g_nl)  &
-!$acc&      present(gxmany,gymany)
-#endif
-  ! we can zero the elements we know are zero while we wait for comm
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(3)
-#else
-!$acc parallel loop gang vector independent collapse(3) async(2) &
-!$acc&         private(j,ix,iy) &
-!$acc&         present(nsplit,ny2,nx0,nx2)
-#endif
-  do j=1,nsplit
-     do ix=nx2,nx0-1
-       do iy=0,ny2
-         gxmany(iy,ix,j) = 0
-         gymany(iy,ix,j) = 0
-       enddo
-     enddo
-  enddo
+        call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j))
+  enddo ! j
 
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(3)
-#else
-!$acc parallel loop gang vector independent collapse(3) async(2) &
-!$acc&         private(j,ix,iy) &
-!$acc&         present(nsplit,ny2,n_toroidal,nx)
-#endif
-  do j=1,nsplit
-     do ix=0,nx-1
-       do iy=n_toroidal,ny2
-         gxmany(iy,ix,j) = 0
-         gymany(iy,ix,j) = 0
-       enddo
-     enddo
-  enddo
   call timer_lib_out('nl')
 
   call timer_lib_in('nl_comm')
@@ -670,307 +262,77 @@ subroutine cgyro_nl_fftw
 
 ! g_nl      is (n_field,n_radial,n_jtheta,nt_loc,n_toroidal_procs)
 ! jcev_c_nl is (n_field,n_radial,n_jtheta,nv_loc,nt_loc,n_toroidal_procs)
-
-  ! tile for performance, since this is effectively a transpose
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(5) &
-!$omp&   private(j,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,ir)
-#else
-!$acc parallel loop gang vector independent collapse(5) async(2) &
-!$acc&         private(j,p,ix,itor,mytm,iy,g0,it,iv_loc,it_loc,jtheta_min,itm,itl,ir) &
-!$acc&         present(g_nl,jvec_c_nl) &
-!$acc&         present(nsplit,n_radial,n_toroidal_procs,nt_loc,nt1,n_theta,nv_loc,nx0)
-#endif
+!$omp parallel do schedule(dynamic,1) &
+!$omp& private(itor,mytm,itm,itl,iy,ir,p,ix,g0,i_omp,j,it,iv_loc,it_loc,jtheta_min)
   do j=1,nsplit
-    do iy0=0,n_toroidal+(F_TORTILE-1)-1,F_TORTILE  ! round up
-      do ir0=0,n_radial+(F_RADTILE-1)-1,F_RADTILE  ! round up
-        do iy1=0,(F_TORTILE-1)   ! tile
-          do ir1=0,(F_RADTILE-1)  ! tile
-            iy = iy0+iy1
-            ir = 1 + ir0+ir1
-            if ((iy < n_toroidal) .and. (ir <= n_radial)) then
-              itor = iy+1
-              itm = 1 + iy/nt_loc
-              itl = 1 + modulo(iy,nt_loc)
+        i_omp = omp_get_thread_num()+1
+
+        ! zero elements not otherwise set below
+        gx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        gy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+
+        ! Array mapping
+        do ir=1,n_radial
+           p  = ir-1-nx0/2
+           ix = p
+           if (ix < 0) ix = ix+nx
+           do itm=1,n_toroidal_procs
+            do itl=1,nt_loc
+              itor = itl + (itm-1)*nt_loc
               mytm = 1 + nt1/nt_loc !my toroidal proc number
               it = 1+((mytm-1)*nsplit+j-1)/nv_loc
               iv_loc = 1+modulo((mytm-1)*nsplit+j-1,nv_loc)
               jtheta_min = 1+((mytm-1)*nsplit)/nv_loc
               it_loc = it-jtheta_min+1
-              iy = itor-1
-
-              p  = ir-1-nx0/2
-              ix = p
-              if (ix < 0) ix = ix+nx
 
+              iy = itor-1
               if (it > n_theta) then
                  g0 = (0.0,0.0)
               else
-                 g0 = i_c*sum( jvec_c_nl(1:n_field,ir,it_loc,iv_loc,itor)*g_nl(1:n_field,ir,it_loc,itor))
+                 g0 = i_c*sum( jvec_c_nl(:,ir,it_loc,iv_loc,itor)*g_nl(:,ir,it_loc,itor))
               endif
-              gxmany(iy,ix,j) = p*g0
-              gymany(iy,ix,j) = iy*g0
-            endif
-          enddo
+              gx(iy,ix,i_omp) = p*g0
+              gy(iy,ix,i_omp) = iy*g0
+            enddo
+           enddo
+           if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
+              ! Average elements so as to ensure
+              !   g(kx,ky=0) = g(-kx,ky=0)^*
+              ! This symmetry is required for complex input to c2r
+              g0 = 0.5*( gx(0,ix,i_omp)+conjg(gx(0,nx-ix,i_omp)) )
+              gx(0,ix   ,i_omp) = g0
+              gx(0,nx-ix,i_omp) = conjg(g0)
+           endif
+           gx(n_toroidal:ny2,ix,i_omp) = 0.0
+           gy(n_toroidal:ny2,ix,i_omp) = 0.0
         enddo
-      enddo
-    enddo
-  enddo
-
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait(2)
-#endif
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  ! Average elements so as to ensure
-  !   g(kx,ky=0) = g(-kx,ky=0)^*
-  ! This symmetry is required for complex input to c2r
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(2) &
-!$omp&   private(j,ix,g0)
-#else
-!$acc parallel loop gang vector independent collapse(2) async(2) &
-!$acc&         private(j,ix,g0) &
-!$acc&         present(gxmany) &
-!$acc&         present(nsplit,nx)
-#endif
-  do j=1,nsplit
-    do ix=1,nx/2-1
-      g0 = 0.5*( gxmany(0,ix,j)+conjg(gxmany(0,nx-ix,j)) )
-      gxmany(0,ix   ,j) = g0
-      gxmany(0,nx-ix,j) = conjg(g0)
-    enddo
-  enddo
-
-#if !defined(OMPGPU)
-!$acc end data
-#endif
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: gymany,vymany)
-#else
-!$omp target data use_device_ptr(gymany,vymany)
-#endif
-
-#else
-!$acc  host_data use_device(gymany,vymany)
-#endif
-
-#if defined(HIPGPU)
-  rc = hipfftExecZ2D(hip_plan_c2r_manyG,c_loc(gymany),c_loc(vymany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-  call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyG,gymany,vymany)
-  rc = 0
-#else
-  rc = cufftExecZ2D(cu_plan_c2r_manyG,gymany,vymany)
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-!$acc end host_data
-#endif
-
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait(2)
-#endif
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-  ! gxmany is complete now
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: gxmany,vxmany)
-#else
-!$omp target data use_device_ptr(gxmany,vxmany)
-#endif
-
-#else
-!$acc  host_data use_device(gxmany,vxmany)
-#endif
-
-#if defined(HIPGPU)
-  rc = hipfftExecZ2D(hip_plan_c2r_manyG,c_loc(gxmany),c_loc(vxmany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-  call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyG,gxmany,vxmany)
-  rc = 0
-#else
-  rc = cufftExecZ2D(cu_plan_c2r_manyG,gxmany,vxmany)
-#endif
-
-#ifdef HIPGPU
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-  ! hipfftExec is asynchronous, will need the results below
-  rc = hipDeviceSynchronize()
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait
-!$acc end host_data
-#endif
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  ! Poisson bracket in real space
-  ! uv = (ux*vy-uy*vx)/(nx*ny)
-
-  inv_nxny = 1.0/(nx*ny)
 
-  call cgyro_nl_fftw_mul(size(uvmany,1)*size(uvmany,2)*nsplitA, &
-                         uvmany, &
-                         uxmany,vymany(:,:,1:nsplitA), &
-                         uymany,vxmany(:,:,1:nsplitA), &
-                         inv_nxny)
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  ! ------------------
-  ! Transform uv to fx
-  ! ------------------
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: uvmany,fxmany)
-#else
-!$omp target data use_device_ptr(uvmany,fxmany)
-#endif
-
-#else
-!$acc wait
-!$acc  host_data use_device(uvmany,fxmany)
-#endif
-
-#if defined(HIPGPU)
-  rc = hipfftExecD2Z(hip_plan_r2c_manyA,c_loc(uvmany),c_loc(fxmany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-  call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyA,uvmany,fxmany)
-  rc = 0
-#else
-  rc = cufftExecD2Z(cu_plan_r2c_manyA,uvmany,fxmany)
-#endif
-
-#ifdef HIPGPU
-  ! hipfftExec is asynchronous, will need the results below
-  rc = hipDeviceSynchronize()
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-!$acc wait
-!$acc end host_data
-#endif
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  call timer_lib_out('nl')
-  call timer_lib_in('nl_mem')
-
-  ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
-  ! that will be filtered in the main time-stepping loop
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
 
-  ! tile for performance, since this is effectively a transpose
-#if defined(OMPGPU)
-!$omp target teams distribute parallel do collapse(5) &
-!$omp&   private(iy,ir,itm,itl,ix)
-#else
-!$acc parallel loop independent collapse(5) gang &
-!$acc&         private(iy,ir,itm,itl,ix) present(fA_nl,fxmany)
-#endif
-  do j=1,nsplitA
-   do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE  ! round up
-    do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE  ! round up
-    do iy1=0,(R_TORTILE-1)   ! tile
-      do ir1=0,(R_RADTILE-1)  ! tile
-       iy = iy0 + iy1
-       ir = 1 + ir0 + ir1
-       if ((iy < n_toroidal) .and. (ir <= n_radial)) then
-           ! itor = iy+1
-           itm = 1 + iy/nt_loc
-           itl = 1 + modulo(iy,nt_loc)
-           ix = ir-1-nx0/2
-           if (ix < 0) ix = ix+nx
+        call fftw_execute_dft_c2r(plan_c2r,gx(:,:,i_omp),vxmany(:,:,j))
 
-           fA_nl(ir,itl,j,itm) = fxmany(iy,ix,j)
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
         endif
-      enddo
-     enddo
-    enddo
-   enddo
-  enddo
 
-#if !defined(OMPGPU)
-  ! end data fA_nl
-!$acc end data
-#endif
+        call fftw_execute_dft_c2r(plan_c2r,gy(:,:,i_omp),vymany(:,:,j))
 
-  if (nsplitB > 0) then
-  ! we can zero the elements we know are zero while we waita
-  ! assuming nsplitB<=nsplitA
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(3)
-#else
-!$acc parallel loop gang vector independent collapse(3) async(2) &
-!$acc&         private(j,ix,iy) &
-!$acc&         present(nsplitB,ny2,nx0,nx2)
-#endif
-   do j=1,nsplitB
-     do ix=nx2,nx0-1
-       do iy=0,ny2
-         fxmany(iy,ix,j) = 0
-         fymany(iy,ix,j) = 0
-       enddo
-     enddo
-   enddo
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
 
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(3)
-#else
-!$acc parallel loop gang vector independent collapse(3) async(2) &
-!$acc&         private(j,ix,iy) &
-!$acc&         present(nsplitB,ny2,n_toroidal,nx)
-#endif
-   do j=1,nsplitB
-     do ix=0,nx-1
-       do iy=n_toroidal,ny2
-         fxmany(iy,ix,j) = 0
-         fymany(iy,ix,j) = 0
-       enddo
-     enddo
-   enddo
-  endif ! if nsplitB>0
+        if (j<=nsplitA) then
+           call cgyro_nl_fftw_stepr(j, j, 1, i_omp)
+        endif
+        ! else we will do it in the next loop
+  enddo ! j
 
-  call timer_lib_out('nl_mem')
+  call timer_lib_out('nl')
 
   call timer_lib_in('nl_comm')
   ! start the async reverse comm
@@ -983,330 +345,86 @@ subroutine cgyro_nl_fftw
     call parallel_slib_f_nc_wait(nsplitB,fpackB,fB_nl,fB_req)
     fB_req_valid = .FALSE.
   endif
+
   ! make sure reqs progress
   call cgyro_nl_fftw_comm_test()
   call timer_lib_out('nl_comm')
 
   if (nsplitB > 0) then
-
-  call timer_lib_in('nl')
-#if !defined(OMPGPU)
-!$acc  data present(fB_nl)  &
-!$acc&      present(fxmany,fymany,gxmany,gymany) &
-!$acc&      present(uxmany,uymany,vxmany,vymany) &
-!$acc&      present(uvmany)
-#endif
+   call timer_lib_in('nl')
 
 ! f_nl is (radial, nt_loc, theta, nv_loc1, toroidal_procs)
 ! where nv_loc1 * toroidal_procs >= nv_loc
+!$omp parallel do schedule(dynamic,1) private(itm,itl,itor,iy,ir,p,ix,f0,i_omp,j)
+   do j=1,nsplitB
+        i_omp = omp_get_thread_num()+1
 
-  ! no tiling, does not seem to help
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(4) &
-!$omp&  private(j,ir,p,ix,itor,iy,f0,g0,itm,itl)
-#else
-!$acc parallel loop gang vector independent collapse(4) async(2) &
-!$acc&         private(j,ir,p,ix,itor,iy,f0,itm,itl)
-#endif
-  do j=1,nsplitB
-     do ir=1,n_radial
-       do itm=1,n_toroidal_procs
-         do itl=1,nt_loc
+        ! zero elements not otherwise set below
+        fx(0:ny2,nx2:nx0-1,i_omp) = 0.0
+        fy(0:ny2,nx2:nx0-1,i_omp) = 0.0
+
+        ! Array mapping
+        do ir=1,n_radial
+           p  = ir-1-nx0/2
+           ix = p
+           if (ix < 0) ix = ix+nx
+           do itm=1,n_toroidal_procs
+            do itl=1,nt_loc
               itor=itl + (itm-1)*nt_loc
               iy = itor-1
-              p  = ir-1-nx0/2
-              ix = p
-              if (ix < 0) ix = ix+nx
-
               f0 = i_c*fB_nl(ir,itl,j,itm)
-              fxmany(iy,ix,j) = p*f0
-              fymany(iy,ix,j) = iy*f0
-         enddo
-       enddo
-     enddo
-  enddo
-
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait(2)
-#endif
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  ! Average elements so as to ensure
-  !   f(kx,ky=0) = f(-kx,ky=0)^*
-  ! This symmetry is required for complex input to c2r
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-!$omp target teams distribute parallel do simd collapse(2) &
-!$omp&  private(j,ix,f0)
-#else
-!$acc parallel loop gang vector independent collapse(2) async(2) &
-!$acc&         private(j,ix,f0) &
-!$acc&         present(fxmany) &
-!$acc&         present(nsplitB,nx)
-#endif
-  do j=1,nsplitB
-    do ix=1,nx/2-1
-      f0 = 0.5*( fxmany(0,ix,j)+conjg(fxmany(0,nx-ix,j)) )
-      fxmany(0,ix   ,j) = f0
-      fxmany(0,nx-ix,j) = conjg(f0)
-    enddo
-  enddo
-
-     ! --------------------------------------
-     ! perform many Fourier Transforms at once
-     ! --------------------------------------
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: fymany,uymany)
-#else
-!$omp target data use_device_ptr(fymany,uymany)
-#endif
-
-#else
-!$acc  host_data use_device(fymany,uymany)
-#endif
-
-#if defined(HIPGPU)
-  rc = hipfftExecZ2D(hip_plan_c2r_manyB,c_loc(fymany),c_loc(uymany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-  call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyB,fymany,uymany)
-  rc = 0
-#else
-  rc = cufftExecZ2D(cu_plan_c2r_manyB,fymany,uymany)
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-!$acc end host_data
-#endif
-
-#if defined(OMPGPU)
-  !no async for OMPGPU for now
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait(2)
-#endif
-  ! fxmany is complete now
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: fxmany,uxmany)
-#else
-!$omp target data use_device_ptr(fxmany,uxmany)
-#endif
-
-#else
-!$acc  host_data use_device(fxmany,uxmany)
-#endif
-
-#if defined(HIPGPU)
-  rc = hipfftExecZ2D(hip_plan_c2r_manyB,c_loc(fxmany),c_loc(uxmany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-  call dfftw_execute_dft_c2r(dfftw_plan_c2r_manyB,fxmany,uxmany)
-  rc = 0
-#else
-  rc = cufftExecZ2D(cu_plan_c2r_manyB,fxmany,uxmany)
-#endif
-
-#ifdef HIPGPU
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-  ! hipfftExec is asynchronous, will need the results below
-  rc = hipDeviceSynchronize()
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait
-!$acc end host_data
-#endif
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  ! Poisson bracket in real space
-  ! uv = (ux*vy-uy*vx)/(nx*ny)
-
-  inv_nxny = 1.0/(nx*ny)
-
-  call cgyro_nl_fftw_mul(size(uvmany,1)*size(uvmany,2)*nsplitB, &
-                         uvmany, &
-                         uxmany,vymany(:,:,(nsplitA+1):nsplit), &
-                         uymany,vxmany(:,:,(nsplitA+1):nsplit), &
-                         inv_nxny)
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  ! ------------------
-  ! Transform uv to fx
-  ! ------------------
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: uvmany,fxmany)
-#else
-!$omp target data use_device_ptr(uvmany,fxmany)
-#endif
-
-#else
-!$acc wait
-!$acc  host_data use_device(uvmany,fxmany)
-#endif
-
-#if defined(HIPGPU)
-  rc = hipfftExecD2Z(hip_plan_r2c_manyB,c_loc(uvmany),c_loc(fxmany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-  call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyB,uvmany,fxmany)
-  rc = 0
-#else
-  rc = cufftExecD2Z(cu_plan_r2c_manyB,uvmany,fxmany)
-#endif
-
-#ifdef HIPGPU
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-  ! hipfftExec is asynchronous, will need the results below
-  rc = hipDeviceSynchronize()
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-!$acc wait
-!$acc end host_data
-#endif
-
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-
-  call timer_lib_out('nl')
-  call timer_lib_in('nl_mem')
-
-  ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
-  ! that will be filtered in the main time-stepping loop
+              fx(iy,ix,i_omp) = p*f0
+              fy(iy,ix,i_omp) = iy*f0
+            enddo
+           enddo
+           if ((ix/=0) .and. (ix<(nx/2))) then ! happens after ix>nx/2
+             ! Average elements so as to ensure
+             !   f(kx,ky=0) = f(-kx,ky=0)^*
+             ! This symmetry is required for complex input to c2r
+             f0 = 0.5*( fx(0,ix,i_omp)+conjg(fx(0,nx-ix,i_omp)) )
+             fx(0,ix   ,i_omp) = f0
+             fx(0,nx-ix,i_omp) = conjg(f0)
+           endif
+           fx(n_toroidal:ny2,ix,i_omp) = 0.0
+           fy(n_toroidal:ny2,ix,i_omp) = 0.0
+        enddo
 
-  ! tile for performance, since this is effectively a transpose
-#if defined(OMPGPU)
-!$omp target teams distribute parallel do collapse(5) &
-!$omp&   private(iy,ir,itm,itl,ix)
-#else
-!$acc parallel loop independent collapse(5) gang &
-!$acc&         private(iy,ir,itm,itl,ix) present(fB_nl,fxmany)
-#endif
-  do j=1,nsplitB
-   do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE  ! round up
-    do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE  ! round up
-    do iy1=0,(R_TORTILE-1)   ! tile
-      do ir1=0,(R_RADTILE-1)  ! tile
-       iy = iy0 + iy1
-       ir = 1 + ir0 + ir1
-       if ((iy < n_toroidal) .and. (ir <= n_radial)) then
-           ! itor = iy+1
-           itm = 1 + iy/nt_loc
-           itl = 1 + modulo(iy,nt_loc)
-           ix = ir-1-nx0/2
-           if (ix < 0) ix = ix+nx
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
 
-           fB_nl(ir,itl,j,itm) = fxmany(iy,ix,j)
+        call fftw_execute_dft_c2r(plan_c2r,fx(:,:,i_omp),uxmany(:,:,j))
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
         endif
-      enddo
-     enddo
-    enddo
-   enddo
-  enddo
 
-#if !defined(OMPGPU)
-  ! end data fB_nl
-!$acc end data
-#endif
+        call fftw_execute_dft_c2r(plan_c2r,fy(:,:,i_omp),uymany(:,:,j))
+        if (i_omp==1) then
+         ! use the main thread to progress the async MPI
+          call cgyro_nl_fftw_comm_test()
+        endif
 
-  call timer_lib_out('nl_mem')
+        call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp)
+   enddo ! j
 
-  call timer_lib_in('nl_comm')
-  ! start the async reverse comm
-  ! can reuse the same req, no overlap with forward fB_req
-  call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req)
-  fB_req_valid = .TRUE.
-  ! make sure reqs progress
-  call cgyro_nl_fftw_comm_test()
-  call timer_lib_out('nl_comm')
+   call timer_lib_out('nl')
 
-  endif ! nsplitB>0
+   call timer_lib_in('nl_comm')
+   ! start the async reverse comm
+   ! can reuse the same req, no overlap with forward fB_req
+   call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req)
+   fB_req_valid = .TRUE.
+   ! make sure reqs progress
+   call cgyro_nl_fftw_comm_test()
+   call timer_lib_out('nl_comm')
+  endif ! if nsplitB>0
 
 end subroutine cgyro_nl_fftw
 
-#else  /* if not definedCGYRO_GPU_FFT */
 
-subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp)
-
-  use timer_lib
-  use parallel_lib
-  use cgyro_globals
-
-  implicit none
-
-  integer, intent(in) :: g_j, f_j
-  integer,intent(in) :: nl_idx ! 1=>A, 2=>B
-  integer,intent(in) :: i_omp
-  integer :: ix,iy
-  integer :: ir,itm,itl,itor
-
-  include 'fftw3.f03'
-
-  ! Poisson bracket in real space
-  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
-
-  call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
-
-  ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
-  ! that will be filtered in the main time-stepping loop
-
-  ! this should really be accounted against nl_mem, but hard to do with OMP
-  do itm=1,n_toroidal_procs
-   do itl=1,nt_loc
-    itor=itl + (itm-1)*nt_loc
-    do ir=1,n_radial
-     ix = ir-1-nx0/2
-     if (ix < 0) ix = ix+nx
-     iy = itor-1
-     if (nl_idx==1) then
-       fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
-     else
-       fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
-     endif
-    enddo
-   enddo
-  enddo
-
-end subroutine cgyro_nl_fftw_stepr
-
-! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw
-subroutine cgyro_nl_fftw
+subroutine cgyro_nl_fftw_triad(ij)
 
   use timer_lib
   use parallel_lib
@@ -1315,6 +433,8 @@ subroutine cgyro_nl_fftw
 
   implicit none
 
+  !-----------------------------------
+  integer, intent(in) :: ij
   !-----------------------------------
   integer :: ix,iy
   integer :: ir,it,itm,itl,it_loc
@@ -1470,7 +590,7 @@ subroutine cgyro_nl_fftw
         endif
 
         if (j<=nsplitA) then
-           call cgyro_nl_fftw_stepr(j, j, 1, i_omp)
+           call cgyro_nl_fftw_stepr_triad(j, j, 1, i_omp)
         endif
         ! else we will do it in the next loop
   enddo ! j
@@ -1481,6 +601,7 @@ subroutine cgyro_nl_fftw
   ! start the async reverse comm
   ! can reuse the same req, no overlap with forward fA_req
   call parallel_slib_r_nc_async(nsplitA,fA_nl,fpackA,fA_req)
+  call parallel_slib_r_nc_async(nsplitA,eA_nl,epackA,eA_req)
   fA_req_valid = .TRUE.
 
   if (nsplitB > 0) then
@@ -1549,7 +670,7 @@ subroutine cgyro_nl_fftw
           call cgyro_nl_fftw_comm_test()
         endif
 
-        call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp)
+        call cgyro_nl_fftw_stepr_triad(nsplitA+j, j, 2, i_omp)
    enddo ! j
 
    call timer_lib_out('nl')
@@ -1558,13 +679,11 @@ subroutine cgyro_nl_fftw
    ! start the async reverse comm
    ! can reuse the same req, no overlap with forward fB_req
    call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req)
+   call parallel_slib_r_nc_async(nsplitB,eB_nl,epackB,eB_req)
    fB_req_valid = .TRUE.
    ! make sure reqs progress
    call cgyro_nl_fftw_comm_test()
    call timer_lib_out('nl_comm')
   endif ! if nsplitB>0
 
-end subroutine cgyro_nl_fftw
-
-#endif /* CGYRO_GPU_FFT */
-
+end subroutine cgyro_nl_fftw_triad
diff --git a/cgyro/src/cgyro_read_input.f90 b/cgyro/src/cgyro_read_input.f90
index be77de7dd..bfbe4bf71 100644
--- a/cgyro/src/cgyro_read_input.f90
+++ b/cgyro/src/cgyro_read_input.f90
@@ -70,6 +70,7 @@ subroutine cgyro_read_input
   call cgyro_readbc_int(moment_print_flag,'MOMENT_PRINT_FLAG')
   call cgyro_readbc_int(gflux_print_flag,'GFLUX_PRINT_FLAG')
   call cgyro_readbc_int(field_print_flag,'FIELD_PRINT_FLAG')
+  call cgyro_readbc_int(triad_print_flag,'TRIAD_PRINT_FLAG')
   call cgyro_readbc_real(amp0)
   call cgyro_readbc_real(amp)
   call cgyro_readbc_real(gamma_e)
diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90
index 2f762bd16..b12c6cca5 100644
--- a/cgyro/src/cgyro_rhs.F90
+++ b/cgyro/src/cgyro_rhs.F90
@@ -35,7 +35,11 @@ subroutine cgyro_rhs_r_comm_async(ij)
   integer, intent(in) :: ij
 
   if (nonlinear_flag == 1) then
-       call cgyro_nl_fftw_comm1_r(ij)
+     if (triad_print_flag == 1 .and. ij == 4) then
+        call cgyro_nl_fftw_comm1_r_triad(ij)
+     else
+        call cgyro_nl_fftw_comm1_r(ij)
+     endif
   endif
 
 end subroutine cgyro_rhs_r_comm_async
@@ -410,7 +414,11 @@ subroutine cgyro_rhs(ij,update_cap)
   if (nonlinear_flag == 1) then
      ! assumes someone already started the input comm
      ! and will finish the output comm
-     call cgyro_nl_fftw()
+     if (triad_print_flag == 1 .and. ij == 4) then
+        call cgyro_nl_fftw_triad()
+     else
+        call cgyro_nl_fftw()
+     endif
   endif
 
   call cgyro_upwind_complete
diff --git a/cgyro/src/cgyro_step_collision.F90 b/cgyro/src/cgyro_step_collision.F90
index f422e88a4..228360d2f 100644
--- a/cgyro/src/cgyro_step_collision.F90
+++ b/cgyro/src/cgyro_step_collision.F90
@@ -334,6 +334,7 @@ subroutine cgyro_step_collision_cpu(use_simple)
 
   ! Compute H given h and [phi(h), apar(h)]
 
+ if (triad_print_flag == 1 ) then
 !$omp parallel do collapse(2) &
 !$omp&            private(iv_loc,is,ic,iv,my_psi,my_ch) firstprivate(nc)
   do itor=nt1,nt2
@@ -341,14 +342,44 @@ subroutine cgyro_step_collision_cpu(use_simple)
      iv_loc = iv-nv1+1
      is = is_v(iv)
      do ic=1,nc
+  
+        ! Save collisional diss. 
+        my_ch = cap_h_ct(iv_loc,itor,ic)
+        my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor))
+           
+        my_psi = my_ch-my_psi*(z(is)/temp(is))
+        cap_h_ct(iv_loc,itor,ic) = (cap_h_c(ic,iv_loc,itor) + my_ch) / 2.0
+        cap_h_ct(iv_loc,itor,ic) = conjg(cap_h_ct(iv_loc,itor,ic)) &
+          * ( my_psi - h_x(ic,iv_loc,itor) )
+
+        h_x(ic,iv_loc,itor) = my_psi
+        cap_h_c(ic,iv_loc,itor) = my_ch
+
+     enddo
+   enddo
+  enddo
+
+ else
+
+!$omp parallel do collapse(2) &
+!$omp&            private(iv_loc,is,ic,iv,my_psi,my_ch) firstprivate(nc)
+  do itor=nt1,nt2
+   do iv=nv1,nv2
+     iv_loc = iv-nv1+1
+     is = is_v(iv)
+     do ic=1,nc
+
         my_ch = cap_h_ct(iv_loc,itor,ic)
         my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor))
         h_x(ic,iv_loc,itor) = my_ch-my_psi*(z(is)/temp(is))
         cap_h_c(ic,iv_loc,itor) = my_ch
+
      enddo
    enddo
   enddo
 
+ end if ! (triad_print_flag == 1 )
+
   call timer_lib_out('coll')
 
 end subroutine cgyro_step_collision_cpu
diff --git a/cgyro/src/cgyro_write_timedata.f90 b/cgyro/src/cgyro_write_timedata.f90
index 3d6b808a1..36864a9ab 100644
--- a/cgyro/src/cgyro_write_timedata.f90
+++ b/cgyro/src/cgyro_write_timedata.f90
@@ -70,6 +70,14 @@ subroutine cgyro_write_timedata
      enddo
   endif
 
+  if (nonlinear_flag == 1 .and. triad_print_flag == 1) then
+      ! Triad energy transfer for all kxky
+        call cgyro_write_distributed_bcomplex(&
+             trim(path)//binfile_triad,&
+             size(triad(:,:,:,:)),&
+             triad(:,:,:,:))
+  endif
+
   if (field_print_flag == 1) then
      p_field = n_field
   else
diff --git a/f2py/pygacode/cgyro/data.py b/f2py/pygacode/cgyro/data.py
index 0e67a3395..15e59cb6e 100644
--- a/f2py/pygacode/cgyro/data.py
+++ b/f2py/pygacode/cgyro/data.py
@@ -302,6 +302,13 @@ def getbigfield(self):
       if fmt != 'null':
          print('INFO: (data.py) Read data in '+fmt+'.cgyro.kxky_v.   '+t)
          self.kxky_v = np.reshape(data[0:nd],(2,self.n_radial,self.theta_plot,self.n_species,self.n_n,nt),'F')
+
+      # 5. triad
+      t,fmt,data = self.extract('.cgyro.triad')
+      if fmt != 'null':
+         print('INFO: (data.py) Read data in '+fmt+'.cgyro.triad.   '+t)
+         nd = 2*self.n_n*self.n_radial*self.n_species*nt*8 
+         self.triad = np.reshape(data[0:nd],(2,self.n_species,self.n_radial,8,self.n_n,nt),'F')
       #-----------------------------------------------------------------
 
    def getgeo(self):
diff --git a/f2py/pygacode/cgyro/data_plot.py b/f2py/pygacode/cgyro/data_plot.py
index 309ee7106..99f36af79 100644
--- a/f2py/pygacode/cgyro/data_plot.py
+++ b/f2py/pygacode/cgyro/data_plot.py
@@ -433,6 +433,200 @@ def plot_phi(self,xin):
       return head,self.t,y0,yn
 
 
+
+   def plot_triad(self,xin):
+
+      w       = xin['w']
+      theta   = xin['theta']
+      field   = xin['field']
+      ymin    = xin['ymin']
+      ymax    = xin['ymax']
+      absnorm = xin['abs']
+      moment  = xin['moment']
+      spec    = xin['spec']
+
+
+      t = self.getnorm(xin['norm'])
+
+      if xin['fig'] is None:
+         fig = plt.figure(MYDIR,figsize=(xin['lx'],xin['ly']))
+
+      self.getbigfield()
+
+      f = self.triad[0,spec,:,:,:,:]+1j*self.triad[1,spec,:,:,:,:]
+      # 0- Triad , 1- non-zonal pairs Triad , 2- d(Entropy)/dt, 3- W_k_{perp}/dt, 4- Entropy
+      # 5- Diss.(radial) , 6- Diss.(theta) , 7- Diss.(Collision)
+      f = f[:,0,:,:]
+
+      #======================================
+      # Set figure size and axes
+      ax = fig.add_subplot(111)
+      ax.grid(which="both",ls=":")
+      ax.grid(which="major",ls=":")
+      ax.set_xlabel(self.tstr)
+      ax.set_title(r'$\mathrm{Triad~energy~transfer~intensity}$'+ r'$-(L_{T_e}/Q_{GBD})$')
+      #======================================
+
+      if spec==1:
+         ft=r'{T_e}'
+      elif spec==0:
+         ft=r'{T_D}'
+      else:
+         ft=r'{T_%d}'%spec
+
+
+      # n=0 intensity
+      T0 = np.sum(f[:,0,:].real,axis=0)   
+      lab0=r'$\left\langle \left|'+ft+r'_0\right|\right\rangle$'   
+      ax.plot(t,T0,label=lab0,linewidth=4)
+
+      # n!=0 intensity
+      for i in range(1,self.n_n):
+         yn = np.sum(f[:,i,:].real,axis=(0)) *2.0 # (KY<0 domain)
+         labn=r'$\left\langle \left|'+ft+r'_{%d}\right|\right\rangle$'%i
+         if i>10:
+            ax.plot(t,yn,':',label=labn,linewidth=2)
+         else:
+            ax.plot(t,yn,label=labn,linewidth=2)
+
+      # total intensity
+      Tn = np.sum(f[:,1:,:].real,axis=(0,1))*2.0 + T0
+      ax.plot(self.t,Tn,'r',label='Total',linewidth=4)
+
+      ax.set_xlim(0,t[-1])
+      if ymax != 'auto':
+         ax.set_ylim(top=float(ymax))
+      if ymin != 'auto':
+         ax.set_ylim(bottom=float(ymin))
+      ax.legend(loc=3)
+
+      # JC: labels only correct for default norm
+      head = '(cs/a) t     Phi_0/rho_*    Phi_n/rho_*'
+
+      fig.tight_layout(pad=0.3)
+
+      return head
+
+
+   def plot_triad_v2(self,xin):
+
+      w       = xin['w']
+      theta   = xin['theta']
+      field   = xin['field']
+      ymin    = xin['ymin']
+      ymax    = xin['ymax']
+      absnorm = xin['abs']
+      moment  = xin['moment']
+      spec    = xin['spec']
+      dn      = xin['dn']
+
+
+      t = self.getnorm(xin['norm'])
+
+      if xin['fig'] is None:
+         fig = plt.figure(MYDIR,figsize=(xin['lx'],xin['ly']))
+
+      self.getbigfield()
+
+      if spec==-1:
+         f = (self.triad[0,:,:,:,:,:] + 1j*self.triad[1,:,:,:,:,:]).sum(axis=0)
+      else:
+         f  = self.triad[0,spec,:,:,:,:] + 1j*self.triad[1,spec,:,:,:,:]
+      # 0- Triad , 1- non-zonal pairs Triad , 2- d(Entropy)/dt, 3- W_k_{perp}/dt, 4- Entropy
+      # 5- Diss.(radial) , 6- Diss.(theta) , 7- Diss.(Collision)
+      T  = f[:,0,:,:].real
+      Ent= f[:,2,:,:].real
+      Wkt= self.triad[0,0,:,3,:,:].real
+      diss_r= f[:,5,:,:].real
+      diss_th=f[:,6,:,:].real
+      diss_c= f[:,7,:,:].real
+
+      ## get heat flux
+      usec = self.getflux(cflux='auto')
+
+      ys = np.sum(self.ky_flux,axis=(2,3))
+      y  = ys[:,1,:]  # 'Q'
+      yn = ys[:,0,:]  # 'Gamma'
+
+      #======================================
+      # Set figure size and axes
+      ax = fig.add_subplot(111)
+      ax.grid(which="both",ls=":")
+      ax.grid(which="major",ls=":")
+      ax.set_xlabel(self.tstr)
+      ax.set_title(r'$\mathrm{Triad~energy~transfer~intensity}$'+ r'$-(L_{T_e}/Q_{GBD})$')
+      #======================================
+
+      # n=0 intensity
+      print('HINT: adjust -dn to match experimental dn (rho/a and Lx/a will shrink)')
+      T0  = np.sum(T[:,0,:],axis=0)
+      Ent0= np.sum(Ent[:,0,:],axis=0) 
+      Wkt0= np.sum(Wkt[:,0,:],axis=0)  *dn**2
+      diss_r0 = np.sum(diss_r[:,0,:],axis=0)
+      diss_th0= np.sum(diss_th[:,0,:],axis=0)
+      diss_c0 = np.sum(diss_c[:,0,:],axis=0)
+
+      if spec == 1: # electron
+         ft = r'{L_{T_e} T_{NZF->ZF,e}/Q_{GBD}}'
+         Sft= r'{L_{T_e} {dS_e \over dt} /Q_{GBD}}'
+         Qft= r'{q_e/Q_{GBD}}'
+         Wft= r'{L_{T_e} W_{k_{\perp}}/Q_{GBD}}'
+         Q  = y[spec,:]
+         G  = (self.dlnndr[1] - 1.5*self.dlntdr[1] )*yn[spec,:] / self.dlntdr[1]
+
+      elif spec == 0: # main ion
+         ft = r'{L_{T_e} T_{NZF->ZF,D} /Q_{GBD}}'
+         Sft= r'{L_{T_e} {dS_D \over dt} /Q_{GBD}}'
+         Qft= r'{L_{T_e} q_D/Q_{GBD}L_{T_D}}'
+         Wft= r'{L_{T_e} W_{k_{\perp}}/Q_{GBD}}'
+         Q  = y[spec,:] * (self.dlntdr[0] /self.dlntdr[1] )
+         G  = (self.dlnndr[0] - 1.5*self.dlntdr[0] )*yn[spec,:] / self.dlntdr[1] *self.temp[0]
+       
+      elif spec == -1: # total
+         ft = r'{L_{T_e} \Sigma_s T_{NZF->ZF,s} /Q_{GBD}}'
+         Sft= r'{L_{T_e} \Sigma_s {dS_s \over dt} /Q_{GBD}}'
+         Qft= r'{L_{T_e} \Sigma_s q_s/Q_{GBD}L_{T_s}}'
+         Wft= r'{L_{T_e} W_{k_{\perp}}/Q_{GBD}}'
+         Q  = (y[:,:] * self.dlntdr[:,np.newaxis]).sum(axis=0)  /self.dlntdr[1]
+         G  = ( (self.dlnndr[:,np.newaxis] - 1.5*self.dlntdr[:,np.newaxis]) \
+             *yn[:,:] *self.temp[:,np.newaxis] ).sum(axis=0)   / self.dlntdr[1]
+      diss_rt = r'D_r'
+      diss_tht= r'D_\theta'
+      diss_ct = r'D_c'
+
+      lab0=r'$\left\langle \left|'+ft+r'\right|\right\rangle$'
+      lab1=r'$\left\langle \left|'+Sft+r'\right|\right\rangle$'
+      lab2=r'$\left\langle \left|'+Wft+r'\right|\right\rangle$'
+      lab3=r'$\left\langle \left|'+Qft+r'\right|\right\rangle$'
+      lab4=r'$\left\langle \left|'+diss_rt +r'\right|\right\rangle$'
+      lab5=r'$\left\langle \left|'+diss_tht+r'\right|\right\rangle$'
+      lab6=r'$\left\langle \left|'+diss_ct +r'\right|\right\rangle$'
+
+      ax.plot(self.t ,T0           ,label=lab0,linewidth=2)
+      ax.plot(self.t ,Ent0         ,label=lab1,linewidth=2)
+      ax.plot(self.t ,Wkt0         ,label=lab2,linewidth=2)
+      #ax.plot(self.t ,G+Q          ,label=lab3,linewidth=2) # q_s = (1/Ln-1.5/LT)T_s*Gamma_s + Q_s
+      ax.plot(self.t ,diss_r0      ,label=lab4,linewidth=2)
+      ax.plot(self.t ,diss_th0     ,label=lab5,linewidth=2)
+      ax.plot(self.t ,diss_c0      ,label=lab6,linewidth=2)
+      if spec == -1: ax.plot(self.t ,(T0 - Ent0) ,':k', label=r'$ZF: T0-S0 $',linewidth=2)
+      if spec == -1: ax.plot(self.t ,-(diss_r0+diss_th0+diss_c0) ,':r', label=r'$ZF: Dissipations $',linewidth=2)
+
+      ax.set_xlim(0,t[-1])
+      if ymax != 'auto':
+         ax.set_ylim(top=float(ymax))
+      if ymin != 'auto':
+         ax.set_ylim(bottom=float(ymin))
+      ax.legend(loc=1, ncol=3, framealpha=0)
+
+      # JC: labels only correct for default norm
+      head = '(cs/a) t     Phi_0/rho_*    Phi_n/rho_*'
+
+      fig.tight_layout(pad=0.3)
+
+      return head
+
+
    def plot_flux(self,xin):
       
       w      = xin['w']
diff --git a/f2py/pygacode/cgyro/data_plot_single.py b/f2py/pygacode/cgyro/data_plot_single.py
index 42406e7a4..0ac7df2f5 100644
--- a/f2py/pygacode/cgyro/data_plot_single.py
+++ b/f2py/pygacode/cgyro/data_plot_single.py
@@ -50,6 +50,7 @@
 xin['bar']    = bool(int(sys.argv[25]))
 xin['ie']     = int(sys.argv[26])
 xin['mesh']   = int(sys.argv[27])
+xin['dn']     = int(sys.argv[28])
 
 # plotting control
 ftype   = xin['ftype']
@@ -84,6 +85,14 @@
 
    head,x,y1,y2 = data_in.plot_phi(xin)
 
+elif plot_type == 'triad':
+
+   head = data_in.plot_triad(xin)
+
+elif plot_type == 'triad_v2':
+
+   head = data_in.plot_triad_v2(xin)
+
 elif plot_type == 'flux':
 
    if ftype == 'nox' or ftype == 'dump':
diff --git a/platform/exec/exec.MINT_OPENMPI b/platform/exec/exec.MINT_OPENMPI
index bcb1db045..4879c7562 100755
--- a/platform/exec/exec.MINT_OPENMPI
+++ b/platform/exec/exec.MINT_OPENMPI
@@ -9,5 +9,9 @@ numa=${5}
 mpinuma=${6}
 
 cd $simdir
+<<<<<<< HEAD
 mpiexec --use-hwthread-cpus -x OMP_NUM_THREADS=$nomp -np $nmpi $exec 
+=======
+mpiexec -x OMP_NUM_THREADS=$nomp -np $nmpi $exec
+>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6
 
diff --git a/shared/landau/landau.F90 b/shared/landau/landau.F90
index e265f851f..c1c26e2df 100644
--- a/shared/landau/landau.F90
+++ b/shared/landau/landau.F90
@@ -1,6 +1,7 @@
 !  gfortran -march=native -O3 -fno-stack-arrays -fimplicit-none -fdefault-real-8 landau.F90  -c
 !intel:
 !ifort -stand f15 -warn all -march=native -O3 -heap-arrays 10 -implicitnone -real-size 64 landau.F90 -c
+<<<<<<< HEAD
 
 #ifdef LANDAU_PREC16
 !still cannot use quad precision with nvhpc/24.3 compiler version
@@ -17,6 +18,22 @@ module landau
   ! int gphys(v) d^3v=1=int g(x) x^2 dx *intnor
   ! Maybe one should normalise this once for a given xmax and density 1?
   real, parameter :: vunit=real(sqrt(2._PREC))
+=======
+#ifndef __PGI
+  !can't use quad precision with frumpy pgi compiler
+#define HIREAL real(16)
+#define HRS _16
+#else
+#define HIREAL real
+#define HRS
+#endif
+module landau
+  HIREAL, parameter,private :: pi1=atan(1.HRS)*4
+  ! real, parameter :: intnorm=pi1**(-1.5)*4*pi1
+  ! int gphys(v) d^3v=1=int g(x) x^2 dx *intnor
+  ! Maybe one should normalise this once for a given xmax and density 1?
+  real, parameter :: vunit=real(sqrt(2.HRS))
+>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6
   ! v*vunit is v in units of sqrt(T/m)
   ! v itself is v in units of sqrt(2T/m) ("collision operator units")
   real, parameter :: intnorm=real(4/sqrt(pi1))
@@ -78,7 +95,11 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
     real p0,p1,pi,q0,q1,qi,r0,r1,ri,s0,s1,si,x,xmax1
     real f1,f2
     integer i,j,k
+<<<<<<< HEAD
     real(PREC) xmaxx,x1
+=======
+    HIREAL xmaxx,x1
+>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6
     logical t1t2,addcutoff1
     real t1,t2 ! for timing
 
@@ -288,6 +309,7 @@ subroutine gentestkernel(n,a1,b1,c1,xmax,beta,gp,gw,ngauss,lor_int,dif_int,addcu
 
     if (verbose>0) print '(A,G13.3)','gentestkernel took ',t2-t1
   contains
+<<<<<<< HEAD
     elemental real(PREC) function fct1(w)
       implicit none
       real(PREC),intent(in) :: w
@@ -306,6 +328,26 @@ end function fct1lo
     elemental real(PREC) function fct1hi(w)
       implicit none
       real(PREC),intent(in) :: w
+=======
+    elemental HIREAL function fct1(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct1=.125HRS*w**(-3)*(exp(-w*w)*w+(2*w*w-1)*.5HRS*sqrt(pi1)*erf(w))
+    end function fct1
+    elemental HIREAL function fct2(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct2=.25HRS*w**(-1)*(.5HRS*sqrt(pi1)*erf(w)-w*exp(-w**2))
+    end function fct2
+    elemental HIREAL function fct1lo(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct1lo=fct1(w)-exp(-xmaxx**2)/6  ! for w<xmax
+    end function fct1lo
+    elemental HIREAL function fct1hi(w)
+      implicit none
+      HIREAL,intent(in) :: w
+>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6
       !  fct1hi(w)=w**(-3)*((1./24)*exp(-xmaxx**2)*xmaxx*(3-6*w**2+2*xmaxx**2)+(sqrt(pi1)/16)*(2*w**2-1)*erf(xmaxx))
       fct1hi=-w**(-3)/48*((-2*exp(-xmaxx**2)*xmaxx*(3+2*xmaxx**2)+3*sqrt(pi1)*erf(xmaxx))+&
            w**2*(12*exp(-xmaxx**2)*xmaxx-6*sqrt(pi1)*erf(xmaxx)))
@@ -313,6 +355,7 @@ elemental real(PREC) function fct1hi(w)
       !./hhv 5 .5 0 50 1 1
       !was due to inaccuracies in this routine, likely due to cancellations in the fcts.
     end function fct1hi
+<<<<<<< HEAD
     elemental real(PREC) function fct2lo(w)
       implicit none
       real(PREC),intent(in) :: w
@@ -322,6 +365,17 @@ elemental real(PREC) function fct2hi(w)
       implicit none
       real(PREC),intent(in) :: w
       fct2hi=-w**(-1)*((1._PREC/12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx))
+=======
+    elemental HIREAL function fct2lo(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct2lo=fct2(w)-w*w*exp(-xmaxx**2)/6 !for w<xmax
+    end function fct2lo
+    elemental HIREAL function fct2hi(w)
+      implicit none
+      HIREAL,intent(in) :: w
+      fct2hi=-w**(-1)*((1.HRS/12)*xmaxx*exp(-xmaxx**2)*(3+2*xmaxx**2)-(sqrt(pi1)/8)*erf(xmaxx))
+>>>>>>> 75e117b697c0d4211214ae51d6ca807d8bf5e2f6
     end function fct2hi
   end subroutine gentestkernel