diff --git a/cgyro/src/cgyro_check_memory.F90 b/cgyro/src/cgyro_check_memory.F90
index 304b7e798..84a821f28 100644
--- a/cgyro/src/cgyro_check_memory.F90
+++ b/cgyro/src/cgyro_check_memory.F90
@@ -106,9 +106,6 @@ subroutine cgyro_check_memory(datafile)
      call cgyro_alloc_add_4d(io,n_field,nc,nv_loc,nt_loc,16,'omega_ss')
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'omega_cap_h')
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'omega_h')
-     if (triad_print_flag == 1) then
-       call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'diss_r')
-     endif
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'h_x')
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'h0_x')
      call cgyro_alloc_add_3d(io,nc,nv_loc,nt_loc,16,'g_x')
@@ -136,12 +133,6 @@ subroutine cgyro_check_memory(datafile)
         call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitA*n_toroidal_procs,16,'fpackA')
         call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitB*n_toroidal_procs,16,'fpackB')
         call cgyro_alloc_add_4d(io,n_field,n_radial,n_jtheta,n_toroidal,16,'gpack')
-        if (triad_print_flag == 1) then
-          call cgyro_alloc_add_4d(io,n_radial,nt_loc,nsplitA,n_toroidal_procs,16,'eA_nl')
-          call cgyro_alloc_add_4d(io,n_radial,nt_loc,nsplitB,n_toroidal_procs,16,'eB_nl')
-          call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitA*n_toroidal_procs,16,'epackA')
-          call cgyro_alloc_add_3d(io,n_radial,nt_loc,nsplitB*n_toroidal_procs,16,'epackB')
-        endif
      endif
 
      write(io,*)
diff --git a/cgyro/src/cgyro_cleanup.F90 b/cgyro/src/cgyro_cleanup.F90
index 177021040..7828e8bf5 100644
--- a/cgyro/src/cgyro_cleanup.F90
+++ b/cgyro/src/cgyro_cleanup.F90
@@ -43,33 +43,21 @@ subroutine cgyro_cleanup
      ccl_del_device(vel2)
      deallocate(vel2)
   endif
-  if(allocated(w_e))   then
-     ccl_del_device(w_e)
-     deallocate(w_e)
-  endif
+  if(allocated(w_e))           deallocate(w_e)
   if(allocated(e_deriv1_mat))  deallocate(e_deriv1_mat)
   if(allocated(e_deriv1_rot_mat))  deallocate(e_deriv1_rot_mat)
   if(allocated(xi))            then
      ccl_del_device(xi)
      deallocate(xi)
   endif
-  if(allocated(w_xi))   then
-     ccl_del_device(w_xi)
-     deallocate(w_xi)
-  endif
-  if(allocated(w_exi))   then
-     ccl_del_device(w_exi)
-     deallocate(w_exi)
-  endif
+  if(allocated(w_xi))          deallocate(w_xi)
+  if(allocated(w_exi))         deallocate(w_exi)
   if(allocated(xi_lor_mat))    deallocate(xi_lor_mat)
   if(allocated(xi_deriv_mat))  deallocate(xi_deriv_mat)
   
   if(allocated(theta))          deallocate(theta)
   if(allocated(thetab))         deallocate(thetab)
-  if(allocated(w_theta))   then
-     ccl_del_device(w_theta)
-     deallocate(w_theta)
-  endif
+  if(allocated(w_theta))        deallocate(w_theta)
   if(allocated(g_theta))        deallocate(g_theta)
   if(allocated(g_theta_geo))    deallocate(g_theta_geo)
   if(allocated(bmag))           deallocate(bmag)
@@ -95,10 +83,7 @@ subroutine cgyro_cleanup
   if(allocated(lambda_rot))          deallocate(lambda_rot)
   if(allocated(dlambda_rot))         deallocate(dlambda_rot)
   if(allocated(dens_rot))            deallocate(dens_rot)
-  if(allocated(dens2_rot))   then
-     ccl_del_device(dens2_rot)
-     deallocate(dens2_rot)
-  endif
+  if(allocated(dens2_rot))           deallocate(dens2_rot)
   if(allocated(dens_ele_rot))        deallocate(dens_ele_rot)
   if(allocated(dens_avg_rot))        deallocate(dens_avg_rot)
   if(allocated(dlnndr_avg_rot))      deallocate(dlnndr_avg_rot)
@@ -152,18 +137,6 @@ subroutine cgyro_cleanup
       ccl_del_device(source)      
      deallocate(source)
   endif
-  if(allocated(triad)) then
-      ccl_del_device(triad)      
-     deallocate(triad)
-  endif
-  if(allocated(triad_loc)) then
-      ccl_del_device(triad_loc)      
-     deallocate(triad_loc)
-  endif
-  if(allocated(triad_loc_old)) then
-      ccl_del_device(triad_loc_old)      
-     deallocate(triad_loc_old)
-  endif
   if(allocated(thfac_itor)) then
       ccl_del_device(thfac_itor)
      deallocate(thfac_itor)
@@ -228,10 +201,6 @@ subroutine cgyro_cleanup
      ccl_del_device(omega_sbeta)        
      deallocate(omega_sbeta)
   endif
-  if(allocated(diss_r)) then
-     ccl_del_device(diss_r)        
-     deallocate(diss_r)
-  endif
   if(allocated(jvec_c))  then
      ccl_del_device(jvec_c)     
      deallocate(jvec_c)
@@ -292,14 +261,6 @@ subroutine cgyro_cleanup
      ccl_del_device(g_nl)
      deallocate(g_nl)
   endif
-  if(allocated(eA_nl))   then
-     ccl_del_device(eA_nl)
-     deallocate(eA_nl)
-  endif
-  if(allocated(eB_nl))   then
-     ccl_del_device(eB_nl)
-     deallocate(eB_nl)
-  endif
   if(allocated(fpackA))   then
      ccl_del_device(fpackA)
      deallocate(fpackA)
@@ -312,14 +273,6 @@ subroutine cgyro_cleanup
      ccl_del_device(gpack)
      deallocate(gpack)
   endif
-  if(allocated(epackA))   then
-     ccl_del_device(epackA)
-     deallocate(epackA)
-  endif
-  if(allocated(epackB))   then
-     ccl_del_device(epackB)
-     deallocate(epackB)
-  endif
   if (allocated(cmat)) then
      ccl_del_bigdevice(cmat)
      deallocate(cmat)
diff --git a/cgyro/src/cgyro_equilibrium.F90 b/cgyro/src/cgyro_equilibrium.F90
index a4edcfd6a..453ea1681 100644
--- a/cgyro/src/cgyro_equilibrium.F90
+++ b/cgyro/src/cgyro_equilibrium.F90
@@ -277,9 +277,9 @@ subroutine cgyro_equilibrium
   enddo
 
 #if defined(OMPGPU)
-!$omp target enter data map(to:xi,omega_stream,w_theta,dens2_rot)
+!$omp target enter data map(to:xi,omega_stream)
 #elif defined(_OPENACC)
-!$acc enter data copyin(xi,omega_stream,w_theta,dens2_rot)
+!$acc enter data copyin(xi,omega_stream)
 #endif
 
 #if defined(OMPGPU) || defined(_OPENACC)
diff --git a/cgyro/src/cgyro_flux.f90 b/cgyro/src/cgyro_flux.f90
index b9622a165..bfdc2f728 100644
--- a/cgyro/src/cgyro_flux.f90
+++ b/cgyro/src/cgyro_flux.f90
@@ -194,9 +194,8 @@ subroutine cgyro_flux
   ! 2-1. Compute Triad energy transfer
   !-------------------------------------------------------------
 
-     if (triad_print_flag == 1) then
-      kx = 2*pi*rho/length
-      do is=1,n_species
+     kx = 2*pi*rho/length
+     do is=1,n_species
         ! Triad energy transfer   :   T_k
         triad_loc_old(is,:,itor,1) = triad_loc(is,:,itor,1) *temp(is)/dlntdr(is_ele)
         ! From Nonzonal Triad energy transfer   :   T_k [NZ(k',k")->k]
@@ -215,8 +214,7 @@ subroutine cgyro_flux
         triad_loc_old(is,:,itor,7) = triad_loc(is,:,itor,6) *temp(is)/dlntdr(is_ele)
         ! Diss. (Coll. )
         triad_loc_old(is,:,itor,8) = triad_loc(is,:,itor,7) *temp(is)/dlntdr(is_ele)
-      enddo
-     endif
+     enddo
 
 
      !-----------------------------------------------------
@@ -281,16 +279,14 @@ subroutine cgyro_flux
        NEW_COMM_1, &
        i_err)
 
-  if (triad_print_flag == 1) then
-   ! Reduced complex triad(ns,kx), below, is still distributed over n 
-   call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), &
+  ! Reduced complex triad(ns,kx), below, is still distributed over n 
+  call MPI_ALLREDUCE(triad_loc_old(:,:,:,:), &
        triad, &
        size(triad), &
        MPI_DOUBLE_COMPLEX, &
        MPI_SUM, &
        NEW_COMM_1, &
        i_err)
-  endif
 
 
   tave_step = tave_step + 1
diff --git a/cgyro/src/cgyro_init_arrays.F90 b/cgyro/src/cgyro_init_arrays.F90
index 09670ca24..7b0b63b46 100644
--- a/cgyro/src/cgyro_init_arrays.F90
+++ b/cgyro/src/cgyro_init_arrays.F90
@@ -448,14 +448,12 @@ subroutine cgyro_init_arrays
                 + abs(omega_rot_drift_r(it,is)) &
                 + abs(omega_rot_edrift_r(it)))          
 
-            if (triad_print_flag == 1) then
-              ! (d/dr) upwind dissipation for triad energy transfer diagnostics
-              diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial &
-               * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) &
-               + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) &
-               + abs(omega_rot_drift_r(it,is)) &
-               + abs(omega_rot_edrift_r(it))) 
-            endif
+            ! (d/dr) upwind dissipation for triad energy transfer diagnostics
+            diss_r(ic,iv_loc,itor) = - (n_radial/length)*spectraldiss(u,nup_radial)*up_radial &
+             * (abs(omega_rdrift(it,is))*energy(ie)*(1.0+xi(ix)**2) &
+             + abs(omega_cdrift_r(it,is)*xi(ix))*vel(ie) &
+             + abs(omega_rot_drift_r(it,is)) &
+             + abs(omega_rot_edrift_r(it))) 
 
            ! omega_star 
            carg = &
@@ -489,13 +487,6 @@ subroutine cgyro_init_arrays
 #elif defined(_OPENACC)
 !$acc enter data copyin(omega_cap_h,omega_h,omega_s,omega_ss,omega_sbeta)
 #endif
-  if (triad_print_flag == 1) then
-#if defined(OMPGPU)
-!$omp target enter data map(to:diss_r)
-#elif defined(_OPENACC)
-!$acc enter data copyin(diss_r)
-#endif
-  endif
   !-------------------------------------------------------------------------
 
   deallocate(gdlnndr,gdlntdr)
diff --git a/cgyro/src/cgyro_init_manager.F90 b/cgyro/src/cgyro_init_manager.F90
index 7316e6951..8a2644e75 100644
--- a/cgyro/src/cgyro_init_manager.F90
+++ b/cgyro/src/cgyro_init_manager.F90
@@ -111,12 +111,6 @@ subroutine cgyro_init_manager
      enddo
   enddo
 
-#if defined(OMPGPU)
-!$omp target enter data map(to:w_exi,w_e,w_xi)
-#elif defined(_OPENACC)
-!$acc enter data copyin(w_exi,w_e,w_xi)
-#endif
-
   allocate(theta(n_theta))
   allocate(thetab(n_theta,n_radial/box_size))
   allocate(w_theta(n_theta))
@@ -185,6 +179,10 @@ subroutine cgyro_init_manager
      allocate(    gflux(0:n_global,n_species,4,n_field,nt1:nt2))
      allocate(gflux_loc(0:n_global,n_species,4,n_field,nt1:nt2))
 
+     allocate(    triad(n_species,n_radial,nt1:nt2,8))
+     allocate(triad_loc(n_species,n_radial,nt1:nt2,7))
+     allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8))
+
      allocate(cflux_tave(n_species,4))
      allocate(gflux_tave(n_species,4))
 
@@ -197,16 +195,6 @@ subroutine cgyro_init_manager
 #elif defined(_OPENACC)
 !$acc enter data create(fcoef,gcoef,field,field_loc,source)
 #endif
-     if (triad_print_flag == 1) then
-        allocate(    triad(n_species,n_radial,nt1:nt2,8))
-        allocate(triad_loc(n_species,n_radial,nt1:nt2,7))
-        allocate(triad_loc_old(n_species,n_radial,nt1:nt2,8))
-#if defined(OMPGPU)
-!$omp target enter data map(alloc:triad,triad_loc,triad_loc_old)
-#elif defined(_OPENACC)
-!$acc enter data create(triad,triad_loc,triad_loc_old)
-#endif
-     endif
 
      if ((collision_model /= 5) .AND. (collision_field_model == 1)) then
        ! nc and nc_loc must be last, since it will be collated     
@@ -274,9 +262,7 @@ subroutine cgyro_init_manager
      allocate(cap_h_v(nc_loc,nt1:nt2,nv))
      allocate(omega_cap_h(nc,nv_loc,nt1:nt2))
      allocate(omega_h(nc,nv_loc,nt1:nt2))
-     if (triad_print_flag == 1) then
-        allocate(diss_r(nc,nv_loc,nt1:nt2))
-     endif
+     allocate(diss_r(nc,nv_loc,nt1:nt2))
      allocate(omega_s(n_field,nc,nv_loc,nt1:nt2))
      allocate(omega_ss(n_field,nc,nv_loc,nt1:nt2))
      allocate(omega_sbeta(nc,nv_loc,nt1:nt2))
@@ -316,8 +302,10 @@ subroutine cgyro_init_manager
      ! Nonlinear arrays
      if (nonlinear_flag == 1) then
         allocate(fA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
+        allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
         allocate(g_nl(n_field,n_radial,n_jtheta,n_toroidal))
         allocate(fpackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
+        allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
         allocate(gpack(n_field,n_radial,n_jtheta,n_toroidal))
         allocate(jvec_c_nl(n_field,n_radial,n_jtheta,nv_loc,n_toroidal))
 #if defined(OMPGPU)
@@ -325,32 +313,16 @@ subroutine cgyro_init_manager
 #elif defined(_OPENACC)
 !$acc enter data create(fpackA,gpack,fA_nl,g_nl,jvec_c_nl)
 #endif
-        if (triad_print_flag == 1) then
-          allocate(eA_nl(n_radial,nt_loc,nsplitA,n_toroidal_procs))
-          allocate(epackA(n_radial,nt_loc,nsplitA*n_toroidal_procs))
-#if defined(OMPGPU)
-!$omp target enter data map(alloc:epackA,eA_nl)
-#elif defined(_OPENACC)
-!$acc enter data create(epackA,eA_nl)
-#endif
-        endif
         if (nsplitB > 0) then ! nsplitB can be zero at large MPI
           allocate(fB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
           allocate(fpackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
+          allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
+          allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
 #if defined(OMPGPU)
 !$omp target enter data map(alloc:fpackB,fB_nl)
 #elif defined(_OPENACC)
 !$acc enter data create(fpackB,fB_nl)
 #endif
-          if (triad_print_flag == 1) then
-            allocate(epackB(n_radial,nt_loc,nsplitB*n_toroidal_procs))
-            allocate(eB_nl(n_radial,nt_loc,nsplitB,n_toroidal_procs))
-#if defined(OMPGPU)
-!$omp target enter data map(alloc:epackB,eB_nl)
-#elif defined(_OPENACC)
-!$acc enter data create(epackB,eB_nl)
-#endif
-          endif
         endif
      endif
 
diff --git a/cgyro/src/cgyro_kernel.F90 b/cgyro/src/cgyro_kernel.F90
index ee3bc3c24..95c08273f 100644
--- a/cgyro/src/cgyro_kernel.F90
+++ b/cgyro/src/cgyro_kernel.F90
@@ -135,13 +135,6 @@ subroutine cgyro_kernel
         ! wait for cap_h_c to be synched into system memory, used by cgyro_write_timedata
 !$acc wait(4)
 #endif
-       if (triad_print_flag == 1) then
-#if defined(OMPGPU)
-!$omp target update from(triad_loc)
-#elif defined(_OPENACC)
-!$acc update host(triad_loc)
-#endif
-       endif
        call timer_lib_out('coll_mem')
 
        call timer_lib_in('io')
diff --git a/cgyro/src/cgyro_nl_comm.F90 b/cgyro/src/cgyro_nl_comm.F90
index a4cd1d28f..4e0e74fae 100644
--- a/cgyro/src/cgyro_nl_comm.F90
+++ b/cgyro/src/cgyro_nl_comm.F90
@@ -326,24 +326,9 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
   real :: dv,dvr,dvp,rval,rval2
   complex :: cprod,cprod2,thfac
 
-#if defined(OMPGPU)
-!$omp target teams distribute parallel do simd collapse(3)
-#elif defined(_OPENACC)
-!$acc parallel loop collapse(3) gang vector independent &
-!$acc&         present(triad_loc_old,triad_loc) &
-!$acc&         present(nt1,nt2,n_radial,n_species) default(none)
-#else
-!$omp parallel do private(ir,is)
-#endif
-  do itor=nt1,nt2
-      do ir=1,n_radial
-        do is=1,n_species
-          triad_loc_old(is,ir,itor,3) = triad_loc(is,ir,itor,3)
-          triad_loc_old(is,ir,itor,4) = triad_loc(is,ir,itor,4)
-          triad_loc(is,ir,itor,:) = 0.0
-        enddo
-      enddo
-  enddo
+  triad_loc_old(:,:,nt1:nt2,3) = triad_loc(:,:,nt1:nt2,3)
+  triad_loc_old(:,:,nt1:nt2,4) = triad_loc(:,:,nt1:nt2,4)
+  triad_loc(:,:,nt1:nt2,:) = 0.0
 
   call timer_lib_in('nl_comm')
   call parallel_slib_r_nc_wait(nsplitA,fA_nl,fpackA,fA_req)
@@ -366,16 +351,11 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
 #if defined(OMPGPU)
 !$omp target teams distribute parallel do simd collapse(4) &
 !$omp&         private(iexch0,itor0,isplit0,iexch_base) &
-!$omp&         private(id,itorbox,jr0,jc,itd,itd_class,thfac) &
 !$omp&         private(ic_loc_m,my_psi)
 #elif defined(_OPENACC)
 !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) &
 !$acc&         private(iexch0,itor0,isplit0,iexch_base) &
-!$acc&         private(id,itorbox,jr0,jc,itd,itd_class,thfac) &
-!$acc&         present(h_x,g_x,cap_h_c,cap_h_ct,field,dvjvec_c,jvec_c) &
-!$acc&         present(ic_c,is_v,ix_v,ie_v,w_exi,w_theta,dens2_rot,z,temp) &
-!$acc&         present(omega_stream,vel,xi,thfac_itor,cderiv,uderiv) &
-!$acc&         present(px,rhs,fpackA,fpackB,epackA,epackB,diss_r,triad_loc) copyin(psi_mul,zf_scale) &
+!$acc&         present(ic_c,px,rhs,fpackA,fpackB) copyin(psi_mul,zf_scale) &
 !$acc&         present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA,nsplitB) copyin(ij) default(none)
 #else
 !$omp parallel do collapse(2) private(ic_loc_m,my_psi) &
@@ -385,12 +365,12 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
   do itor=nt1,nt2
     do iv_loc_m=1,nv_loc
       do ir=1,n_radial
-        do it=1,n_theta
-           itorbox = itor*box_size*sign_qs
-           jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
-           jr0(1) = n_theta*(ir-1)
-           jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+        itorbox = itor*box_size*sign_qs
+        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+        jr0(1) = n_theta*(ir-1)
+        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
 
+        do it=1,n_theta
            ic_loc_m = ic_c(ir,it)
 
            is = is_v(iv_loc_m +nv1 -1 )
@@ -510,16 +490,11 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
 #if defined(OMPGPU)
 !$omp target teams distribute parallel do simd collapse(4) &
 !$omp&         private(iexch0,itor0,isplit0,iexch_base) &
-!$omp&         private(id,itorbox,jr0,jc,itd,itd_class,thfac) &
 !$omp&         private(ic_loc_m,my_psi)
 #elif defined(_OPENACC)
 !$acc parallel loop collapse(4) gang vector independent private(ic_loc_m,my_psi) &
 !$acc&         private(iexch0,itor0,isplit0,iexch_base) &
-!$acc&         private(id,itorbox,jr0,jc,itd,itd_class,thfac) &
-!$acc&         present(h_x,g_x,cap_h_c,cap_h_ct,field,dvjvec_c,jvec_c) &
-!$acc&         present(ic_c,is_v,ix_v,ie_v,w_exi,w_theta,dens2_rot,z,temp) &
-!$acc&         present(omega_stream,vel,xi,thfac_itor,cderiv,uderiv) &
-!$acc&         present(px,rhs,fpackA,epackA,diss_r,triad_loc) copyin(psi_mul,zf_scale) &
+!$acc&         present(ic_c,px,rhs,fpackA) copyin(psi_mul,zf_scale) &
 !$acc&         present(nt1,nt2,nv_loc,n_theta,n_radial,nsplit,nsplitA) copyin(ij) default(none)
 #else
 !$omp parallel do collapse(2) private(ic_loc_m,my_psi) &
@@ -529,12 +504,12 @@ subroutine cgyro_nl_fftw_comm1_r_triad(ij)
   do itor=nt1,nt2
     do iv_loc_m=1,nv_loc
       do ir=1,n_radial
-        do it=1,n_theta
-           itorbox = itor*box_size*sign_qs
-           jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
-           jr0(1) = n_theta*(ir-1)
-           jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
+        itorbox = itor*box_size*sign_qs
+        jr0(0) = n_theta*modulo(ir-itorbox-1,n_radial)
+        jr0(1) = n_theta*(ir-1)
+        jr0(2) = n_theta*modulo(ir+itorbox-1,n_radial)
 
+        do it=1,n_theta
            ic_loc_m = ic_c(ir,it)
 
            is = is_v(iv_loc_m +nv1 -1 )
diff --git a/cgyro/src/cgyro_nl_fftw.F90 b/cgyro/src/cgyro_nl_fftw.F90
index a4c58206a..a2e065fde 100644
--- a/cgyro/src/cgyro_nl_fftw.F90
+++ b/cgyro/src/cgyro_nl_fftw.F90
@@ -378,63 +378,7 @@ subroutine cgyro_nl_fftw_mul(sz,uvm,uxm,vym,uym,vxm,inv_nxny)
 
 end subroutine
 
-subroutine cgyro_nl_fftw_mul_sub_mean(ny,nx,nt,uvm,uxm,vym,uym,vxm,inv_nxny)
-  implicit none
-
-  integer, intent(in) :: ny,nx,nt
-  real, dimension(*),intent(out) :: uvm
-  real, dimension(*),intent(in) :: uxm,vym,uym,vxm
-  real, intent(in) :: inv_nxny
-
-  integer :: iy,ix,it
-  integer :: sz,sz_t
-  integer :: i,i_xt
-  real :: y_mean_ux, y_mean_uy, y_mean_vx, y_mean_vy
-  real :: r_ux, r_uy, r_vx, r_vy
-  real :: inv_ny
-
-  sz = ny*nx*nt
-  sz_t = ny*nx
-  inv_ny = 1.0/ny
-
-#if defined(OMPGPU)
-!$omp target teams distribute parallel do collapse(2) &
-!$omp&  private(iy,i_xt,y_mean_ux,y_mean_uy,y_mean_vx,y_mean_vy) &
-!$omp&  private(i,r_ux,r_uy,r_vx,r_vy) &
-!$omp&  map(to:uxm(1:sz),vym(1:sz),uym(1:sz),vxm(1:sz)) &
-!$omp&  map(from:uvm(1:sz))
-#else
-!$acc parallel loop independent gang collapse(2) &
-!$acc&         private(iy,i_xt,y_mean_ux,y_mean_uy,y_mean_vx,y_mean_vy) &
-!$acc&         present(uvm,uxm,vym,uym,vxm)
-#endif
-  do it=0,nt-1
-   do ix=0,nx-1
-    i_xt = (it*sz_t) + ix*ny;
-    y_mean_ux = sum(uxm(i_xt+1:i_xt+ny) ) * inv_ny
-    y_mean_uy = sum(uym(i_xt+1:i_xt+ny) ) * inv_ny
-    y_mean_vx = sum(vxm(i_xt+1:i_xt+ny) ) * inv_ny
-    y_mean_vy = sum(vym(i_xt+1:i_xt+ny) ) * inv_ny
-#if (!defined(OMPGPU)) && defined(_OPENACC)
-!$acc loop vector private(i,r_ux,r_uy,r_vx,r_vy)
-#endif
-    do iy=1,ny
-      i = i_xt+iy
-      ! remove ky=0
-      r_ux = uxm(i) - y_mean_ux
-      r_uy = uym(i) - y_mean_uy
-      r_vx = vxm(i) - y_mean_vx
-      r_vy = vym(i) - y_mean_vy
-
-      ! compute and save uv
-      uvm(i) = (uxm(i)*vym(i)-uym(i)*vxm(i))*inv_nxny
-    enddo
-   enddo
-  enddo
-
-end subroutine
-
-subroutine cgyro_nl_fftw(i_triad)
+subroutine cgyro_nl_fftw(ij)
 
 #if defined(HIPGPU)
   use hipfort
@@ -454,7 +398,7 @@ subroutine cgyro_nl_fftw(i_triad)
   include 'fftw/fftw3.f'
 #endif
   !-----------------------------------
-  integer, intent(in) :: i_triad
+  integer, intent(in) :: ij
   !-----------------------------------
   integer :: j,p,iexch
   integer :: it,ir,itm,itl,ix,iy
@@ -533,7 +477,7 @@ subroutine cgyro_nl_fftw(i_triad)
 
   call timer_lib_in('nl')
 #if !defined(OMPGPU)
-!$acc  data present(fA_nl,eA_nl)  &
+!$acc  data present(fA_nl)  &
 !$acc&      present(fxmany,fymany,gxmany,gymany) &
 !$acc&      present(uxmany,uymany,vxmany,vymany) &
 !$acc&      present(uvmany)
@@ -985,99 +929,8 @@ subroutine cgyro_nl_fftw(i_triad)
    enddo
   enddo
 
-  if (i_triad == 1) then
-    call timer_lib_out('nl_mem')
-    call timer_lib_in('nl')
-    ! 2. Poisson bracket in real space with Non_Zonal pairs
-
-    call cgyro_nl_fftw_mul_sub_mean(size(uvmany,1),size(uvmany,2),nsplitA, &
-                         uvmany, &
-                         uxmany,vymany(:,:,1:nsplitA), &
-                         uymany,vxmany(:,:,1:nsplitA), &
-                         inv_nxny)
-
-    ! make sure reqs progress
-    call cgyro_nl_fftw_comm_test()
-
-    ! ------------------
-    ! Transform uv to fx
-    ! ------------------
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: uvmany,fxmany)
-#else
-!$omp target data use_device_ptr(uvmany,fxmany)
-#endif
-
-#else
-!$acc wait
-!$acc  host_data use_device(uvmany,fxmany)
-#endif
-
-#if defined(HIPGPU)
-    rc = hipfftExecD2Z(hip_plan_r2c_manyA,c_loc(uvmany),c_loc(fxmany))
-#elif defined(MKLGPU)
-  !$omp dispatch
-    call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyA,uvmany,fxmany)
-    rc = 0
-#else
-    rc = cufftExecD2Z(cu_plan_r2c_manyA,uvmany,fxmany)
-#endif
-
-#ifdef HIPGPU
-    ! hipfftExec is asynchronous, will need the results below
-    rc = hipDeviceSynchronize()
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-!$acc wait
-!$acc end host_data
-#endif
-
-    ! make sure reqs progress
-    call cgyro_nl_fftw_comm_test()
-
-    call timer_lib_out('nl')
-    call timer_lib_in('nl_mem')
-
-    ! tile for performance, since this is effectively a transpose
-#if defined(OMPGPU)
-!$omp target teams distribute parallel do collapse(5) &
-!$omp&   private(iy,ir,itm,itl,ix)
-#else
-!$acc parallel loop independent collapse(5) gang &
-!$acc&         private(iy,ir,itm,itl,ix) present(eA_nl,fxmany)
-#endif
-    do j=1,nsplitA
-     do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE  ! round up
-      do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE  ! round up
-       do iy1=0,(R_TORTILE-1)   ! tile
-        do ir1=0,(R_RADTILE-1)  ! tile
-         iy = iy0 + iy1
-         ir = 1 + ir0 + ir1
-         if ((iy < n_toroidal) .and. (ir <= n_radial)) then
-           ! itor = iy+1
-           itm = 1 + iy/nt_loc
-           itl = 1 + modulo(iy,nt_loc)
-           ix = ir-1-nx0/2
-           if (ix < 0) ix = ix+nx
-
-           eA_nl(ir,itl,j,itm) = fxmany(iy,ix,j)
-          endif
-        enddo
-       enddo
-      enddo
-     enddo
-    enddo
-
-  endif ! i_triad==1
-
 #if !defined(OMPGPU)
-  ! end data fA_nl,eA_nl
+  ! end data fA_nl
 !$acc end data
 #endif
 
@@ -1125,9 +978,6 @@ subroutine cgyro_nl_fftw(i_triad)
   ! start the async reverse comm
   ! can reuse the same req, no overlap with forward fA_req
   call parallel_slib_r_nc_async(nsplitA,fA_nl,fpackA,fA_req)
-  if (i_triad == 1) then
-     call parallel_slib_r_nc_async(nsplitA,eA_nl,epackA,eA_req)
-  end if
   fA_req_valid = .TRUE.
 
   if (nsplitB > 0) then
@@ -1143,7 +993,7 @@ subroutine cgyro_nl_fftw(i_triad)
 
   call timer_lib_in('nl')
 #if !defined(OMPGPU)
-!$acc  data present(fB_nl,eB_nl)  &
+!$acc  data present(fB_nl)  &
 !$acc&      present(fxmany,fymany,gxmany,gymany) &
 !$acc&      present(uxmany,uymany,vxmany,vymany) &
 !$acc&      present(uvmany)
@@ -1392,105 +1242,8 @@ subroutine cgyro_nl_fftw(i_triad)
    enddo
   enddo
 
-  if (i_triad == 1) then
-    call timer_lib_out('nl_mem')
-    call timer_lib_in('nl')
-    ! 2. Poisson bracket in real space with Non_Zonal pairs
-    call cgyro_nl_fftw_mul(size(uvmany,1)*size(uvmany,2)*nsplitB, &
-                         uvmany, &
-                         uxmany,vymany(:,:,(nsplitA+1):nsplit), &
-                         uymany,vxmany(:,:,(nsplitA+1):nsplit), &
-                         inv_nxny)
-
-    ! make sure reqs progress
-    call cgyro_nl_fftw_comm_test()
-
-    ! ------------------
-    ! Transform uv to fx
-    ! ------------------
-
-#if defined(OMPGPU)
-
-#if defined(MKLGPU)
-!$omp target data map(tofrom: uvmany,fxmany)
-#else
-!$omp target data use_device_ptr(uvmany,fxmany)
-#endif
-
-#else
-!$acc wait
-!$acc  host_data use_device(uvmany,fxmany)
-#endif
-
-#if defined(HIPGPU)
-    rc = hipfftExecD2Z(hip_plan_r2c_manyB,c_loc(uvmany),c_loc(fxmany))
-#elif defined(MKLGPU)
-    !$omp dispatch
-    call dfftw_execute_dft_r2c(dfftw_plan_r2c_manyB,uvmany,fxmany)
-    rc = 0
-#else
-    rc = cufftExecD2Z(cu_plan_r2c_manyB,uvmany,fxmany)
-#endif
-
-#ifdef HIPGPU
-    ! make sure reqs progress
-    call cgyro_nl_fftw_comm_test()
-    ! hipfftExec is asynchronous, will need the results below
-    rc = hipDeviceSynchronize()
-#endif
-
-#if defined(OMPGPU)
-!$omp end target data
-#else
-    ! make sure reqs progress
-    call cgyro_nl_fftw_comm_test()
-!$acc wait
-!$acc end host_data
-#endif
-
-    ! make sure reqs progress
-    call cgyro_nl_fftw_comm_test()
-
-    call timer_lib_out('nl')
-    call timer_lib_in('nl_mem')
-
-    ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
-    ! that will be filtered in the main time-stepping loop
-
-    ! tile for performance, since this is effectively a transpose
-#if defined(OMPGPU)
-!$omp target teams distribute parallel do collapse(5) &
-!$omp&   private(iy,ir,itm,itl,ix)
-#else
-!$acc parallel loop independent collapse(5) gang &
-!$acc&         private(iy,ir,itm,itl,ix) present(eB_nl,fxmany)
-#endif
-    do j=1,nsplitB
-     do iy0=0,n_toroidal+(R_TORTILE-1)-1,R_TORTILE  ! round up
-      do ir0=0,n_radial+(R_RADTILE-1)-1,R_RADTILE  ! round up
-       do iy1=0,(R_TORTILE-1)   ! tile
-        do ir1=0,(R_RADTILE-1)  ! tile
-         iy = iy0 + iy1
-         ir = 1 + ir0 + ir1
-         if ((iy < n_toroidal) .and. (ir <= n_radial)) then
-           ! itor = iy+1
-           itm = 1 + iy/nt_loc
-           itl = 1 + modulo(iy,nt_loc)
-           ix = ir-1-nx0/2
-           if (ix < 0) ix = ix+nx
-
-           eB_nl(ir,itl,j,itm) = fxmany(iy,ix,j)
-         endif
-        enddo
-       enddo
-      enddo
-     enddo
-    enddo
-
-  endif ! if i_triad
-
 #if !defined(OMPGPU)
-  ! end data fB_nl,eB_nl
+  ! end data fB_nl
 !$acc end data
 #endif
 
@@ -1500,9 +1253,6 @@ subroutine cgyro_nl_fftw(i_triad)
   ! start the async reverse comm
   ! can reuse the same req, no overlap with forward fB_req
   call parallel_slib_r_nc_async(nsplitB,fB_nl,fpackB,fB_req)
-  if (i_triad == 1) then
-     call parallel_slib_r_nc_async(nsplitB,eB_nl,epackB,eB_req)
-  end if
   fB_req_valid = .TRUE.
   ! make sure reqs progress
   call cgyro_nl_fftw_comm_test()
@@ -1522,24 +1272,16 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp)
 
   implicit none
 
-  !-----------------------------------
   integer, intent(in) :: g_j, f_j
   integer,intent(in) :: nl_idx ! 1=>A, 2=>B
   integer,intent(in) :: i_omp
-  !-----------------------------------
   integer :: ix,iy
   integer :: ir,itm,itl,itor
-  real :: inv_nxny
 
   include 'fftw3.f03'
 
-  inv_nxny = 1.0/(nx*ny)
   ! Poisson bracket in real space
-  do ix=0,nx-1
-    do iy=0,ny-1
-      uv(iy,ix,i_omp) = (uxmany(iy,ix,f_j)*vymany(iy,ix,g_j)-uymany(iy,ix,f_j)*vxmany(iy,ix,g_j)) * inv_nxny
-    enddo
-  enddo
+  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
 
   call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
@@ -1565,7 +1307,6 @@ subroutine cgyro_nl_fftw_stepr(g_j, f_j, nl_idx, i_omp)
 
 end subroutine cgyro_nl_fftw_stepr
 
-  ! assumes cgyro_nl_fftw_stepr has already been called
 subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp)
 
   use timer_lib
@@ -1574,47 +1315,73 @@ subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp)
 
   implicit none
 
-  !-----------------------------------
   integer, intent(in) :: g_j, f_j
   integer,intent(in) :: nl_idx ! 1=>A, 2=>B
   integer,intent(in) :: i_omp
-  !-----------------------------------
-  real :: y_mean_ux, y_mean_uy, y_mean_vx, y_mean_vy
-  real :: r_ux, r_uy, r_vx, r_vy
+  real :: y_mean(nx)
   integer :: ix,iy
   integer :: ir,itm,itl,itor
-  real :: inv_nxny,inv_ny
 
   include 'fftw3.f03'
 
-  inv_ny = 1.0/ny
-  inv_nxny = 1.0/(nx*ny)
-  ! 1. Poisson bracket in real space was done in cgyro_nl_fftw_stepr
+  ! 1. Poisson bracket in real space
+  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
 
-  ! 2. Poisson bracket in real space with Non_Zonal pairs
+  call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
-  do ix=0,nx-1
-    y_mean_ux = sum(uxmany(:,ix,f_j) ,dim=1 ) * inv_ny
-    y_mean_uy = sum(uymany(:,ix,f_j) ,dim=1 ) * inv_ny
-    y_mean_vx = sum(vxmany(:,ix,f_j) ,dim=1 ) * inv_ny
-    y_mean_vy = sum(vymany(:,ix,f_j) ,dim=1 ) * inv_ny
-    do iy=0,ny-1
-      ! remove ky=0
-      r_ux = uxmany(iy,ix,f_j) - y_mean_ux
-      r_uy = uymany(iy,ix,f_j) - y_mean_uy
-      r_vx = vxmany(iy,ix,f_j) - y_mean_vx
-      r_vy = vymany(iy,ix,f_j) - y_mean_vy
-      ! we could save, but we do not really need to (not used outside from this function)
-      !uxmany(iy,ix,f_j) = r_ux
-      !uymany(iy,ix,f_j) = r_y
-      !vxmany(iy,ix,f_j) = r_vx
-      !vymany(iy,ix,f_j) = r_vy
-
-      ! compute and save uv
-      uv(iy,ix,i_omp) = (r_ux*r_vy-r_uy*r_vx) * inv_nxny
+  ! NOTE: The FFT will generate an unwanted n=0,p=-nr/2 component
+  ! that will be filtered in the main time-stepping loop
+
+  ! this should really be accounted against nl_mem, but hard to do with OMP
+  do itm=1,n_toroidal_procs
+   do itl=1,nt_loc
+    itor=itl + (itm-1)*nt_loc
+    do ir=1,n_radial
+     ix = ir-1-nx0/2
+     if (ix < 0) ix = ix+nx
+     iy = itor-1
+     if (nl_idx==1) then
+       fA_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     else
+       fB_nl(ir,itl,f_j,itm) = fx(iy,ix,i_omp)
+     endif
     enddo
+   enddo
   enddo
 
+
+  ! 2. Poisson bracket in real space with Non_Zonal pairs
+
+  ! remove ky=0 in uxmany
+  y_mean = sum(uxmany(:,:,f_j) ,dim=1 ) / ny
+  do iy=0,ny-1
+    uxmany(iy,:,f_j) = uxmany(iy,:,f_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in uymany
+  y_mean = sum(uymany(:,:,f_j) ,dim=1 ) / ny ! =0
+  do iy=0,ny-1
+    uymany(iy,:,f_j) = uymany(iy,:,f_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in vx
+  y_mean = sum(vxmany(:,:,g_j) ,dim=1 ) / ny
+  do iy=0,ny-1
+    vxmany(iy,:,g_j) = vxmany(iy,:,g_j) -y_mean
+
+  end do
+
+  ! remove ky=0 in vy
+  y_mean = sum(vymany(:,:,g_j) ,dim=1 ) / ny ! =0
+  do iy=0,ny-1
+    vymany(iy,:,g_j) = vymany(iy,:,g_j) -y_mean
+
+  end do
+
+  uv(:,:,i_omp) = (uxmany(:,:,f_j)*vymany(:,:,g_j)-uymany(:,:,f_j)*vxmany(:,:,g_j))/(nx*ny)
+
   call fftw_execute_dft_r2c(plan_r2c,uv(:,:,i_omp),fx(:,:,i_omp))
 
   do itm=1,n_toroidal_procs
@@ -1636,7 +1403,7 @@ subroutine cgyro_nl_fftw_stepr_triad(g_j, f_j, nl_idx, i_omp)
 end subroutine cgyro_nl_fftw_stepr_triad
 
 ! NOTE: call cgyro_nl_fftw_comm1 before cgyro_nl_fftw
-subroutine cgyro_nl_fftw(i_triad)
+subroutine cgyro_nl_fftw(ij)
 
   use timer_lib
   use parallel_lib
@@ -1647,9 +1414,9 @@ subroutine cgyro_nl_fftw(i_triad)
 
 
   !-----------------------------------
-  integer, intent(in) :: i_triad
+  integer, intent(in) :: ij
   !-----------------------------------
-  integer :: ix,iy
+  integer :: ix,iy,i_triad=0
   integer :: ir,it,itm,itl,it_loc
   integer :: itor,mytm
   integer :: j,p
@@ -1662,6 +1429,10 @@ subroutine cgyro_nl_fftw(i_triad)
 
   include 'fftw3.f03'
 
+  if (triad_print_flag == 1 .and. ij == 3) then
+   i_triad=1
+  endif
+
   call cgyro_nl_fftw_comm_test()
 
   ! time to wait for the FA_nl to become avaialble
@@ -1803,14 +1574,10 @@ subroutine cgyro_nl_fftw(i_triad)
         endif
 
         if (j<=nsplitA) then
-           call cgyro_nl_fftw_stepr(j, j, 1, i_omp)
-           if (i_omp==1) then
-             ! use the main thread to progress the async MPI
-             call cgyro_nl_fftw_comm_test()
-           endif
-
            if (i_triad == 1) then
               call cgyro_nl_fftw_stepr_triad(j, j, 1, i_omp)
+           else
+              call cgyro_nl_fftw_stepr(j, j, 1, i_omp)
            endif
         endif
         ! else we will do it in the next loop
@@ -1893,14 +1660,10 @@ subroutine cgyro_nl_fftw(i_triad)
           call cgyro_nl_fftw_comm_test()
         endif
 
-        call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp)
-        if (i_omp==1) then
-         ! use the main thread to progress the async MPI
-          call cgyro_nl_fftw_comm_test()
-        endif
-
         if (i_triad == 1) then
           call cgyro_nl_fftw_stepr_triad(nsplitA+j, j, 2, i_omp)
+        else
+          call cgyro_nl_fftw_stepr(nsplitA+j, j, 2, i_omp)
         endif
    enddo ! j
 
diff --git a/cgyro/src/cgyro_rhs.F90 b/cgyro/src/cgyro_rhs.F90
index b566b4328..b83770355 100644
--- a/cgyro/src/cgyro_rhs.F90
+++ b/cgyro/src/cgyro_rhs.F90
@@ -393,7 +393,6 @@ subroutine cgyro_rhs(ij,update_cap)
   integer, intent(in) :: ij
   logical, intent(in) :: update_cap
   !--------------------------------
-  integer :: i_triad
 
   ! fields is ready by now
   call cgyro_rhs_comm_async_fd
@@ -413,13 +412,9 @@ subroutine cgyro_rhs(ij,update_cap)
 
   ! Nonlinear evaluation [f,g]
   if (nonlinear_flag == 1) then
-     i_triad=0
-     if (triad_print_flag == 1 .and. ij == 3) then
-       i_triad=1
-     endif
      ! assumes someone already started the input comm
      ! and will finish the output comm
-     call cgyro_nl_fftw(i_triad)
+     call cgyro_nl_fftw(ij)
   endif
 
   call cgyro_upwind_complete
diff --git a/cgyro/src/cgyro_step_collision.F90 b/cgyro/src/cgyro_step_collision.F90
index eb97d6207..c2297993c 100644
--- a/cgyro/src/cgyro_step_collision.F90
+++ b/cgyro/src/cgyro_step_collision.F90
@@ -1049,38 +1049,6 @@ subroutine cgyro_step_collision_gpu(use_simple)
   call timer_lib_in('coll')
 
   ! Compute H given h and [phi(h), apar(h)]
- if (triad_print_flag == 1 ) then
-
-#if defined(OMPGPU)
-!$omp target teams distribute parallel do simd collapse(3) &
-!$omp&         private(iv_loc,is,my_psi,my_ch)
-#else
-!$acc parallel loop collapse(3) gang vector private(iv_loc,is,my_psi,my_ch) &
-!$acc&         present(is_v,cap_h_c,cap_h_ct,cap_h_c,jvec_c,field,z,temp,h_x) &
-!$acc&         present(nt1,nt2,nv1,nv2,nc) default(none)
-#endif
-  do itor=nt1,nt2
-   do iv=nv1,nv2
-     do ic=1,nc
-        iv_loc = iv-nv1+1
-        is = is_v(iv)
-
-        ! Save collisional diss. 
-        my_ch = cap_h_ct(iv_loc,itor,ic)
-        my_psi = sum(jvec_c(:,ic,iv_loc,itor)*field(:,ic,itor))
-           
-        my_psi = my_ch-my_psi*(z(is)/temp(is))
-        cap_h_ct(iv_loc,itor,ic) = (cap_h_c(ic,iv_loc,itor) + my_ch) / 2.0
-        cap_h_ct(iv_loc,itor,ic) = conjg(cap_h_ct(iv_loc,itor,ic)) &
-          * ( my_psi - h_x(ic,iv_loc,itor) )
-
-        h_x(ic,iv_loc,itor) = my_psi
-        cap_h_c(ic,iv_loc,itor) = my_ch
-     enddo
-   enddo
-  enddo
-
- else
 
 #if defined(OMPGPU)
 !$omp target teams distribute parallel do simd collapse(3) &
@@ -1103,8 +1071,6 @@ subroutine cgyro_step_collision_gpu(use_simple)
    enddo
   enddo
 
- end if ! (triad_print_flag == 1 )
-
   call timer_lib_out('coll')
 
 end subroutine cgyro_step_collision_gpu