Merge pull request #1644 from danielpeter/devel

updates array allocations and seismogram outputs; cleans up code comments
SPECFEM · Nov 13, 2023 · ee9914a · ee9914a
2 parents 1297505 + 5530249
commit ee9914a
Show file tree

Hide file tree

Showing 16 changed files with 96 additions and 97 deletions.
diff --git a/src/decompose_mesh/rules.mk b/src/decompose_mesh/rules.mk
@@ -154,6 +154,7 @@ xdecompose_mesh_mpi_OBJECTS = \
 	$O/module_database.dec.o \
 	$O/module_mesh.dec.o \
 	$O/module_partition.dec.o \
+	$O/part_decompose_mesh.dec_module.o \
 	$O/program_decompose_mesh_mpi.mpidec.o \
 	$(EMPTY_MACRO)
 

diff --git a/src/generate_databases/save_arrays_solver_hdf5.F90 b/src/generate_databases/save_arrays_solver_hdf5.F90
@@ -90,7 +90,7 @@ subroutine save_arrays_solver_mesh_hdf5()
 
   ! element node connectivity for movie output
   ! the node ids are stored after dividing one NGLL*-th order spectral element into NGLLX*NGLLY*NGLLZ elements
-  integer, dimension(9,nspec_ab*(NGLLX-1)*(NGLLY-1)*(NGLLZ-1)) :: spec_elm_conn_xdmf
+  integer, dimension(:,:),allocatable :: spec_elm_conn_xdmf
 
   ! dummy arrays
   integer, dimension(1,1), parameter                    :: i2d_dummy = reshape((/0/),(/1,1/))
@@ -153,9 +153,17 @@ subroutine save_arrays_solver_mesh_hdf5()
   allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
   if (ier /= 0) call exit_MPI_without_rank('error allocating array 650')
   if (ier /= 0) stop 'error allocating array'
+  ibool_interfaces_ext_mesh_dummy(:,:) = 0
   do i = 1, num_interfaces_ext_mesh
     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
   enddo
+
+  ! connectivity
+  allocate(spec_elm_conn_xdmf(9,NSPEC_AB*(NGLLX-1)*(NGLLY-1)*(NGLLZ-1)),stat=ier)
+  if (ier /= 0) call exit_MPI_without_rank('error allocating array 651')
+  if (ier /= 0) stop 'error allocating spec_elm_conn_xdm array'
+  spec_elm_conn_xdmf(:,:) = 0
+
   call synchronize_all()
 
   ! get MPI parameters
@@ -1433,10 +1441,8 @@ subroutine save_arrays_solver_mesh_hdf5()
   call synchronize_all()
 
   ! cleanup
-  if (allocated(ibool_interfaces_ext_mesh_dummy)) then
-    deallocate(ibool_interfaces_ext_mesh_dummy,stat=ier)
-    if (ier /= 0) stop 'error deallocating array ibool_interfaces_ext_mesh_dummy'
-  endif
+  deallocate(ibool_interfaces_ext_mesh_dummy)
+  deallocate(spec_elm_conn_xdmf)
 
   ! user output
   if (myrank == 0) then

diff --git a/src/gpu/compute_forces_acoustic_cuda.cu b/src/gpu/compute_forces_acoustic_cuda.cu
@@ -67,10 +67,9 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
   //       to combine forward and backward wavefield in the same kernel call
 
   // kernel timing
-  gpu_event start, stop;
-  if (CUDA_TIMING ){
-    start_timing_gpu(&start,&stop);
-  }
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
   int nb_field = mp->simulation_type == 3 ? 2 : 1 ;
 
   // sets gpu arrays
@@ -202,8 +201,8 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
 
   }
 
-  // Cuda timing
-  if (CUDA_TIMING ){
+  // kernel timing
+  if (GPU_TIMING){
     realw flops,time;
     stop_timing_gpu(&start,&stop,"Kernel_2_acoustic_impl",&time);
     // time in seconds

diff --git a/src/gpu/compute_forces_viscoelastic_cuda.cu b/src/gpu/compute_forces_viscoelastic_cuda.cu
@@ -91,7 +91,7 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
 
   // kernel timing
   gpu_event start,stop;
-  if (CUDA_TIMING){ start_timing_gpu(&start,&stop); }
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
 
   // defines local parameters for forward/adjoint function calls
   realw *displ,*veloc,*accel;
@@ -1076,8 +1076,8 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
     } // ANISOTROPY
   } // ATTENUATION
 
-  // Cuda timing
-  if (CUDA_TIMING){
+  // kernel timing
+  if (GPU_TIMING){
     if (ATTENUATION){
       stop_timing_gpu(&start,&stop,"Kernel_2_att_impl");
     }else{

diff --git a/src/gpu/helper_functions.cu b/src/gpu/helper_functions.cu
@@ -227,7 +227,7 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str){
 #endif
 
   // user output
-  printf("%s: Execution Time = %f ms\n",info_str,time);
+  printf("GPU_timing %s: Execution Time = %f ms\n",info_str,time);
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -253,7 +253,7 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str, rea
 #endif
 
   // user output
-  printf("%s: Execution Time = %f ms\n",info_str,time);
+  printf("GPU_timing %s: Execution Time = %f ms\n",info_str,time);
 
   // returns time
   *t = time;

diff --git a/src/gpu/kernels/Kernel_2_acoustic_impl.cu b/src/gpu/kernels/Kernel_2_acoustic_impl.cu
@@ -462,7 +462,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
 //                              i.e.               11% x theoretical peak performance ~ 440 GFlop/s.
 //                                                 11% x "pratical"  peak performance ~ 320 GFlop/s.
 //
-//   CUDA_TIMING: we achieve about 224 GFlop/s (1 mpi process, 20736 elements)
+//   GPU_TIMING: we achieve about 224 GFlop/s (1 mpi process, 20736 elements)
 //                -> that is about 8% of the "practical" peak. (or 70% of the theoretical arithmetic intensity)
 //
 //                this might be due to the first compute code block (before first syncthreads), where

diff --git a/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu b/src/gpu/kernels/Kernel_2_viscoelastic_impl.cu
@@ -2412,8 +2412,6 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
 #endif
   }
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
   // synchronize all the threads (one thread for each of the NGLL grid points of the
   // current spectral element) because we need the whole element to be ready in order
   // to be able to compute the matrix products along cut planes of the 3D element below
@@ -2547,8 +2545,6 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
                             &rho_s_H1,&rho_s_H2,&rho_s_H3);
   }
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
   // form dot product with test vector, symmetric form
   // 1. cut-plane xi
   __syncthreads();
@@ -2583,8 +2579,6 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
     sum_terms3 += rho_s_H3;
   }
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
   // assembles acceleration array
   if (threadIdx.x < NGLL3) {
 
@@ -2662,8 +2656,6 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
     epsilondev_trace[tx + working_element*NGLL3] = epsilondev_trace_loc;
   } // threadIdx.x
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
 } // kernel_2_att_impl()
 
 
@@ -2814,8 +2806,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
     // copy displacement from global memory to shared memory
     load_shared_memory_displ<FORWARD_OR_ADJOINT>(&tx,&iglob,d_displ,s_dummyx_loc,s_dummyy_loc,s_dummyz_loc);
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
     // attenuation
     // use first order Taylor expansion of displacement for local storage of stresses
     // at this current time step, to fix attenuation in a consistent way
@@ -2875,8 +2865,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
       tempz3l += s_dummyz_loc[l*NGLL2+J*NGLLX+I]*fac3;
     }
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
     // attenuation
     // temporary variables used for fixing attenuation in a consistent way
     tempx1l_att = 0.f;
@@ -2963,8 +2951,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
             + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
             + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
     // attenuation
     // temporary variables used for fixing attenuation in a consistent way
     tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
@@ -3049,8 +3035,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
     duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
     duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
     // precompute some sums to save CPU time
     duxdxl_plus_duydyl = duxdxl + duydyl;
     duxdxl_plus_duzdzl = duxdxl + duzdzl;
@@ -3059,8 +3043,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
     duzdxl_plus_duxdzl = duzdxl + duxdzl;
     duzdyl_plus_duydzl = duzdyl + duydzl;
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
     // attenuation
     // temporary variables used for fixing attenuation in a consistent way
     duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
@@ -3207,8 +3189,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
   // to be able to compute the matrix products along cut planes of the 3D element below
   __syncthreads();
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
   if (active ){
 
 #ifndef MANUALLY_UNROLLED_LOOPS
@@ -3326,10 +3306,7 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
     d_accel[iglob*3 + 2] += sum_terms3;
 #endif // USE_TEXTURES_FIELDS
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
 #else // MESH_COLORING
-
     //mesh coloring
     if (use_mesh_coloring_gpu ){
 
@@ -3371,8 +3348,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
     epsilondev_yz[tx + working_element*NGLL3] = epsilondev_yz_loc;
   } // if (active)
 
-// JC JC here we will need to add GPU support for the new C-PML routines
-
 } // kernel_2_att_impl()
 
 */

diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h
@@ -112,8 +112,7 @@ typedef double realw;
 #endif
 
 // performance timers
-#define CUDA_TIMING 0
-#define CUDA_TIMING_UPDATE 0
+#define GPU_TIMING 0
 
 // error checking after cuda function calls
 // (note: this synchronizes many calls, thus e.g. no asynchronuous memcpy possible)

diff --git a/src/gpu/prepare_mesh_constants_cuda.cu b/src/gpu/prepare_mesh_constants_cuda.cu
@@ -420,8 +420,6 @@ void FC_FUNC_(prepare_constants_device,
   // Kelvin_voigt initialization
   mp->use_Kelvin_Voigt_damping = 0;
 
-  // JC JC here we will need to add GPU support for the new C-PML routines
-
   GPU_ERROR_CHECKING("prepare_constants_device");
 }
 

diff --git a/src/gpu/update_displacement_cuda.cu b/src/gpu/update_displacement_cuda.cu
@@ -77,7 +77,7 @@ void FC_FUNC_(update_displacement_cuda,
 
   // kernel timing
   gpu_event start,stop;
-  if (CUDA_TIMING_UPDATE ) start_timing_gpu(&start,&stop);
+  if (GPU_TIMING) start_timing_gpu(&start,&stop);
 
   // debug
   //realw max_d,max_v,max_a;
@@ -166,8 +166,8 @@ void FC_FUNC_(update_displacement_cuda,
     }
   }
 
-  // Cuda timing
-  if (CUDA_TIMING_UPDATE ){
+  // kernel timing
+  if (GPU_TIMING){
     realw flops,time;
     stop_timing_gpu(&start,&stop,"UpdateDispVeloc_kernel",&time);
     // time in seconds
@@ -235,7 +235,7 @@ void FC_FUNC_(update_displacement_ac_cuda,
 
   // kernel timing
   gpu_event start,stop;
-  if (CUDA_TIMING_UPDATE ) start_timing_gpu(&start,&stop);
+  if (GPU_TIMING) start_timing_gpu(&start,&stop);
 
 #ifdef USE_CUDA
   if (run_cuda){
@@ -255,8 +255,8 @@ void FC_FUNC_(update_displacement_ac_cuda,
   }
 #endif
 
-  // Cuda timing
-  if (CUDA_TIMING_UPDATE ){
+  // kernel timing
+  if (GPU_TIMING){
     realw flops,time;
     stop_timing_gpu(&start,&stop,"UpdatePotential_kernel",&time);
     // time in seconds

diff --git a/src/specfem3D/compute_seismograms.f90 b/src/specfem3D/compute_seismograms.f90
@@ -634,7 +634,7 @@ subroutine compute_seismograms_moment_adjoint()
                                       hprime_xx,hprime_yy,hprime_zz)
 
       ! source time function value
-      time_source_dble = dble(NSTEP-it)*DT - t0 - tshift_src(irec)
+      time_source_dble = dble(NSTEP-it) * DT - t0 - tshift_src(irec)
 
       ! determines source time function value
       stf = get_stf_viscoelastic(time_source_dble,irec,NSTEP-it+1)

diff --git a/src/specfem3D/setup_sources_receivers.f90 b/src/specfem3D/setup_sources_receivers.f90
@@ -1265,13 +1265,27 @@ subroutine setup_receivers_precompute_intp()
   !         nadj_rec_local - determines the number of adjoint sources, i.e., number of station locations (STATIONS_ADJOINT), which
   !                          act as sources to drive the adjoint wavefield
 
+  ! check if we need to save seismos
+  if (SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) then
+    do_save_seismograms = .false.
+  else
+    do_save_seismograms = .true.
+  endif
+
+  ! sets local receivers to zero if no seismogram needs to be saved
+  if (.not. do_save_seismograms) nrec_local = 0
+
   ! user output
   if (myrank == 0) then
     write(IMAIN,*) 'seismograms:'
-    if (WRITE_SEISMOGRAMS_BY_MAIN) then
-      write(IMAIN,*) '  seismograms written by main process only'
+    if (do_save_seismograms) then
+      if (WRITE_SEISMOGRAMS_BY_MAIN) then
+        write(IMAIN,*) '  seismograms written by main process only'
+      else
+        write(IMAIN,*) '  seismograms written by all processes'
+      endif
     else
-      write(IMAIN,*) '  seismograms written by all processes'
+      write(IMAIN,*) '  seismograms will not be saved'
     endif
     write(IMAIN,*)
     write(IMAIN,*) '  Total number of simulation steps (NSTEP)                       = ',NSTEP

diff --git a/src/specfem3D/specfem3D_par.F90 b/src/specfem3D/specfem3D_par.F90
@@ -193,6 +193,7 @@ module specfem_par
 
   integer :: nlength_seismogram
   integer :: seismo_offset,seismo_current
+  logical :: do_save_seismograms
 
   ! for ASDF/SAC headers time
   integer :: yr_PDE,jda_PDE,ho_PDE,mi_PDE

diff --git a/src/specfem3D/write_output_ASCII_or_binary.f90 b/src/specfem3D/write_output_ASCII_or_binary.f90
@@ -37,14 +37,17 @@ subroutine write_output_ASCII_or_binary(one_seismogram, &
 
   use constants
 
-  use specfem_par, only: myrank,USE_BINARY_FOR_SEISMOGRAMS,SAVE_ALL_SEISMOS_IN_ONE_FILE,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-    seismo_offset,seismo_current,NTSTEP_BETWEEN_OUTPUT_SAMPLE
+  use specfem_par, only: myrank, &
+    USE_BINARY_FOR_SEISMOGRAMS,SAVE_ALL_SEISMOS_IN_ONE_FILE, &
+    nlength_seismogram, &
+    NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
+    seismo_offset,seismo_current
 
   implicit none
 
   integer,intent(in) :: NSTEP,it,SIMULATION_TYPE
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,NTSTEP_BETWEEN_OUTPUT_SEISMOS/NTSTEP_BETWEEN_OUTPUT_SAMPLE),intent(in) :: one_seismogram
+  real(kind=CUSTOM_REAL), dimension(NDIM,nlength_seismogram),intent(in) :: one_seismogram
 
   double precision,intent(in) :: t0,DT
 
@@ -56,7 +59,6 @@ subroutine write_output_ASCII_or_binary(one_seismogram, &
   integer :: isample,ier,it_current
   double precision :: time_t_db
   real(kind=CUSTOM_REAL) :: value,time_t
-
   real, dimension(1:seismo_current) :: tr
 
   ! opens seismogram file
@@ -112,11 +114,9 @@ subroutine write_output_ASCII_or_binary(one_seismogram, &
     ! time
     if (SIMULATION_TYPE == 1) then
       ! forward simulation
-      ! distinguish between single and double precision for reals
-      time_t_db = dble( (it_current-1) * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0
+      time_t_db = dble((it_current-1) * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0
     else if (SIMULATION_TYPE == 3) then
       ! adjoint simulation: backward/reconstructed wavefields
-      ! distinguish between single and double precision for reals
       ! note: compare time_t with time used for source term
       time_t_db = dble(NSTEP - it_current * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0
     endif