Skip to content

Commit

Permalink
Merge pull request #1644 from danielpeter/devel
Browse files Browse the repository at this point in the history
updates array allocations and seismogram outputs; cleans up code comments
  • Loading branch information
danielpeter authored Nov 13, 2023
2 parents 1297505 + 5530249 commit ee9914a
Show file tree
Hide file tree
Showing 16 changed files with 96 additions and 97 deletions.
1 change: 1 addition & 0 deletions src/decompose_mesh/rules.mk
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ xdecompose_mesh_mpi_OBJECTS = \
$O/module_database.dec.o \
$O/module_mesh.dec.o \
$O/module_partition.dec.o \
$O/part_decompose_mesh.dec_module.o \
$O/program_decompose_mesh_mpi.mpidec.o \
$(EMPTY_MACRO)

Expand Down
16 changes: 11 additions & 5 deletions src/generate_databases/save_arrays_solver_hdf5.F90
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ subroutine save_arrays_solver_mesh_hdf5()

! element node connectivity for movie output
! the node ids are stored after dividing one NGLL*-th order spectral element into NGLLX*NGLLY*NGLLZ elements
integer, dimension(9,nspec_ab*(NGLLX-1)*(NGLLY-1)*(NGLLZ-1)) :: spec_elm_conn_xdmf
integer, dimension(:,:),allocatable :: spec_elm_conn_xdmf

! dummy arrays
integer, dimension(1,1), parameter :: i2d_dummy = reshape((/0/),(/1,1/))
Expand Down Expand Up @@ -153,9 +153,17 @@ subroutine save_arrays_solver_mesh_hdf5()
allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
if (ier /= 0) call exit_MPI_without_rank('error allocating array 650')
if (ier /= 0) stop 'error allocating array'
ibool_interfaces_ext_mesh_dummy(:,:) = 0
do i = 1, num_interfaces_ext_mesh
ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
enddo

! connectivity
allocate(spec_elm_conn_xdmf(9,NSPEC_AB*(NGLLX-1)*(NGLLY-1)*(NGLLZ-1)),stat=ier)
if (ier /= 0) call exit_MPI_without_rank('error allocating array 651')
if (ier /= 0) stop 'error allocating spec_elm_conn_xdm array'
spec_elm_conn_xdmf(:,:) = 0

call synchronize_all()

! get MPI parameters
Expand Down Expand Up @@ -1433,10 +1441,8 @@ subroutine save_arrays_solver_mesh_hdf5()
call synchronize_all()

! cleanup
if (allocated(ibool_interfaces_ext_mesh_dummy)) then
deallocate(ibool_interfaces_ext_mesh_dummy,stat=ier)
if (ier /= 0) stop 'error deallocating array ibool_interfaces_ext_mesh_dummy'
endif
deallocate(ibool_interfaces_ext_mesh_dummy)
deallocate(spec_elm_conn_xdmf)

! user output
if (myrank == 0) then
Expand Down
11 changes: 5 additions & 6 deletions src/gpu/compute_forces_acoustic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,9 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
// to combine forward and backward wavefield in the same kernel call

// kernel timing
gpu_event start, stop;
if (CUDA_TIMING ){
start_timing_gpu(&start,&stop);
}
gpu_event start,stop;
if (GPU_TIMING){ start_timing_gpu(&start,&stop); }

int nb_field = mp->simulation_type == 3 ? 2 : 1 ;

// sets gpu arrays
Expand Down Expand Up @@ -202,8 +201,8 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,

}

// Cuda timing
if (CUDA_TIMING ){
// kernel timing
if (GPU_TIMING){
realw flops,time;
stop_timing_gpu(&start,&stop,"Kernel_2_acoustic_impl",&time);
// time in seconds
Expand Down
6 changes: 3 additions & 3 deletions src/gpu/compute_forces_viscoelastic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,

// kernel timing
gpu_event start,stop;
if (CUDA_TIMING){ start_timing_gpu(&start,&stop); }
if (GPU_TIMING){ start_timing_gpu(&start,&stop); }

// defines local parameters for forward/adjoint function calls
realw *displ,*veloc,*accel;
Expand Down Expand Up @@ -1076,8 +1076,8 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
} // ANISOTROPY
} // ATTENUATION

// Cuda timing
if (CUDA_TIMING){
// kernel timing
if (GPU_TIMING){
if (ATTENUATION){
stop_timing_gpu(&start,&stop,"Kernel_2_att_impl");
}else{
Expand Down
4 changes: 2 additions & 2 deletions src/gpu/helper_functions.cu
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str){
#endif

// user output
printf("%s: Execution Time = %f ms\n",info_str,time);
printf("GPU_timing %s: Execution Time = %f ms\n",info_str,time);
}

/* ----------------------------------------------------------------------------------------------- */
Expand All @@ -253,7 +253,7 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str, rea
#endif

// user output
printf("%s: Execution Time = %f ms\n",info_str,time);
printf("GPU_timing %s: Execution Time = %f ms\n",info_str,time);

// returns time
*t = time;
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/kernels/Kernel_2_acoustic_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
// i.e. 11% x theoretical peak performance ~ 440 GFlop/s.
// 11% x "pratical" peak performance ~ 320 GFlop/s.
//
// CUDA_TIMING: we achieve about 224 GFlop/s (1 mpi process, 20736 elements)
// GPU_TIMING: we achieve about 224 GFlop/s (1 mpi process, 20736 elements)
// -> that is about 8% of the "practical" peak. (or 70% of the theoretical arithmetic intensity)
//
// this might be due to the first compute code block (before first syncthreads), where
Expand Down
25 changes: 0 additions & 25 deletions src/gpu/kernels/Kernel_2_viscoelastic_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2412,8 +2412,6 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
#endif
}

// JC JC here we will need to add GPU support for the new C-PML routines

// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
Expand Down Expand Up @@ -2547,8 +2545,6 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
&rho_s_H1,&rho_s_H2,&rho_s_H3);
}

// JC JC here we will need to add GPU support for the new C-PML routines

// form dot product with test vector, symmetric form
// 1. cut-plane xi
__syncthreads();
Expand Down Expand Up @@ -2583,8 +2579,6 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
sum_terms3 += rho_s_H3;
}

// JC JC here we will need to add GPU support for the new C-PML routines

// assembles acceleration array
if (threadIdx.x < NGLL3) {

Expand Down Expand Up @@ -2662,8 +2656,6 @@ Kernel_2_att_impl(int nb_blocks_to_compute,
epsilondev_trace[tx + working_element*NGLL3] = epsilondev_trace_loc;
} // threadIdx.x

// JC JC here we will need to add GPU support for the new C-PML routines

} // kernel_2_att_impl()


Expand Down Expand Up @@ -2814,8 +2806,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
// copy displacement from global memory to shared memory
load_shared_memory_displ<FORWARD_OR_ADJOINT>(&tx,&iglob,d_displ,s_dummyx_loc,s_dummyy_loc,s_dummyz_loc);
// JC JC here we will need to add GPU support for the new C-PML routines
// attenuation
// use first order Taylor expansion of displacement for local storage of stresses
// at this current time step, to fix attenuation in a consistent way
Expand Down Expand Up @@ -2875,8 +2865,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
tempz3l += s_dummyz_loc[l*NGLL2+J*NGLLX+I]*fac3;
}
// JC JC here we will need to add GPU support for the new C-PML routines
// attenuation
// temporary variables used for fixing attenuation in a consistent way
tempx1l_att = 0.f;
Expand Down Expand Up @@ -2963,8 +2951,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
+ s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
// JC JC here we will need to add GPU support for the new C-PML routines
// attenuation
// temporary variables used for fixing attenuation in a consistent way
tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
Expand Down Expand Up @@ -3049,8 +3035,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
// JC JC here we will need to add GPU support for the new C-PML routines
// precompute some sums to save CPU time
duxdxl_plus_duydyl = duxdxl + duydyl;
duxdxl_plus_duzdzl = duxdxl + duzdzl;
Expand All @@ -3059,8 +3043,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
duzdxl_plus_duxdzl = duzdxl + duxdzl;
duzdyl_plus_duydzl = duzdyl + duydzl;
// JC JC here we will need to add GPU support for the new C-PML routines
// attenuation
// temporary variables used for fixing attenuation in a consistent way
duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
Expand Down Expand Up @@ -3207,8 +3189,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
// to be able to compute the matrix products along cut planes of the 3D element below
__syncthreads();
// JC JC here we will need to add GPU support for the new C-PML routines
if (active ){
#ifndef MANUALLY_UNROLLED_LOOPS
Expand Down Expand Up @@ -3326,10 +3306,7 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
d_accel[iglob*3 + 2] += sum_terms3;
#endif // USE_TEXTURES_FIELDS
// JC JC here we will need to add GPU support for the new C-PML routines
#else // MESH_COLORING
//mesh coloring
if (use_mesh_coloring_gpu ){
Expand Down Expand Up @@ -3371,8 +3348,6 @@ Kernel_2_att_org_impl(int nb_blocks_to_compute,
epsilondev_yz[tx + working_element*NGLL3] = epsilondev_yz_loc;
} // if (active)
// JC JC here we will need to add GPU support for the new C-PML routines
} // kernel_2_att_impl()
*/
Expand Down
3 changes: 1 addition & 2 deletions src/gpu/mesh_constants_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ typedef double realw;
#endif

// performance timers
#define CUDA_TIMING 0
#define CUDA_TIMING_UPDATE 0
#define GPU_TIMING 0

// error checking after cuda function calls
// (note: this synchronizes many calls, thus e.g. no asynchronuous memcpy possible)
Expand Down
2 changes: 0 additions & 2 deletions src/gpu/prepare_mesh_constants_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -420,8 +420,6 @@ void FC_FUNC_(prepare_constants_device,
// Kelvin_voigt initialization
mp->use_Kelvin_Voigt_damping = 0;

// JC JC here we will need to add GPU support for the new C-PML routines

GPU_ERROR_CHECKING("prepare_constants_device");
}

Expand Down
12 changes: 6 additions & 6 deletions src/gpu/update_displacement_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ void FC_FUNC_(update_displacement_cuda,

// kernel timing
gpu_event start,stop;
if (CUDA_TIMING_UPDATE ) start_timing_gpu(&start,&stop);
if (GPU_TIMING) start_timing_gpu(&start,&stop);

// debug
//realw max_d,max_v,max_a;
Expand Down Expand Up @@ -166,8 +166,8 @@ void FC_FUNC_(update_displacement_cuda,
}
}

// Cuda timing
if (CUDA_TIMING_UPDATE ){
// kernel timing
if (GPU_TIMING){
realw flops,time;
stop_timing_gpu(&start,&stop,"UpdateDispVeloc_kernel",&time);
// time in seconds
Expand Down Expand Up @@ -235,7 +235,7 @@ void FC_FUNC_(update_displacement_ac_cuda,

// kernel timing
gpu_event start,stop;
if (CUDA_TIMING_UPDATE ) start_timing_gpu(&start,&stop);
if (GPU_TIMING) start_timing_gpu(&start,&stop);

#ifdef USE_CUDA
if (run_cuda){
Expand All @@ -255,8 +255,8 @@ void FC_FUNC_(update_displacement_ac_cuda,
}
#endif

// Cuda timing
if (CUDA_TIMING_UPDATE ){
// kernel timing
if (GPU_TIMING){
realw flops,time;
stop_timing_gpu(&start,&stop,"UpdatePotential_kernel",&time);
// time in seconds
Expand Down
2 changes: 1 addition & 1 deletion src/specfem3D/compute_seismograms.f90
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ subroutine compute_seismograms_moment_adjoint()
hprime_xx,hprime_yy,hprime_zz)

! source time function value
time_source_dble = dble(NSTEP-it)*DT - t0 - tshift_src(irec)
time_source_dble = dble(NSTEP-it) * DT - t0 - tshift_src(irec)

! determines source time function value
stf = get_stf_viscoelastic(time_source_dble,irec,NSTEP-it+1)
Expand Down
20 changes: 17 additions & 3 deletions src/specfem3D/setup_sources_receivers.f90
Original file line number Diff line number Diff line change
Expand Up @@ -1265,13 +1265,27 @@ subroutine setup_receivers_precompute_intp()
! nadj_rec_local - determines the number of adjoint sources, i.e., number of station locations (STATIONS_ADJOINT), which
! act as sources to drive the adjoint wavefield

! check if we need to save seismos
if (SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) then
do_save_seismograms = .false.
else
do_save_seismograms = .true.
endif

! sets local receivers to zero if no seismogram needs to be saved
if (.not. do_save_seismograms) nrec_local = 0

! user output
if (myrank == 0) then
write(IMAIN,*) 'seismograms:'
if (WRITE_SEISMOGRAMS_BY_MAIN) then
write(IMAIN,*) ' seismograms written by main process only'
if (do_save_seismograms) then
if (WRITE_SEISMOGRAMS_BY_MAIN) then
write(IMAIN,*) ' seismograms written by main process only'
else
write(IMAIN,*) ' seismograms written by all processes'
endif
else
write(IMAIN,*) ' seismograms written by all processes'
write(IMAIN,*) ' seismograms will not be saved'
endif
write(IMAIN,*)
write(IMAIN,*) ' Total number of simulation steps (NSTEP) = ',NSTEP
Expand Down
1 change: 1 addition & 0 deletions src/specfem3D/specfem3D_par.F90
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ module specfem_par

integer :: nlength_seismogram
integer :: seismo_offset,seismo_current
logical :: do_save_seismograms

! for ASDF/SAC headers time
integer :: yr_PDE,jda_PDE,ho_PDE,mi_PDE
Expand Down
14 changes: 7 additions & 7 deletions src/specfem3D/write_output_ASCII_or_binary.f90
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,17 @@ subroutine write_output_ASCII_or_binary(one_seismogram, &

use constants

use specfem_par, only: myrank,USE_BINARY_FOR_SEISMOGRAMS,SAVE_ALL_SEISMOS_IN_ONE_FILE,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
seismo_offset,seismo_current,NTSTEP_BETWEEN_OUTPUT_SAMPLE
use specfem_par, only: myrank, &
USE_BINARY_FOR_SEISMOGRAMS,SAVE_ALL_SEISMOS_IN_ONE_FILE, &
nlength_seismogram, &
NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
seismo_offset,seismo_current

implicit none

integer,intent(in) :: NSTEP,it,SIMULATION_TYPE

real(kind=CUSTOM_REAL), dimension(NDIM,NTSTEP_BETWEEN_OUTPUT_SEISMOS/NTSTEP_BETWEEN_OUTPUT_SAMPLE),intent(in) :: one_seismogram
real(kind=CUSTOM_REAL), dimension(NDIM,nlength_seismogram),intent(in) :: one_seismogram

double precision,intent(in) :: t0,DT

Expand All @@ -56,7 +59,6 @@ subroutine write_output_ASCII_or_binary(one_seismogram, &
integer :: isample,ier,it_current
double precision :: time_t_db
real(kind=CUSTOM_REAL) :: value,time_t

real, dimension(1:seismo_current) :: tr

! opens seismogram file
Expand Down Expand Up @@ -112,11 +114,9 @@ subroutine write_output_ASCII_or_binary(one_seismogram, &
! time
if (SIMULATION_TYPE == 1) then
! forward simulation
! distinguish between single and double precision for reals
time_t_db = dble( (it_current-1) * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0
time_t_db = dble((it_current-1) * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0
else if (SIMULATION_TYPE == 3) then
! adjoint simulation: backward/reconstructed wavefields
! distinguish between single and double precision for reals
! note: compare time_t with time used for source term
time_t_db = dble(NSTEP - it_current * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0
endif
Expand Down
Loading

0 comments on commit ee9914a

Please sign in to comment.