diff --git a/examples/immersed_zones/block.case b/examples/immersed_zones/block.case index 6539de1e7bf..37c548f3e75 100644 --- a/examples/immersed_zones/block.case +++ b/examples/immersed_zones/block.case @@ -6,7 +6,7 @@ "output_at_end": false, "output_boundary": false, "output_checkpoints": false, - "end_time": 0.01, + "end_time": 20.0, "timestep": 5e-5, "numerics": { "time_order": 3, diff --git a/examples/immersed_zones/block_sphere.case b/examples/immersed_zones/block_sphere.case index 83da59c37dc..d9e441910c5 100644 --- a/examples/immersed_zones/block_sphere.case +++ b/examples/immersed_zones/block_sphere.case @@ -6,7 +6,7 @@ "output_at_end": false, "output_boundary": false, "output_checkpoints": false, - "end_time": 0.01, + "end_time": 20.0, "timestep": 5e-5, "numerics": { "time_order": 3, diff --git a/examples/immersed_zones/run.sh b/examples/immersed_zones/run.sh index e32bd6e2320..16399044e37 100755 --- a/examples/immersed_zones/run.sh +++ b/examples/immersed_zones/run.sh @@ -26,7 +26,7 @@ function help() { echo -e " See Readme for additional details." exit 0 } -if [ $# -eq 0 ]; then help; fi +if [ $# -eq 0 ]; then ALL=1; fi # Handle options Nx=32 && Ny=8 && Nz=8 diff --git a/src/.depends b/src/.depends index 61007d7f3ef..918c4ba7d34 100644 --- a/src/.depends +++ b/src/.depends @@ -18,7 +18,7 @@ math/mxm_wrapper.o : math/mxm_wrapper.F90 common/utils.o config/num_types.o sem/speclib.o : sem/speclib.f90 common/utils.o config/num_types.o qoi/drag_torque.o : qoi/drag_torque.f90 math/bcknd/device/device_math.o device/device.o common/utils.o config/num_types.o sem/space.o math/math.o comm/comm.o mesh/facet_zone.o mesh/mesh.o sem/coef.o field/field.o sem/local_interpolation.o : sem/local_interpolation.f90 config/neko_config.o math/bcknd/device/device_math.o device/device.o field/field_list.o field/field.o common/utils.o math/fast3d.o math/math.o mesh/point.o config/num_types.o sem/space.o math/tensor.o -math/math.o : math/math.f90 comm/comm.o config/num_types.o +math/math.o : math/math.f90 comm/comm.o common/utils.o config/num_types.o math/mathops.o : math/mathops.f90 config/num_types.o math/fast3d.o : math/fast3d.f90 math/math.o sem/speclib.o config/num_types.o sem/space.o : sem/space.f90 math/mxm_wrapper.o math/tensor.o math/math.o math/fast3d.o common/utils.o device/device.o sem/speclib.o config/num_types.o config/neko_config.o @@ -106,7 +106,7 @@ math/bcknd/xsmm/fdm_xsmm.o : math/bcknd/xsmm/fdm_xsmm.f90 math/bcknd/xsmm/tensor math/schwarz.o : math/schwarz.f90 config/neko_config.o device/device.o math/fdm.o math/bcknd/device/device_math.o math/bcknd/device/device_schwarz.o gs/gather_scatter.o bc/bc.o sem/dofmap.o sem/space.o mesh/mesh.o math/math.o config/num_types.o math/vector.o : math/vector.f90 common/utils.o math/bcknd/device/device_math.o device/device.o config/num_types.o math/math.o config/neko_config.o math/matrix.o : math/matrix.f90 common/utils.o math/bcknd/device/device_math.o device/device.o config/num_types.o math/math.o config/neko_config.o -math/signed_distance.o : math/signed_distance.f90 adt/stack.o mesh/search_tree/aabb.o mesh/point.o common/utils.o mesh/aabb_tree.o mesh/tri_mesh.o mesh/tri.o field/field.o config/num_types.o +math/signed_distance.o : math/signed_distance.f90 adt/stack.o mesh/search_tree/aabb.o mesh/point.o common/utils.o config/neko_config.o device/device.o mesh/aabb_tree.o mesh/tri_mesh.o mesh/tri.o field/field.o config/num_types.o common/checkpoint.o : common/checkpoint.f90 mesh/mesh.o common/utils.o field/field.o device/device.o sem/space.o field/field_series.o config/num_types.o config/neko_config.o io/generic_file.o : io/generic_file.f90 comm/comm.o common/utils.o config/num_types.o io/map_file.o : io/map_file.f90 io/format/map.o comm/comm.o common/utils.o io/generic_file.o @@ -293,9 +293,13 @@ source_terms/boussinesq_source_term.o : source_terms/boussinesq_source_term.f90 source_terms/bcknd/cpu/boussinesq_source_term_cpu.o : source_terms/bcknd/cpu/boussinesq_source_term_cpu.f90 math/math.o field/field.o field/field_list.o config/num_types.o source_terms/bcknd/device/boussinesq_source_term_device.o : source_terms/bcknd/device/boussinesq_source_term_device.f90 math/bcknd/device/device_math.o field/field.o field/field_list.o config/num_types.o source_terms/source_term_fctry.o : source_terms/source_term_fctry.f90 common/utils.o common/json_utils.o source_terms/coriolis_source_term.o source_terms/brinkman_source_term.o source_terms/boussinesq_source_term.o source_terms/const_source_term.o source_terms/source_term.o -source_terms/brinkman_source_term.o : source_terms/brinkman_source_term.f90 mesh/point_zone_registry.o mesh/point_zone.o mesh/search_tree/aabb.o common/profiler.o math/signed_distance.o source_terms/brinkman/filters.o device/device.o mesh/tri_mesh.o io/file.o math/field_math.o common/utils.o config/neko_config.o sem/coef.o source_terms/source_term.o field/field_registry.o common/json_utils.o field/field_list.o field/field.o config/num_types.o -source_terms/brinkman/filters.o : source_terms/brinkman/filters.f90 source_terms/bcknd/cpu/filters_cpu.o common/utils.o config/num_types.o config/neko_config.o field/field.o +source_terms/brinkman_source_term.o : source_terms/brinkman_source_term.f90 mesh/point_zone_registry.o mesh/point_zone.o mesh/search_tree/aabb.o common/profiler.o math/signed_distance.o source_terms/brinkman/filters.o device/device.o mesh/tri_mesh.o io/file.o math/bcknd/device/device_math.o math/math.o math/field_math.o common/utils.o config/neko_config.o sem/coef.o source_terms/source_term.o field/field_registry.o common/json_utils.o field/field_list.o field/field.o config/num_types.o +source_terms/brinkman/filters.o : source_terms/brinkman/filters.f90 source_terms/bcknd/device/filters_device.o source_terms/bcknd/cpu/filters_cpu.o common/utils.o config/num_types.o config/neko_config.o field/field.o source_terms/bcknd/cpu/filters_cpu.o : source_terms/bcknd/cpu/filters_cpu.f90 config/num_types.o +source_terms/bcknd/device/filters_device.o : source_terms/bcknd/device/filters_device.F90 source_terms/bcknd/device/opencl/opencl_filters.o source_terms/bcknd/device/hip/hip_filters.o source_terms/bcknd/device/cuda/cuda_filters.o common/utils.o math/bcknd/device/device_math.o config/num_types.o +source_terms/bcknd/device/cuda/cuda_filters.o : source_terms/bcknd/device/cuda/cuda_filters.f90 config/num_types.o +source_terms/bcknd/device/hip/hip_filters.o : source_terms/bcknd/device/hip/hip_filters.f90 config/num_types.o +source_terms/bcknd/device/opencl/opencl_filters.o : source_terms/bcknd/device/opencl/opencl_filters.f90 config/num_types.o les/les_model.o : les/les_model.f90 math/bcknd/device/device_math.o math/math.o device/device.o config/neko_config.o gs/gs_ops.o sem/coef.o sem/dofmap.o field/field_registry.o field/field.o config/num_types.o les/les_model_fctry.o : les/les_model_fctry.f90 common/utils.o les/sigma.o les/dynamic_smagorinsky.o les/smagorinsky.o les/vreman.o les/les_model.o les/vreman.o : les/vreman.f90 sem/coef.o les/bcknd/cpu/vreman_cpu.o config/neko_config.o common/utils.o common/json_utils.o sem/dofmap.o les/les_model.o field/field.o config/num_types.o diff --git a/src/.depends_device b/src/.depends_device index 4eeaecb8f8e..4a0a9a3153b 100644 --- a/src/.depends_device +++ b/src/.depends_device @@ -26,7 +26,7 @@ bc/bcknd/device/hip/dong_outflow.o : bc/bcknd/device/hip/dong_outflow.hip bc/bck common/bcknd/device/hip/rhs_maker.o : common/bcknd/device/hip/rhs_maker.hip common/bcknd/device/hip/sumab_kernel.h common/bcknd/device/hip/makeext_kernel.h common/bcknd/device/hip/makebdf_kernel.h common/bcknd/device/hip/projection.o : common/bcknd/device/hip/projection.hip common/bcknd/device/hip/projection_kernel.h common/bcknd/device/hip/gradient_jump_penalty.o : common/bcknd/device/hip/gradient_jump_penalty.hip common/bcknd/device/hip/gradient_jump_penalty_kernel.h -fluid/bcknd/device/hip/pnpn_res.o : fluid/bcknd/device/hip/pnpn_res.hip fluid/bcknd/device/hip/vel_res_update_kernel.h fluid/bcknd/device/hip/prs_res_kernel.h +fluid/bcknd/device/hip/pnpn_res.o : fluid/bcknd/device/hip/pnpn_res.hip fluid/bcknd/device/hip/vel_res_update_kernel.h fluid/bcknd/device/hip/prs_res_kernel.h scalar/bcknd/device/hip/scalar_residual.o : scalar/bcknd/device/hip/scalar_residual.hip scalar/bcknd/device/hip/scalar_residual_update_kernel.h sem/bcknd/device/hip/coef.o : sem/bcknd/device/hip/coef.hip sem/bcknd/device/hip/coef_kernel.h device/cuda/check.o : device/cuda/check.cu device/cuda/check.h @@ -47,7 +47,7 @@ krylov/bcknd/device/cuda/pipecg_aux.o : krylov/bcknd/device/cuda/pipecg_aux.cu k krylov/bcknd/device/cuda/fusedcg_aux.o : krylov/bcknd/device/cuda/fusedcg_aux.cu krylov/bcknd/device/cuda/fusedcg_kernel.h krylov/bcknd/device/cuda/fusedcg_cpld_aux.o : krylov/bcknd/device/cuda/fusedcg_cpld_aux.cu krylov/bcknd/device/cuda/fusedcg_cpld_kernel.h krylov/bcknd/device/cuda/gmres_aux.o : krylov/bcknd/device/cuda/gmres_aux.cu krylov/bcknd/device/cuda/gmres_kernel.h -gs/bcknd/device/cuda/gs.o : gs/bcknd/device/cuda/gs.cu gs/bcknd/device/cuda/gs_kernels.h +gs/bcknd/device/cuda/gs.o : gs/bcknd/device/cuda/gs.cu gs/bcknd/device/cuda/gs_kernels.h bc/bcknd/device/cuda/dirichlet.o : bc/bcknd/device/cuda/dirichlet.cu bc/bcknd/device/cuda/dirichlet_kernel.h bc/bcknd/device/cuda/inflow.o : bc/bcknd/device/cuda/inflow.cu bc/bcknd/device/cuda/inflow_kernel.h bc/bcknd/device/cuda/no_slip_wall.o : bc/bcknd/device/cuda/no_slip_wall.cu bc/bcknd/device/cuda/no_slip_wall_kernel.h @@ -58,13 +58,13 @@ bc/bcknd/device/cuda/dong_outflow.o : bc/bcknd/device/cuda/dong_outflow.cu bc/bc common/bcknd/device/cuda/rhs_maker.o : common/bcknd/device/cuda/rhs_maker.cu common/bcknd/device/cuda/sumab_kernel.h common/bcknd/device/cuda/makeext_kernel.h common/bcknd/device/cuda/makebdf_kernel.h common/bcknd/device/cuda/projection.o : common/bcknd/device/cuda/projection.cu common/bcknd/device/cuda/projection_kernel.h fluid/bcknd/device/cuda/pnpn_res.o : fluid/bcknd/device/cuda/pnpn_res.cu fluid/bcknd/device/cuda/vel_res_update_kernel.h fluid/bcknd/device/cuda/prs_res_kernel.h -fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.o : fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.cu fluid/stress_formulation/bcknd/device/cuda/prs_stress_res_kernel.h +fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.o : fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.cu fluid/stress_formulation/bcknd/device/cuda/prs_stress_res_kernel.h common/bcknd/device/cuda/gradient_jump_penalty.o : common/bcknd/device/cuda/gradient_jump_penalty.cu common/bcknd/device/cuda/gradient_jump_penalty_kernel.h scalar/bcknd/device/cuda/scalar_residual.o : scalar/bcknd/device/cuda/scalar_residual.cu scalar/bcknd/device/cuda/scalar_residual_update_kernel.h sem/bcknd/device/cuda/coef.o : sem/bcknd/device/cuda/coef.cu sem/bcknd/device/cuda/coef_kernel.h device/opencl/check.o : device/opencl/check.c device/opencl/check.h device/opencl/jit.o : device/opencl/jit.c device/opencl/jit.h -math/bcknd/device/opencl/math.o : math/bcknd/device/opencl/math.c math/bcknd/device/opencl/math_kernel.cl.h +math/bcknd/device/opencl/math.o : math/bcknd/device/opencl/math.c math/bcknd/device/opencl/math_kernel.cl.h math/bcknd/device/opencl/schwarz.o : math/bcknd/device/opencl/schwarz.c math/bcknd/device/opencl/schwarz_kernel.cl.h math/bcknd/device/opencl/tensor.o : math/bcknd/device/opencl/tensor.c math/bcknd/device/opencl/tensor_kernel.cl.h math/bcknd/device/opencl/fdm.o : math/bcknd/device/opencl/fdm.c math/bcknd/device/opencl/fdm_kernel.cl.h @@ -77,7 +77,7 @@ math/bcknd/device/opencl/opr_lambda2.o : math/bcknd/device/opencl/opr_lambda2.c math/bcknd/device/opencl/opr_cfl.o : math/bcknd/device/opencl/opr_cfl.c math/bcknd/device/opencl/cfl_kernel.cl.h math/bcknd/device/opencl/ax_helm.o : math/bcknd/device/opencl/ax_helm.c math/bcknd/device/opencl/ax_helm_kernel.cl.h krylov/bcknd/device/opencl/pc_jacobi.o : krylov/bcknd/device/opencl/pc_jacobi.c krylov/bcknd/device/opencl/jacobi_kernel.cl.h -gs/bcknd/device/opencl/gs.o : gs/bcknd/device/opencl/gs.c gs/bcknd/device/opencl/gs_kernels.cl.h +gs/bcknd/device/opencl/gs.o : gs/bcknd/device/opencl/gs.c gs/bcknd/device/opencl/gs_kernels.cl.h bc/bcknd/device/opencl/dirichlet.o : bc/bcknd/device/opencl/dirichlet.c bc/bcknd/device/opencl/dirichlet_kernel.cl.h bc/bcknd/device/opencl/inflow.o : bc/bcknd/device/opencl/inflow.c bc/bcknd/device/opencl/inflow_kernel.cl.h bc/bcknd/device/opencl/no_slip_wall.o : bc/bcknd/device/opencl/no_slip_wall.c bc/bcknd/device/opencl/no_slip_wall_kernel.cl.h @@ -89,3 +89,6 @@ common/bcknd/device/opencl/rhs_maker.o : common/bcknd/device/opencl/rhs_maker.c fluid/bcknd/device/opencl/pnpn_res.o : fluid/bcknd/device/opencl/pnpn_res.c fluid/bcknd/device/opencl/pnpn_res_kernel.cl.h scalar/bcknd/device/opencl/scalar_residual.o : scalar/bcknd/device/opencl/scalar_residual.c scalar/bcknd/device/opencl/scalar_residual_kernel.cl.h sem/bcknd/device/opencl/coef.o : sem/bcknd/device/opencl/coef.c sem/bcknd/device/opencl/coef_kernel.cl.h +source_terms/bcknd/device/cuda/filters.o : source_terms/bcknd/device/cuda/filters.cu source_terms/bcknd/device/cuda/filter_kernels.h +source_terms/bcknd/device/hip/filters.o : source_terms/bcknd/device/hip/filters.hip source_terms/bcknd/device/hip/filter_kernels.h +source_terms/bcknd/device/opencl/filters.o : source_terms/bcknd/device/opencl/filters.c source_terms/bcknd/device/opencl/filter_kernels.cl.h diff --git a/src/Makefile.am b/src/Makefile.am index 313d034c958..0693ba408a3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -299,6 +299,10 @@ neko_fortran_SOURCES = \ source_terms/brinkman_source_term.f90\ source_terms/brinkman/filters.f90\ source_terms/bcknd/cpu/filters_cpu.f90\ + source_terms/bcknd/device/filters_device.F90\ + source_terms/bcknd/device/cuda/cuda_filters.f90\ + source_terms/bcknd/device/hip/hip_filters.f90\ + source_terms/bcknd/device/opencl/opencl_filters.f90\ les/les_model.f90\ les/les_model_fctry.f90\ les/vreman.f90\ @@ -364,7 +368,8 @@ libneko_a_SOURCES += \ common/bcknd/device/hip/gradient_jump_penalty.hip\ fluid/bcknd/device/hip/pnpn_res.hip\ scalar/bcknd/device/hip/scalar_residual.hip\ - sem/bcknd/device/hip/coef.hip + sem/bcknd/device/hip/coef.hip\ + source_terms/bcknd/device/hip/filters.hip endif if ENABLE_CUDA @@ -401,7 +406,8 @@ libneko_a_SOURCES += \ fluid/bcknd/device/cuda/pnpn_res.cu\ fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.cu\ scalar/bcknd/device/cuda/scalar_residual.cu\ - sem/bcknd/device/cuda/coef.cu + sem/bcknd/device/cuda/coef.cu\ + source_terms/bcknd/device/cuda/filters.cu endif if ENABLE_OPENCL @@ -432,7 +438,8 @@ libneko_a_SOURCES += \ common/bcknd/device/opencl/rhs_maker.c\ fluid/bcknd/device/opencl/pnpn_res.c\ scalar/bcknd/device/opencl/scalar_residual.c\ - sem/bcknd/device/opencl/coef.c + sem/bcknd/device/opencl/coef.c\ + source_terms/bcknd/device/opencl/filters.c %.cl.h: %.cl bash ./scripts/cltostring.sh $< @@ -501,7 +508,8 @@ CLEANFILES += \ krylov/bcknd/device/opencl/*.cl.h\ fluid/bcknd/device/opencl/*.cl.h\ scalar/bcknd/device/opencl/*.cl.h\ - sem/bcknd/device/opencl/*.cl.h + sem/bcknd/device/opencl/*.cl.h\ + source_terms/bcknd/device/opencl/*.cl.h endif if ENABLE_MAKEDEPF90 @@ -604,6 +612,9 @@ EXTRA_DIST = \ sem/bcknd/device/opencl/coef_kernel.cl\ math/bcknd/device/device_mpi_reduce.h\ math/bcknd/device/device_mpi_op.h\ + source_terms/bcknd/device/cuda/filter_kernels.h\ + source_terms/bcknd/device/hip/filter_kernels.h\ + source_terms/bcknd/device/opencl/filter_kernels.cl\ device/opencl/jit.h\ device/opencl/prgm_lib.h\ device/opencl/check.h\ diff --git a/src/device/opencl/prgm_lib.F90 b/src/device/opencl/prgm_lib.F90 index be4fe2953fa..3e97be87591 100644 --- a/src/device/opencl/prgm_lib.F90 +++ b/src/device/opencl/prgm_lib.F90 @@ -82,6 +82,9 @@ module opencl_prgm_lib !> Device lambda2 kernels type(c_ptr), bind(c) :: lambda2_program = C_NULL_PTR + !> Device filter kernels + type(c_ptr), bind(c) :: filter_program = C_NULL_PTR + contains subroutine opencl_prgm_lib_release @@ -261,6 +264,13 @@ subroutine opencl_prgm_lib_release lambda2_program = C_NULL_PTR end if + if (c_associated(filter_program)) then + if(clReleaseProgram(filter_program) .ne. CL_SUCCESS) then + call neko_error('Failed to release program') + end if + filter_program = C_NULL_PTR + end if + end subroutine opencl_prgm_lib_release #endif diff --git a/src/device/opencl/prgm_lib.h b/src/device/opencl/prgm_lib.h index 1cea5643890..5a68c55b6ea 100644 --- a/src/device/opencl/prgm_lib.h +++ b/src/device/opencl/prgm_lib.h @@ -80,6 +80,8 @@ extern void *scalar_residual_program; /** Device lambda2 kernel */ extern void *lambda2_program; +/** Device filter kernel */ +extern void *filter_program; #endif diff --git a/src/math/bcknd/device/cuda/cuda_math.f90 b/src/math/bcknd/device/cuda/cuda_math.f90 index b79b402c858..073509dbef8 100644 --- a/src/math/bcknd/device/cuda/cuda_math.f90 +++ b/src/math/bcknd/device/cuda/cuda_math.f90 @@ -317,4 +317,74 @@ subroutine cuda_absval(a_d, n) & integer(c_int) :: n end subroutine cuda_absval end interface + + ! ========================================================================== ! + ! Interfaces for the pointwise operations. + + interface + subroutine cuda_pwmax_vec2(a_d, b_d, n) & + bind(c, name = 'cuda_pwmax_vec2') + use, intrinsic :: iso_c_binding, only: c_int, c_ptr + type(c_ptr), value :: a_d, b_d + integer(c_int) :: n + end subroutine cuda_pwmax_vec2 + + subroutine cuda_pwmax_vec3(a_d, b_d, c_d, n) & + bind(c, name = 'cuda_pwmax_vec3') + use, intrinsic :: iso_c_binding, only: c_int, c_ptr + type(c_ptr), value :: a_d, b_d, c_d + integer(c_int) :: n + end subroutine cuda_pwmax_vec3 + + subroutine cuda_pwmax_sca2(a_d, c_d, n) & + bind(c, name = 'cuda_pwmax_sca2') + use, intrinsic :: iso_c_binding, only: c_int, c_ptr + import c_rp + type(c_ptr), value :: a_d + real(c_rp) :: c_d + integer(c_int) :: n + end subroutine cuda_pwmax_sca2 + + subroutine cuda_pwmax_sca3(a_d, b_d, c_d, n) & + bind(c, name = 'cuda_pwmax_sca3') + use, intrinsic :: iso_c_binding, only: c_int, c_ptr + import c_rp + type(c_ptr), value :: a_d, b_d + real(c_rp) :: c_d + integer(c_int) :: n + end subroutine cuda_pwmax_sca3 + + subroutine cuda_pwmin_vec2(a_d, b_d, n) & + bind(c, name = 'cuda_pwmin_vec2') + use, intrinsic :: iso_c_binding, only: c_int, c_ptr + type(c_ptr), value :: a_d, b_d + integer(c_int) :: n + end subroutine cuda_pwmin_vec2 + + subroutine cuda_pwmin_vec3(a_d, b_d, c_d, n) & + bind(c, name = 'cuda_pwmin_vec3') + use, intrinsic :: iso_c_binding, only: c_int, c_ptr + type(c_ptr), value :: a_d, b_d, c_d + integer(c_int) :: n + end subroutine cuda_pwmin_vec3 + + subroutine cuda_pwmin_sca2(a_d, c_d, n) & + bind(c, name = 'cuda_pwmin_sca2') + use, intrinsic :: iso_c_binding, only: c_int, c_ptr + import c_rp + type(c_ptr), value :: a_d + real(c_rp) :: c_d + integer(c_int) :: n + end subroutine cuda_pwmin_sca2 + + subroutine cuda_pwmin_sca3(a_d, b_d, c_d, n) & + bind(c, name = 'cuda_pwmin_sca3') + use, intrinsic :: iso_c_binding, only: c_int, c_ptr + import c_rp + type(c_ptr), value :: a_d, b_d + real(c_rp) :: c_d + integer(c_int) :: n + end subroutine cuda_pwmin_sca3 + + end interface end module cuda_math diff --git a/src/math/bcknd/device/cuda/math.cu b/src/math/bcknd/device/cuda/math.cu index 931c8f7a11b..783d0a47fe8 100644 --- a/src/math/bcknd/device/cuda/math.cu +++ b/src/math/bcknd/device/cuda/math.cu @@ -42,7 +42,7 @@ extern "C" { #include #include - + /** Fortran wrapper for copy * Copy a vector \f$ a = b \f$ */ @@ -128,7 +128,7 @@ extern "C" { CUDA_CHECK(cudaGetLastError()); } - + /** Fortran wrapper for cadd * Add a scalar to vector \f$ a_i = a_i + c \f$ */ @@ -171,7 +171,7 @@ extern "C" { (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n); CUDA_CHECK(cudaGetLastError()); } - + } /** @@ -179,16 +179,16 @@ extern "C" { * Vector addition \f$ a = a + b \f$ */ void cuda_add2(void *a, void *b, int *n) { - + const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); add2_kernel<<>>((real *) a, (real *) b, *n); CUDA_CHECK(cudaGetLastError()); - + } - + /** * Fortran wrapper for add3 * Vector addition \f$ a = b + c \f$ @@ -221,23 +221,23 @@ extern "C" { /** * Fortran wrapper for add2s1 * Vector addition with scalar multiplication \f$ a = c_1 a + b \f$ - * (multiplication on first argument) + * (multiplication on first argument) */ void cuda_add2s1(void *a, void *b, real *c1, int *n) { - + const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); add2s1_kernel<<>>((real *) a, (real *) b, *c1, *n); CUDA_CHECK(cudaGetLastError()); - + } /** * Fortran wrapper for add2s2 * Vector addition with scalar multiplication \f$ a = a + c_1 b \f$ - * (multiplication on second argument) + * (multiplication on second argument) */ void cuda_add2s2(void *a, void *b, real *c1, int *n) { @@ -252,26 +252,26 @@ extern "C" { /** * Fortran wrapper for add2s2 - * Vector addition with scalar multiplication + * Vector addition with scalar multiplication * \f$ x = x + c_1 p1 + c_2p2 + ... + c_jpj \f$ - * (multiplication on second argument) + * (multiplication on second argument) */ void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) { - + const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); - + add2s2_many_kernel<<>>((real *) x, (const real **) p, (real *) alpha, *j, *n); CUDA_CHECK(cudaGetLastError()); } - + /** * Fortran wrapper for addsqr2s2 * Vector addition with scalar multiplication \f$ a = a + c_1 (b * b) \f$ - * (multiplication on second argument) + * (multiplication on second argument) */ void cuda_addsqr2s2(void *a, void *b, real *c1, int *n) { @@ -288,7 +288,7 @@ extern "C" { /** * Fortran wrapper for add3s2 * Vector addition with scalar multiplication \f$ a = c_1 b + c_2 c \f$ - * (multiplication on second argument) + * (multiplication on second argument) */ void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) { @@ -315,7 +315,7 @@ extern "C" { (cudaStream_t) glb_cmd_queue>>>((real *) a, *n); CUDA_CHECK(cudaGetLastError()); } - + /** * Fortran wrapper for invcol2 * Vector division \f$ a = a / b \f$ @@ -329,7 +329,7 @@ extern "C" { (real *) b, *n); CUDA_CHECK(cudaGetLastError()); } - + /** * Fortran wrapper for col2 * Vector multiplication with 2 vectors \f$ a = a \cdot b \f$ @@ -343,7 +343,7 @@ extern "C" { (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n); CUDA_CHECK(cudaGetLastError()); } - + /** * Fortran wrapper for col3 * Vector multiplication with 3 vectors \f$ a = b \cdot c \f$ @@ -371,7 +371,7 @@ extern "C" { (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n); CUDA_CHECK(cudaGetLastError()); } - + /** * Fortran wrapper for sub2 @@ -397,7 +397,7 @@ extern "C" { const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); sub3_kernel<<>>((real *) a, (real *) b, + (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n); CUDA_CHECK(cudaGetLastError()); } @@ -441,7 +441,7 @@ extern "C" { const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); - + vdot3_kernel<<>>((real *) dot, (real *) u1, (real *) u2, (real *) u3, @@ -460,12 +460,12 @@ extern "C" { const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); - + vcross_kernel<<>>((real *) u1, (real *) u2, (real *) u3, (real *) v1, (real *) v2, - (real *) v3, + (real *) v3, (real *) w1, (real *) w2, (real *) w3, *n); CUDA_CHECK(cudaGetLastError()); @@ -484,22 +484,22 @@ extern "C" { * Compute multiplication sum \f$ dot = u \cdot v \cdot w \f$ */ real cuda_vlsc3(void *u, void *v, void *w, int *n) { - + const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); const int nb = ((*n) + 1024 - 1)/ 1024; - const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; - + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + if ( nb > red_s){ red_s = nb; if (bufred != NULL) { CUDA_CHECK(cudaFreeHost(bufred)); - CUDA_CHECK(cudaFree(bufred_d)); + CUDA_CHECK(cudaFree(bufred_d)); } CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real))); CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real))); } - + glsc3_kernel<<>> ((real *) u, (real *) v, (real *) w, bufred_d, *n); CUDA_CHECK(cudaGetLastError()); @@ -518,22 +518,22 @@ extern "C" { * Weighted inner product \f$ a^T b c \f$ */ real cuda_glsc3(void *a, void *b, void *c, int *n) { - + const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); const int nb = ((*n) + 1024 - 1)/ 1024; - const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; - + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + if ( nb > red_s){ red_s = nb; if (bufred != NULL) { CUDA_CHECK(cudaFreeHost(bufred)); - CUDA_CHECK(cudaFree(bufred_d)); + CUDA_CHECK(cudaFree(bufred_d)); } CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real))); CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real))); } - + glsc3_kernel<<>> ((real *) a, (real *) b, (real *) c, bufred_d, *n); CUDA_CHECK(cudaGetLastError()); @@ -551,22 +551,22 @@ extern "C" { return bufred[0]; } - + /** * Fortran wrapper for doing an reduction to an array * Weighted inner product \f$ w^T v(n,1:j) c \f$ */ - void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){ + void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){ int pow2 = 1; while(pow2 < (*j)){ pow2 = 2*pow2; } - const int nt = 1024/pow2; + const int nt = 1024/pow2; const dim3 nthrds(pow2, nt, 1); const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1); const int nb = ((*n) + nt - 1)/nt; const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; - + if((*j)*nb>red_s){ red_s = (*j)*nb; if (bufred != NULL) { @@ -576,7 +576,7 @@ extern "C" { CUDA_CHECK(cudaMallocHost(&bufred,(*j)*nb*sizeof(real))); CUDA_CHECK(cudaMalloc(&bufred_d, (*j)*nb*sizeof(real))); } - + glsc3_many_kernel<<>> ((const real *) w, (const real **) v, (const real *)mult, bufred_d, *j, *n); @@ -587,7 +587,7 @@ extern "C" { #ifdef HAVE_DEVICE_MPI cudaStreamSynchronize(stream); device_mpi_allreduce(bufred_d, h, (*j), sizeof(real), DEVICE_MPI_SUM); -#else +#else CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); @@ -599,22 +599,22 @@ extern "C" { * Weighted inner product \f$ a^T b c \f$ */ real cuda_glsc2(void *a, void *b, int *n) { - + const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); const int nb = ((*n) + 1024 - 1)/ 1024; - const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; if ( nb > red_s){ red_s = nb; if (bufred != NULL) { CUDA_CHECK(cudaFreeHost(bufred)); - CUDA_CHECK(cudaFree(bufred_d)); + CUDA_CHECK(cudaFree(bufred_d)); } CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real))); CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real))); } - + glsc2_kernel <<>>((real *) a, (real *) b, @@ -635,7 +635,7 @@ extern "C" { return bufred[0]; } - /** + /** * Fortran wrapper glsum * Sum a vector of length n */ @@ -643,18 +643,18 @@ extern "C" { const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); const int nb = ((*n) + 1024 - 1)/ 1024; - const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; - + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + if ( nb > red_s){ red_s = nb; if (bufred != NULL) { CUDA_CHECK(cudaFreeHost(bufred)); - CUDA_CHECK(cudaFree(bufred_d)); + CUDA_CHECK(cudaFree(bufred_d)); } CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real))); CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real))); } - if ( *n > 0) { + if ( *n > 0) { glsum_kernel <<>>((real *) a, bufred_d, *n); CUDA_CHECK(cudaGetLastError()); @@ -669,11 +669,11 @@ extern "C" { cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); #endif - + return bufred[0]; } - /** + /** * Fortran wrapper absval * Sum a vector of length n */ @@ -681,12 +681,135 @@ extern "C" { const dim3 nthrds(1024, 1, 1); const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1); - const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; absval_kernel - <<>>((real *) a, * n); + <<>>((real *) a, * n); CUDA_CHECK(cudaGetLastError()); + + } + + // ======================================================================== // + // Point-wise operations. + + /** Fortran wrapper for pwmax_vec2 + * + * Compute the maximum of two vectors \f$ a = \max(a, b) \f$ + */ + void cuda_pwmax_vec2(void *a, void *b, int *n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + pwmax_vec2_kernel<<>>( + (real *)a, (real *)b, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for pwmax_vec3 + * + * Compute the maximum of two vectors \f$ a = \max(b, c) \f$ + */ + void cuda_pwmax_vec3(void *a, void *b, void *c, int *n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + pwmax_vec3_kernel<<>>( + (real *)a, (real *)b, (real *)c, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for pwmax_sca2 + * + * Compute the maximum of vector and scalar \f$ a = \max(a, c) \f$ + */ + void cuda_pwmax_sca2(void *a, real *c, int *n) { + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + pwmax_sca2_kernel<<>>( + (real *)a, *c, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for pwmax_sca3 + * + * Compute the maximum of vector and scalar \f$ a = \max(b, c) \f$ + */ + void cuda_pwmax_sca3(void *a, void *b, real *c, int *n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + pwmax_sca3_kernel<<>>( + (real *)a, (real *)b, *c, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for pwmin_vec2 + * + * Compute the minimum of two vectors \f$ a = \min(a, b) \f$ + */ + void cuda_pwmin_vec2(void *a, void *b, int *n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + pwmin_vec2_kernel<<>>( + (real *)a, (real *)b, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for pwmin_vec3 + * + * Compute the minimum of two vectors \f$ a = \min(b, c) \f$ + */ + void cuda_pwmin_vec3(void *a, void *b, void *c, int *n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + pwmin_vec3_kernel<<>>( + (real *)a, (real *)b, (real *)c, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for pwmin_sca2 + * + * Compute the minimum of vector and scalar \f$ a = \min(a, c) \f$ + */ + void cuda_pwmin_sca2(void *a, real *c, int *n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + pwmin_sca2_kernel<<>>( + (real *)a, *c, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for pwmin_sca3 + * + * Compute the minimum of vector and scalar \f$ a = \min(b, c) \f$ + */ + void cuda_pwmin_sca3(void *a, void *b, real *c, int *n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + pwmin_sca3_kernel<<>>( + (real *)a, (real *)b, *c, *n); + CUDA_CHECK(cudaGetLastError()); } -} +} /* extern "C" */ diff --git a/src/math/bcknd/device/cuda/math_kernel.h b/src/math/bcknd/device/cuda/math_kernel.h index d633f705351..26eebb8565d 100644 --- a/src/math/bcknd/device/cuda/math_kernel.h +++ b/src/math/bcknd/device/cuda/math_kernel.h @@ -755,4 +755,119 @@ __global__ void absval_kernel(T * __restrict__ a, } } +// ========================================================================== // +// Kernels for the point-wise operations + +/** + * Device kernel for point-wise max of two vectors + * a = max(a, b) + */ +template +__global__ void + pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]); +} + +/** + * Device kernel for point-wise max of two vectors + * a = max(b, c) + */ +template +__global__ void pwmax_vec3_kernel( + T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c, + const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]); +} + +/** + * Device kernel for point-wise max of vector and scalar + * a = max(a, c) + */ +template +__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) a[i] = max(a[i], c); +} + +/** + * Device kernel for point-wise max of vector and scalar + * a = max(b, c) + */ +template +__global__ void pwmax_sca3_kernel( + T* __restrict__ a, const T* __restrict b, const T c, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) a[i] = max(b[i], c); +} + +/** + * Device kernel for point-wise min of two vectors + * a = min(a, b) + */ +template +__global__ void + pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]); +} + +/** + * Device kernel for point-wise min of two vectors + * a = min(b, c) + */ +template +__global__ void pwmin_vec3_kernel( + T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c, + const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]); +} + +/** + * Device kernel for point-wise min of vector and scalar + * a = min(a, c) + */ +template +__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) a[i] = min(a[i], c); +} + +/** + * Device kernel for point-wise min of vector and scalar + * a = min(b, c) + */ +template +__global__ void pwmin_sca3_kernel( + T* __restrict__ a, const T* __restrict b, const T c, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) a[i] = min(b[i], c); +} + #endif // __MATH_MATH_KERNEL_H__ diff --git a/src/math/bcknd/device/device_math.F90 b/src/math/bcknd/device/device_math.F90 index 5d3e19e7c44..50935d36a6c 100644 --- a/src/math/bcknd/device/device_math.F90 +++ b/src/math/bcknd/device/device_math.F90 @@ -47,6 +47,17 @@ module device_math implicit none private + interface device_pwmax + module procedure device_pwmax_vec2, device_pwmax_vec3, & + device_pwmax_sca2, device_pwmax_sca3 + end interface device_pwmax + + interface device_pwmin + module procedure device_pwmin_vec2, device_pwmin_vec3, & + device_pwmin_sca2, device_pwmin_sca3 + end interface device_pwmin + + public :: device_copy, device_rzero, device_rone, device_cmult, & device_cmult2, device_cadd, device_cadd2, device_cfill, device_add2, & device_add3, device_add4, device_add2s1, device_add2s2, & @@ -55,7 +66,8 @@ module device_math device_addcol3, device_addcol4, device_vdot3, device_vlsc3, & device_glsc3, device_glsc3_many, device_add2s2_many, device_glsc2, & device_glsum, device_masked_copy, device_cfill_mask, & - device_masked_red_copy, device_vcross, device_absval + device_masked_red_copy, device_vcross, device_absval, & + device_pwmax, device_pwmin contains @@ -650,5 +662,156 @@ subroutine device_absval(a_d, n) end subroutine device_absval + ! ========================================================================== ! + ! Device point-wise max + + !> Compute the point-wise maximum of two vectors + !! \f$ a_i = \max(a_i, b_i) \f$ + subroutine device_pwmax_vec2(a_d, b_d, n) + type(c_ptr) :: a_d, b_d + integer :: n + +#if HAVE_HIP + call neko_error('No HIP backend for device_pwmax_vec2') +#elif HAVE_CUDA + call cuda_pwmax_vec2(a_d, b_d, n) +#elif HAVE_OPENCL + call neko_error('No OpenCL backend for device_pwmax_vec2') +#else + call neko_error('No device backend configured') +#endif + end subroutine device_pwmax_vec2 + + !> Compute the point-wise maximum of two vectors + !! \f$ a_i = \max(b_i, c_i) \f$ + subroutine device_pwmax_vec3(a_d, b_d, c_d, n) + type(c_ptr) :: a_d, b_d, c_d + integer :: n + +#if HAVE_HIP + call neko_error('No HIP backend for device_pwmax_vec3') +#elif HAVE_CUDA + call cuda_pwmax_vec3(a_d, b_d, c_d, n) +#elif HAVE_OPENCL + call neko_error('No OpenCL backend for device_pwmax_vec3') +#else + call neko_error('No device backend configured') +#endif + + end subroutine device_pwmax_vec3 + + !> Compute the point-wise maximum of a vector and a scalar + !! \f$ a_i = \max(a_i, c) \f$ + subroutine device_pwmax_sca2(a_d, c, n) + type(c_ptr) :: a_d + real(kind=rp), intent(in) :: c + integer :: n + +#if HAVE_HIP + call neko_error('No HIP backend for device_pwmax_sca2') +#elif HAVE_CUDA + call cuda_pwmax_sca2(a_d, c, n) +#elif HAVE_OPENCL + call neko_error('No OpenCL backend for device_pwmax_sca2') +#else + call neko_error('No device backend configured') +#endif + + end subroutine device_pwmax_sca2 + + !> Compute the point-wise maximum of a vector and a scalar + !! \f$ a_i = \max(b_i, c) \f$ + subroutine device_pwmax_sca3(a_d, b_d, c, n) + type(c_ptr) :: a_d, b_d + real(kind=rp), intent(in) :: c + integer :: n + +#if HAVE_HIP + call neko_error('No HIP backend for device_pwmax_sca3') +#elif HAVE_CUDA + call cuda_pwmax_sca3(a_d, b_d, c, n) +#elif HAVE_OPENCL + call neko_error('No OpenCL backend for device_pwmax_sca3') +#else + call neko_error('No device backend configured') +#endif + + end subroutine device_pwmax_sca3 + + ! ========================================================================== ! + ! Device point-wise min + + !> Compute the point-wise minimum of two vectors + !! \f$ a_i = \min(a_i, b_i) \f$ + subroutine device_pwmin_vec2(a_d, b_d, n) + type(c_ptr) :: a_d, b_d + integer :: n + +#if HAVE_HIP + call neko_error('No HIP backend for device_pwmin_vec2') +#elif HAVE_CUDA + call cuda_pwmin_vec2(a_d, b_d, n) +#elif HAVE_OPENCL + call neko_error('No OpenCL backend for device_pwmin_vec2') +#else + call neko_error('No device backend configured') +#endif + end subroutine device_pwmin_vec2 + + !> Compute the point-wise minimum of two vectors + !! \f$ a_i = \min(b_i, c_i) \f$ + subroutine device_pwmin_vec3(a_d, b_d, c_d, n) + type(c_ptr) :: a_d, b_d, c_d + integer :: n + +#if HAVE_HIP + call neko_error('No HIP backend for device_pwmin_vec3') +#elif HAVE_CUDA + call cuda_pwmin_vec3(a_d, b_d, c_d, n) +#elif HAVE_OPENCL + call neko_error('No OpenCL backend for device_pwmin_vec3') +#else + call neko_error('No device backend configured') +#endif + + end subroutine device_pwmin_vec3 + + !> Compute the point-wise minimum of a vector and a scalar + !! \f$ a_i = \min(a_i, c) \f$ + subroutine device_pwmin_sca2(a_d, c, n) + type(c_ptr) :: a_d + real(kind=rp), intent(in) :: c + integer :: n + +#if HAVE_HIP + call neko_error('No HIP backend for device_pwmin_sca2') +#elif HAVE_CUDA + call cuda_pwmin_sca2(a_d, c, n) +#elif HAVE_OPENCL + call neko_error('No OpenCL backend for device_pwmin_sca2') +#else + call neko_error('No device backend configured') +#endif + + end subroutine device_pwmin_sca2 + + !> Compute the point-wise minimum of a vector and a scalar + !! \f$ a_i = \min(b_i, c) \f$ + subroutine device_pwmin_sca3(a_d, b_d, c, n) + type(c_ptr) :: a_d, b_d + real(kind=rp), intent(in) :: c + integer :: n + +#if HAVE_HIP + call neko_error('No HIP backend for device_pwmin_sca3') +#elif HAVE_CUDA + call cuda_pwmin_sca3(a_d, b_d, c, n) +#elif HAVE_OPENCL + call neko_error('No OpenCL backend for device_pwmin_sca3') +#else + call neko_error('No device backend configured') +#endif + + end subroutine device_pwmin_sca3 end module device_math diff --git a/src/math/math.f90 b/src/math/math.f90 index d18177b219c..605b5c17a7f 100644 --- a/src/math/math.f90 +++ b/src/math/math.f90 @@ -59,7 +59,10 @@ ! module math use num_types, only : rp, dp, sp, qp, i4, xp - use comm + use utils, only: neko_error + use comm, only: NEKO_COMM, MPI_REAL_PRECISION, MPI_EXTRA_PRECISION + use mpi_f08, only: MPI_MIN, MPI_MAX, MPI_SUM, MPI_IN_PLACE, MPI_INTEGER, & + MPI_Allreduce implicit none private @@ -96,13 +99,21 @@ module math module procedure srelcmp, drelcmp, qrelcmp end interface relcmp + interface pwmax + module procedure pwmax_vec2, pwmax_vec3, pwmax_scal2, pwmax_scal3 + end interface pwmax + + interface pwmin + module procedure pwmin_vec2, pwmin_vec3, pwmin_sca2, pwmin_sca3 + end interface pwmin + public :: abscmp, rzero, izero, row_zero, rone, copy, cmult, cadd, cfill, & glsum, glmax, glmin, chsign, vlmax, vlmin, invcol1, invcol3, invers2, & vcross, vdot2, vdot3, vlsc3, vlsc2, add2, add3, add4, sub2, sub3, & add2s1, add2s2, addsqr2s2, cmult2, invcol2, col2, col3, subcol3, & add3s2, subcol4, addcol3, addcol4, ascol5, p_update, x_update, glsc2, & glsc3, glsc4, sort, masked_copy, cfill_mask, relcmp, glimax, glimin, & - swap, reord, flipv, cadd2, masked_red_copy, absval + swap, reord, flipv, cadd2, masked_red_copy, absval, pwmax, pwmin contains @@ -1169,4 +1180,105 @@ subroutine absval(a, n) end do end subroutine absval + ! ========================================================================== ! + ! Point-wise operations + + !> Point-wise maximum of two vectors \f$ a = \max(a, b) \f$ + subroutine pwmax_vec2(a, b, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: a + real(kind=rp), dimension(n), intent(in) :: b + integer :: i + + do i = 1, n + a(i) = max(a(i), b(i)) + end do + end subroutine pwmax_vec2 + + !> Point-wise maximum of two vectors \f$ a = \max(b, c) \f$ + subroutine pwmax_vec3(a, b, c, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: a + real(kind=rp), dimension(n), intent(in) :: b, c + integer :: i + + do i = 1, n + a(i) = max(b(i), c(i)) + end do + end subroutine pwmax_vec3 + + !> Point-wise maximum of scalar and vector \f$ a = \max(a, b) \f$ + subroutine pwmax_scal2(a, b, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: a + real(kind=rp), intent(in) :: b + integer :: i + + do i = 1, n + a(i) = max(a(i), b) + end do + end subroutine pwmax_scal2 + + !> Point-wise maximum of scalar and vector \f$ a = \max(b, c) \f$ + subroutine pwmax_scal3(a, b, c, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: a + real(kind=rp), dimension(n), intent(in) :: b + real(kind=rp), intent(in) :: c + integer :: i + + do i = 1, n + a(i) = max(b(i), c) + end do + end subroutine pwmax_scal3 + + !> Point-wise minimum of two vectors \f$ a = \min(a, b) \f$ + subroutine pwmin_vec2(a, b, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: a + real(kind=rp), dimension(n), intent(in) :: b + integer :: i + + do i = 1, n + a(i) = min(a(i), b(i)) + end do + end subroutine pwmin_vec2 + + !> Point-wise minimum of two vectors \f$ a = \min(b, c) \f$ + subroutine pwmin_vec3(a, b, c, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: a + real(kind=rp), dimension(n), intent(in) :: b, c + integer :: i + + do i = 1, n + a(i) = min(b(i), c(i)) + end do + end subroutine pwmin_vec3 + + !> Point-wise minimum of scalar and vector \f$ a = \min(a, b) \f$ + subroutine pwmin_sca2(a, b, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: a + real(kind=rp), intent(in) :: b + integer :: i + + do i = 1, n + a(i) = min(a(i), b) + end do + end subroutine pwmin_sca2 + + !> Point-wise minimum of scalar and vector \f$ a = \min(b, c) \f$ + subroutine pwmin_sca3(a, b, c, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: a + real(kind=rp), dimension(n), intent(in) :: b + real(kind=rp), intent(in) :: c + integer :: i + + do i = 1, n + a(i) = min(b(i), c) + end do + end subroutine pwmin_sca3 + end module math diff --git a/src/math/signed_distance.f90 b/src/math/signed_distance.f90 index f9c18f9dabf..8f60a51dd8f 100644 --- a/src/math/signed_distance.f90 +++ b/src/math/signed_distance.f90 @@ -37,8 +37,13 @@ module signed_distance use tri, only: tri_t use tri_mesh, only: tri_mesh_t use aabb_tree, only: aabb_tree_t + use device, only: device_memcpy, HOST_TO_DEVICE + use neko_config, only: NEKO_BCKND_DEVICE + use utils, only: neko_error, neko_warning implicit none + private + public :: signed_distance_field contains @@ -53,9 +58,6 @@ module signed_distance !! @param[in] object Object !! @param[in,optional] max_distance Maximum distance outside the mesh subroutine signed_distance_field(field_data, object, max_distance) - use utils, only: neko_error - implicit none - type(field_t), intent(inout) :: field_data class(*), intent(in) :: object real(kind=dp), intent(in), optional :: max_distance @@ -89,9 +91,6 @@ end subroutine signed_distance_field !! @param[in] mesh Triangular mesh !! @param[in] max_distance Maximum distance outside the mesh subroutine signed_distance_field_tri_mesh(field_data, mesh, max_distance) - use utils, only: neko_error - implicit none - type(field_t), intent(inout) :: field_data type(tri_mesh_t), intent(in) :: mesh real(kind=dp), intent(in) :: max_distance @@ -111,7 +110,7 @@ subroutine signed_distance_field_tri_mesh(field_data, mesh, max_distance) if (search_tree%get_size() .ne. mesh%nelv) then call neko_error("signed_distance_field_tri_mesh: & - & Error building the search tree.") + & Error building the search tree.") end if do id = 1, total_size @@ -124,6 +123,13 @@ subroutine signed_distance_field_tri_mesh(field_data, mesh, max_distance) field_data%x(id, 1, 1, 1) = real(distance, kind=rp) end do + if (NEKO_BCKND_DEVICE .eq. 1) then + call neko_warning("signed_distance_field_tri_mesh:& + & Device version not implemented.") + call device_memcpy(field_data%x, field_data%x_d, field_data%size(), & + HOST_TO_DEVICE, sync = .false.) + end if + end subroutine signed_distance_field_tri_mesh !> @brief Signed distance function diff --git a/src/source_terms/bcknd/cpu/filters_cpu.f90 b/src/source_terms/bcknd/cpu/filters_cpu.f90 index 3b2dc2a4bab..e4008af8ed2 100644 --- a/src/source_terms/bcknd/cpu/filters_cpu.f90 +++ b/src/source_terms/bcknd/cpu/filters_cpu.f90 @@ -34,51 +34,84 @@ module filters_cpu use num_types, only: rp implicit none + private + public :: smooth_step_cpu, step_function_cpu, permeability_cpu contains + !> @brief Apply a smooth step function to a scalar. + subroutine smooth_step_cpu(x, edge0, edge1, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: x + real(kind=rp), intent(in) :: edge0, edge1 + + x = smooth_step_kernel(x, edge0, edge1) + + end subroutine smooth_step_cpu + + !> @brief Apply a step function to a scalar. + subroutine step_function_cpu(x, x_step, value0, value1, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: x + real(kind=rp), intent(in) :: x_step, value0, value1 + + x = step_function_kernel(x, x_step, value0, value1) + + end subroutine step_function_cpu + + !> @brief Apply a permeability function to a scalar. + subroutine permeability_cpu(x, k_0, k_1, q, n) + integer, intent(in) :: n + real(kind=rp), dimension(n), intent(inout) :: x + real(kind=rp), intent(in) :: k_0, k_1, q + + x = permeability_kernel(x, k_0, k_1, q) + + end subroutine permeability_cpu + + ! ========================================================================== ! ! Internal functions and subroutines ! ========================================================================== ! !> @brief Apply a smooth step function to a scalar. - elemental function smooth_step_cpu(x, edge0, edge1) result(res) + elemental function smooth_step_kernel(x, edge0, edge1) result(res) real(kind=rp), intent(in) :: x real(kind=rp), intent(in) :: edge0, edge1 real(kind=rp) :: res, t - t = clamp_cpu((x - edge0) / (edge1 - edge0), 0.0_rp, 1.0_rp) + t = clamp_kernel((x - edge0) / (edge1 - edge0), 0.0_rp, 1.0_rp) res = t**3 * (t * (6.0_rp * t - 15.0_rp) + 10.0_rp) - end function smooth_step_cpu + end function smooth_step_kernel !> @brief Clamp a value between two limits. - elemental function clamp_cpu(x, lowerlimit, upperlimit) result(res) + elemental function clamp_kernel(x, lowerlimit, upperlimit) result(res) real(kind=rp), intent(in) :: x real(kind=rp), intent(in) :: lowerlimit, upperlimit real(kind=rp) :: res res = max(lowerlimit, min(upperlimit, x)) - end function clamp_cpu + end function clamp_kernel !> @brief Apply a step function to a scalar. - elemental function step_function_cpu(x, x_step, value0, value1) result(res) + elemental function step_function_kernel(x, x_step, value0, value1) result(res) real(kind=rp), intent(in) :: x, x_step, value0, value1 real(kind=rp) :: res res = merge(value0, value1, x > x_step) - end function step_function_cpu + end function step_function_kernel !> @brief Apply a permeability function to a scalar. - elemental function permeability_cpu(x, k_0, k_1, q) result(perm) + elemental function permeability_kernel(x, k_0, k_1, q) result(perm) real(kind=rp), intent(in) :: x, k_0, k_1, q real(kind=rp) :: perm perm = k_0 + (k_1 - k_0) * x * (q + 1.0_rp) / (q + x) - end function permeability_cpu + end function permeability_kernel end module filters_cpu diff --git a/src/source_terms/bcknd/device/cuda/cuda_filters.f90 b/src/source_terms/bcknd/device/cuda/cuda_filters.f90 new file mode 100644 index 00000000000..73a831845c6 --- /dev/null +++ b/src/source_terms/bcknd/device/cuda/cuda_filters.f90 @@ -0,0 +1,71 @@ +! Copyright (c) 2024, The Neko Authors +! All rights reserved. +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! +! * Neither the name of the authors nor the names of its +! contributors may be used to endorse or promote products derived +! from this software without specific prior written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!> Cuda interface binding for filters +module cuda_filters + use num_types, only: c_rp + implicit none + private + + public :: cuda_smooth_step, cuda_step_function, cuda_permeability + + ! Interfaces for the backend functions + interface + subroutine cuda_smooth_step(x, edge0, edge1, n) & + bind(c, name = "cuda_smooth_step") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: edge0, edge1 + integer(c_int) :: n + end subroutine cuda_smooth_step + + subroutine cuda_step_function(x, edge, left, right, n) & + bind(c, name = "cuda_step_function") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: edge, left, right + integer(c_int) :: n + end subroutine cuda_step_function + + subroutine cuda_permeability(x, k_0, k_1, q, n) & + bind(c, name = "cuda_permeability") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: k_0, k_1, q + integer(c_int) :: n + end subroutine cuda_permeability + end interface + +end module cuda_filters diff --git a/src/source_terms/bcknd/device/cuda/filter_kernels.h b/src/source_terms/bcknd/device/cuda/filter_kernels.h new file mode 100644 index 00000000000..ef7ce93c0f6 --- /dev/null +++ b/src/source_terms/bcknd/device/cuda/filter_kernels.h @@ -0,0 +1,103 @@ +#ifndef __FILTER_KERNELS__ +#define __FILTER_KERNELS__ +/* + Copyright (c) 2024, The Neko Authors + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the authors nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +/** + * Device kernel for smooth step function. + * + * @param x in/out array + * @param edge0 lower edge + * @param edge1 upper edge + * @param n size of the input array + */ +template +__global__ void smooth_step_kernel( + T* __restrict__ x, const T edge0, const T edge1, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) { + T t = (x[i] - edge0) / (edge1 - edge0); + t = min(max(t, 0.0), 1.0); + x[i] = pow(t, 3.0) * (t * (t * 6.0 - 15.0) + 10.0); + } +} + +/** + * Device kernel for step function. + * + * @param x input array + * @param edge step location + * @param left value to the left of the step + * @param right value to the right of the step + * @param n size of the input array + */ +template +__global__ void step_kernel( + T* __restrict__ x, const T edge, const T left, const T right, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) { + const T x_i = x[i]; + x[i] = (x_i < edge) ? left : right; + } +} + +/** + * Device kernel for the inverse permeability kernel. + * + * @param x input array + * @param k_0 lower bound of the permeability + * @param k_1 upper bound of the permeability + * @param q parameter + * @param n size of the input array + */ +template +__global__ void permeability_kernel( + T* __restrict__ x, const T k_0, const T k_1, const T q, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) { + const T x_i = x[i]; + x[i] = k_0 + (k_1 - k_0) * x_i * (q + 1.0) / (q + x_i); + } +} + +#endif // __FILTER_KERNELS__ diff --git a/src/source_terms/bcknd/device/cuda/filters.cu b/src/source_terms/bcknd/device/cuda/filters.cu new file mode 100644 index 00000000000..fc94648317d --- /dev/null +++ b/src/source_terms/bcknd/device/cuda/filters.cu @@ -0,0 +1,100 @@ +/* + Copyright (c) 2024, The Neko Authors + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the authors nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "filter_kernels.h" +#include +#include +#include +#include + +extern "C" { + + /** Fortran wrapper for 2nd order smooth_step + * + * Compute the smooth step function for a given array. + * t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0); + * return t^3 * (t * (6.0 * t - 15.0) + 10.0); + */ + void cuda_smooth_step(void *x, real *edge0, real *edge1, int *n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + smooth_step_kernel<<>>( + (real *)x, *edge0, *edge1, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for step function + * + * Compute the step function for a given array. + * if (x < edge) return left; + * else return right; + * + * @param x array to apply the step function + * @param edge threshold value + * @param left value to return if x < edge + * @param right value to return if x >= edge + * @param n size of the array + */ + void cuda_step_function(void *x, real *edge, real *left, real *right, int *n) { + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + step_kernel<<>>( + (real *)x, *edge, *left, *right, *n); + CUDA_CHECK(cudaGetLastError()); + } + + /** Fortran wrapper for the permeability filter + * + * @param x array to apply the permeability filter on + * @param k_0 lower bound of the permeability + * @param k_1 upper bound of the permeability + * @param q parameter + * @param n size of the array + */ + void cuda_permeability(void *x, real *k_0, real *k_1, real *q, int *n) { + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; + + permeability_kernel<<>>( + (real *)x, *k_0, *k_1, *q, *n); + + CUDA_CHECK(cudaGetLastError()); + } +} /* extern "C" */ diff --git a/src/source_terms/bcknd/device/filters_device.F90 b/src/source_terms/bcknd/device/filters_device.F90 new file mode 100644 index 00000000000..66983b9d4a3 --- /dev/null +++ b/src/source_terms/bcknd/device/filters_device.F90 @@ -0,0 +1,113 @@ +! Copyright (c) 2024, The Neko Authors +! All rights reserved. +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! +! * Neither the name of the authors nor the names of its +! contributors may be used to endorse or promote products derived +! from this software without specific prior written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +!> Device implementations of the filter functions. +module filters_device + use num_types, only: rp, c_rp + use device_math, only: device_pwmax, device_pwmin + use, intrinsic :: iso_c_binding, only: c_ptr + use utils, only: neko_error + + use cuda_filters, only: & + cuda_smooth_step, cuda_step_function, cuda_permeability + use hip_filters, only: & + hip_smooth_step, hip_step_function, hip_permeability + use opencl_filters, only: & + opencl_smooth_step, opencl_step_function, opencl_permeability + + implicit none + +contains + + ! ========================================================================== ! + ! Internal functions and subroutines + ! ========================================================================== ! + + !> @brief Apply a smooth step function to a scalar. + subroutine smooth_step_device(x, edge0, edge1, n) + type(c_ptr), intent(inout) :: x + real(kind=rp), intent(in) :: edge0, edge1 + integer, intent(in) :: n + +#if HAVE_HIP == 1 + call hip_smooth_step(x, edge0, edge1, n) +#elif HAVE_CUDA == 1 + call cuda_smooth_step(x, edge0, edge1, n) +#elif HAVE_OPENCL == 1 + call opencl_smooth_step(x, edge0, edge1, n) +#else + call neko_error(& + "Smooth step function not implemented for the current device.") +#endif + + end subroutine smooth_step_device + + !> @brief Apply a step function to a scalar. + subroutine step_function_device(x, x_step, value0, value1, n) + type(c_ptr), intent(inout) :: x + real(kind=rp), intent(in) :: x_step, value0, value1 + integer, intent(in) :: n + +#if HAVE_HIP == 1 + call hip_step_function(x, x_step, value0, value1, n) +#elif HAVE_CUDA == 1 + call cuda_step_function(x, x_step, value0, value1, n) +#elif HAVE_OPENCL == 1 + call opencl_step_function(x, x_step, value0, value1, n) +#else + call neko_error(& + "Step function not implemented for the current device.") +#endif + + end subroutine step_function_device + + !> @brief Apply a permeability function to a scalar. + subroutine permeability_device(x, k_0, k_1, q, n) + type(c_ptr), intent(inout) :: x + real(kind=rp), intent(in) :: k_0, k_1, q + integer, intent(in) :: n + +#if HAVE_HIP == 1 + call hip_permeability(x, k_0, k_1, q, n) +#elif HAVE_CUDA == 1 + call cuda_permeability(x, k_0, k_1, q, n) +#elif HAVE_OPENCL == 1 + call opencl_permeability(x, k_0, k_1, q, n) +#else + call neko_error(& + "Permeability function not implemented for the current device.") +#endif + + end subroutine permeability_device + + +end module filters_device diff --git a/src/source_terms/bcknd/device/hip/filter_kernels.h b/src/source_terms/bcknd/device/hip/filter_kernels.h new file mode 100644 index 00000000000..ef7ce93c0f6 --- /dev/null +++ b/src/source_terms/bcknd/device/hip/filter_kernels.h @@ -0,0 +1,103 @@ +#ifndef __FILTER_KERNELS__ +#define __FILTER_KERNELS__ +/* + Copyright (c) 2024, The Neko Authors + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the authors nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +/** + * Device kernel for smooth step function. + * + * @param x in/out array + * @param edge0 lower edge + * @param edge1 upper edge + * @param n size of the input array + */ +template +__global__ void smooth_step_kernel( + T* __restrict__ x, const T edge0, const T edge1, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) { + T t = (x[i] - edge0) / (edge1 - edge0); + t = min(max(t, 0.0), 1.0); + x[i] = pow(t, 3.0) * (t * (t * 6.0 - 15.0) + 10.0); + } +} + +/** + * Device kernel for step function. + * + * @param x input array + * @param edge step location + * @param left value to the left of the step + * @param right value to the right of the step + * @param n size of the input array + */ +template +__global__ void step_kernel( + T* __restrict__ x, const T edge, const T left, const T right, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) { + const T x_i = x[i]; + x[i] = (x_i < edge) ? left : right; + } +} + +/** + * Device kernel for the inverse permeability kernel. + * + * @param x input array + * @param k_0 lower bound of the permeability + * @param k_1 upper bound of the permeability + * @param q parameter + * @param n size of the input array + */ +template +__global__ void permeability_kernel( + T* __restrict__ x, const T k_0, const T k_1, const T q, const int n) { + + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int str = blockDim.x * gridDim.x; + + for (int i = idx; i < n; i += str) { + const T x_i = x[i]; + x[i] = k_0 + (k_1 - k_0) * x_i * (q + 1.0) / (q + x_i); + } +} + +#endif // __FILTER_KERNELS__ diff --git a/src/source_terms/bcknd/device/hip/filters.hip b/src/source_terms/bcknd/device/hip/filters.hip new file mode 100644 index 00000000000..b2fccdd2361 --- /dev/null +++ b/src/source_terms/bcknd/device/hip/filters.hip @@ -0,0 +1,100 @@ +/* + Copyright (c) 2021-2023, The Neko Authors + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the authors nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include + +#include "filter_kernels.h" + +extern "C" { + +/** Fortran wrapper for 2nd order smooth_step + * + * Compute the smooth step function for a given array. + * t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0); + * return t^3 * (t * (6.0 * t - 15.0) + 10.0); + */ +void hip_smooth_step(void* x, real* edge0, real* edge1, int* n) { + + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(smooth_step_kernel), nblcks, nthrds, 0, + (hipStream_t)glb_cmd_queue, (real*)x, *edge0, *edge1, *n); + HIP_CHECK(hipGetLastError()); +} + +/** Fortran wrapper for step function + * + * Compute the step function for a given array. + * if (x < edge) return left; + * else return right; + * + * @param x array to apply the step function + * @param edge threshold value + * @param left value to return if x < edge + * @param right value to return if x >= edge + * @param n size of the array + */ +void hip_step_function(void* x, real* edge, real* left, real* right, int* n) { + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(step_kernel), nblcks, nthrds, 0, + (hipStream_t)glb_cmd_queue, (real*)x, *edge, *left, *right, *n); + HIP_CHECK(hipGetLastError()); +} + +/** Fortran wrapper for the permeability filter + * + * @param x array to apply the permeability filter on + * @param k_0 lower bound of the permeability + * @param k_1 upper bound of the permeability + * @param q parameter + * @param n size of the array + */ +void hip_permeability(void* x, real* k_0, real* k_1, real* q, int* n) { + const dim3 nthrds(1024, 1, 1); + const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1); + + hipLaunchKernelGGL( + HIP_KERNEL_NAME(permeability_kernel), nblcks, nthrds, 0, + (hipStream_t)glb_cmd_queue, (real*)x, *k_0, *k_1, *q, *n); + + HIP_CHECK(hipGetLastError()); +} +} diff --git a/src/source_terms/bcknd/device/hip/hip_filters.f90 b/src/source_terms/bcknd/device/hip/hip_filters.f90 new file mode 100644 index 00000000000..aa614aad393 --- /dev/null +++ b/src/source_terms/bcknd/device/hip/hip_filters.f90 @@ -0,0 +1,71 @@ +! Copyright (c) 2024, The Neko Authors +! All rights reserved. +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! +! * Neither the name of the authors nor the names of its +! contributors may be used to endorse or promote products derived +! from this software without specific prior written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!> Hip interface binding for filters +module hip_filters + use num_types, only: rp, c_rp + implicit none + private + + public :: hip_smooth_step, hip_step_function, hip_permeability + + ! Interfaces for the backend functions + interface + subroutine hip_smooth_step(x, edge0, edge1, n) & + bind(c, name = "hip_smooth_step") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: edge0, edge1 + integer(c_int) :: n + end subroutine hip_smooth_step + + subroutine hip_step_function(x, edge, left, right, n) & + bind(c, name = "hip_step_function") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: edge, left, right + integer(c_int) :: n + end subroutine hip_step_function + + subroutine hip_permeability(x, k_0, k_1, q, n) & + bind(c, name = "hip_permeability") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: k_0, k_1, q + integer(c_int) :: n + end subroutine hip_permeability + end interface + +end module hip_filters diff --git a/src/source_terms/bcknd/device/opencl/filter_kernels.cl b/src/source_terms/bcknd/device/opencl/filter_kernels.cl new file mode 100644 index 00000000000..4b4f9b253f3 --- /dev/null +++ b/src/source_terms/bcknd/device/opencl/filter_kernels.cl @@ -0,0 +1,103 @@ +#ifndef __SOURCE_TERMS_FILTER_KERNELS__ +#define __SOURCE_TERMS_FILTER_KERNELS__ +/* + Copyright (c) 2021-2024, The Neko Authors + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the authors nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +/** + * Device kernel for smooth step function. + * + * @param x in/out array + * @param edge0 lower edge + * @param edge1 upper edge + * @param n size of the input array + */ +__kernel void smooth_step_kernel( + __global real* __restrict__ x, const real edge0, const real edge1, + const int n) { + + const int idx = get_global_id(0); + const int str = get_global_size(0); + + for (int i = idx; i < n; i += str) { + real t = (x[i] - edge0) / (edge1 - edge0); + t = min(max(t, 0.0), 1.0); + x[i] = pow(t, 3.0) * (t * (t * 6.0 - 15.0) + 10.0); + } +} + +/** + * Device kernel for step function. + * + * @param x input array + * @param edge step location + * @param left value to the left of the step + * @param right value to the right of the step + * @param n size of the input array + */ +__kernel void step_kernel( + __global real* __restrict__ x, const real edge, const real left, + const real right, const int n) { + + const int idx = get_global_id(0); + const int str = get_global_size(0); + + for (int i = idx; i < n; i += str) { + const real x_i = x[i]; + x[i] = (x_i < edge) ? left : right; + } +} + +/** + * Device kernel for the inverse permeability kernel. + * + * @param x input array + * @param k_0 lower bound of the permeability + * @param k_1 upper bound of the permeability + * @param q parameter + * @param n size of the input array + */ +__kernel void permeability_kernel( + __global real* __restrict__ x, const real k_0, const real k_1, const real q, + const int n) { + + const int idx = get_global_id(0); + const int str = get_global_size(0); + + for (int i = idx; i < n; i += str) { + const real x_i = x[i]; + x[i] = k_0 + (k_1 - k_0) * x_i * (q + 1.0) / (q + x_i); + } +} + +#endif /* __SOURCE_TERMS_FILTER_KERNELS__ */ \ No newline at end of file diff --git a/src/source_terms/bcknd/device/opencl/filters.c b/src/source_terms/bcknd/device/opencl/filters.c new file mode 100644 index 00000000000..8af47c00f3c --- /dev/null +++ b/src/source_terms/bcknd/device/opencl/filters.c @@ -0,0 +1,149 @@ +/* + Copyright (c) 2021-2024, The Neko Authors + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the authors nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef __APPLE__ + #include +#else + #include +#endif + +#include +#include +#include +#include +#include +#include + +#include "filter_kernels.cl.h" + +/** Fortran wrapper for 2nd order smooth_step + * + * Compute the smooth step function for a given array. + * t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0); + * return t^3 * (t * (6.0 * t - 15.0) + 10.0); + */ +void opencl_smooth_step(void* x, real* edge0, real* edge1, int* n) { + cl_int err; + + if (filter_program == NULL) + opencl_kernel_jit(filter_kernels, (cl_program*)&filter_program); + + cl_kernel kernel = + clCreateKernel(filter_program, "smooth_step_kernel", &err); + CL_CHECK(err); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&x)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), edge0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), edge1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n)); + + const int nb = ((*n) + 256 - 1) / 256; + const size_t global_item_size = 256 * nb; + const size_t local_item_size = 256; + + CL_CHECK(clEnqueueNDRangeKernel( + (cl_command_queue)glb_cmd_queue, kernel, 1, NULL, &global_item_size, + &local_item_size, 0, NULL, NULL)); +} + +/** Fortran wrapper for step function + * + * Compute the step function for a given array. + * if (x < edge) return left; + * else return right; + * + * @param x array to apply the step function + * @param edge threshold value + * @param left value to return if x < edge + * @param right value to return if x >= edge + * @param n size of the array + */ +void opencl_step_function( + void* x, real* edge, real* left, real* right, int* n) { + cl_int err; + + if (filter_program == NULL) + opencl_kernel_jit(filter_kernels, (cl_program*)&filter_program); + + cl_kernel kernel = + clCreateKernel(filter_program, "step_function_kernel", &err); + CL_CHECK(err); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&x)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), edge)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), left)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), right)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n)); + + const int nb = ((*n) + 256 - 1) / 256; + const size_t global_item_size = 256 * nb; + const size_t local_item_size = 256; + + CL_CHECK(clEnqueueNDRangeKernel( + (cl_command_queue)glb_cmd_queue, kernel, 1, NULL, &global_item_size, + &local_item_size, 0, NULL, NULL)); +} + +/** Fortran wrapper for the permeability filter + * + * @param x array to apply the permeability filter on + * @param k_0 lower bound of the permeability + * @param k_1 upper bound of the permeability + * @param q parameter + * @param n size of the array + */ +void opencl_permeability(void* x, real* k_0, real* k_1, real* q, int* n) { + cl_int err; + + if (filter_program == NULL) + opencl_kernel_jit(filter_kernels, (cl_program*)&filter_program); + + cl_kernel kernel = + clCreateKernel(filter_program, "permeability_kernel", &err); + CL_CHECK(err); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&x)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), k_0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), k_1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), q)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n)); + + const int nb = ((*n) + 256 - 1) / 256; + const size_t global_item_size = 256 * nb; + const size_t local_item_size = 256; + + CL_CHECK(clEnqueueNDRangeKernel( + (cl_command_queue)glb_cmd_queue, kernel, 1, NULL, &global_item_size, + &local_item_size, 0, NULL, NULL)); +} \ No newline at end of file diff --git a/src/source_terms/bcknd/device/opencl/opencl_filters.f90 b/src/source_terms/bcknd/device/opencl/opencl_filters.f90 new file mode 100644 index 00000000000..90ee1833053 --- /dev/null +++ b/src/source_terms/bcknd/device/opencl/opencl_filters.f90 @@ -0,0 +1,71 @@ +! Copyright (c) 2024, The Neko Authors +! All rights reserved. +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! +! * Neither the name of the authors nor the names of its +! contributors may be used to endorse or promote products derived +! from this software without specific prior written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!> OpenCL interface binding for filters +module opencl_filters + use num_types, only: rp, c_rp + implicit none + private + + public :: opencl_smooth_step, opencl_step_function, opencl_permeability + + ! Interfaces for the backend functions + interface + subroutine opencl_smooth_step(x, edge0, edge1, n) & + bind(c, name = "opencl_smooth_step") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: edge0, edge1 + integer(c_int) :: n + end subroutine opencl_smooth_step + + subroutine opencl_step_function(x, edge, left, right, n) & + bind(c, name = "opencl_step_function") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: edge, left, right + integer(c_int) :: n + end subroutine opencl_step_function + + subroutine opencl_permeability(x, k_0, k_1, q, n) & + bind(c, name = "opencl_permeability") + use, intrinsic :: iso_c_binding, only: c_ptr, c_int + import c_rp + type(c_ptr), value :: x + real(c_rp) :: k_0, k_1, q + integer(c_int) :: n + end subroutine opencl_permeability + end interface + +end module opencl_filters diff --git a/src/source_terms/brinkman/filters.f90 b/src/source_terms/brinkman/filters.f90 index d8f7b71ac4b..20a2a19f890 100644 --- a/src/source_terms/brinkman/filters.f90 +++ b/src/source_terms/brinkman/filters.f90 @@ -38,6 +38,12 @@ module filters use neko_config, only: NEKO_BCKND_DEVICE use num_types, only: rp use utils, only: neko_error + + use filters_cpu, only: smooth_step_cpu, permeability_cpu, & + step_function_cpu + use filters_device, only: smooth_step_device, permeability_device, & + step_function_device + implicit none private @@ -61,15 +67,15 @@ module filters !! @param[in] edge0 Edge giving output 0. !! @param[in] edge1 Edge giving output 1. subroutine smooth_step_field(F, edge0, edge1) - use filters_cpu, only: smooth_step_cpu - type(field_t), intent(inout) :: F real(kind=rp), intent(in) :: edge0, edge1 + integer :: n + n = F%size() if (NEKO_BCKND_DEVICE .eq. 1) then - call neko_error('smooth_step_field: not implemented for device') + call smooth_step_device(F%x_d, edge0, edge1, n) else - F%x = smooth_step_cpu(F%x, edge0, edge1) + call smooth_step_cpu(F%x, edge0, edge1, n) end if end subroutine smooth_step_field @@ -80,18 +86,17 @@ end subroutine smooth_step_field !! @param[in] k_0 Permeability at x=0. !! @param[in] k_1 Permeability at x=1. !! @param[in] q Penalty factor. - subroutine permeability_field(F_out, x, k_0, k_1, q) - use filters_cpu, only: permeability_cpu - - type(field_t), intent(inout) :: F_out - type(field_t), intent(in) :: x + subroutine permeability_field(F, k_0, k_1, q) + type(field_t), intent(inout) :: F real(kind=rp), intent(in) :: k_0, k_1 real(kind=rp), intent(in) :: q + integer :: n + n = F%size() if (NEKO_BCKND_DEVICE .eq. 1) then - call neko_error('permeability_field: not implemented for device') + call permeability_device(F%x_d, k_0, k_1, q, n) else - F_out%x = permeability_cpu(x%x, k_0, k_1, q) + call permeability_cpu(F%x, k_0, k_1, q, n) end if end subroutine permeability_field @@ -101,15 +106,15 @@ end subroutine permeability_field !! @param[in] value0 Value of the field before the step. !! @param[in] value1 Value of the field after the step. subroutine step_function_field(F, x0, value0, value1) - use filters_cpu, only: step_function_cpu - type(field_t), intent(inout) :: F real(kind=rp), intent(in) :: x0, value0, value1 + integer :: n + n = F%size() if (NEKO_BCKND_DEVICE .eq. 1) then - call neko_error('step_function_field: not implemented for device') + call step_function_device(F%x_d, x0, value0, value1, n) else - F%x = step_function_cpu(F%x, x0, value0, value1) + call step_function_cpu(F%x, x0, value0, value1, n) end if end subroutine step_function_field diff --git a/src/source_terms/brinkman_source_term.f90 b/src/source_terms/brinkman_source_term.f90 index 8842d681ee1..c4c2fd2027a 100644 --- a/src/source_terms/brinkman_source_term.f90 +++ b/src/source_terms/brinkman_source_term.f90 @@ -43,6 +43,9 @@ module brinkman_source_term use neko_config, only: NEKO_BCKND_DEVICE use utils, only: neko_error use field_math, only: field_subcol3 + + use math, only: pwmax + use device_math, only: device_pwmax implicit none private @@ -174,14 +177,9 @@ subroutine brinkman_source_term_init_from_json(this, json, fields, coef) ! ------------------------------------------------------------------------ ! ! Compute the permeability field - call permeability_field(this%brinkman, this%indicator, & - & brinkman_limits(1), brinkman_limits(2), brinkman_penalty) - - ! Copy the permeability field to the device - if (NEKO_BCKND_DEVICE .eq. 1) then - call device_memcpy(this%brinkman%x, this%brinkman%x_d, & - this%brinkman%dof%size(), HOST_TO_DEVICE, .true.) - end if + this%brinkman = this%indicator + call permeability_field(this%brinkman, & + brinkman_limits(1), brinkman_limits(2), brinkman_penalty) end subroutine brinkman_source_term_init_from_json @@ -325,7 +323,7 @@ subroutine init_boundary_mesh(this, json) ! compute the signed distance function. This should be replaced with a ! more efficient method, such as a tree search. - call temp_field%init(this%indicator%dof) + call temp_field%init(this%coef%dof) ! Select how to transform the distance field to a design field select case (distance_transform) @@ -339,6 +337,7 @@ subroutine init_boundary_mesh(this, json) case ('step') call json_get(json, 'distance_transform.value', scalar_d) + scalar_r = real(scalar_d, kind=rp) call signed_distance_field(temp_field, boundary_mesh, scalar_d) call step_function_field(temp_field, scalar_r, 1.0_rp, 0.0_rp) @@ -359,7 +358,12 @@ subroutine init_boundary_mesh(this, json) end select ! Update the global indicator field by max operator - this%indicator%x = max(this%indicator%x, temp_field%x) + if (NEKO_BCKND_DEVICE .eq. 1) then + call device_pwmax(this%indicator%x_d, temp_field%x_d, & + this%indicator%size()) + else + this%indicator%x = max(this%indicator%x, temp_field%x) + end if end subroutine init_boundary_mesh @@ -392,8 +396,7 @@ subroutine init_point_zone(this, json) call json_get_or_default(json, 'filter.type', filter_type, 'none') ! Compute the indicator field - - call temp_field%init(this%indicator%dof) + call temp_field%init(this%coef%dof) my_point_zone => neko_point_zone_registry%get_point_zone(zone_name) diff --git a/tests/unit/device_math/device_math_parallel.pf b/tests/unit/device_math/device_math_parallel.pf index ab70603954f..ddd8b5e3066 100644 --- a/tests/unit/device_math/device_math_parallel.pf +++ b/tests/unit/device_math/device_math_parallel.pf @@ -3,7 +3,7 @@ module device_math_parallel use math use pfunit use device - use device_math + use device_math use neko_config use num_types use comm, only : NEKO_COMM, pe_rank, pe_size @@ -15,7 +15,7 @@ module device_math_parallel procedure :: setUp procedure :: tearDown end type test_device_math - + contains subroutine setUp(this) @@ -28,8 +28,8 @@ contains subroutine tearDown(this) class(test_device_math), intent(inout) :: this if (NEKO_BCKND_DEVICE .eq. 1) then - call device_finalize - end if + call device_finalize + end if end subroutine tearDown @test @@ -42,33 +42,33 @@ contains if (NEKO_BCKND_DEVICE .eq. 1) then a = 1.0_rp - + call device_map(a, a_d, size(a)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - + call device_rzero(a_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(0.0_rp, a(i), tolerance=NEKO_EPS) end do - + do i = 1, n a(i) = real(i, rp) end do - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - + call device_rzero(a_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(0.0_rp, a(i), tolerance=NEKO_EPS) end do end if - + end subroutine test_device_math_rzero @test @@ -81,33 +81,33 @@ contains if (NEKO_BCKND_DEVICE .eq. 1) then a = 0.0_rp - + call device_map(a, a_d, size(a)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - + call device_rone(a_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(1.0_rp, a(i), tolerance=NEKO_EPS) end do - + do i = 1, n a(i) = real(i, rp) end do - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - + call device_rone(a_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(1.0_rp, a(i), tolerance=NEKO_EPS) end do end if - + end subroutine test_device_math_rone @test @@ -120,7 +120,7 @@ contains integer :: i if (NEKO_BCKND_DEVICE .eq. 1) then - + call rone(a, n) call rzero(b, n) do i = 1, n @@ -129,20 +129,20 @@ contains call device_map(a, a_d, size(a)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - + call device_map(b, b_d, size(b)) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) - + call device_copy(b_d, a_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.false.) call device_memcpy(b, b_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(a(i), b(i)) end do end if - + end subroutine test_device_math_copy @test @@ -156,23 +156,23 @@ contains if (NEKO_BCKND_DEVICE .eq. 1) then a = 1.0_rp - + call device_map(a, a_d, size(a)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - + call device_cfill(a_d, c, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(42.0_rp, a(i), tolerance=NEKO_EPS) end do end if - + end subroutine test_device_math_cfill - @test + @test subroutine test_device_math_cadd(this) class(test_device_math), intent(inout) :: this integer, parameter :: n = 4711 @@ -183,20 +183,20 @@ contains if (NEKO_BCKND_DEVICE .eq. 1) then a = 40.0_rp - + call device_map(a, a_d, size(a)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - + call device_cadd(a_d, c, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(42.0_rp, a(i), tolerance=NEKO_EPS) end do - + end if - + end subroutine test_device_math_cadd @test @@ -213,24 +213,24 @@ contains a(i) = 40.0_rp b(i) = 2.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) - + call device_add2(a_d, b_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(42.0_rp, a(i)) end do end if - + end subroutine test_device_math_add2 - + @test subroutine test_device_math_add2s1(this) class(test_device_math), intent(inout) :: this @@ -246,22 +246,22 @@ contains a(i) = 20.0_rp b(i) = 2.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) - + call device_add2s1(a_d, b_d, c1, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(42.0_rp, a(i)) end do end if - + end subroutine test_device_math_add2s1 @test @@ -282,19 +282,19 @@ contains call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) - + call device_add2s2(a_d, b_d, c1, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) do i = 1, n @assertEqual(24.0_rp, a(i)) end do end if - + end subroutine test_device_math_add2s2 @test @@ -319,20 +319,20 @@ contains call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_map(c, c_d, size(c)) - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.) - + call device_add3s2(a_d, b_d, c_d, c1, c2, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) do i = 1, n @assertEqual(84.0_rp, a(i)) end do end if - + end subroutine test_device_math_add3s2 @test @@ -347,22 +347,22 @@ contains do i = 1, n a(i) = 2.0_rp end do - + call device_map(a, a_d, size(a)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - + call device_invcol1(a_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(0.5_rp, a(i)) end do end if end subroutine test_device_math_invcol1 - + @test subroutine test_device_math_invcol2(this) class(test_device_math), intent(inout) :: this @@ -377,24 +377,24 @@ contains a(i) = 21.0_rp b(i) = 2.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) - + call device_invcol2(a_d, b_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(10.5_rp, a(i)) end do end if end subroutine test_device_math_invcol2 - + @test subroutine test_device_math_col2(this) class(test_device_math), intent(inout) :: this @@ -409,17 +409,17 @@ contains a(i) = 21.0_rp b(i) = 2.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) - + call device_col2(a_d, b_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(42.0_rp, a(i)) end do @@ -443,7 +443,7 @@ contains b(i) = 2.0_rp c(i) = 21.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_map(c, c_d, size(c)) @@ -451,11 +451,11 @@ contains call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.) - + call device_col3(a_d, b_d, c_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(42.0_rp, a(i)) end do @@ -477,24 +477,24 @@ contains a(i) = 44.0_rp b(i) = 2.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) - + call device_sub2(a_d, b_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(42.0_rp, a(i)) end do end if end subroutine test_device_math_sub2 - + @test subroutine test_device_math_sub3(this) class(test_device_math), intent(inout) :: this @@ -511,7 +511,7 @@ contains b(i) = 2.0_rp c(i) = 21.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_map(c, c_d, size(c)) @@ -519,11 +519,11 @@ contains call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.) - + call device_sub3(a_d, b_d, c_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(-19.0_rp, a(i)) end do @@ -547,7 +547,7 @@ contains b(i) = 2.0_rp c(i) = 21.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_map(c, c_d, size(c)) @@ -555,11 +555,11 @@ contains call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.) - + call device_addcol3(a_d, b_d, c_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(4711.0_rp, a(i)) end do @@ -585,7 +585,7 @@ contains c(i) = 21.0_rp d(i) = 1.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_map(c, c_d, size(c)) @@ -595,11 +595,11 @@ contains call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.) call device_memcpy(d, d_d, size(d), HOST_TO_DEVICE, sync=.false.) - + call device_addcol4(a_d, b_d, c_d, d_d, n) - + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) - + do i = 1, n @assertEqual(4711.0_rp, a(i)) end do @@ -619,29 +619,29 @@ contains integer :: i, ierr if (NEKO_BCKND_DEVICE .eq. 1) then - + call MPI_Comm_dup(this%getMpiCommunicator(), NEKO_COMM%mpi_val, ierr) - + do i = 1, n a(i) = 1.0_rp b(i) = 1.0_rp c(i) = 1.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) call device_map(c, c_d, size(c)) - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.) - - expected = real(this%getNumProcesses(), rp) * real(n, rp) + + expected = real(this%getNumProcesses(), rp) * real(n, rp) res = device_glsc3(a_d, b_d, c_d, n) @assertEqual(expected, res) @MPIassertEqual(expected, res) end if - + end subroutine test_device_math_glsc3 @test(npes=[1]) @@ -655,26 +655,26 @@ contains integer :: i, ierr if (NEKO_BCKND_DEVICE .eq. 1) then - + call MPI_Comm_dup(this%getMpiCommunicator(), NEKO_COMM%mpi_val, ierr) - + do i = 1, n a(i) = 1.0_rp b(i) = 1.0_rp end do - + call device_map(a, a_d, size(a)) call device_map(b, b_d, size(b)) - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.) - - expected = real(this%getNumProcesses(), rp) * real(n, rp) + + expected = real(this%getNumProcesses(), rp) * real(n, rp) res = device_glsc2(a_d, b_d, n) @assertEqual(expected, res) @MPIassertEqual(expected, res) end if - + end subroutine test_device_math_glsc2 @test(npes=[1]) @@ -687,23 +687,103 @@ contains integer :: i, ierr if (NEKO_BCKND_DEVICE .eq. 1) then - + call MPI_Comm_dup(this%getMpiCommunicator(), NEKO_COMM%mpi_val, ierr) - + do i = 1, n a(i) = 1.0_rp end do - + call device_map(a, a_d, size(a)) - + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) - - expected = real(this%getNumProcesses(), rp) * real(n, rp) + + expected = real(this%getNumProcesses(), rp) * real(n, rp) res = device_glsum(a_d, n) @assertEqual(expected, res) @MPIassertEqual(expected, res) end if - + end subroutine test_device_math_glsum + ! ========================================================================== ! + ! Tests for the pointwise operations + + !> Test the point-wise maximum operation + @test(npes=[1]) + subroutine test_device_math_pwmax(this) + class(test_device_math), intent(inout) :: this + + integer, parameter :: n = 4711 + real(kind=rp), dimension(n) :: a, b, c + type(c_ptr) :: a_d = C_NULL_PTR + type(c_ptr) :: b_d = C_NULL_PTR + + integer :: i + + if (NEKO_BCKND_DEVICE .eq. 1) then + do i = 1, n + a(i) = real(i, rp) + b(i) = real(n - i, rp) + end do + + call device_map(a, a_d, size(a)) + call device_map(b, b_d, size(b)) + + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) + call device_memcpy(b, b_d, size(a), HOST_TO_DEVICE, sync=.false.) + + call device_pwmax(a_d, b_d, n) + + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) + + do i = 1, n + c(i) = max(a(i), b(i)) + end do + + do i = 1, n + @assertEqual(c(i), a(i)) + end do + end if + + end subroutine test_device_math_pwmax + + !> Test the point-wise minimum operation + @test(npes=[1]) + subroutine test_device_math_pwmin(this) + class(test_device_math), intent(inout) :: this + + integer, parameter :: n = 4711 + real(kind=rp), dimension(n) :: a, b, c + type(c_ptr) :: a_d = C_NULL_PTR + type(c_ptr) :: b_d = C_NULL_PTR + + integer :: i + + if (NEKO_BCKND_DEVICE .eq. 1) then + do i = 1, n + a(i) = real(i, rp) + b(i) = real(n - i, rp) + end do + + call device_map(a, a_d, size(a)) + call device_map(b, b_d, size(b)) + + call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.) + call device_memcpy(b, b_d, size(a), HOST_TO_DEVICE, sync=.false.) + + call device_pwmin(a_d, b_d, n) + + call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.) + + do i = 1, n + c(i) = min(a(i), b(i)) + end do + + do i = 1, n + @assertEqual(c(i), a(i)) + end do + end if + + end subroutine test_device_math_pwmin end module device_math_parallel diff --git a/tests/unit/math/math_parallel.pf b/tests/unit/math/math_parallel.pf index 1aa2b31cc3d..80708e7e3b5 100644 --- a/tests/unit/math/math_parallel.pf +++ b/tests/unit/math/math_parallel.pf @@ -17,14 +17,14 @@ contains c = 42.0_rp + (NEKO_EPS/2.0_rp) if (epsilon(1.0_rp) .lt. real(1d-12, rp)) then d = 42.0_rp + real(1d-12, rp) - else if (epsilon(1.0_rp) .lt. real(1d-5, rp)) then + else if (epsilon(1.0_rp) .lt. real(1d-5, rp)) then d = 42.0_rp + real(1d-5, rp) end if - + @assertFalse(abscmp(a, b)) @assertTrue(abscmp(b, c)) @assertFalse(abscmp(b, d)) - + end subroutine test_math_abscmp @test @@ -33,7 +33,7 @@ contains real(kind=rp) :: a(n) integer :: i - a = 1.0_rp + a = 1.0_rp call rzero(a, n) do i = 1, n @assertEqual(0.0_rp, a(i)) @@ -46,7 +46,7 @@ contains do i = 1, n @assertEqual(0.0_rp, a(i)) end do - + end subroutine test_math_rzero @test @@ -60,7 +60,7 @@ contains do i = 1, n @assertEqual(0, a(i)) end do - + end subroutine test_math_izero @test @@ -77,7 +77,7 @@ contains do i = 1, n @assertEqual(1.0_rp, a(i)) end do - + end subroutine test_math_rone @test @@ -96,7 +96,7 @@ contains do i = 1, n @assertEqual(a(i), b(i)) end do - + end subroutine test_math_copy @test @@ -317,7 +317,7 @@ contains end do end subroutine test_math_invcol2 - + @test subroutine test_math_col2 integer, parameter :: n = 17 @@ -377,7 +377,7 @@ contains end subroutine test_math_sub3 - @test + @test subroutine test_math_addcol3 integer, parameter :: n = 17 real(kind=rp), dimension(n) :: a, b, c @@ -406,18 +406,73 @@ contains integer :: i, ierr call MPI_Comm_dup(this%getMpiCommunicator(), NEKO_COMM%mpi_val, ierr) - + do i = 1, n a(i) = 1.0_rp b(i) = 1.0_rp c(i) = 1.0_rp end do - expected = real(this%getNumProcesses(), rp) * 17.0_rp + expected = real(this%getNumProcesses(), rp) * 17.0_rp res = glsc3(a, b, c, n) @assertEqual(expected, res) @MPIassertEqual(expected, res) end subroutine test_math_glsc3 + ! ========================================================================== ! + ! Tests for the pointwise operations + + !> Test the point-wise maximum operation + @test + subroutine test_math_pwmax + + integer, parameter :: n = 4711 + real(kind=rp), dimension(n) :: a, b, c + + integer :: i + + do i = 1, n + a(i) = real(i, rp) + b(i) = real(n - i, rp) + end do + + call pwmax(a, b, n) + + do i = 1, n + c(i) = max(a(i), b(i)) + end do + + do i = 1, n + @assertEqual(c(i), a(i)) + end do + + end subroutine test_math_pwmax + + !> Test the point-wise minimum operation + @test + subroutine test_math_pwmin + + integer, parameter :: n = 4711 + real(kind=rp), dimension(n) :: a, b, c + + integer :: i + + do i = 1, n + a(i) = real(i, rp) + b(i) = real(n - i, rp) + end do + + call pwmin(a, b, n) + + do i = 1, n + c(i) = min(a(i), b(i)) + end do + + do i = 1, n + @assertEqual(c(i), a(i)) + end do + + end subroutine test_math_pwmin + end module math_parallel