diff --git a/examples/immersed_zones/block.case b/examples/immersed_zones/block.case
index 6539de1e7bf..37c548f3e75 100644
--- a/examples/immersed_zones/block.case
+++ b/examples/immersed_zones/block.case
@@ -6,7 +6,7 @@
         "output_at_end": false,
         "output_boundary": false,
         "output_checkpoints": false,
-        "end_time": 0.01,
+        "end_time": 20.0,
         "timestep": 5e-5,
         "numerics": {
             "time_order": 3,
diff --git a/examples/immersed_zones/block_sphere.case b/examples/immersed_zones/block_sphere.case
index 83da59c37dc..d9e441910c5 100644
--- a/examples/immersed_zones/block_sphere.case
+++ b/examples/immersed_zones/block_sphere.case
@@ -6,7 +6,7 @@
         "output_at_end": false,
         "output_boundary": false,
         "output_checkpoints": false,
-        "end_time": 0.01,
+        "end_time": 20.0,
         "timestep": 5e-5,
         "numerics": {
             "time_order": 3,
diff --git a/examples/immersed_zones/run.sh b/examples/immersed_zones/run.sh
index e32bd6e2320..16399044e37 100755
--- a/examples/immersed_zones/run.sh
+++ b/examples/immersed_zones/run.sh
@@ -26,7 +26,7 @@ function help() {
     echo -e "  See Readme for additional details."
     exit 0
 }
-if [ $# -eq 0 ]; then help; fi
+if [ $# -eq 0 ]; then ALL=1; fi
 
 # Handle options
 Nx=32 && Ny=8 && Nz=8
diff --git a/src/.depends b/src/.depends
index 61007d7f3ef..918c4ba7d34 100644
--- a/src/.depends
+++ b/src/.depends
@@ -18,7 +18,7 @@ math/mxm_wrapper.o : math/mxm_wrapper.F90 common/utils.o config/num_types.o
 sem/speclib.o : sem/speclib.f90 common/utils.o config/num_types.o 
 qoi/drag_torque.o : qoi/drag_torque.f90 math/bcknd/device/device_math.o device/device.o common/utils.o config/num_types.o sem/space.o math/math.o comm/comm.o mesh/facet_zone.o mesh/mesh.o sem/coef.o field/field.o 
 sem/local_interpolation.o : sem/local_interpolation.f90 config/neko_config.o math/bcknd/device/device_math.o device/device.o field/field_list.o field/field.o common/utils.o math/fast3d.o math/math.o mesh/point.o config/num_types.o sem/space.o math/tensor.o 
-math/math.o : math/math.f90 comm/comm.o config/num_types.o 
+math/math.o : math/math.f90 comm/comm.o common/utils.o config/num_types.o 
 math/mathops.o : math/mathops.f90 config/num_types.o 
 math/fast3d.o : math/fast3d.f90 math/math.o sem/speclib.o config/num_types.o 
 sem/space.o : sem/space.f90 math/mxm_wrapper.o math/tensor.o math/math.o math/fast3d.o common/utils.o device/device.o sem/speclib.o config/num_types.o config/neko_config.o 
@@ -106,7 +106,7 @@ math/bcknd/xsmm/fdm_xsmm.o : math/bcknd/xsmm/fdm_xsmm.f90 math/bcknd/xsmm/tensor
 math/schwarz.o : math/schwarz.f90 config/neko_config.o device/device.o math/fdm.o math/bcknd/device/device_math.o math/bcknd/device/device_schwarz.o gs/gather_scatter.o bc/bc.o sem/dofmap.o sem/space.o mesh/mesh.o math/math.o config/num_types.o 
 math/vector.o : math/vector.f90 common/utils.o math/bcknd/device/device_math.o device/device.o config/num_types.o math/math.o config/neko_config.o 
 math/matrix.o : math/matrix.f90 common/utils.o math/bcknd/device/device_math.o device/device.o config/num_types.o math/math.o config/neko_config.o 
-math/signed_distance.o : math/signed_distance.f90 adt/stack.o mesh/search_tree/aabb.o mesh/point.o common/utils.o mesh/aabb_tree.o mesh/tri_mesh.o mesh/tri.o field/field.o config/num_types.o 
+math/signed_distance.o : math/signed_distance.f90 adt/stack.o mesh/search_tree/aabb.o mesh/point.o common/utils.o config/neko_config.o device/device.o mesh/aabb_tree.o mesh/tri_mesh.o mesh/tri.o field/field.o config/num_types.o 
 common/checkpoint.o : common/checkpoint.f90 mesh/mesh.o common/utils.o field/field.o device/device.o sem/space.o field/field_series.o config/num_types.o config/neko_config.o 
 io/generic_file.o : io/generic_file.f90 comm/comm.o common/utils.o config/num_types.o 
 io/map_file.o : io/map_file.f90 io/format/map.o comm/comm.o common/utils.o io/generic_file.o 
@@ -293,9 +293,13 @@ source_terms/boussinesq_source_term.o : source_terms/boussinesq_source_term.f90
 source_terms/bcknd/cpu/boussinesq_source_term_cpu.o : source_terms/bcknd/cpu/boussinesq_source_term_cpu.f90 math/math.o field/field.o field/field_list.o config/num_types.o 
 source_terms/bcknd/device/boussinesq_source_term_device.o : source_terms/bcknd/device/boussinesq_source_term_device.f90 math/bcknd/device/device_math.o field/field.o field/field_list.o config/num_types.o 
 source_terms/source_term_fctry.o : source_terms/source_term_fctry.f90 common/utils.o common/json_utils.o source_terms/coriolis_source_term.o source_terms/brinkman_source_term.o source_terms/boussinesq_source_term.o source_terms/const_source_term.o source_terms/source_term.o 
-source_terms/brinkman_source_term.o : source_terms/brinkman_source_term.f90 mesh/point_zone_registry.o mesh/point_zone.o mesh/search_tree/aabb.o common/profiler.o math/signed_distance.o source_terms/brinkman/filters.o device/device.o mesh/tri_mesh.o io/file.o math/field_math.o common/utils.o config/neko_config.o sem/coef.o source_terms/source_term.o field/field_registry.o common/json_utils.o field/field_list.o field/field.o config/num_types.o 
-source_terms/brinkman/filters.o : source_terms/brinkman/filters.f90 source_terms/bcknd/cpu/filters_cpu.o common/utils.o config/num_types.o config/neko_config.o field/field.o 
+source_terms/brinkman_source_term.o : source_terms/brinkman_source_term.f90 mesh/point_zone_registry.o mesh/point_zone.o mesh/search_tree/aabb.o common/profiler.o math/signed_distance.o source_terms/brinkman/filters.o device/device.o mesh/tri_mesh.o io/file.o math/bcknd/device/device_math.o math/math.o math/field_math.o common/utils.o config/neko_config.o sem/coef.o source_terms/source_term.o field/field_registry.o common/json_utils.o field/field_list.o field/field.o config/num_types.o 
+source_terms/brinkman/filters.o : source_terms/brinkman/filters.f90 source_terms/bcknd/device/filters_device.o source_terms/bcknd/cpu/filters_cpu.o common/utils.o config/num_types.o config/neko_config.o field/field.o 
 source_terms/bcknd/cpu/filters_cpu.o : source_terms/bcknd/cpu/filters_cpu.f90 config/num_types.o 
+source_terms/bcknd/device/filters_device.o : source_terms/bcknd/device/filters_device.F90 source_terms/bcknd/device/opencl/opencl_filters.o source_terms/bcknd/device/hip/hip_filters.o source_terms/bcknd/device/cuda/cuda_filters.o common/utils.o math/bcknd/device/device_math.o config/num_types.o 
+source_terms/bcknd/device/cuda/cuda_filters.o : source_terms/bcknd/device/cuda/cuda_filters.f90 config/num_types.o 
+source_terms/bcknd/device/hip/hip_filters.o : source_terms/bcknd/device/hip/hip_filters.f90 config/num_types.o 
+source_terms/bcknd/device/opencl/opencl_filters.o : source_terms/bcknd/device/opencl/opencl_filters.f90 config/num_types.o 
 les/les_model.o : les/les_model.f90 math/bcknd/device/device_math.o math/math.o device/device.o config/neko_config.o gs/gs_ops.o sem/coef.o sem/dofmap.o field/field_registry.o field/field.o config/num_types.o 
 les/les_model_fctry.o : les/les_model_fctry.f90 common/utils.o les/sigma.o les/dynamic_smagorinsky.o les/smagorinsky.o les/vreman.o les/les_model.o 
 les/vreman.o : les/vreman.f90 sem/coef.o les/bcknd/cpu/vreman_cpu.o config/neko_config.o common/utils.o common/json_utils.o sem/dofmap.o les/les_model.o field/field.o config/num_types.o 
diff --git a/src/.depends_device b/src/.depends_device
index 4eeaecb8f8e..4a0a9a3153b 100644
--- a/src/.depends_device
+++ b/src/.depends_device
@@ -26,7 +26,7 @@ bc/bcknd/device/hip/dong_outflow.o : bc/bcknd/device/hip/dong_outflow.hip bc/bck
 common/bcknd/device/hip/rhs_maker.o : common/bcknd/device/hip/rhs_maker.hip common/bcknd/device/hip/sumab_kernel.h common/bcknd/device/hip/makeext_kernel.h common/bcknd/device/hip/makebdf_kernel.h
 common/bcknd/device/hip/projection.o : common/bcknd/device/hip/projection.hip common/bcknd/device/hip/projection_kernel.h
 common/bcknd/device/hip/gradient_jump_penalty.o : common/bcknd/device/hip/gradient_jump_penalty.hip common/bcknd/device/hip/gradient_jump_penalty_kernel.h
-fluid/bcknd/device/hip/pnpn_res.o : fluid/bcknd/device/hip/pnpn_res.hip fluid/bcknd/device/hip/vel_res_update_kernel.h fluid/bcknd/device/hip/prs_res_kernel.h 
+fluid/bcknd/device/hip/pnpn_res.o : fluid/bcknd/device/hip/pnpn_res.hip fluid/bcknd/device/hip/vel_res_update_kernel.h fluid/bcknd/device/hip/prs_res_kernel.h
 scalar/bcknd/device/hip/scalar_residual.o : scalar/bcknd/device/hip/scalar_residual.hip scalar/bcknd/device/hip/scalar_residual_update_kernel.h
 sem/bcknd/device/hip/coef.o : sem/bcknd/device/hip/coef.hip sem/bcknd/device/hip/coef_kernel.h
 device/cuda/check.o : device/cuda/check.cu device/cuda/check.h
@@ -47,7 +47,7 @@ krylov/bcknd/device/cuda/pipecg_aux.o : krylov/bcknd/device/cuda/pipecg_aux.cu k
 krylov/bcknd/device/cuda/fusedcg_aux.o : krylov/bcknd/device/cuda/fusedcg_aux.cu krylov/bcknd/device/cuda/fusedcg_kernel.h
 krylov/bcknd/device/cuda/fusedcg_cpld_aux.o : krylov/bcknd/device/cuda/fusedcg_cpld_aux.cu krylov/bcknd/device/cuda/fusedcg_cpld_kernel.h
 krylov/bcknd/device/cuda/gmres_aux.o : krylov/bcknd/device/cuda/gmres_aux.cu krylov/bcknd/device/cuda/gmres_kernel.h
-gs/bcknd/device/cuda/gs.o : gs/bcknd/device/cuda/gs.cu gs/bcknd/device/cuda/gs_kernels.h 
+gs/bcknd/device/cuda/gs.o : gs/bcknd/device/cuda/gs.cu gs/bcknd/device/cuda/gs_kernels.h
 bc/bcknd/device/cuda/dirichlet.o : bc/bcknd/device/cuda/dirichlet.cu bc/bcknd/device/cuda/dirichlet_kernel.h
 bc/bcknd/device/cuda/inflow.o : bc/bcknd/device/cuda/inflow.cu bc/bcknd/device/cuda/inflow_kernel.h
 bc/bcknd/device/cuda/no_slip_wall.o : bc/bcknd/device/cuda/no_slip_wall.cu bc/bcknd/device/cuda/no_slip_wall_kernel.h
@@ -58,13 +58,13 @@ bc/bcknd/device/cuda/dong_outflow.o : bc/bcknd/device/cuda/dong_outflow.cu bc/bc
 common/bcknd/device/cuda/rhs_maker.o : common/bcknd/device/cuda/rhs_maker.cu common/bcknd/device/cuda/sumab_kernel.h common/bcknd/device/cuda/makeext_kernel.h common/bcknd/device/cuda/makebdf_kernel.h
 common/bcknd/device/cuda/projection.o : common/bcknd/device/cuda/projection.cu common/bcknd/device/cuda/projection_kernel.h
 fluid/bcknd/device/cuda/pnpn_res.o : fluid/bcknd/device/cuda/pnpn_res.cu fluid/bcknd/device/cuda/vel_res_update_kernel.h fluid/bcknd/device/cuda/prs_res_kernel.h
-fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.o : fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.cu fluid/stress_formulation/bcknd/device/cuda/prs_stress_res_kernel.h 
+fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.o : fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.cu fluid/stress_formulation/bcknd/device/cuda/prs_stress_res_kernel.h
 common/bcknd/device/cuda/gradient_jump_penalty.o : common/bcknd/device/cuda/gradient_jump_penalty.cu common/bcknd/device/cuda/gradient_jump_penalty_kernel.h
 scalar/bcknd/device/cuda/scalar_residual.o : scalar/bcknd/device/cuda/scalar_residual.cu scalar/bcknd/device/cuda/scalar_residual_update_kernel.h
 sem/bcknd/device/cuda/coef.o : sem/bcknd/device/cuda/coef.cu sem/bcknd/device/cuda/coef_kernel.h
 device/opencl/check.o : device/opencl/check.c device/opencl/check.h
 device/opencl/jit.o : device/opencl/jit.c device/opencl/jit.h
-math/bcknd/device/opencl/math.o : math/bcknd/device/opencl/math.c math/bcknd/device/opencl/math_kernel.cl.h 
+math/bcknd/device/opencl/math.o : math/bcknd/device/opencl/math.c math/bcknd/device/opencl/math_kernel.cl.h
 math/bcknd/device/opencl/schwarz.o : math/bcknd/device/opencl/schwarz.c math/bcknd/device/opencl/schwarz_kernel.cl.h
 math/bcknd/device/opencl/tensor.o : math/bcknd/device/opencl/tensor.c math/bcknd/device/opencl/tensor_kernel.cl.h
 math/bcknd/device/opencl/fdm.o : math/bcknd/device/opencl/fdm.c math/bcknd/device/opencl/fdm_kernel.cl.h
@@ -77,7 +77,7 @@ math/bcknd/device/opencl/opr_lambda2.o : math/bcknd/device/opencl/opr_lambda2.c
 math/bcknd/device/opencl/opr_cfl.o : math/bcknd/device/opencl/opr_cfl.c math/bcknd/device/opencl/cfl_kernel.cl.h
 math/bcknd/device/opencl/ax_helm.o : math/bcknd/device/opencl/ax_helm.c math/bcknd/device/opencl/ax_helm_kernel.cl.h
 krylov/bcknd/device/opencl/pc_jacobi.o : krylov/bcknd/device/opencl/pc_jacobi.c krylov/bcknd/device/opencl/jacobi_kernel.cl.h
-gs/bcknd/device/opencl/gs.o : gs/bcknd/device/opencl/gs.c gs/bcknd/device/opencl/gs_kernels.cl.h 
+gs/bcknd/device/opencl/gs.o : gs/bcknd/device/opencl/gs.c gs/bcknd/device/opencl/gs_kernels.cl.h
 bc/bcknd/device/opencl/dirichlet.o : bc/bcknd/device/opencl/dirichlet.c bc/bcknd/device/opencl/dirichlet_kernel.cl.h
 bc/bcknd/device/opencl/inflow.o : bc/bcknd/device/opencl/inflow.c bc/bcknd/device/opencl/inflow_kernel.cl.h
 bc/bcknd/device/opencl/no_slip_wall.o : bc/bcknd/device/opencl/no_slip_wall.c bc/bcknd/device/opencl/no_slip_wall_kernel.cl.h
@@ -89,3 +89,6 @@ common/bcknd/device/opencl/rhs_maker.o : common/bcknd/device/opencl/rhs_maker.c
 fluid/bcknd/device/opencl/pnpn_res.o : fluid/bcknd/device/opencl/pnpn_res.c fluid/bcknd/device/opencl/pnpn_res_kernel.cl.h
 scalar/bcknd/device/opencl/scalar_residual.o : scalar/bcknd/device/opencl/scalar_residual.c scalar/bcknd/device/opencl/scalar_residual_kernel.cl.h
 sem/bcknd/device/opencl/coef.o : sem/bcknd/device/opencl/coef.c sem/bcknd/device/opencl/coef_kernel.cl.h
+source_terms/bcknd/device/cuda/filters.o : source_terms/bcknd/device/cuda/filters.cu source_terms/bcknd/device/cuda/filter_kernels.h
+source_terms/bcknd/device/hip/filters.o : source_terms/bcknd/device/hip/filters.hip source_terms/bcknd/device/hip/filter_kernels.h
+source_terms/bcknd/device/opencl/filters.o : source_terms/bcknd/device/opencl/filters.c source_terms/bcknd/device/opencl/filter_kernels.cl.h
diff --git a/src/Makefile.am b/src/Makefile.am
index 313d034c958..0693ba408a3 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -299,6 +299,10 @@ neko_fortran_SOURCES = \
 	source_terms/brinkman_source_term.f90\
 	source_terms/brinkman/filters.f90\
 	source_terms/bcknd/cpu/filters_cpu.f90\
+	source_terms/bcknd/device/filters_device.F90\
+	source_terms/bcknd/device/cuda/cuda_filters.f90\
+	source_terms/bcknd/device/hip/hip_filters.f90\
+	source_terms/bcknd/device/opencl/opencl_filters.f90\
 	les/les_model.f90\
 	les/les_model_fctry.f90\
 	les/vreman.f90\
@@ -364,7 +368,8 @@ libneko_a_SOURCES += \
 	common/bcknd/device/hip/gradient_jump_penalty.hip\
 	fluid/bcknd/device/hip/pnpn_res.hip\
 	scalar/bcknd/device/hip/scalar_residual.hip\
-	sem/bcknd/device/hip/coef.hip
+	sem/bcknd/device/hip/coef.hip\
+	source_terms/bcknd/device/hip/filters.hip
 endif
 
 if ENABLE_CUDA
@@ -401,7 +406,8 @@ libneko_a_SOURCES += \
 	fluid/bcknd/device/cuda/pnpn_res.cu\
 	fluid/stress_formulation/bcknd/device/cuda/pnpn_stress_res.cu\
 	scalar/bcknd/device/cuda/scalar_residual.cu\
-	sem/bcknd/device/cuda/coef.cu
+	sem/bcknd/device/cuda/coef.cu\
+	source_terms/bcknd/device/cuda/filters.cu
 endif
 
 if ENABLE_OPENCL
@@ -432,7 +438,8 @@ libneko_a_SOURCES += \
 	common/bcknd/device/opencl/rhs_maker.c\
 	fluid/bcknd/device/opencl/pnpn_res.c\
 	scalar/bcknd/device/opencl/scalar_residual.c\
-	sem/bcknd/device/opencl/coef.c
+	sem/bcknd/device/opencl/coef.c\
+	source_terms/bcknd/device/opencl/filters.c
 
 %.cl.h: %.cl
 	bash ./scripts/cltostring.sh $<
@@ -501,7 +508,8 @@ CLEANFILES += \
 	krylov/bcknd/device/opencl/*.cl.h\
 	fluid/bcknd/device/opencl/*.cl.h\
 	scalar/bcknd/device/opencl/*.cl.h\
-	sem/bcknd/device/opencl/*.cl.h
+	sem/bcknd/device/opencl/*.cl.h\
+	source_terms/bcknd/device/opencl/*.cl.h
 endif
 
 if ENABLE_MAKEDEPF90
@@ -604,6 +612,9 @@ EXTRA_DIST = \
 	sem/bcknd/device/opencl/coef_kernel.cl\
 	math/bcknd/device/device_mpi_reduce.h\
 	math/bcknd/device/device_mpi_op.h\
+	source_terms/bcknd/device/cuda/filter_kernels.h\
+	source_terms/bcknd/device/hip/filter_kernels.h\
+	source_terms/bcknd/device/opencl/filter_kernels.cl\
 	device/opencl/jit.h\
 	device/opencl/prgm_lib.h\
 	device/opencl/check.h\
diff --git a/src/device/opencl/prgm_lib.F90 b/src/device/opencl/prgm_lib.F90
index be4fe2953fa..3e97be87591 100644
--- a/src/device/opencl/prgm_lib.F90
+++ b/src/device/opencl/prgm_lib.F90
@@ -82,6 +82,9 @@ module opencl_prgm_lib
   !> Device lambda2 kernels
   type(c_ptr), bind(c) :: lambda2_program = C_NULL_PTR
 
+  !> Device filter kernels
+  type(c_ptr), bind(c) :: filter_program = C_NULL_PTR
+
 contains
 
   subroutine opencl_prgm_lib_release
@@ -261,6 +264,13 @@ subroutine opencl_prgm_lib_release
        lambda2_program = C_NULL_PTR
     end if
 
+    if (c_associated(filter_program)) then
+      if(clReleaseProgram(filter_program) .ne. CL_SUCCESS) then
+         call neko_error('Failed to release program')
+      end if
+      filter_program = C_NULL_PTR
+   end if
+
   end subroutine opencl_prgm_lib_release
 
 #endif
diff --git a/src/device/opencl/prgm_lib.h b/src/device/opencl/prgm_lib.h
index 1cea5643890..5a68c55b6ea 100644
--- a/src/device/opencl/prgm_lib.h
+++ b/src/device/opencl/prgm_lib.h
@@ -80,6 +80,8 @@ extern void *scalar_residual_program;
 /** Device lambda2 kernel */
 extern void *lambda2_program;
 
+/** Device filter kernel */
+extern void *filter_program;
 
 
 #endif
diff --git a/src/math/bcknd/device/cuda/cuda_math.f90 b/src/math/bcknd/device/cuda/cuda_math.f90
index b79b402c858..073509dbef8 100644
--- a/src/math/bcknd/device/cuda/cuda_math.f90
+++ b/src/math/bcknd/device/cuda/cuda_math.f90
@@ -317,4 +317,74 @@ subroutine cuda_absval(a_d, n) &
        integer(c_int) :: n
      end subroutine cuda_absval
   end interface
+
+  ! ========================================================================== !
+  ! Interfaces for the pointwise operations.
+
+  interface
+     subroutine cuda_pwmax_vec2(a_d, b_d, n) &
+          bind(c, name = 'cuda_pwmax_vec2')
+       use, intrinsic :: iso_c_binding, only: c_int, c_ptr
+       type(c_ptr), value :: a_d, b_d
+       integer(c_int) :: n
+     end subroutine cuda_pwmax_vec2
+
+     subroutine cuda_pwmax_vec3(a_d, b_d, c_d, n) &
+          bind(c, name = 'cuda_pwmax_vec3')
+       use, intrinsic :: iso_c_binding, only: c_int, c_ptr
+       type(c_ptr), value :: a_d, b_d, c_d
+       integer(c_int) :: n
+     end subroutine cuda_pwmax_vec3
+
+     subroutine cuda_pwmax_sca2(a_d, c_d, n) &
+          bind(c, name = 'cuda_pwmax_sca2')
+       use, intrinsic :: iso_c_binding, only: c_int, c_ptr
+       import c_rp
+       type(c_ptr), value :: a_d
+       real(c_rp) :: c_d
+       integer(c_int) :: n
+     end subroutine cuda_pwmax_sca2
+
+     subroutine cuda_pwmax_sca3(a_d, b_d, c_d, n) &
+          bind(c, name = 'cuda_pwmax_sca3')
+       use, intrinsic :: iso_c_binding, only: c_int, c_ptr
+       import c_rp
+       type(c_ptr), value :: a_d, b_d
+       real(c_rp) :: c_d
+       integer(c_int) :: n
+     end subroutine cuda_pwmax_sca3
+
+     subroutine cuda_pwmin_vec2(a_d, b_d, n) &
+          bind(c, name = 'cuda_pwmin_vec2')
+       use, intrinsic :: iso_c_binding, only: c_int, c_ptr
+       type(c_ptr), value :: a_d, b_d
+       integer(c_int) :: n
+     end subroutine cuda_pwmin_vec2
+
+     subroutine cuda_pwmin_vec3(a_d, b_d, c_d, n) &
+          bind(c, name = 'cuda_pwmin_vec3')
+       use, intrinsic :: iso_c_binding, only: c_int, c_ptr
+       type(c_ptr), value :: a_d, b_d, c_d
+       integer(c_int) :: n
+     end subroutine cuda_pwmin_vec3
+
+     subroutine cuda_pwmin_sca2(a_d, c_d, n) &
+          bind(c, name = 'cuda_pwmin_sca2')
+       use, intrinsic :: iso_c_binding, only: c_int, c_ptr
+       import c_rp
+       type(c_ptr), value :: a_d
+       real(c_rp) :: c_d
+       integer(c_int) :: n
+     end subroutine cuda_pwmin_sca2
+
+     subroutine cuda_pwmin_sca3(a_d, b_d, c_d, n) &
+          bind(c, name = 'cuda_pwmin_sca3')
+       use, intrinsic :: iso_c_binding, only: c_int, c_ptr
+       import c_rp
+       type(c_ptr), value :: a_d, b_d
+       real(c_rp) :: c_d
+       integer(c_int) :: n
+     end subroutine cuda_pwmin_sca3
+
+  end interface
 end module cuda_math
diff --git a/src/math/bcknd/device/cuda/math.cu b/src/math/bcknd/device/cuda/math.cu
index 931c8f7a11b..783d0a47fe8 100644
--- a/src/math/bcknd/device/cuda/math.cu
+++ b/src/math/bcknd/device/cuda/math.cu
@@ -42,7 +42,7 @@ extern "C" {
 
 #include <math/bcknd/device/device_mpi_reduce.h>
 #include <math/bcknd/device/device_mpi_op.h>
-  
+
   /** Fortran wrapper for copy
    * Copy a vector \f$ a = b \f$
    */
@@ -128,7 +128,7 @@ extern "C" {
     CUDA_CHECK(cudaGetLastError());
 
   }
-  
+
   /** Fortran wrapper for cadd
    * Add a scalar to vector \f$ a_i = a_i + c \f$
    */
@@ -171,7 +171,7 @@ extern "C" {
         (cudaStream_t) glb_cmd_queue>>>((real *) a, *c, *n);
       CUDA_CHECK(cudaGetLastError());
     }
-    
+
   }
 
   /**
@@ -179,16 +179,16 @@ extern "C" {
    * Vector addition \f$ a = a + b \f$
    */
   void cuda_add2(void *a, void *b, int *n) {
-    
+
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
     add2_kernel<real><<<nblcks, nthrds, 0,
       (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
     CUDA_CHECK(cudaGetLastError());
-    
+
   }
-  
+
   /**
    * Fortran wrapper for add3
    * Vector addition \f$ a = b + c \f$
@@ -221,23 +221,23 @@ extern "C" {
   /**
    * Fortran wrapper for add2s1
    * Vector addition with scalar multiplication \f$ a = c_1 a + b \f$
-   * (multiplication on first argument) 
+   * (multiplication on first argument)
    */
   void cuda_add2s1(void *a, void *b, real *c1, int *n) {
-    
+
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
     add2s1_kernel<real><<<nblcks, nthrds, 0,
       (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *c1, *n);
     CUDA_CHECK(cudaGetLastError());
-    
+
   }
 
   /**
    * Fortran wrapper for add2s2
    * Vector addition with scalar multiplication \f$ a = a + c_1 b \f$
-   * (multiplication on second argument) 
+   * (multiplication on second argument)
    */
   void cuda_add2s2(void *a, void *b, real *c1, int *n) {
 
@@ -252,26 +252,26 @@ extern "C" {
 
   /**
    * Fortran wrapper for add2s2
-   * Vector addition with scalar multiplication 
+   * Vector addition with scalar multiplication
    * \f$ x = x + c_1 p1 + c_2p2 + ... + c_jpj \f$
-   * (multiplication on second argument) 
+   * (multiplication on second argument)
    */
   void cuda_add2s2_many(void *x, void **p, void *alpha, int *j, int *n) {
-        
+
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
-    
+
     add2s2_many_kernel<real><<<nblcks, nthrds, 0,
       (cudaStream_t) glb_cmd_queue>>>((real *) x, (const real **) p,
                                       (real *) alpha, *j, *n);
     CUDA_CHECK(cudaGetLastError());
 
   }
-  
+
   /**
    * Fortran wrapper for addsqr2s2
    * Vector addition with scalar multiplication \f$ a = a + c_1 (b * b) \f$
-   * (multiplication on second argument) 
+   * (multiplication on second argument)
    */
   void cuda_addsqr2s2(void *a, void *b, real *c1, int *n) {
 
@@ -288,7 +288,7 @@ extern "C" {
   /**
    * Fortran wrapper for add3s2
    * Vector addition with scalar multiplication \f$ a = c_1 b + c_2 c \f$
-   * (multiplication on second argument) 
+   * (multiplication on second argument)
    */
   void cuda_add3s2(void *a, void *b, void *c, real *c1, real *c2, int *n) {
 
@@ -315,7 +315,7 @@ extern "C" {
       (cudaStream_t) glb_cmd_queue>>>((real *) a, *n);
     CUDA_CHECK(cudaGetLastError());
   }
-  
+
   /**
    * Fortran wrapper for invcol2
    * Vector division \f$ a = a / b \f$
@@ -329,7 +329,7 @@ extern "C" {
                                              (real *) b, *n);
     CUDA_CHECK(cudaGetLastError());
   }
-  
+
   /**
    * Fortran wrapper for col2
    * Vector multiplication with 2 vectors \f$ a = a \cdot b \f$
@@ -343,7 +343,7 @@ extern "C" {
       (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, *n);
     CUDA_CHECK(cudaGetLastError());
   }
-  
+
   /**
    * Fortran wrapper for col3
    * Vector multiplication with 3 vectors \f$ a = b \cdot c \f$
@@ -371,7 +371,7 @@ extern "C" {
       (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, (real *) c, *n);
     CUDA_CHECK(cudaGetLastError());
   }
-  
+
 
   /**
    * Fortran wrapper for sub2
@@ -397,7 +397,7 @@ extern "C" {
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
 
     sub3_kernel<real><<<nblcks, nthrds, 0,
-      (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b, 
+      (cudaStream_t) glb_cmd_queue>>>((real *) a, (real *) b,
                                             (real *) c, *n);
     CUDA_CHECK(cudaGetLastError());
   }
@@ -441,7 +441,7 @@ extern "C" {
 
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
-    
+
     vdot3_kernel<real><<<nblcks, nthrds, 0,
       (cudaStream_t) glb_cmd_queue>>>((real *) dot, (real *) u1,
                                       (real *) u2, (real *) u3,
@@ -460,12 +460,12 @@ extern "C" {
 
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
-    
+
     vcross_kernel<real><<<nblcks, nthrds, 0,
       (cudaStream_t) glb_cmd_queue>>>((real *) u1,
                                       (real *) u2, (real *) u3,
                                       (real *) v1, (real *) v2,
-                                      (real *) v3, 
+                                      (real *) v3,
                                       (real *) w1, (real *) w2,
                                       (real *) w3, *n);
     CUDA_CHECK(cudaGetLastError());
@@ -484,22 +484,22 @@ extern "C" {
    * Compute multiplication sum \f$ dot = u \cdot v \cdot w \f$
    */
   real cuda_vlsc3(void *u, void *v, void *w, int *n) {
-        
+
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
     const int nb = ((*n) + 1024 - 1)/ 1024;
-    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;      
-    
+    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
     if ( nb > red_s){
       red_s = nb;
       if (bufred != NULL) {
         CUDA_CHECK(cudaFreeHost(bufred));
-        CUDA_CHECK(cudaFree(bufred_d));        
+        CUDA_CHECK(cudaFree(bufred_d));
       }
       CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
       CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
     }
-     
+
     glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
       ((real *) u, (real *) v, (real *) w, bufred_d, *n);
     CUDA_CHECK(cudaGetLastError());
@@ -518,22 +518,22 @@ extern "C" {
    * Weighted inner product \f$ a^T b c \f$
    */
   real cuda_glsc3(void *a, void *b, void *c, int *n) {
-        
+
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
     const int nb = ((*n) + 1024 - 1)/ 1024;
-    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;      
-    
+    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
     if ( nb > red_s){
       red_s = nb;
       if (bufred != NULL) {
         CUDA_CHECK(cudaFreeHost(bufred));
-        CUDA_CHECK(cudaFree(bufred_d));        
+        CUDA_CHECK(cudaFree(bufred_d));
       }
       CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
       CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
     }
-     
+
     glsc3_kernel<real><<<nblcks, nthrds, 0, stream>>>
       ((real *) a, (real *) b, (real *) c, bufred_d, *n);
     CUDA_CHECK(cudaGetLastError());
@@ -551,22 +551,22 @@ extern "C" {
 
     return bufred[0];
   }
-  
+
   /**
    * Fortran wrapper for doing an reduction to an array
    * Weighted inner product \f$ w^T v(n,1:j) c \f$
    */
-  void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){ 
+  void cuda_glsc3_many(real *h, void * w, void *v,void *mult, int *j, int *n){
     int pow2 = 1;
     while(pow2 < (*j)){
       pow2 = 2*pow2;
     }
-    const int nt = 1024/pow2;   
+    const int nt = 1024/pow2;
     const dim3 nthrds(pow2, nt, 1);
     const dim3 nblcks(((*n)+nt - 1)/nt, 1, 1);
     const int nb = ((*n) + nt - 1)/nt;
     const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
-    
+
     if((*j)*nb>red_s){
       red_s = (*j)*nb;
       if (bufred != NULL) {
@@ -576,7 +576,7 @@ extern "C" {
       CUDA_CHECK(cudaMallocHost(&bufred,(*j)*nb*sizeof(real)));
       CUDA_CHECK(cudaMalloc(&bufred_d, (*j)*nb*sizeof(real)));
     }
-    
+
     glsc3_many_kernel<real><<<nblcks, nthrds, 0, stream>>>
       ((const real *) w, (const real **) v,
        (const real *)mult, bufred_d, *j, *n);
@@ -587,7 +587,7 @@ extern "C" {
 #ifdef HAVE_DEVICE_MPI
     cudaStreamSynchronize(stream);
     device_mpi_allreduce(bufred_d, h, (*j), sizeof(real), DEVICE_MPI_SUM);
-#else    
+#else
     CUDA_CHECK(cudaMemcpyAsync(h, bufred_d, (*j) * sizeof(real),
                                cudaMemcpyDeviceToHost, stream));
     cudaStreamSynchronize(stream);
@@ -599,22 +599,22 @@ extern "C" {
    * Weighted inner product \f$ a^T b c \f$
    */
   real cuda_glsc2(void *a, void *b, int *n) {
-        
+
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
     const int nb = ((*n) + 1024 - 1)/ 1024;
-    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;      
+    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
 
     if ( nb > red_s){
       red_s = nb;
       if (bufred != NULL) {
         CUDA_CHECK(cudaFreeHost(bufred));
-        CUDA_CHECK(cudaFree(bufred_d));        
+        CUDA_CHECK(cudaFree(bufred_d));
       }
       CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
       CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
     }
-         
+
     glsc2_kernel<real>
       <<<nblcks, nthrds, 0, stream>>>((real *) a,
                                       (real *) b,
@@ -635,7 +635,7 @@ extern "C" {
     return bufred[0];
   }
 
-  /** 
+  /**
    * Fortran wrapper glsum
    * Sum a vector of length n
    */
@@ -643,18 +643,18 @@ extern "C" {
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
     const int nb = ((*n) + 1024 - 1)/ 1024;
-    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;      
-    
+    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
     if ( nb > red_s){
       red_s = nb;
       if (bufred != NULL) {
         CUDA_CHECK(cudaFreeHost(bufred));
-        CUDA_CHECK(cudaFree(bufred_d));        
+        CUDA_CHECK(cudaFree(bufred_d));
       }
       CUDA_CHECK(cudaMallocHost(&bufred,nb*sizeof(real)));
       CUDA_CHECK(cudaMalloc(&bufred_d, nb*sizeof(real)));
     }
-    if ( *n > 0) { 
+    if ( *n > 0) {
       glsum_kernel<real>
         <<<nblcks, nthrds, 0, stream>>>((real *) a, bufred_d, *n);
       CUDA_CHECK(cudaGetLastError());
@@ -669,11 +669,11 @@ extern "C" {
                                cudaMemcpyDeviceToHost, stream));
     cudaStreamSynchronize(stream);
 #endif
-    
+
     return bufred[0];
   }
 
-  /** 
+  /**
    * Fortran wrapper absval
    * Sum a vector of length n
    */
@@ -681,12 +681,135 @@ extern "C" {
 
     const dim3 nthrds(1024, 1, 1);
     const dim3 nblcks(((*n)+1024 - 1)/ 1024, 1, 1);
-    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue; 
+    const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
 
     absval_kernel<real>
-    <<<nblcks, nthrds,0, stream>>>((real *) a, * n);  
+    <<<nblcks, nthrds,0, stream>>>((real *) a, * n);
     CUDA_CHECK(cudaGetLastError());
+
+  }
+
+  // ======================================================================== //
+  // Point-wise operations.
+
+  /** Fortran wrapper for pwmax_vec2
+   *
+   * Compute the maximum of two vectors \f$ a = \max(a, b) \f$
+   */
+  void cuda_pwmax_vec2(void *a, void *b, int *n) {
+
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      pwmax_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)a, (real *)b, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+  
+  /** Fortran wrapper for pwmax_vec3
+   *
+   * Compute the maximum of two vectors \f$ a = \max(b, c) \f$
+   */
+  void cuda_pwmax_vec3(void *a, void *b, void *c, int *n) {
+    
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      pwmax_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)a, (real *)b, (real *)c, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+  
+  /** Fortran wrapper for pwmax_sca2
+   *
+   * Compute the maximum of vector and scalar \f$ a = \max(a, c) \f$
+   */
+  void cuda_pwmax_sca2(void *a, real *c, int *n) {
     
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      pwmax_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)a, *c, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+  
+  /** Fortran wrapper for pwmax_sca3
+   *
+   * Compute the maximum of vector and scalar \f$ a = \max(b, c) \f$
+   */
+  void cuda_pwmax_sca3(void *a, void *b, real *c, int *n) {
+    
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      pwmax_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)a, (real *)b, *c, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+
+  /** Fortran wrapper for pwmin_vec2
+   *
+   * Compute the minimum of two vectors \f$ a = \min(a, b) \f$
+   */
+  void cuda_pwmin_vec2(void *a, void *b, int *n) {
+
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      pwmin_vec2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)a, (real *)b, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+  
+  /** Fortran wrapper for pwmin_vec3
+   *
+   * Compute the minimum of two vectors \f$ a = \min(b, c) \f$
+   */
+  void cuda_pwmin_vec3(void *a, void *b, void *c, int *n) {
+    
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      pwmin_vec3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)a, (real *)b, (real *)c, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+  
+  /** Fortran wrapper for pwmin_sca2
+   *
+   * Compute the minimum of vector and scalar \f$ a = \min(a, c) \f$
+   */
+  void cuda_pwmin_sca2(void *a, real *c, int *n) {
+    
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      pwmin_sca2_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)a, *c, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+  
+  /** Fortran wrapper for pwmin_sca3
+   *
+   * Compute the minimum of vector and scalar \f$ a = \min(b, c) \f$
+   */
+  void cuda_pwmin_sca3(void *a, void *b, real *c, int *n) {
+    
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      pwmin_sca3_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)a, (real *)b, *c, *n);
+      CUDA_CHECK(cudaGetLastError());
   }
 
-}
+} /* extern "C" */
diff --git a/src/math/bcknd/device/cuda/math_kernel.h b/src/math/bcknd/device/cuda/math_kernel.h
index d633f705351..26eebb8565d 100644
--- a/src/math/bcknd/device/cuda/math_kernel.h
+++ b/src/math/bcknd/device/cuda/math_kernel.h
@@ -755,4 +755,119 @@ __global__ void absval_kernel(T * __restrict__ a,
   }
 }
 
+// ========================================================================== //
+// Kernels for the point-wise operations
+
+/**
+ * Device kernel for point-wise max of two vectors
+ * a = max(a, b)
+ */
+template <typename T>
+__global__ void
+    pwmax_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) a[i] = max(a[i], b[i]);
+}
+
+/**
+ * Device kernel for point-wise max of two vectors
+ * a = max(b, c)
+ */
+template <typename T>
+__global__ void pwmax_vec3_kernel(
+    T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
+    const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) a[i] = max(b[i], c[i]);
+}
+
+/**
+ * Device kernel for point-wise max of vector and scalar
+ * a = max(a, c)
+ */
+template <typename T>
+__global__ void pwmax_sca2_kernel(T* __restrict__ a, const T c, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) a[i] = max(a[i], c);
+}
+
+/**
+ * Device kernel for point-wise max of vector and scalar
+ * a = max(b, c)
+ */
+template <typename T>
+__global__ void pwmax_sca3_kernel(
+    T* __restrict__ a, const T* __restrict b, const T c, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) a[i] = max(b[i], c);
+}
+
+/**
+ * Device kernel for point-wise min of two vectors
+ * a = min(a, b)
+ */
+template <typename T>
+__global__ void
+    pwmin_vec2_kernel(T* __restrict__ a, const T* __restrict__ b, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) a[i] = min(a[i], b[i]);
+}
+
+/**
+ * Device kernel for point-wise min of two vectors
+ * a = min(b, c)
+ */
+template <typename T>
+__global__ void pwmin_vec3_kernel(
+    T* __restrict__ a, const T* __restrict__ b, const T* __restrict__ c,
+    const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) a[i] = min(b[i], c[i]);
+}
+
+/**
+ * Device kernel for point-wise min of vector and scalar
+ * a = min(a, c)
+ */
+template <typename T>
+__global__ void pwmin_sca2_kernel(T* __restrict__ a, const T c, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) a[i] = min(a[i], c);
+}
+
+/**
+ * Device kernel for point-wise min of vector and scalar
+ * a = min(b, c)
+ */
+template <typename T>
+__global__ void pwmin_sca3_kernel(
+    T* __restrict__ a, const T* __restrict b, const T c, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) a[i] = min(b[i], c);
+}
+
 #endif // __MATH_MATH_KERNEL_H__
diff --git a/src/math/bcknd/device/device_math.F90 b/src/math/bcknd/device/device_math.F90
index 5d3e19e7c44..50935d36a6c 100644
--- a/src/math/bcknd/device/device_math.F90
+++ b/src/math/bcknd/device/device_math.F90
@@ -47,6 +47,17 @@ module device_math
   implicit none
   private
 
+  interface device_pwmax
+     module procedure device_pwmax_vec2, device_pwmax_vec3, &
+          device_pwmax_sca2, device_pwmax_sca3
+  end interface device_pwmax
+
+  interface device_pwmin
+     module procedure device_pwmin_vec2, device_pwmin_vec3, &
+          device_pwmin_sca2, device_pwmin_sca3
+  end interface device_pwmin
+
+
   public :: device_copy, device_rzero, device_rone, device_cmult, &
        device_cmult2, device_cadd, device_cadd2, device_cfill, device_add2, &
        device_add3, device_add4, device_add2s1, device_add2s2, &
@@ -55,7 +66,8 @@ module device_math
        device_addcol3, device_addcol4, device_vdot3, device_vlsc3, &
        device_glsc3, device_glsc3_many, device_add2s2_many, device_glsc2, &
        device_glsum, device_masked_copy, device_cfill_mask, &
-       device_masked_red_copy, device_vcross, device_absval
+       device_masked_red_copy, device_vcross, device_absval, &
+       device_pwmax, device_pwmin
 
 contains
 
@@ -650,5 +662,156 @@ subroutine device_absval(a_d, n)
 
   end subroutine device_absval
 
+  ! ========================================================================== !
+  ! Device point-wise max
+
+  !> Compute the point-wise maximum of two vectors
+  !! \f$ a_i = \max(a_i, b_i) \f$
+  subroutine device_pwmax_vec2(a_d, b_d, n)
+    type(c_ptr) :: a_d, b_d
+    integer :: n
+
+#if HAVE_HIP
+    call neko_error('No HIP backend for device_pwmax_vec2')
+#elif HAVE_CUDA
+    call cuda_pwmax_vec2(a_d, b_d, n)
+#elif HAVE_OPENCL
+    call neko_error('No OpenCL backend for device_pwmax_vec2')
+#else
+    call neko_error('No device backend configured')
+#endif
+  end subroutine device_pwmax_vec2
+
+  !> Compute the point-wise maximum of two vectors
+  !! \f$ a_i = \max(b_i, c_i) \f$
+  subroutine device_pwmax_vec3(a_d, b_d, c_d, n)
+    type(c_ptr) :: a_d, b_d, c_d
+    integer :: n
+
+#if HAVE_HIP
+    call neko_error('No HIP backend for device_pwmax_vec3')
+#elif HAVE_CUDA
+    call cuda_pwmax_vec3(a_d, b_d, c_d, n)
+#elif HAVE_OPENCL
+    call neko_error('No OpenCL backend for device_pwmax_vec3')
+#else
+    call neko_error('No device backend configured')
+#endif
+
+  end subroutine device_pwmax_vec3
+
+  !> Compute the point-wise maximum of a vector and a scalar
+  !! \f$ a_i = \max(a_i, c) \f$
+  subroutine device_pwmax_sca2(a_d, c, n)
+    type(c_ptr) :: a_d
+    real(kind=rp), intent(in) :: c
+    integer :: n
+
+#if HAVE_HIP
+    call neko_error('No HIP backend for device_pwmax_sca2')
+#elif HAVE_CUDA
+    call cuda_pwmax_sca2(a_d, c, n)
+#elif HAVE_OPENCL
+    call neko_error('No OpenCL backend for device_pwmax_sca2')
+#else
+    call neko_error('No device backend configured')
+#endif
+
+  end subroutine device_pwmax_sca2
+
+  !> Compute the point-wise maximum of a vector and a scalar
+  !! \f$ a_i = \max(b_i, c) \f$
+  subroutine device_pwmax_sca3(a_d, b_d, c, n)
+    type(c_ptr) :: a_d, b_d
+    real(kind=rp), intent(in) :: c
+    integer :: n
+
+#if HAVE_HIP
+    call neko_error('No HIP backend for device_pwmax_sca3')
+#elif HAVE_CUDA
+    call cuda_pwmax_sca3(a_d, b_d, c, n)
+#elif HAVE_OPENCL
+    call neko_error('No OpenCL backend for device_pwmax_sca3')
+#else
+    call neko_error('No device backend configured')
+#endif
+
+  end subroutine device_pwmax_sca3
+
+  ! ========================================================================== !
+  ! Device point-wise min
+
+  !> Compute the point-wise minimum of two vectors
+  !! \f$ a_i = \min(a_i, b_i) \f$
+  subroutine device_pwmin_vec2(a_d, b_d, n)
+    type(c_ptr) :: a_d, b_d
+    integer :: n
+
+#if HAVE_HIP
+    call neko_error('No HIP backend for device_pwmin_vec2')
+#elif HAVE_CUDA
+    call cuda_pwmin_vec2(a_d, b_d, n)
+#elif HAVE_OPENCL
+    call neko_error('No OpenCL backend for device_pwmin_vec2')
+#else
+    call neko_error('No device backend configured')
+#endif
+  end subroutine device_pwmin_vec2
+
+  !> Compute the point-wise minimum of two vectors
+  !! \f$ a_i = \min(b_i, c_i) \f$
+  subroutine device_pwmin_vec3(a_d, b_d, c_d, n)
+    type(c_ptr) :: a_d, b_d, c_d
+    integer :: n
+
+#if HAVE_HIP
+    call neko_error('No HIP backend for device_pwmin_vec3')
+#elif HAVE_CUDA
+    call cuda_pwmin_vec3(a_d, b_d, c_d, n)
+#elif HAVE_OPENCL
+    call neko_error('No OpenCL backend for device_pwmin_vec3')
+#else
+    call neko_error('No device backend configured')
+#endif
+
+  end subroutine device_pwmin_vec3
+
+  !> Compute the point-wise minimum of a vector and a scalar
+  !! \f$ a_i = \min(a_i, c) \f$
+  subroutine device_pwmin_sca2(a_d, c, n)
+    type(c_ptr) :: a_d
+    real(kind=rp), intent(in) :: c
+    integer :: n
+
+#if HAVE_HIP
+    call neko_error('No HIP backend for device_pwmin_sca2')
+#elif HAVE_CUDA
+    call cuda_pwmin_sca2(a_d, c, n)
+#elif HAVE_OPENCL
+    call neko_error('No OpenCL backend for device_pwmin_sca2')
+#else
+    call neko_error('No device backend configured')
+#endif
+
+  end subroutine device_pwmin_sca2
+
+  !> Compute the point-wise minimum of a vector and a scalar
+  !! \f$ a_i = \min(b_i, c) \f$
+  subroutine device_pwmin_sca3(a_d, b_d, c, n)
+    type(c_ptr) :: a_d, b_d
+    real(kind=rp), intent(in) :: c
+    integer :: n
+
+#if HAVE_HIP
+    call neko_error('No HIP backend for device_pwmin_sca3')
+#elif HAVE_CUDA
+    call cuda_pwmin_sca3(a_d, b_d, c, n)
+#elif HAVE_OPENCL
+    call neko_error('No OpenCL backend for device_pwmin_sca3')
+#else
+    call neko_error('No device backend configured')
+#endif
+
+  end subroutine device_pwmin_sca3
 
 end module device_math
diff --git a/src/math/math.f90 b/src/math/math.f90
index d18177b219c..605b5c17a7f 100644
--- a/src/math/math.f90
+++ b/src/math/math.f90
@@ -59,7 +59,10 @@
 !
 module math
   use num_types, only : rp, dp, sp, qp, i4, xp
-  use comm
+  use utils, only: neko_error
+  use comm, only: NEKO_COMM, MPI_REAL_PRECISION, MPI_EXTRA_PRECISION
+  use mpi_f08, only: MPI_MIN, MPI_MAX, MPI_SUM, MPI_IN_PLACE, MPI_INTEGER, &
+       MPI_Allreduce
   implicit none
   private
 
@@ -96,13 +99,21 @@ module math
      module procedure srelcmp, drelcmp, qrelcmp
   end interface relcmp
 
+  interface pwmax
+     module procedure pwmax_vec2, pwmax_vec3, pwmax_scal2, pwmax_scal3
+  end interface pwmax
+
+  interface pwmin
+     module procedure pwmin_vec2, pwmin_vec3, pwmin_sca2, pwmin_sca3
+  end interface pwmin
+
   public :: abscmp, rzero, izero, row_zero, rone, copy, cmult, cadd, cfill, &
        glsum, glmax, glmin, chsign, vlmax, vlmin, invcol1, invcol3, invers2, &
        vcross, vdot2, vdot3, vlsc3, vlsc2, add2, add3, add4, sub2, sub3, &
        add2s1, add2s2, addsqr2s2, cmult2, invcol2, col2, col3, subcol3, &
        add3s2, subcol4, addcol3, addcol4, ascol5, p_update, x_update, glsc2, &
        glsc3, glsc4, sort, masked_copy, cfill_mask, relcmp, glimax, glimin, &
-       swap, reord, flipv, cadd2, masked_red_copy, absval
+       swap, reord, flipv, cadd2, masked_red_copy, absval, pwmax, pwmin
 
 contains
 
@@ -1169,4 +1180,105 @@ subroutine absval(a, n)
     end do
   end subroutine absval
 
+  ! ========================================================================== !
+  ! Point-wise operations
+
+  !> Point-wise maximum of two vectors \f$ a = \max(a, b) \f$
+  subroutine pwmax_vec2(a, b, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: a
+    real(kind=rp), dimension(n), intent(in) :: b
+    integer :: i
+
+    do i = 1, n
+       a(i) = max(a(i), b(i))
+    end do
+  end subroutine pwmax_vec2
+
+  !> Point-wise maximum of two vectors \f$ a = \max(b, c) \f$
+  subroutine pwmax_vec3(a, b, c, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: a
+    real(kind=rp), dimension(n), intent(in) :: b, c
+    integer :: i
+
+    do i = 1, n
+       a(i) = max(b(i), c(i))
+    end do
+  end subroutine pwmax_vec3
+
+  !> Point-wise maximum of scalar and vector \f$ a = \max(a, b) \f$
+  subroutine pwmax_scal2(a, b, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: a
+    real(kind=rp), intent(in) :: b
+    integer :: i
+
+    do i = 1, n
+       a(i) = max(a(i), b)
+    end do
+  end subroutine pwmax_scal2
+
+  !> Point-wise maximum of scalar and vector \f$ a = \max(b, c) \f$
+  subroutine pwmax_scal3(a, b, c, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: a
+    real(kind=rp), dimension(n), intent(in) :: b
+    real(kind=rp), intent(in) :: c
+    integer :: i
+
+    do i = 1, n
+       a(i) = max(b(i), c)
+    end do
+  end subroutine pwmax_scal3
+
+  !> Point-wise minimum of two vectors \f$ a = \min(a, b) \f$
+  subroutine pwmin_vec2(a, b, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: a
+    real(kind=rp), dimension(n), intent(in) :: b
+    integer :: i
+
+    do i = 1, n
+       a(i) = min(a(i), b(i))
+    end do
+  end subroutine pwmin_vec2
+
+  !> Point-wise minimum of two vectors \f$ a = \min(b, c) \f$
+  subroutine pwmin_vec3(a, b, c, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: a
+    real(kind=rp), dimension(n), intent(in) :: b, c
+    integer :: i
+
+    do i = 1, n
+       a(i) = min(b(i), c(i))
+    end do
+  end subroutine pwmin_vec3
+
+  !> Point-wise minimum of scalar and vector \f$ a = \min(a, b) \f$
+  subroutine pwmin_sca2(a, b, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: a
+    real(kind=rp), intent(in) :: b
+    integer :: i
+
+    do i = 1, n
+       a(i) = min(a(i), b)
+    end do
+  end subroutine pwmin_sca2
+
+  !> Point-wise minimum of scalar and vector \f$ a = \min(b, c) \f$
+  subroutine pwmin_sca3(a, b, c, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: a
+    real(kind=rp), dimension(n), intent(in) :: b
+    real(kind=rp), intent(in) :: c
+    integer :: i
+
+    do i = 1, n
+       a(i) = min(b(i), c)
+    end do
+  end subroutine pwmin_sca3
+
 end module math
diff --git a/src/math/signed_distance.f90 b/src/math/signed_distance.f90
index f9c18f9dabf..8f60a51dd8f 100644
--- a/src/math/signed_distance.f90
+++ b/src/math/signed_distance.f90
@@ -37,8 +37,13 @@ module signed_distance
   use tri, only: tri_t
   use tri_mesh, only: tri_mesh_t
   use aabb_tree, only: aabb_tree_t
+  use device, only: device_memcpy, HOST_TO_DEVICE
+  use neko_config, only: NEKO_BCKND_DEVICE
+  use utils, only: neko_error, neko_warning
 
   implicit none
+  private
+  public :: signed_distance_field
 
 contains
 
@@ -53,9 +58,6 @@ module signed_distance
   !! @param[in] object Object
   !! @param[in,optional] max_distance Maximum distance outside the mesh
   subroutine signed_distance_field(field_data, object, max_distance)
-    use utils, only: neko_error
-    implicit none
-
     type(field_t), intent(inout) :: field_data
     class(*), intent(in) :: object
     real(kind=dp), intent(in), optional :: max_distance
@@ -89,9 +91,6 @@ end subroutine signed_distance_field
   !! @param[in] mesh Triangular mesh
   !! @param[in] max_distance Maximum distance outside the mesh
   subroutine signed_distance_field_tri_mesh(field_data, mesh, max_distance)
-    use utils, only: neko_error
-    implicit none
-
     type(field_t), intent(inout) :: field_data
     type(tri_mesh_t), intent(in) :: mesh
     real(kind=dp), intent(in) :: max_distance
@@ -111,7 +110,7 @@ subroutine signed_distance_field_tri_mesh(field_data, mesh, max_distance)
 
     if (search_tree%get_size() .ne. mesh%nelv) then
        call neko_error("signed_distance_field_tri_mesh: &
-         & Error building the search tree.")
+            & Error building the search tree.")
     end if
 
     do id = 1, total_size
@@ -124,6 +123,13 @@ subroutine signed_distance_field_tri_mesh(field_data, mesh, max_distance)
        field_data%x(id, 1, 1, 1) = real(distance, kind=rp)
     end do
 
+    if (NEKO_BCKND_DEVICE .eq. 1) then
+       call neko_warning("signed_distance_field_tri_mesh:&
+            & Device version not implemented.")
+       call device_memcpy(field_data%x, field_data%x_d, field_data%size(), &
+            HOST_TO_DEVICE, sync = .false.)
+    end if
+
   end subroutine signed_distance_field_tri_mesh
 
   !> @brief Signed distance function
diff --git a/src/source_terms/bcknd/cpu/filters_cpu.f90 b/src/source_terms/bcknd/cpu/filters_cpu.f90
index 3b2dc2a4bab..e4008af8ed2 100644
--- a/src/source_terms/bcknd/cpu/filters_cpu.f90
+++ b/src/source_terms/bcknd/cpu/filters_cpu.f90
@@ -34,51 +34,84 @@
 module filters_cpu
   use num_types, only: rp
   implicit none
+  private
+  public :: smooth_step_cpu, step_function_cpu, permeability_cpu
 
 contains
 
+  !> @brief Apply a smooth step function to a scalar.
+  subroutine smooth_step_cpu(x, edge0, edge1, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: x
+    real(kind=rp), intent(in) :: edge0, edge1
+
+    x = smooth_step_kernel(x, edge0, edge1)
+
+  end subroutine smooth_step_cpu
+
+  !> @brief Apply a step function to a scalar.
+  subroutine step_function_cpu(x, x_step, value0, value1, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: x
+    real(kind=rp), intent(in) :: x_step, value0, value1
+
+    x = step_function_kernel(x, x_step, value0, value1)
+
+  end subroutine step_function_cpu
+
+  !> @brief Apply a permeability function to a scalar.
+  subroutine permeability_cpu(x, k_0, k_1, q, n)
+    integer, intent(in) :: n
+    real(kind=rp), dimension(n), intent(inout) :: x
+    real(kind=rp), intent(in) :: k_0, k_1, q
+
+    x = permeability_kernel(x, k_0, k_1, q)
+
+  end subroutine permeability_cpu
+
+
   ! ========================================================================== !
   ! Internal functions and subroutines
   ! ========================================================================== !
 
   !> @brief Apply a smooth step function to a scalar.
-  elemental function smooth_step_cpu(x, edge0, edge1) result(res)
+  elemental function smooth_step_kernel(x, edge0, edge1) result(res)
     real(kind=rp), intent(in) :: x
     real(kind=rp), intent(in) :: edge0, edge1
     real(kind=rp) :: res, t
 
-    t = clamp_cpu((x - edge0) / (edge1 - edge0), 0.0_rp, 1.0_rp)
+    t = clamp_kernel((x - edge0) / (edge1 - edge0), 0.0_rp, 1.0_rp)
 
     res = t**3 * (t * (6.0_rp * t - 15.0_rp) + 10.0_rp)
 
-  end function smooth_step_cpu
+  end function smooth_step_kernel
 
   !> @brief Clamp a value between two limits.
-  elemental function clamp_cpu(x, lowerlimit, upperlimit) result(res)
+  elemental function clamp_kernel(x, lowerlimit, upperlimit) result(res)
     real(kind=rp), intent(in) :: x
     real(kind=rp), intent(in) :: lowerlimit, upperlimit
     real(kind=rp) :: res
 
     res = max(lowerlimit, min(upperlimit, x))
-  end function clamp_cpu
+  end function clamp_kernel
 
   !> @brief Apply a step function to a scalar.
-  elemental function step_function_cpu(x, x_step, value0, value1) result(res)
+  elemental function step_function_kernel(x, x_step, value0, value1) result(res)
     real(kind=rp), intent(in) :: x, x_step, value0, value1
     real(kind=rp) :: res
 
     res = merge(value0, value1, x > x_step)
 
-  end function step_function_cpu
+  end function step_function_kernel
 
   !> @brief Apply a permeability function to a scalar.
-  elemental function permeability_cpu(x, k_0, k_1, q) result(perm)
+  elemental function permeability_kernel(x, k_0, k_1, q) result(perm)
     real(kind=rp), intent(in) :: x, k_0, k_1, q
     real(kind=rp) :: perm
 
     perm = k_0 + (k_1 - k_0) * x * (q + 1.0_rp) / (q + x)
 
-  end function permeability_cpu
+  end function permeability_kernel
 
 
 end module filters_cpu
diff --git a/src/source_terms/bcknd/device/cuda/cuda_filters.f90 b/src/source_terms/bcknd/device/cuda/cuda_filters.f90
new file mode 100644
index 00000000000..73a831845c6
--- /dev/null
+++ b/src/source_terms/bcknd/device/cuda/cuda_filters.f90
@@ -0,0 +1,71 @@
+! Copyright (c) 2024, The Neko Authors
+! All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!
+!   * Redistributions of source code must retain the above copyright
+!     notice, this list of conditions and the following disclaimer.
+!
+!   * Redistributions in binary form must reproduce the above
+!     copyright notice, this list of conditions and the following
+!     disclaimer in the documentation and/or other materials provided
+!     with the distribution.
+!
+!   * Neither the name of the authors nor the names of its
+!     contributors may be used to endorse or promote products derived
+!     from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+
+!> Cuda interface binding for filters
+module cuda_filters
+  use num_types, only: c_rp
+  implicit none
+  private
+
+  public :: cuda_smooth_step, cuda_step_function, cuda_permeability
+
+  ! Interfaces for the backend functions
+  interface
+     subroutine cuda_smooth_step(x, edge0, edge1, n) &
+          bind(c, name = "cuda_smooth_step")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: edge0, edge1
+       integer(c_int) :: n
+     end subroutine cuda_smooth_step
+
+     subroutine cuda_step_function(x, edge, left, right, n) &
+          bind(c, name = "cuda_step_function")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: edge, left, right
+       integer(c_int) :: n
+     end subroutine cuda_step_function
+
+     subroutine cuda_permeability(x, k_0, k_1, q, n) &
+          bind(c, name = "cuda_permeability")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: k_0, k_1, q
+       integer(c_int) :: n
+     end subroutine cuda_permeability
+  end interface
+
+end module cuda_filters
diff --git a/src/source_terms/bcknd/device/cuda/filter_kernels.h b/src/source_terms/bcknd/device/cuda/filter_kernels.h
new file mode 100644
index 00000000000..ef7ce93c0f6
--- /dev/null
+++ b/src/source_terms/bcknd/device/cuda/filter_kernels.h
@@ -0,0 +1,103 @@
+#ifndef __FILTER_KERNELS__
+#define __FILTER_KERNELS__
+/*
+ Copyright (c) 2024, The Neko Authors
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+
+   * Neither the name of the authors nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+ * Device kernel for smooth step function.
+ *
+ * @param x  in/out array
+ * @param edge0  lower edge
+ * @param edge1  upper edge
+ * @param n  size of the input array
+ */
+template <typename T>
+__global__ void smooth_step_kernel(
+    T* __restrict__ x, const T edge0, const T edge1, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) {
+        T t  = (x[i] - edge0) / (edge1 - edge0);
+        t    = min(max(t, 0.0), 1.0);
+        x[i] = pow(t, 3.0) * (t * (t * 6.0 - 15.0) + 10.0);
+    }
+}
+
+/**
+ * Device kernel for step function.
+ *
+ * @param x  input array
+ * @param edge  step location
+ * @param left  value to the left of the step
+ * @param right  value to the right of the step
+ * @param n  size of the input array
+ */
+template <typename T>
+__global__ void step_kernel(
+    T* __restrict__ x, const T edge, const T left, const T right, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) {
+        const T x_i = x[i];
+        x[i]        = (x_i < edge) ? left : right;
+    }
+}
+
+/**
+ * Device kernel for the inverse permeability kernel.
+ *
+ * @param x  input array
+ * @param k_0  lower bound of the permeability
+ * @param k_1  upper bound of the permeability
+ * @param q  parameter
+ * @param n  size of the input array
+ */
+template <typename T>
+__global__ void permeability_kernel(
+    T* __restrict__ x, const T k_0, const T k_1, const T q, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) {
+        const T x_i = x[i];
+        x[i]        = k_0 + (k_1 - k_0) * x_i * (q + 1.0) / (q + x_i);
+    }
+}
+
+#endif // __FILTER_KERNELS__
diff --git a/src/source_terms/bcknd/device/cuda/filters.cu b/src/source_terms/bcknd/device/cuda/filters.cu
new file mode 100644
index 00000000000..fc94648317d
--- /dev/null
+++ b/src/source_terms/bcknd/device/cuda/filters.cu
@@ -0,0 +1,100 @@
+/*
+ Copyright (c) 2024, The Neko Authors
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+
+   * Neither the name of the authors nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "filter_kernels.h"
+#include <device/device_config.h>
+#include <device/cuda/check.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+extern "C" {
+
+  /** Fortran wrapper for 2nd order smooth_step
+   *
+   * Compute the smooth step function for a given array.
+   * t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0);
+   * return t^3 * (t * (6.0 * t - 15.0) + 10.0);
+   */
+  void cuda_smooth_step(void *x, real *edge0, real *edge1, int *n) {
+    
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      smooth_step_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)x, *edge0, *edge1, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+
+  /** Fortran wrapper for step function
+   *
+   * Compute the step function for a given array.
+   * if (x < edge) return left;
+   * else return right;
+   *
+   * @param x array to apply the step function
+   * @param edge threshold value
+   * @param left value to return if x < edge
+   * @param right value to return if x >= edge
+   * @param n size of the array
+   */
+  void cuda_step_function(void *x, real *edge, real *left, real *right, int *n) {
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      step_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)x, *edge, *left, *right, *n);
+      CUDA_CHECK(cudaGetLastError());
+  }
+
+  /** Fortran wrapper for the permeability filter
+   *
+   * @param x array to apply the permeability filter on
+   * @param k_0 lower bound of the permeability
+   * @param k_1 upper bound of the permeability
+   * @param q parameter
+   * @param n size of the array
+   */
+  void cuda_permeability(void *x, real *k_0, real *k_1, real *q, int *n) {
+      const dim3 nthrds(1024, 1, 1);
+      const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+      const cudaStream_t stream = (cudaStream_t) glb_cmd_queue;
+
+      permeability_kernel<real><<<nblcks, nthrds, 0, stream>>>(
+          (real *)x, *k_0, *k_1, *q, *n);
+
+      CUDA_CHECK(cudaGetLastError());
+  }
+} /* extern "C" */
diff --git a/src/source_terms/bcknd/device/filters_device.F90 b/src/source_terms/bcknd/device/filters_device.F90
new file mode 100644
index 00000000000..66983b9d4a3
--- /dev/null
+++ b/src/source_terms/bcknd/device/filters_device.F90
@@ -0,0 +1,113 @@
+! Copyright (c) 2024, The Neko Authors
+! All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!
+!   * Redistributions of source code must retain the above copyright
+!     notice, this list of conditions and the following disclaimer.
+!
+!   * Redistributions in binary form must reproduce the above
+!     copyright notice, this list of conditions and the following
+!     disclaimer in the documentation and/or other materials provided
+!     with the distribution.
+!
+!   * Neither the name of the authors nor the names of its
+!     contributors may be used to endorse or promote products derived
+!     from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+!
+!> Device implementations of the filter functions.
+module filters_device
+  use num_types, only: rp, c_rp
+  use device_math, only: device_pwmax, device_pwmin
+  use, intrinsic :: iso_c_binding, only: c_ptr
+  use utils, only: neko_error
+
+  use cuda_filters, only: &
+       cuda_smooth_step, cuda_step_function, cuda_permeability
+  use hip_filters, only: &
+       hip_smooth_step, hip_step_function, hip_permeability
+  use opencl_filters, only: &
+       opencl_smooth_step, opencl_step_function, opencl_permeability
+
+  implicit none
+
+contains
+
+  ! ========================================================================== !
+  ! Internal functions and subroutines
+  ! ========================================================================== !
+
+  !> @brief Apply a smooth step function to a scalar.
+  subroutine smooth_step_device(x, edge0, edge1, n)
+    type(c_ptr), intent(inout) :: x
+    real(kind=rp), intent(in) :: edge0, edge1
+    integer, intent(in) :: n
+
+#if HAVE_HIP == 1
+    call hip_smooth_step(x, edge0, edge1, n)
+#elif HAVE_CUDA == 1
+    call cuda_smooth_step(x, edge0, edge1, n)
+#elif HAVE_OPENCL == 1
+    call opencl_smooth_step(x, edge0, edge1, n)
+#else
+    call neko_error(&
+         "Smooth step function not implemented for the current device.")
+#endif
+
+  end subroutine smooth_step_device
+
+  !> @brief Apply a step function to a scalar.
+  subroutine step_function_device(x, x_step, value0, value1, n)
+    type(c_ptr), intent(inout) :: x
+    real(kind=rp), intent(in) :: x_step, value0, value1
+    integer, intent(in) :: n
+
+#if HAVE_HIP == 1
+    call hip_step_function(x, x_step, value0, value1, n)
+#elif HAVE_CUDA == 1
+    call cuda_step_function(x, x_step, value0, value1, n)
+#elif HAVE_OPENCL == 1
+    call opencl_step_function(x, x_step, value0, value1, n)
+#else
+    call neko_error(&
+         "Step function not implemented for the current device.")
+#endif
+
+  end subroutine step_function_device
+
+  !> @brief Apply a permeability function to a scalar.
+  subroutine permeability_device(x, k_0, k_1, q, n)
+    type(c_ptr), intent(inout) :: x
+    real(kind=rp), intent(in) :: k_0, k_1, q
+    integer, intent(in) :: n
+
+#if HAVE_HIP == 1
+    call hip_permeability(x, k_0, k_1, q, n)
+#elif HAVE_CUDA == 1
+    call cuda_permeability(x, k_0, k_1, q, n)
+#elif HAVE_OPENCL == 1
+    call opencl_permeability(x, k_0, k_1, q, n)
+#else
+    call neko_error(&
+         "Permeability function not implemented for the current device.")
+#endif
+
+  end subroutine permeability_device
+
+
+end module filters_device
diff --git a/src/source_terms/bcknd/device/hip/filter_kernels.h b/src/source_terms/bcknd/device/hip/filter_kernels.h
new file mode 100644
index 00000000000..ef7ce93c0f6
--- /dev/null
+++ b/src/source_terms/bcknd/device/hip/filter_kernels.h
@@ -0,0 +1,103 @@
+#ifndef __FILTER_KERNELS__
+#define __FILTER_KERNELS__
+/*
+ Copyright (c) 2024, The Neko Authors
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+
+   * Neither the name of the authors nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+ * Device kernel for smooth step function.
+ *
+ * @param x  in/out array
+ * @param edge0  lower edge
+ * @param edge1  upper edge
+ * @param n  size of the input array
+ */
+template <typename T>
+__global__ void smooth_step_kernel(
+    T* __restrict__ x, const T edge0, const T edge1, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) {
+        T t  = (x[i] - edge0) / (edge1 - edge0);
+        t    = min(max(t, 0.0), 1.0);
+        x[i] = pow(t, 3.0) * (t * (t * 6.0 - 15.0) + 10.0);
+    }
+}
+
+/**
+ * Device kernel for step function.
+ *
+ * @param x  input array
+ * @param edge  step location
+ * @param left  value to the left of the step
+ * @param right  value to the right of the step
+ * @param n  size of the input array
+ */
+template <typename T>
+__global__ void step_kernel(
+    T* __restrict__ x, const T edge, const T left, const T right, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) {
+        const T x_i = x[i];
+        x[i]        = (x_i < edge) ? left : right;
+    }
+}
+
+/**
+ * Device kernel for the inverse permeability kernel.
+ *
+ * @param x  input array
+ * @param k_0  lower bound of the permeability
+ * @param k_1  upper bound of the permeability
+ * @param q  parameter
+ * @param n  size of the input array
+ */
+template <typename T>
+__global__ void permeability_kernel(
+    T* __restrict__ x, const T k_0, const T k_1, const T q, const int n) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int str = blockDim.x * gridDim.x;
+
+    for (int i = idx; i < n; i += str) {
+        const T x_i = x[i];
+        x[i]        = k_0 + (k_1 - k_0) * x_i * (q + 1.0) / (q + x_i);
+    }
+}
+
+#endif // __FILTER_KERNELS__
diff --git a/src/source_terms/bcknd/device/hip/filters.hip b/src/source_terms/bcknd/device/hip/filters.hip
new file mode 100644
index 00000000000..b2fccdd2361
--- /dev/null
+++ b/src/source_terms/bcknd/device/hip/filters.hip
@@ -0,0 +1,100 @@
+/*
+ Copyright (c) 2021-2023, The Neko Authors
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+
+   * Neither the name of the authors nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <device/device_config.h>
+#include <device/hip/check.h>
+#include <hip/hip_runtime.h>
+
+#include "filter_kernels.h"
+
+extern "C" {
+
+/** Fortran wrapper for 2nd order smooth_step
+ *
+ * Compute the smooth step function for a given array.
+ * t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0);
+ * return t^3 * (t * (6.0 * t - 15.0) + 10.0);
+ */
+void hip_smooth_step(void* x, real* edge0, real* edge1, int* n) {
+
+    const dim3 nthrds(1024, 1, 1);
+    const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(smooth_step_kernel<real>), nblcks, nthrds, 0,
+        (hipStream_t)glb_cmd_queue, (real*)x, *edge0, *edge1, *n);
+    HIP_CHECK(hipGetLastError());
+}
+
+/** Fortran wrapper for step function
+ *
+ * Compute the step function for a given array.
+ * if (x < edge) return left;
+ * else return right;
+ *
+ * @param x array to apply the step function
+ * @param edge threshold value
+ * @param left value to return if x < edge
+ * @param right value to return if x >= edge
+ * @param n size of the array
+ */
+void hip_step_function(void* x, real* edge, real* left, real* right, int* n) {
+    const dim3 nthrds(1024, 1, 1);
+    const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(step_kernel<real>), nblcks, nthrds, 0,
+        (hipStream_t)glb_cmd_queue, (real*)x, *edge, *left, *right, *n);
+    HIP_CHECK(hipGetLastError());
+}
+
+/** Fortran wrapper for the permeability filter
+ *
+ * @param x array to apply the permeability filter on
+ * @param k_0 lower bound of the permeability
+ * @param k_1 upper bound of the permeability
+ * @param q parameter
+ * @param n size of the array
+ */
+void hip_permeability(void* x, real* k_0, real* k_1, real* q, int* n) {
+    const dim3 nthrds(1024, 1, 1);
+    const dim3 nblcks(((*n) + 1024 - 1) / 1024, 1, 1);
+
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(permeability_kernel<real>), nblcks, nthrds, 0,
+        (hipStream_t)glb_cmd_queue, (real*)x, *k_0, *k_1, *q, *n);
+
+    HIP_CHECK(hipGetLastError());
+}
+}
diff --git a/src/source_terms/bcknd/device/hip/hip_filters.f90 b/src/source_terms/bcknd/device/hip/hip_filters.f90
new file mode 100644
index 00000000000..aa614aad393
--- /dev/null
+++ b/src/source_terms/bcknd/device/hip/hip_filters.f90
@@ -0,0 +1,71 @@
+! Copyright (c) 2024, The Neko Authors
+! All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!
+!   * Redistributions of source code must retain the above copyright
+!     notice, this list of conditions and the following disclaimer.
+!
+!   * Redistributions in binary form must reproduce the above
+!     copyright notice, this list of conditions and the following
+!     disclaimer in the documentation and/or other materials provided
+!     with the distribution.
+!
+!   * Neither the name of the authors nor the names of its
+!     contributors may be used to endorse or promote products derived
+!     from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+
+!> Hip interface binding for filters
+module hip_filters
+  use num_types, only: rp, c_rp
+  implicit none
+  private
+
+  public :: hip_smooth_step, hip_step_function, hip_permeability
+
+  ! Interfaces for the backend functions
+  interface
+     subroutine hip_smooth_step(x, edge0, edge1, n) &
+          bind(c, name = "hip_smooth_step")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: edge0, edge1
+       integer(c_int) :: n
+     end subroutine hip_smooth_step
+
+     subroutine hip_step_function(x, edge, left, right, n) &
+          bind(c, name = "hip_step_function")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: edge, left, right
+       integer(c_int) :: n
+     end subroutine hip_step_function
+
+     subroutine hip_permeability(x, k_0, k_1, q, n) &
+          bind(c, name = "hip_permeability")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: k_0, k_1, q
+       integer(c_int) :: n
+     end subroutine hip_permeability
+  end interface
+
+end module hip_filters
diff --git a/src/source_terms/bcknd/device/opencl/filter_kernels.cl b/src/source_terms/bcknd/device/opencl/filter_kernels.cl
new file mode 100644
index 00000000000..4b4f9b253f3
--- /dev/null
+++ b/src/source_terms/bcknd/device/opencl/filter_kernels.cl
@@ -0,0 +1,103 @@
+#ifndef __SOURCE_TERMS_FILTER_KERNELS__
+#define __SOURCE_TERMS_FILTER_KERNELS__
+/*
+ Copyright (c) 2021-2024, The Neko Authors
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+
+   * Neither the name of the authors nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+ * Device kernel for smooth step function.
+ *
+ * @param x  in/out array
+ * @param edge0  lower edge
+ * @param edge1  upper edge
+ * @param n  size of the input array
+ */
+__kernel void smooth_step_kernel(
+    __global real* __restrict__ x, const real edge0, const real edge1,
+    const int n) {
+
+    const int idx = get_global_id(0);
+    const int str = get_global_size(0);
+
+    for (int i = idx; i < n; i += str) {
+        real t = (x[i] - edge0) / (edge1 - edge0);
+        t      = min(max(t, 0.0), 1.0);
+        x[i]   = pow(t, 3.0) * (t * (t * 6.0 - 15.0) + 10.0);
+    }
+}
+
+/**
+ * Device kernel for step function.
+ *
+ * @param x  input array
+ * @param edge  step location
+ * @param left  value to the left of the step
+ * @param right  value to the right of the step
+ * @param n  size of the input array
+ */
+__kernel void step_kernel(
+    __global real* __restrict__ x, const real edge, const real left,
+    const real right, const int n) {
+
+    const int idx = get_global_id(0);
+    const int str = get_global_size(0);
+
+    for (int i = idx; i < n; i += str) {
+        const real x_i = x[i];
+        x[i]           = (x_i < edge) ? left : right;
+    }
+}
+
+/**
+ * Device kernel for the inverse permeability kernel.
+ *
+ * @param x  input array
+ * @param k_0  lower bound of the permeability
+ * @param k_1  upper bound of the permeability
+ * @param q  parameter
+ * @param n  size of the input array
+ */
+__kernel void permeability_kernel(
+    __global real* __restrict__ x, const real k_0, const real k_1, const real q,
+    const int n) {
+
+    const int idx = get_global_id(0);
+    const int str = get_global_size(0);
+
+    for (int i = idx; i < n; i += str) {
+        const real x_i = x[i];
+        x[i]           = k_0 + (k_1 - k_0) * x_i * (q + 1.0) / (q + x_i);
+    }
+}
+
+#endif /* __SOURCE_TERMS_FILTER_KERNELS__ */
\ No newline at end of file
diff --git a/src/source_terms/bcknd/device/opencl/filters.c b/src/source_terms/bcknd/device/opencl/filters.c
new file mode 100644
index 00000000000..8af47c00f3c
--- /dev/null
+++ b/src/source_terms/bcknd/device/opencl/filters.c
@@ -0,0 +1,149 @@
+/*
+ Copyright (c) 2021-2024, The Neko Authors
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+
+   * Neither the name of the authors nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+#else
+    #include <CL/cl.h>
+#endif
+
+#include <device/device_config.h>
+#include <device/opencl/check.h>
+#include <device/opencl/jit.h>
+#include <device/opencl/prgm_lib.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "filter_kernels.cl.h"
+
+/** Fortran wrapper for 2nd order smooth_step
+ *
+ * Compute the smooth step function for a given array.
+ * t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0);
+ * return t^3 * (t * (6.0 * t - 15.0) + 10.0);
+ */
+void opencl_smooth_step(void* x, real* edge0, real* edge1, int* n) {
+    cl_int err;
+
+    if (filter_program == NULL)
+        opencl_kernel_jit(filter_kernels, (cl_program*)&filter_program);
+
+    cl_kernel kernel =
+        clCreateKernel(filter_program, "smooth_step_kernel", &err);
+    CL_CHECK(err);
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&x));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), edge0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), edge1));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), n));
+
+    const int    nb               = ((*n) + 256 - 1) / 256;
+    const size_t global_item_size = 256 * nb;
+    const size_t local_item_size  = 256;
+
+    CL_CHECK(clEnqueueNDRangeKernel(
+        (cl_command_queue)glb_cmd_queue, kernel, 1, NULL, &global_item_size,
+        &local_item_size, 0, NULL, NULL));
+}
+
+/** Fortran wrapper for step function
+ *
+ * Compute the step function for a given array.
+ * if (x < edge) return left;
+ * else return right;
+ *
+ * @param x array to apply the step function
+ * @param edge threshold value
+ * @param left value to return if x < edge
+ * @param right value to return if x >= edge
+ * @param n size of the array
+ */
+void opencl_step_function(
+    void* x, real* edge, real* left, real* right, int* n) {
+    cl_int err;
+
+    if (filter_program == NULL)
+        opencl_kernel_jit(filter_kernels, (cl_program*)&filter_program);
+
+    cl_kernel kernel =
+        clCreateKernel(filter_program, "step_function_kernel", &err);
+    CL_CHECK(err);
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&x));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), edge));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), left));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), right));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
+
+    const int    nb               = ((*n) + 256 - 1) / 256;
+    const size_t global_item_size = 256 * nb;
+    const size_t local_item_size  = 256;
+
+    CL_CHECK(clEnqueueNDRangeKernel(
+        (cl_command_queue)glb_cmd_queue, kernel, 1, NULL, &global_item_size,
+        &local_item_size, 0, NULL, NULL));
+}
+
+/** Fortran wrapper for the permeability filter
+ *
+ * @param x array to apply the permeability filter on
+ * @param k_0 lower bound of the permeability
+ * @param k_1 upper bound of the permeability
+ * @param q parameter
+ * @param n size of the array
+ */
+void opencl_permeability(void* x, real* k_0, real* k_1, real* q, int* n) {
+    cl_int err;
+
+    if (filter_program == NULL)
+        opencl_kernel_jit(filter_kernels, (cl_program*)&filter_program);
+
+    cl_kernel kernel =
+        clCreateKernel(filter_program, "permeability_kernel", &err);
+    CL_CHECK(err);
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&x));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(real), k_0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(real), k_1));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(real), q));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), n));
+
+    const int    nb               = ((*n) + 256 - 1) / 256;
+    const size_t global_item_size = 256 * nb;
+    const size_t local_item_size  = 256;
+
+    CL_CHECK(clEnqueueNDRangeKernel(
+        (cl_command_queue)glb_cmd_queue, kernel, 1, NULL, &global_item_size,
+        &local_item_size, 0, NULL, NULL));
+}
\ No newline at end of file
diff --git a/src/source_terms/bcknd/device/opencl/opencl_filters.f90 b/src/source_terms/bcknd/device/opencl/opencl_filters.f90
new file mode 100644
index 00000000000..90ee1833053
--- /dev/null
+++ b/src/source_terms/bcknd/device/opencl/opencl_filters.f90
@@ -0,0 +1,71 @@
+! Copyright (c) 2024, The Neko Authors
+! All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!
+!   * Redistributions of source code must retain the above copyright
+!     notice, this list of conditions and the following disclaimer.
+!
+!   * Redistributions in binary form must reproduce the above
+!     copyright notice, this list of conditions and the following
+!     disclaimer in the documentation and/or other materials provided
+!     with the distribution.
+!
+!   * Neither the name of the authors nor the names of its
+!     contributors may be used to endorse or promote products derived
+!     from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+! POSSIBILITY OF SUCH DAMAGE.
+
+!> OpenCL interface binding for filters
+module opencl_filters
+  use num_types, only: rp, c_rp
+  implicit none
+  private
+
+  public :: opencl_smooth_step, opencl_step_function, opencl_permeability
+
+  ! Interfaces for the backend functions
+  interface
+     subroutine opencl_smooth_step(x, edge0, edge1, n) &
+          bind(c, name = "opencl_smooth_step")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: edge0, edge1
+       integer(c_int) :: n
+     end subroutine opencl_smooth_step
+
+     subroutine opencl_step_function(x, edge, left, right, n) &
+          bind(c, name = "opencl_step_function")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: edge, left, right
+       integer(c_int) :: n
+     end subroutine opencl_step_function
+
+     subroutine opencl_permeability(x, k_0, k_1, q, n) &
+          bind(c, name = "opencl_permeability")
+       use, intrinsic :: iso_c_binding, only: c_ptr, c_int
+       import c_rp
+       type(c_ptr), value :: x
+       real(c_rp) :: k_0, k_1, q
+       integer(c_int) :: n
+     end subroutine opencl_permeability
+  end interface
+
+end module opencl_filters
diff --git a/src/source_terms/brinkman/filters.f90 b/src/source_terms/brinkman/filters.f90
index d8f7b71ac4b..20a2a19f890 100644
--- a/src/source_terms/brinkman/filters.f90
+++ b/src/source_terms/brinkman/filters.f90
@@ -38,6 +38,12 @@ module filters
   use neko_config, only: NEKO_BCKND_DEVICE
   use num_types, only: rp
   use utils, only: neko_error
+
+  use filters_cpu, only: smooth_step_cpu, permeability_cpu, &
+       step_function_cpu
+  use filters_device, only: smooth_step_device, permeability_device, &
+       step_function_device
+
   implicit none
 
   private
@@ -61,15 +67,15 @@ module filters
   !! @param[in] edge0 Edge giving output 0.
   !! @param[in] edge1 Edge giving output 1.
   subroutine smooth_step_field(F, edge0, edge1)
-    use filters_cpu, only: smooth_step_cpu
-
     type(field_t), intent(inout) :: F
     real(kind=rp), intent(in) :: edge0, edge1
+    integer :: n
 
+    n = F%size()
     if (NEKO_BCKND_DEVICE .eq. 1) then
-       call neko_error('smooth_step_field: not implemented for device')
+       call smooth_step_device(F%x_d, edge0, edge1, n)
     else
-       F%x = smooth_step_cpu(F%x, edge0, edge1)
+       call smooth_step_cpu(F%x, edge0, edge1, n)
     end if
   end subroutine smooth_step_field
 
@@ -80,18 +86,17 @@ end subroutine smooth_step_field
   !! @param[in] k_0 Permeability at x=0.
   !! @param[in] k_1 Permeability at x=1.
   !! @param[in] q Penalty factor.
-  subroutine permeability_field(F_out, x, k_0, k_1, q)
-    use filters_cpu, only: permeability_cpu
-
-    type(field_t), intent(inout) :: F_out
-    type(field_t), intent(in) :: x
+  subroutine permeability_field(F, k_0, k_1, q)
+    type(field_t), intent(inout) :: F
     real(kind=rp), intent(in) :: k_0, k_1
     real(kind=rp), intent(in) :: q
+    integer :: n
 
+    n = F%size()
     if (NEKO_BCKND_DEVICE .eq. 1) then
-       call neko_error('permeability_field: not implemented for device')
+       call permeability_device(F%x_d, k_0, k_1, q, n)
     else
-       F_out%x = permeability_cpu(x%x, k_0, k_1, q)
+       call permeability_cpu(F%x, k_0, k_1, q, n)
     end if
   end subroutine permeability_field
 
@@ -101,15 +106,15 @@ end subroutine permeability_field
   !! @param[in] value0 Value of the field before the step.
   !! @param[in] value1 Value of the field after the step.
   subroutine step_function_field(F, x0, value0, value1)
-    use filters_cpu, only: step_function_cpu
-
     type(field_t), intent(inout) :: F
     real(kind=rp), intent(in) :: x0, value0, value1
+    integer :: n
 
+    n = F%size()
     if (NEKO_BCKND_DEVICE .eq. 1) then
-       call neko_error('step_function_field: not implemented for device')
+       call step_function_device(F%x_d, x0, value0, value1, n)
     else
-       F%x = step_function_cpu(F%x, x0, value0, value1)
+       call step_function_cpu(F%x, x0, value0, value1, n)
     end if
   end subroutine step_function_field
 
diff --git a/src/source_terms/brinkman_source_term.f90 b/src/source_terms/brinkman_source_term.f90
index 8842d681ee1..c4c2fd2027a 100644
--- a/src/source_terms/brinkman_source_term.f90
+++ b/src/source_terms/brinkman_source_term.f90
@@ -43,6 +43,9 @@ module brinkman_source_term
   use neko_config, only: NEKO_BCKND_DEVICE
   use utils, only: neko_error
   use field_math, only: field_subcol3
+
+  use math, only: pwmax
+  use device_math, only: device_pwmax
   implicit none
   private
 
@@ -174,14 +177,9 @@ subroutine brinkman_source_term_init_from_json(this, json, fields, coef)
     ! ------------------------------------------------------------------------ !
     ! Compute the permeability field
 
-    call permeability_field(this%brinkman, this%indicator, &
-         & brinkman_limits(1), brinkman_limits(2), brinkman_penalty)
-
-    ! Copy the permeability field to the device
-    if (NEKO_BCKND_DEVICE .eq. 1) then
-       call device_memcpy(this%brinkman%x, this%brinkman%x_d, &
-            this%brinkman%dof%size(), HOST_TO_DEVICE, .true.)
-    end if
+    this%brinkman = this%indicator
+    call permeability_field(this%brinkman, &
+         brinkman_limits(1), brinkman_limits(2), brinkman_penalty)
 
   end subroutine brinkman_source_term_init_from_json
 
@@ -325,7 +323,7 @@ subroutine init_boundary_mesh(this, json)
     ! compute the signed distance function. This should be replaced with a
     ! more efficient method, such as a tree search.
 
-    call temp_field%init(this%indicator%dof)
+    call temp_field%init(this%coef%dof)
 
     ! Select how to transform the distance field to a design field
     select case (distance_transform)
@@ -339,6 +337,7 @@ subroutine init_boundary_mesh(this, json)
       case ('step')
 
        call json_get(json, 'distance_transform.value', scalar_d)
+       scalar_r = real(scalar_d, kind=rp)
 
        call signed_distance_field(temp_field, boundary_mesh, scalar_d)
        call step_function_field(temp_field, scalar_r, 1.0_rp, 0.0_rp)
@@ -359,7 +358,12 @@ subroutine init_boundary_mesh(this, json)
     end select
 
     ! Update the global indicator field by max operator
-    this%indicator%x = max(this%indicator%x, temp_field%x)
+    if (NEKO_BCKND_DEVICE .eq. 1) then
+       call device_pwmax(this%indicator%x_d, temp_field%x_d, &
+            this%indicator%size())
+    else
+       this%indicator%x = max(this%indicator%x, temp_field%x)
+    end if
 
   end subroutine init_boundary_mesh
 
@@ -392,8 +396,7 @@ subroutine init_point_zone(this, json)
     call json_get_or_default(json, 'filter.type', filter_type, 'none')
 
     ! Compute the indicator field
-
-    call temp_field%init(this%indicator%dof)
+    call temp_field%init(this%coef%dof)
 
     my_point_zone => neko_point_zone_registry%get_point_zone(zone_name)
 
diff --git a/tests/unit/device_math/device_math_parallel.pf b/tests/unit/device_math/device_math_parallel.pf
index ab70603954f..ddd8b5e3066 100644
--- a/tests/unit/device_math/device_math_parallel.pf
+++ b/tests/unit/device_math/device_math_parallel.pf
@@ -3,7 +3,7 @@ module device_math_parallel
   use math
   use pfunit
   use device
-  use device_math  
+  use device_math
   use neko_config
   use num_types
   use comm, only : NEKO_COMM, pe_rank, pe_size
@@ -15,7 +15,7 @@ module device_math_parallel
      procedure :: setUp
      procedure :: tearDown
   end type test_device_math
-  
+
 contains
 
   subroutine setUp(this)
@@ -28,8 +28,8 @@ contains
   subroutine tearDown(this)
     class(test_device_math), intent(inout) :: this
     if (NEKO_BCKND_DEVICE .eq. 1) then
-         call device_finalize
-   end if
+       call device_finalize
+    end if
   end subroutine tearDown
 
   @test
@@ -42,33 +42,33 @@ contains
 
     if (NEKO_BCKND_DEVICE .eq. 1) then
        a = 1.0_rp
-       
+
        call device_map(a, a_d, size(a))
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_rzero(a_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(0.0_rp, a(i), tolerance=NEKO_EPS)
        end do
-       
+
        do i = 1, n
           a(i) = real(i, rp)
        end do
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_rzero(a_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(0.0_rp, a(i), tolerance=NEKO_EPS)
        end do
     end if
-    
+
   end subroutine test_device_math_rzero
 
   @test
@@ -81,33 +81,33 @@ contains
 
     if (NEKO_BCKND_DEVICE .eq. 1) then
        a = 0.0_rp
-       
+
        call device_map(a, a_d, size(a))
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_rone(a_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(1.0_rp, a(i), tolerance=NEKO_EPS)
        end do
-       
+
        do i = 1, n
           a(i) = real(i, rp)
        end do
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_rone(a_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(1.0_rp, a(i), tolerance=NEKO_EPS)
        end do
     end if
-    
+
   end subroutine test_device_math_rone
 
   @test
@@ -120,7 +120,7 @@ contains
     integer :: i
 
     if (NEKO_BCKND_DEVICE .eq. 1) then
-       
+
        call rone(a, n)
        call rzero(b, n)
        do i = 1, n
@@ -129,20 +129,20 @@ contains
 
        call device_map(a, a_d, size(a))
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-    
+
        call device_map(b, b_d, size(b))
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_copy(b_d, a_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.false.)
        call device_memcpy(b, b_d, size(a), DEVICE_TO_HOST, sync=.true.)
-    
+
        do i = 1, n
           @assertEqual(a(i), b(i))
        end do
     end if
-    
+
   end subroutine test_device_math_copy
 
   @test
@@ -156,23 +156,23 @@ contains
 
     if (NEKO_BCKND_DEVICE .eq. 1) then
        a = 1.0_rp
-       
+
        call device_map(a, a_d, size(a))
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_cfill(a_d, c, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(42.0_rp, a(i), tolerance=NEKO_EPS)
        end do
 
     end if
-    
+
   end subroutine test_device_math_cfill
 
-    @test
+  @test
   subroutine test_device_math_cadd(this)
     class(test_device_math), intent(inout) :: this
     integer, parameter :: n = 4711
@@ -183,20 +183,20 @@ contains
 
     if (NEKO_BCKND_DEVICE .eq. 1) then
        a = 40.0_rp
-       
+
        call device_map(a, a_d, size(a))
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_cadd(a_d, c, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(42.0_rp, a(i), tolerance=NEKO_EPS)
        end do
-       
+
     end if
-    
+
   end subroutine test_device_math_cadd
 
   @test
@@ -213,24 +213,24 @@ contains
           a(i) = 40.0_rp
           b(i) = 2.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_add2(a_d, b_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(42.0_rp, a(i))
        end do
     end if
-    
+
   end subroutine test_device_math_add2
-  
+
   @test
   subroutine test_device_math_add2s1(this)
     class(test_device_math), intent(inout) :: this
@@ -246,22 +246,22 @@ contains
           a(i) = 20.0_rp
           b(i) = 2.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_add2s1(a_d, b_d, c1, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(42.0_rp, a(i))
        end do
     end if
-    
+
   end subroutine test_device_math_add2s1
 
   @test
@@ -282,19 +282,19 @@ contains
 
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_add2s2(a_d, b_d, c1, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
 
        do i = 1, n
           @assertEqual(24.0_rp, a(i))
        end do
     end if
-    
+
   end subroutine test_device_math_add2s2
 
   @test
@@ -319,20 +319,20 @@ contains
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
        call device_map(c, c_d, size(c))
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_add3s2(a_d, b_d, c_d, c1, c2, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
 
        do i = 1, n
           @assertEqual(84.0_rp, a(i))
        end do
     end if
-    
+
   end subroutine test_device_math_add3s2
 
   @test
@@ -347,22 +347,22 @@ contains
        do i = 1, n
           a(i) = 2.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
 
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_invcol1(a_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(0.5_rp, a(i))
        end do
     end if
 
   end subroutine test_device_math_invcol1
-  
+
   @test
   subroutine test_device_math_invcol2(this)
     class(test_device_math), intent(inout) :: this
@@ -377,24 +377,24 @@ contains
           a(i) = 21.0_rp
           b(i) = 2.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
 
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_invcol2(a_d, b_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(10.5_rp, a(i))
        end do
     end if
 
   end subroutine test_device_math_invcol2
-  
+
   @test
   subroutine test_device_math_col2(this)
     class(test_device_math), intent(inout) :: this
@@ -409,17 +409,17 @@ contains
           a(i) = 21.0_rp
           b(i) = 2.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
 
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_col2(a_d, b_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(42.0_rp, a(i))
        end do
@@ -443,7 +443,7 @@ contains
           b(i) = 2.0_rp
           c(i) = 21.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
        call device_map(c, c_d, size(c))
@@ -451,11 +451,11 @@ contains
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_col3(a_d, b_d, c_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(42.0_rp, a(i))
        end do
@@ -477,24 +477,24 @@ contains
           a(i) = 44.0_rp
           b(i) = 2.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
 
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_sub2(a_d, b_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(42.0_rp, a(i))
        end do
     end if
 
   end subroutine test_device_math_sub2
-  
+
   @test
   subroutine test_device_math_sub3(this)
     class(test_device_math), intent(inout) :: this
@@ -511,7 +511,7 @@ contains
           b(i) = 2.0_rp
           c(i) = 21.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
        call device_map(c, c_d, size(c))
@@ -519,11 +519,11 @@ contains
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_sub3(a_d, b_d, c_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(-19.0_rp, a(i))
        end do
@@ -547,7 +547,7 @@ contains
           b(i) = 2.0_rp
           c(i) = 21.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
        call device_map(c, c_d, size(c))
@@ -555,11 +555,11 @@ contains
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_addcol3(a_d, b_d, c_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(4711.0_rp, a(i))
        end do
@@ -585,7 +585,7 @@ contains
           c(i) = 21.0_rp
           d(i) = 1.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
        call device_map(c, c_d, size(c))
@@ -595,11 +595,11 @@ contains
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(d, d_d, size(d), HOST_TO_DEVICE, sync=.false.)
-       
+
        call device_addcol4(a_d, b_d, c_d, d_d, n)
-       
+
        call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
-       
+
        do i = 1, n
           @assertEqual(4711.0_rp, a(i))
        end do
@@ -619,29 +619,29 @@ contains
     integer :: i, ierr
 
     if (NEKO_BCKND_DEVICE .eq. 1) then
-       
+
        call MPI_Comm_dup(this%getMpiCommunicator(), NEKO_COMM%mpi_val, ierr)
-       
+
        do i = 1, n
           a(i) = 1.0_rp
           b(i) = 1.0_rp
           c(i) = 1.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
        call device_map(c, c_d, size(c))
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(c, c_d, size(c), HOST_TO_DEVICE, sync=.false.)
-       
-       expected = real(this%getNumProcesses(), rp)  * real(n, rp)
+
+       expected = real(this%getNumProcesses(), rp) * real(n, rp)
        res = device_glsc3(a_d, b_d, c_d, n)
        @assertEqual(expected, res)
        @MPIassertEqual(expected, res)
     end if
-    
+
   end subroutine test_device_math_glsc3
 
   @test(npes=[1])
@@ -655,26 +655,26 @@ contains
     integer :: i, ierr
 
     if (NEKO_BCKND_DEVICE .eq. 1) then
-       
+
        call MPI_Comm_dup(this%getMpiCommunicator(), NEKO_COMM%mpi_val, ierr)
-       
+
        do i = 1, n
           a(i) = 1.0_rp
           b(i) = 1.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
        call device_map(b, b_d, size(b))
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
        call device_memcpy(b, b_d, size(b), HOST_TO_DEVICE, sync=.false.)
-       
-       expected = real(this%getNumProcesses(), rp)  * real(n, rp)
+
+       expected = real(this%getNumProcesses(), rp) * real(n, rp)
        res = device_glsc2(a_d, b_d, n)
        @assertEqual(expected, res)
        @MPIassertEqual(expected, res)
     end if
-    
+
   end subroutine test_device_math_glsc2
 
   @test(npes=[1])
@@ -687,23 +687,103 @@ contains
     integer :: i, ierr
 
     if (NEKO_BCKND_DEVICE .eq. 1) then
-       
+
        call MPI_Comm_dup(this%getMpiCommunicator(), NEKO_COMM%mpi_val, ierr)
-       
+
        do i = 1, n
           a(i) = 1.0_rp
        end do
-       
+
        call device_map(a, a_d, size(a))
-       
+
        call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
-       
-       expected = real(this%getNumProcesses(), rp)  * real(n, rp)
+
+       expected = real(this%getNumProcesses(), rp) * real(n, rp)
        res = device_glsum(a_d, n)
        @assertEqual(expected, res)
        @MPIassertEqual(expected, res)
     end if
-    
+
   end subroutine test_device_math_glsum
 
+  ! ========================================================================== !
+  ! Tests for the pointwise operations
+
+  !> Test the point-wise maximum operation
+  @test(npes=[1])
+  subroutine test_device_math_pwmax(this)
+    class(test_device_math), intent(inout) :: this
+
+    integer, parameter :: n = 4711
+    real(kind=rp), dimension(n) :: a, b, c
+    type(c_ptr) :: a_d = C_NULL_PTR
+    type(c_ptr) :: b_d = C_NULL_PTR
+
+    integer :: i
+
+    if (NEKO_BCKND_DEVICE .eq. 1) then
+       do i = 1, n
+          a(i) = real(i, rp)
+          b(i) = real(n - i, rp)
+       end do
+
+       call device_map(a, a_d, size(a))
+       call device_map(b, b_d, size(b))
+
+       call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
+       call device_memcpy(b, b_d, size(a), HOST_TO_DEVICE, sync=.false.)
+
+       call device_pwmax(a_d, b_d, n)
+
+       call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
+
+       do i = 1, n
+          c(i) = max(a(i), b(i))
+       end do
+
+       do i = 1, n
+          @assertEqual(c(i), a(i))
+       end do
+    end if
+
+  end subroutine test_device_math_pwmax
+
+  !> Test the point-wise minimum operation
+  @test(npes=[1])
+  subroutine test_device_math_pwmin(this)
+    class(test_device_math), intent(inout) :: this
+
+    integer, parameter :: n = 4711
+    real(kind=rp), dimension(n) :: a, b, c
+    type(c_ptr) :: a_d = C_NULL_PTR
+    type(c_ptr) :: b_d = C_NULL_PTR
+
+    integer :: i
+
+    if (NEKO_BCKND_DEVICE .eq. 1) then
+       do i = 1, n
+          a(i) = real(i, rp)
+          b(i) = real(n - i, rp)
+       end do
+
+       call device_map(a, a_d, size(a))
+       call device_map(b, b_d, size(b))
+
+       call device_memcpy(a, a_d, size(a), HOST_TO_DEVICE, sync=.false.)
+       call device_memcpy(b, b_d, size(a), HOST_TO_DEVICE, sync=.false.)
+
+       call device_pwmin(a_d, b_d, n)
+
+       call device_memcpy(a, a_d, size(a), DEVICE_TO_HOST, sync=.true.)
+
+       do i = 1, n
+          c(i) = min(a(i), b(i))
+       end do
+
+       do i = 1, n
+          @assertEqual(c(i), a(i))
+       end do
+    end if
+
+  end subroutine test_device_math_pwmin
 end module device_math_parallel
diff --git a/tests/unit/math/math_parallel.pf b/tests/unit/math/math_parallel.pf
index 1aa2b31cc3d..80708e7e3b5 100644
--- a/tests/unit/math/math_parallel.pf
+++ b/tests/unit/math/math_parallel.pf
@@ -17,14 +17,14 @@ contains
     c = 42.0_rp + (NEKO_EPS/2.0_rp)
     if (epsilon(1.0_rp) .lt. real(1d-12, rp)) then
        d = 42.0_rp + real(1d-12, rp)
-    else if (epsilon(1.0_rp) .lt.  real(1d-5, rp)) then
+    else if (epsilon(1.0_rp) .lt. real(1d-5, rp)) then
        d = 42.0_rp + real(1d-5, rp)
     end if
-    
+
     @assertFalse(abscmp(a, b))
     @assertTrue(abscmp(b, c))
     @assertFalse(abscmp(b, d))
-    
+
   end subroutine test_math_abscmp
 
   @test
@@ -33,7 +33,7 @@ contains
     real(kind=rp) :: a(n)
     integer :: i
 
-    a = 1.0_rp  
+    a = 1.0_rp
     call rzero(a, n)
     do i = 1, n
        @assertEqual(0.0_rp, a(i))
@@ -46,7 +46,7 @@ contains
     do i = 1, n
        @assertEqual(0.0_rp, a(i))
     end do
-    
+
   end subroutine test_math_rzero
 
   @test
@@ -60,7 +60,7 @@ contains
     do i = 1, n
        @assertEqual(0, a(i))
     end do
-    
+
   end subroutine test_math_izero
 
   @test
@@ -77,7 +77,7 @@ contains
     do i = 1, n
        @assertEqual(1.0_rp, a(i))
     end do
-    
+
   end subroutine test_math_rone
 
   @test
@@ -96,7 +96,7 @@ contains
     do i = 1, n
        @assertEqual(a(i), b(i))
     end do
-    
+
   end subroutine test_math_copy
 
   @test
@@ -317,7 +317,7 @@ contains
     end do
 
   end subroutine test_math_invcol2
-  
+
   @test
   subroutine test_math_col2
     integer, parameter :: n = 17
@@ -377,7 +377,7 @@ contains
 
   end subroutine test_math_sub3
 
-    @test
+  @test
   subroutine test_math_addcol3
     integer, parameter :: n = 17
     real(kind=rp), dimension(n) :: a, b, c
@@ -406,18 +406,73 @@ contains
     integer :: i, ierr
 
     call MPI_Comm_dup(this%getMpiCommunicator(), NEKO_COMM%mpi_val, ierr)
-    
+
     do i = 1, n
        a(i) = 1.0_rp
        b(i) = 1.0_rp
        c(i) = 1.0_rp
     end do
 
-    expected = real(this%getNumProcesses(), rp)  * 17.0_rp
+    expected = real(this%getNumProcesses(), rp) * 17.0_rp
     res = glsc3(a, b, c, n)
     @assertEqual(expected, res)
     @MPIassertEqual(expected, res)
 
   end subroutine test_math_glsc3
 
+  ! ========================================================================== !
+  ! Tests for the pointwise operations
+
+  !> Test the point-wise maximum operation
+  @test
+  subroutine test_math_pwmax
+
+    integer, parameter :: n = 4711
+    real(kind=rp), dimension(n) :: a, b, c
+
+    integer :: i
+
+    do i = 1, n
+       a(i) = real(i, rp)
+       b(i) = real(n - i, rp)
+    end do
+
+    call pwmax(a, b, n)
+
+    do i = 1, n
+       c(i) = max(a(i), b(i))
+    end do
+
+    do i = 1, n
+       @assertEqual(c(i), a(i))
+    end do
+
+  end subroutine test_math_pwmax
+
+  !> Test the point-wise minimum operation
+  @test
+  subroutine test_math_pwmin
+
+    integer, parameter :: n = 4711
+    real(kind=rp), dimension(n) :: a, b, c
+
+    integer :: i
+
+    do i = 1, n
+       a(i) = real(i, rp)
+       b(i) = real(n - i, rp)
+    end do
+
+    call pwmin(a, b, n)
+
+    do i = 1, n
+       c(i) = min(a(i), b(i))
+    end do
+
+    do i = 1, n
+       @assertEqual(c(i), a(i))
+    end do
+
+  end subroutine test_math_pwmin
+
 end module math_parallel