diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 1105b3e..d43763d 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -115,7 +115,7 @@ jobs: - name: Archive test results patch if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: baseline-patch path: remhos/baseline.patch diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7ac56b3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +remhos +*.dat +*.mesh +*.o +*.gf +*.svg +.DS_Store +/build/ +/.cache/ +/.vscode/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..7ddec8b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,85 @@ +cmake_minimum_required(VERSION 3.18) + +set(project remhos) +project(${project} LANGUAGES CXX) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_COMPILER_LAUNCHER ccache) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# CXX flags ******************************************************************* +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + add_compile_options(-fsanitize=address -O0) + add_link_options(-fsanitize=address) +endif() + +# remove -DNDEBUG from default RelWithDebInfo flags +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2" CACHE STRING "" FORCE) + +# Verbosity options *********************************************************** +set(CMAKE_VERBOSE_MAKEFILE OFF CACHE BOOL "" FORCE) +# set(CUDA_VERBOSE_BUILD OFF CACHE BOOL "" FORCE) + +# Include paths *************************************************************** +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../mfem) +if (${CMAKE_SYSTEM_NAME} MATCHES "Linux") +include_directories(/usr/include/hypre /usr/include) +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") +include_directories(/opt/homebrew/opt/fmt/include + /opt/homebrew/opt/openmpi/include + /opt/homebrew/opt/metis/include + /opt/homebrew/opt/hypre/include) +add_compile_definitions(MFEM_USE_CMAKE_TESTS) +else() + message(FATAL_ERROR "Unsupported system") +endif() + +# Copy mesh files ************************************************************* +file(GLOB SRC_MESH_FILES LIST_DIRECTORIES false data/*.mesh) +set(BUILD_DATA_DIR ${CMAKE_CURRENT_BINARY_DIR}/data) +add_custom_command(OUTPUT data_is_copied + COMMAND ${CMAKE_COMMAND} -E make_directory ${BUILD_DATA_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${SRC_MESH_FILES} ${BUILD_DATA_DIR} + COMMAND ${CMAKE_COMMAND} -E touch data_is_copied + COMMENT "Copying mesh files ...") +add_custom_target(copy_mesh_files DEPENDS data_is_copied) + +# Library source files ******************************************************** +add_library(Remhos STATIC remhos.cpp remhos_fct.cpp + remhos_ho.cpp remhos_lo.cpp + remhos_mono.cpp remhos_sync.cpp + remhos_tools.cpp) + +# Testing options ************************************************************* +enable_testing() +function(add_remhos_test name) + set(file ${name}.cpp) + set(target ${name}) + list(LENGTH ARGN ARGN_COUNT) + if(ARGN_COUNT GREATER 0) + list(GET ARGN 0 extra) + string(APPEND name "_" ${extra}) + string(APPEND target "_" ${extra}) + endif() + message(STATUS "Adding target: ${target}") + add_executable(${target} ${file}) + add_dependencies(${target} copy_mesh_files) + target_link_libraries(${target} Remhos -lmpi -lmfem -lhypre -lmetis -lfmt) +if (${CMAKE_SYSTEM_NAME} MATCHES "Linux") + target_link_directories(${target} PUBLIC /usr/lib/x86_64-linux-gnu) +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + target_link_directories(${target} PUBLIC /opt/homebrew/opt/openmpi/lib + /opt/homebrew/opt/metis/lib + /opt/homebrew/opt/hypre/lib + /opt/homebrew/opt/fmt/lib) +endif() + target_link_directories(${target} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../mfem) + message(STATUS "Adding test: ${target}_n1") + add_test(NAME ${target}_n1 COMMAND mpirun -n 1 ${target} ${extra}) + message(STATUS "Adding test: ${target}_n3") + add_test(NAME ${target}_n3 COMMAND mpirun -n 3 ${target}) +endfunction() + +add_remhos_test(remhos_main) +add_remhos_test(remhos_tests) \ No newline at end of file diff --git a/makefile b/makefile index 81957b7..1434602 100644 --- a/makefile +++ b/makefile @@ -64,6 +64,18 @@ TEST_MK = $(MFEM_DIR)/config/test.mk MFEM_DIR1 := $(MFEM_DIR) MFEM_DIR2 := $(realpath $(MFEM_DIR)) +# Use Caliper annotations + +ifdef CALIPER_DIR +CALIPER_DIR = $(spack location --install-dir caliper) +ADIAK_DIR = $(spack location --install-dir adiak) +CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER +ADIAK_INCLUDE = -I${ADIAK_DIR}/include +ADIAK_LDFLAGS = -L${ADIAK_DIR}/lib -ladiak +CALIPER_LDFLAGS = -L${CALIPER_DIR}/lib64 -lcaliper +endif + + # Use the compiler used by MFEM. Get the compiler and the options for compiling # and linking from MFEM's config.mk. (Skip this if the target does not require # building.) @@ -73,7 +85,7 @@ ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS))) endif CXX = $(MFEM_CXX) -CPPFLAGS = $(MFEM_CPPFLAGS) +CPPFLAGS = $(MFEM_CPPFLAGS) $(CALIPER_FLAGS) $(ADIAK_FLAGS) CXXFLAGS = $(MFEM_CXXFLAGS) # MFEM config does not define C compiler @@ -81,7 +93,8 @@ CC = gcc CFLAGS = -O3 # Optional link flags -LDFLAGS = +LDFLAGS = $(CALIPER_LDFLAGS) $(ADIAK_LDFLAGS) + OPTIM_OPTS = -O3 DEBUG_OPTS = -g -Wall -std=c++11 @@ -102,7 +115,8 @@ ifeq ($(REMHOS_DEBUG),YES) endif LIBS = $(strip $(REMHOS_LIBS) $(LDFLAGS)) -CCC = $(strip $(CXX) $(REMHOS_FLAGS)) +EXTRA_INC_DIR = $(or $(wildcard $(MFEM_DIR)/include/mfem),$(MFEM_DIR)) +CCC = $(strip $(CXX) $(REMHOS_FLAGS) $(if $(EXTRA_INC_DIR),-I$(EXTRA_INC_DIR))) Ccc = $(strip $(CC) $(CFLAGS) $(GL_OPTS)) SOURCE_FILES = remhos.cpp remhos_tools.cpp remhos_lo.cpp remhos_ho.cpp \ diff --git a/remhos.cpp b/remhos.cpp index 1bda644..de1e514 100644 --- a/remhos.cpp +++ b/remhos.cpp @@ -39,14 +39,27 @@ #include "remhos_tools.hpp" #include "remhos_sync.hpp" +#ifdef USE_CALIPER +#include +#include +//#include +#ifdef HAVE_MPI +#include +#endif +#endif + using namespace std; using namespace mfem; +#define REMHOS_GPU_SETUP + enum class HOSolverType {None, Neumann, CG, LocalInverse}; enum class FCTSolverType {None, FluxBased, ClipScale, - NonlinearPenalty, FCTProject}; + NonlinearPenalty, FCTProject + }; enum class LOSolverType {None, DiscrUpwind, DiscrUpwindPrec, - ResDist, ResDistSubcell, MassBased}; + ResDist, ResDistSubcell, MassBased + }; enum class MonolithicSolverType {None, ResDistMono, ResDistMonoSubcell}; enum class TimeStepControl {FixedTimeStep, LOBoundsError}; @@ -72,13 +85,16 @@ double inflow_function(const Vector &x); // Mesh bounding box Vector bb_min, bb_max; +//Caliper setup +void setupCaliper(); + class AdvectionOperator : public TimeDependentOperator { private: BilinearForm &Mbf, &ml; ParBilinearForm &Kbf; ParBilinearForm &M_HO, &K_HO; - Vector &lumpedM; + Vector &lumpedM, ones; Vector start_mesh_pos, start_submesh_pos; GridFunction &mesh_pos, *submesh_pos, &mesh_vel, &submesh_vel; @@ -88,6 +104,8 @@ class AdvectionOperator : public TimeDependentOperator double dt; TimeStepControl dt_control; mutable double dt_est; + mutable TimingData timer; + Assembly &asmbl; LowOrderMethod &lom; @@ -132,16 +150,30 @@ class AdvectionOperator : public TimeDependentOperator start_submesh_pos = sm_pos; } + void PrintTimingData(int steps) const; + virtual ~AdvectionOperator() { } }; +#ifndef MFEM_USE_CMAKE_TESTS +int remhos(int, char *[], double &); + int main(int argc, char *argv[]) +{ + double final_mass_u = M_PI; // unused + return remhos(argc, argv, final_mass_u); +} +#endif // MFEM_USE_CMAKE_TESTS + +MFEM_EXPORT int remhos(int argc, char *argv[], double &final_mass_u) { // Initialize MPI. mfem::MPI_Session mpi(argc, argv); const int myid = mpi.WorldRank(); - const char *mesh_file = "data/periodic-square.mesh"; + const char *mesh_file = "default"; + int dim = 3; + int elem_per_mpi = 1; int rs_levels = 2; int rp_levels = 0; int order = 3; @@ -158,7 +190,9 @@ int main(int argc, char *argv[]) double t_final = 4.0; TimeStepControl dt_control = TimeStepControl::FixedTimeStep; double dt = 0.005; + int max_tsteps = -1; bool visualization = true; + bool save_meshes_and_solution = false; bool visit = false; bool verify_bounds = false; bool product_sync = false; @@ -171,6 +205,9 @@ int main(int argc, char *argv[]) OptionsParser args(argc, argv); args.AddOption(&mesh_file, "-m", "--mesh", "Mesh file to use."); + args.AddOption(&dim, "-dim", "--dimension", "Dimension of the problem."); + args.AddOption(&elem_per_mpi, "-epm", "--elem-per-mpi", + "Number of element per mpi task."); args.AddOption(&problem_num, "-p", "--problem", "Problem setup to use. See options in velocity_function()."); args.AddOption(&rs_levels, "-rs", "--refine-serial", @@ -227,9 +264,14 @@ int main(int argc, char *argv[]) " 1 - Bounds violation of the LO sltn."); args.AddOption(&dt, "-dt", "--time-step", "Initial time step size (dt might change based on -dtc)."); + args.AddOption(&max_tsteps, "-ms", "--max-steps", + "Maximum number of steps (negative means no restriction)."); args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", "--no-visualization", "Enable or disable GLVis visualization."); + args.AddOption(&save_meshes_and_solution, "-save", "--save-meshes-and-solution", + "-no-save", "--save-meshes-and-solution", + "Print the final meshes and solution."); args.AddOption(&visit, "-visit", "--visit-datafiles", "-no-visit", "--no-visit-datafiles", "Save data files for VisIt (visit.llnl.gov) visualization."); @@ -249,6 +291,25 @@ int main(int argc, char *argv[]) } if (myid == 0) { args.PrintOptions(cout); } +//setup caliper config manager +#ifdef USE_CALIPER + setupCaliper(); + + cali::ConfigManager calimgr; + if (calimgr.error()) + std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl; + calimgr.start(); + //adiak::init(nullptr); + //adiak::cmdline(); + //adiak::hostname(); +#endif + +#ifdef REMHOS_GPU_SETUP + MFEM_VERIFY(ho_type == HOSolverType::LocalInverse && + lo_type == LOSolverType::MassBased && + fct_type == FCTSolverType::ClipScale, "Wrong GPU setup."); +#endif + // Enable hardware devices such as GPUs, and programming models such as // CUDA, OCCA, RAJA and OpenMP based on command line options. Device device(device_config); @@ -259,29 +320,47 @@ int main(int argc, char *argv[]) else if (problem_num < 20) { exec_mode = 1; } else { MFEM_ABORT("Unspecified execution mode."); } - // Read the serial mesh from the given mesh file on all processors. - // Refine the mesh in serial to increase the resolution. - Mesh *mesh = new Mesh(Mesh::LoadFromFile(mesh_file, 1, 1)); - const int dim = mesh->Dimension(); - for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); } - mesh->GetBoundingBox(bb_min, bb_max, max(order, 1)); - - // Only standard assembly in 1D (some mfem functions just abort in 1D). - if ((pa || next_gen_full) && dim == 1) + Mesh *mesh = nullptr; + int *mpi_partitioning = nullptr; + if (strncmp(mesh_file, "default", 7) != 0) { - MFEM_WARNING("Disabling PA / FA for 1D."); - pa = false; - next_gen_full = false; + // Read the serial mesh from the given mesh file on all processors. + // Refine the mesh in serial to increase the resolution. + mesh = new Mesh(Mesh::LoadFromFile(mesh_file, 1, 1)); + for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); } + } + else + { + mesh = CartesianMesh(dim, Mpi::WorldSize(), elem_per_mpi, myid == 0, + rp_levels, &mpi_partitioning); } + dim = mesh->Dimension(); + mesh->GetBoundingBox(bb_min, bb_max, max(order, 1)); // Parallel partitioning of the mesh. // Refine the mesh further in parallel to increase the resolution. - ParMesh pmesh(MPI_COMM_WORLD, *mesh); + ParMesh pmesh(MPI_COMM_WORLD, *mesh, mpi_partitioning); delete mesh; + delete mpi_partitioning; for (int lev = 0; lev < rp_levels; lev++) { pmesh.UniformRefinement(); } MPI_Comm comm = pmesh.GetComm(); const int NE = pmesh.GetNE(); + if (strncmp(mesh_file, "default", 7) == 0) + { + MFEM_VERIFY(pmesh.GetGlobalNE() == Mpi::WorldSize() * elem_per_mpi, + "Mesh generation error."); + MFEM_VERIFY(NE == elem_per_mpi, "Mesh generation error."); + } + + // Only standard assembly in 1D (some mfem functions just abort in 1D). + if ((pa || next_gen_full) && dim == 1) + { + MFEM_WARNING("Disabling PA / FA for 1D."); + pa = false; + next_gen_full = false; + } + // Define the ODE solver used for time integration. Several explicit // Runge-Kutta methods are available. ODESolver *ode_solver = NULL; @@ -361,14 +440,24 @@ int main(int argc, char *argv[]) v.ProjectCoefficient(vcoeff); double t = 0.0; +#ifdef USE_CALIPER + CALI_CXX_MARK_LOOP_BEGIN(mainloop, "rem.mainloop"); +#endif while (t < t_final) { +#ifdef USE_CALIPER + CALI_CXX_MARK_LOOP_ITERATION(mainloop, t); +#endif t += dt; // Move the mesh nodes. x.Add(std::min(dt, t_final-t), v); // Update the node velocities. v.ProjectCoefficient(vcoeff); } +#ifdef USE_CALIPER + CALI_CXX_MARK_LOOP_END(mainloop); +#endif + // Pseudotime velocity. add(x, -1.0, x0, v_gf); @@ -476,6 +565,9 @@ int main(int argc, char *argv[]) { M_HO.SetAssemblyLevel(AssemblyLevel::PARTIAL); K_HO.SetAssemblyLevel(AssemblyLevel::PARTIAL); + + k.SetAssemblyLevel(AssemblyLevel::PARTIAL); + m.SetAssemblyLevel(AssemblyLevel::PARTIAL); } if (next_gen_full) @@ -493,12 +585,13 @@ int main(int argc, char *argv[]) } // Compute the lumped mass matrix. - Vector lumpedM; ParBilinearForm ml(&pfes); ml.AddDomainIntegrator(new LumpedIntegrator(new MassIntegrator)); - ml.Assemble(); - ml.Finalize(); - ml.SpMat().GetDiag(lumpedM); + if (!pa) + { + ml.Assemble(); + ml.Finalize(); + } m.Assemble(); m.Finalize(); @@ -506,6 +599,16 @@ int main(int argc, char *argv[]) k.Assemble(skip_zeros); k.Finalize(skip_zeros); + Vector lumpedM; + if (!pa) { ml.SpMat().GetDiag(lumpedM); } + else + { + lumpedM.SetSize(m.Height()); + Vector ones(m.Height()); + ones = 1.0; + m.Mult(ones, lumpedM); + } + // Store topological dof data. DofInfo dofs(pfes, bounds_type); @@ -770,8 +873,7 @@ int main(int argc, char *argv[]) { MFEM_VERIFY(ho_solver != nullptr, "Mass-Based LO solver requires a choice of a HO solver."); - lo_solver = new MassBasedAvg(pfes, *ho_solver, - (exec_mode == 1) ? &v_gf : nullptr); + lo_solver = new MassBasedAvg(pfes, *ho_solver); } // Setup of the monolithic solver (if any). @@ -793,18 +895,21 @@ int main(int argc, char *argv[]) } // Print the starting meshes and initial condition. - ofstream meshHO("meshHO_init.mesh"); - meshHO.precision(precision); - pmesh.PrintAsOne(meshHO); - if (subcell_mesh) + if (save_meshes_and_solution) { - ofstream meshLO("meshLO_init.mesh"); - meshLO.precision(precision); - subcell_mesh->PrintAsOne(meshLO); + ofstream meshHO("meshHO_init.mesh"); + meshHO.precision(precision); + pmesh.PrintAsOne(meshHO); + if (subcell_mesh) + { + ofstream meshLO("meshLO_init.mesh"); + meshLO.precision(precision); + subcell_mesh->PrintAsOne(meshLO); + } + ofstream sltn("sltn_init.gf"); + sltn.precision(precision); + u.SaveAsOne(sltn); } - ofstream sltn("sltn_init.gf"); - sltn.precision(precision); - u.SaveAsOne(sltn); // Create data collection for solution output: either VisItDataCollection for // ASCII data files, or SidreDataCollection for binary data files. @@ -1062,6 +1167,8 @@ int main(int argc, char *argv[]) else { res = u; } } + if (ti_total == max_tsteps) { done = true; } + if (done || ti % vis_steps == 0) { if (myid == 0) @@ -1096,6 +1203,16 @@ int main(int argc, char *argv[]) } } + int steps = ti_total; + switch (ode_solver_type) + { + case 2: steps *= 2; break; + case 3: steps *= 3; break; + case 4: steps *= 4; break; + case 6: steps *= 6; break; + } + adv.PrintTimingData(steps); + if (dt_control != TimeStepControl::FixedTimeStep && myid == 0) { cout << "Total time steps: " << ti_total @@ -1103,6 +1220,7 @@ int main(int argc, char *argv[]) } // Print the final meshes and solution. + if (save_meshes_and_solution) { ofstream meshHO("meshHO_final.mesh"); meshHO.precision(precision); @@ -1122,10 +1240,23 @@ int main(int argc, char *argv[]) double mass_u_loc = 0.0, mass_us_loc = 0.0; if (exec_mode == 1) { - ml.BilinearForm::operator=(0.0); - ml.Assemble(); - lumpedM.HostRead(); - ml.SpMat().GetDiag(lumpedM); + // Using lumped diagonal for mass conservation. + if (!pa) + { + ml.BilinearForm::operator=(0.0); + ml.Assemble(); + lumpedM.HostRead(); + ml.SpMat().GetDiag(lumpedM); + } + else + { + ParBilinearForm m(&pfes); + m.AddDomainIntegrator(new MassIntegrator); + m.Assemble(); + Vector ones(m.Height()); + ones = 1.0; + m.Mult(ones, lumpedM); + } mass_u_loc = lumpedM * u; if (product_sync) { mass_us_loc = lumpedM * us; } } @@ -1136,6 +1267,7 @@ int main(int argc, char *argv[]) } double mass_u, mass_us = 0.0, s_max = 0.0; MPI_Allreduce(&mass_u_loc, &mass_u, 1, MPI_DOUBLE, MPI_SUM, comm); + final_mass_u = mass_u; const double umax_loc = u.Max(); MPI_Allreduce(&umax_loc, &umax, 1, MPI_DOUBLE, MPI_MAX, comm); if (product_sync) @@ -1194,7 +1326,7 @@ int main(int argc, char *argv[]) } } - if (smth_indicator) + if (smth_indicator && save_meshes_and_solution) { // Print the values of the smoothness indicator. ParGridFunction si_val; @@ -1228,7 +1360,9 @@ int main(int argc, char *argv[]) delete lom.SubFes1; delete lom.VolumeTerms; } - +#ifdef USE_CALIPER + calimgr.flush(); +#endif return 0; } @@ -1245,12 +1379,21 @@ AdvectionOperator::AdvectionOperator(int size, BilinearForm &Mbf_, TimeDependentOperator(size), Mbf(Mbf_), ml(_ml), Kbf(Kbf_), M_HO(M_HO_), K_HO(K_HO_), lumpedM(_lumpedM), + ones(lumpedM.Size()), start_mesh_pos(pos.Size()), start_submesh_pos(sub_vel.Size()), mesh_pos(pos), submesh_pos(sub_pos), mesh_vel(vel), submesh_vel(sub_vel), x_gf(Kbf.ParFESpace()), + timer(M_HO.Size()), asmbl(_asmbl), lom(_lom), dofs(_dofs), - ho_solver(hos), lo_solver(los), fct_solver(fct), mono_solver(mos) { } + ho_solver(hos), lo_solver(los), fct_solver(fct), mono_solver(mos) +{ + if (ho_solver) { ho_solver->timer = &timer; } + if (lo_solver) { lo_solver->timer = &timer; } + if (fct_solver) { fct_solver->timer = &timer; } + + ones = 1.0; +} void AdvectionOperator::Mult(const Vector &X, Vector &Y) const { @@ -1266,21 +1409,33 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const // Reset precomputed geometric data. Mbf.FESpace()->GetMesh()->DeleteGeometricFactors(); +#ifndef REMHOS_GPU_SETUP // Reassemble on the new mesh. Element contributions. // Currently needed to have the sparse matrices used by the LO methods. Mbf.BilinearForm::operator=(0.0); Mbf.Assemble(); Kbf.BilinearForm::operator=(0.0); Kbf.Assemble(0); - ml.BilinearForm::operator=(0.0); - ml.Assemble(); - lumpedM.HostReadWrite(); - ml.SpMat().GetDiag(lumpedM); +#endif + timer.sw_L2inv.Start(); M_HO.BilinearForm::operator=(0.0); M_HO.Assemble(); + timer.sw_L2inv.Stop(); + + if (Mbf.GetAssemblyLevel() != AssemblyLevel::PARTIAL) + { + ml.BilinearForm::operator=(0.0); + ml.Assemble(); + lumpedM.HostReadWrite(); + ml.SpMat().GetDiag(lumpedM); + } + else { M_HO.Mult(ones, lumpedM); } + + timer.sw_rhs.Start(); K_HO.BilinearForm::operator=(0.0); K_HO.Assemble(0); + timer.sw_rhs.Stop(); if (lom.pk) { @@ -1288,6 +1443,7 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const lom.pk->Assemble(); } +#ifndef REMHOS_GPU_SETUP // Face contributions. asmbl.bdrInt = 0.; Mesh *mesh = M_HO.FESpace()->GetMesh(); @@ -1317,6 +1473,7 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const } } } +#endif } const int size = Kbf.ParFESpace()->GetVSize(); @@ -1339,11 +1496,20 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const { MFEM_VERIFY(ho_solver && lo_solver, "FCT requires HO and LO solvers."); - lo_solver->CalcLOSolution(u, du_LO); + // High-order solution. ho_solver->CalcHOSolution(u, du_HO); + auto mba = dynamic_cast(lo_solver); + if (mba) { mba->SetHOSolution(du_HO); } + + // Low-order solution. + lo_solver->CalcLOSolution(u, du_LO); + + // Computation of bounds and FCT blending. + timer.sw_FCT.Start(); dofs.ComputeElementsMinMax(u, dofs.xe_min, dofs.xe_max, NULL, NULL); dofs.ComputeBounds(dofs.xe_min, dofs.xe_max, dofs.xi_min, dofs.xi_max); + timer.sw_FCT.Stop(); fct_solver->CalcFCTSolution(x_gf, lumpedM, du_HO, du_LO, dofs.xi_min, dofs.xi_max, d_u); @@ -1442,6 +1608,39 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const } } +void AdvectionOperator::PrintTimingData(int steps) const +{ + const MPI_Comm com = M_HO.ParFESpace()->GetComm(); + const int myid = M_HO.ParFESpace()->GetMyRank(); + + double my_rt[5], T[5]; + my_rt[0] = timer.sw_rhs.RealTime(); + my_rt[1] = timer.sw_L2inv.RealTime(); + my_rt[2] = timer.sw_LO.RealTime(); + my_rt[3] = timer.sw_FCT.RealTime(); + my_rt[4] = my_rt[0] + my_rt[2] + my_rt[3]; + MPI_Reduce(my_rt, T, 5, MPI_DOUBLE, MPI_MAX, 0, com); + + const HYPRE_BigInt dofs_steps = M_HO.ParFESpace()->GlobalVSize() * steps; + + if (myid == 0) + { + std::cout << "---" << std::endl; + std::cout << "RHS kernel time: " << T[0] << std::endl + << "L2inv kernel time: " << T[1] << std::endl + << "LO kernel time: " << T[2] << std::endl + << "FCT kernel time: " << T[3] << std::endl + << "Total kernel time: " << T[4] << std::endl; + std::cout << "---" << std::endl; + std::cout << "FOM RHS: " << 1e-6 * dofs_steps / T[0] << std::endl + << "FOM INV: " << 1e-6 * dofs_steps / T[1] << std::endl + << "FOM LO: " << 1e-6 * dofs_steps / T[2] << std::endl + << "FOM FCT: " << 1e-6 * dofs_steps / T[3] << std::endl + << "FOM: " << 1e-6 * dofs_steps / T[4] << std::endl; + std::cout << "(megadofs x time steps / second)\n---" << std::endl; + } +} + void AdvectionOperator::UpdateTimeStepEstimate(const Vector &x, const Vector &dx, const Vector &x_min, @@ -1453,6 +1652,7 @@ void AdvectionOperator::UpdateTimeStepEstimate(const Vector &x, int n = x.Size(); const double eps = 1e-12; double dt = numeric_limits::infinity(); + x_min.HostRead(), x_max.HostRead(), x.HostRead(), dx.HostRead(); for (int i = 0; i < n; i++) { @@ -1854,3 +2054,19 @@ double inflow_function(const Vector &x) } else { return 0.0; } } + +void setupCaliper() +{ +#ifdef USE_CALIPER +#ifdef HAVE_MPI + cali_mpi_init(); +#endif + + cali_config_preset("CALI_LOG_VERBOSITY", "0"); + cali_config_preset("CALI_CALIPER_ATTRIBUTE_DEFAULT_SCOPE", "process"); + + //cali_set_global_string_byname("rem.git_vers", GIT_VERS); + //cali_set_global_string_byname("rem.git_hash", GIT_HASH); +#endif +} + diff --git a/remhos_fct.cpp b/remhos_fct.cpp index be2454b..d3cb1cc 100644 --- a/remhos_fct.cpp +++ b/remhos_fct.cpp @@ -56,8 +56,8 @@ void FCTSolver::CalcCompatibleLOProduct(const ParGridFunction &us, double s_avg = mass_us / mass_u; // Min and max of s using the full stencil of active dofs. - s_min_loc.SetDataAndSize(s_min.GetData() + k*ndofs, ndofs); - s_max_loc.SetDataAndSize(s_max.GetData() + k*ndofs, ndofs); + s_min_loc.SetDataAndSize(s_min.HostReadWrite() + k*ndofs, ndofs); + s_max_loc.SetDataAndSize(s_max.HostReadWrite() + k*ndofs, ndofs); double smin = numeric_limits::infinity(), smax = -numeric_limits::infinity(); for (int j = 0; j < ndofs; j++) @@ -484,78 +484,93 @@ void ClipScaleSolver::CalcFCTSolution(const ParGridFunction &u, const Vector &m, const Vector &u_min, const Vector &u_max, Vector &du) const { + timer->sw_FCT.Start(); + const int NE = pfes.GetMesh()->GetNE(); const int nd = pfes.GetFE(0)->GetDof(); - Vector f_clip(nd); - - int dof_id; - double sumPos, sumNeg, u_new_ho, u_new_lo, new_mass, f_clip_min, f_clip_max; - double umin, umax; - const double eps = 1.0e-15; + Vector f_clip(nd*NE); // Smoothness indicator. ParGridFunction si_val; + if (smth_indicator) { + MFEM_ABORT("smth_indicator not supported in kernel!"); smth_indicator->ComputeSmoothnessIndicator(u, si_val); } - u.HostRead(); - m.HostRead(); - du.HostReadWrite(); - du_lo.HostRead(); du_ho.HostRead(); - for (int k = 0; k < NE; k++) + const auto δt = dt; + + const auto U = mfem::Reshape(u.Read(), NE*nd); + const auto M = mfem::Reshape(m.Read(), NE*nd); + const auto DU_LO = mfem::Reshape(du_lo.Read(), NE*nd); + const auto DU_HO = mfem::Reshape(du_ho.Read(), NE*nd); + const auto U_MIN = mfem::Reshape(u_min.Read(), NE*nd); + const auto U_MAX = mfem::Reshape(u_max.Read(), NE*nd); + + auto F_CLIP = mfem::Reshape(f_clip.Write(), NE, nd); + auto DU = mfem::Reshape(du.Write(), NE*nd); + + const bool update_bounds = smth_indicator != nullptr; + if (update_bounds) MFEM_ABORT("UpdateBounds not ported to device") + + mfem::forall(NE, [=] MFEM_HOST_DEVICE (int k) { - sumPos = sumNeg = 0.0; + constexpr auto eps = 1.0e-15; + double sumPos = 0.0, sumNeg = 0.0; // Clip. for (int j = 0; j < nd; j++) { - dof_id = k*nd+j; + const int dof_id = k*nd+j; + + const auto u_new_lo = U(dof_id) + δt * DU_LO(dof_id); - u_new_ho = u(dof_id) + dt * du_ho(dof_id); - u_new_lo = u(dof_id) + dt * du_lo(dof_id); + auto umin = U_MIN(dof_id); + auto umax = U_MAX(dof_id); - umin = u_min(dof_id); - umax = u_max(dof_id); - if (smth_indicator) +#if !defined(MFEM_USE_CUDA) && !defined(MFEM_USE_HIP) + if (update_bounds) { + const auto u_new_ho = U(dof_id) + δt * DU_HO(dof_id); smth_indicator->UpdateBounds(dof_id, u_new_ho, si_val, umin, umax); } +#endif - f_clip_min = m(dof_id) / dt * (umin - u_new_lo); - f_clip_max = m(dof_id) / dt * (umax - u_new_lo); + const auto f_clip_min = M(dof_id) / δt * (umin - u_new_lo); + const auto f_clip_max = M(dof_id) / δt * (umax - u_new_lo); - f_clip(j) = m(dof_id) * (du_ho(dof_id) - du_lo(dof_id)); - f_clip(j) = min(f_clip_max, max(f_clip_min, f_clip(j))); + F_CLIP(k,j) = M(dof_id) * (DU_HO(dof_id) - DU_LO(dof_id)); + F_CLIP(k,j) = fmin(f_clip_max, fmax(f_clip_min, F_CLIP(k,j))); - sumNeg += min(f_clip(j), 0.0); - sumPos += max(f_clip(j), 0.0); + sumNeg += fmin(F_CLIP(k,j), 0.0); + sumPos += fmax(F_CLIP(k,j), 0.0); } - new_mass = sumNeg + sumPos; + double new_mass = sumNeg + sumPos; // Rescale. for (int j = 0; j < nd; j++) { if (new_mass > eps) { - f_clip(j) = min(0.0, f_clip(j)) - - max(0.0, f_clip(j)) * sumNeg / sumPos; + F_CLIP(k,j) = fmin(0.0, F_CLIP(k,j)) - + fmax(0.0, F_CLIP(k,j)) * sumNeg / sumPos; } if (new_mass < -eps) { - f_clip(j) = max(0.0, f_clip(j)) - - min(0.0, f_clip(j)) * sumPos / sumNeg; + F_CLIP(k,j) = fmax(0.0, F_CLIP(k,j)) - + fmin(0.0, F_CLIP(k,j)) * sumPos / sumNeg; } // Set du to the discrete time derivative featuring the high order // anti-diffusive reconstruction that leads to an forward Euler // updated admissible solution. - dof_id = k*nd+j; - du(dof_id) = du_lo(dof_id) + f_clip(j) / m(dof_id); + const int dof_id = k*nd+j; + DU(dof_id) = DU_LO(dof_id) + F_CLIP(k,j) / M(dof_id); } - } + }); + timer->sw_FCT.Stop(); } void ClipScaleSolver::CalcFCTProduct(const ParGridFunction &us, const Vector &m, @@ -745,12 +760,13 @@ void ElementFCTProjection::CalcFCTSolution(const ParGridFunction &u, } // element loop } -void ElementFCTProjection::CalcFCTProduct(const ParGridFunction &us, const Vector &m, - const Vector &d_us_HO, const Vector &d_us_LO, - Vector &s_min, Vector &s_max, - const Vector &u_new, - const Array &active_el, - const Array &active_dofs, Vector &d_us) +void ElementFCTProjection::CalcFCTProduct(const ParGridFunction &us, + const Vector &m, + const Vector &d_us_HO, const Vector &d_us_LO, + Vector &s_min, Vector &s_max, + const Vector &u_new, + const Array &active_el, + const Array &active_dofs, Vector &d_us) { us.HostRead(); s_min.HostReadWrite(); diff --git a/remhos_fct.hpp b/remhos_fct.hpp index e3b9f5e..f5891c9 100644 --- a/remhos_fct.hpp +++ b/remhos_fct.hpp @@ -25,6 +25,7 @@ namespace mfem { class SmoothnessIndicator; +struct TimingData; // Monotone, High-order, Conservative Solver. class FCTSolver @@ -83,6 +84,8 @@ class FCTSolver { MFEM_ABORT("Product remap is not implemented for the chosen solver"); } + + TimingData *timer = nullptr; }; class FluxBasedFCT : public FCTSolver diff --git a/remhos_ho.cpp b/remhos_ho.cpp index 0125acc..7edbc25 100644 --- a/remhos_ho.cpp +++ b/remhos_ho.cpp @@ -72,38 +72,61 @@ void CGHOSolver::CalcHOSolution(const Vector &u, Vector &du) const LocalInverseHOSolver::LocalInverseHOSolver(ParFiniteElementSpace &space, ParBilinearForm &Mbf, ParBilinearForm &Kbf) - : HOSolver(space), M(Mbf), K(Kbf) { } + : HOSolver(space), M(Mbf), K(Kbf) +{ + if (M.GetAssemblyLevel() == AssemblyLevel::PARTIAL) + { + M_inv = new DGMassInverse(space, BasisType::GaussLegendre); + M_inv->SetAbsTol(1e-8), M_inv->SetRelTol(0.0); + } +} void LocalInverseHOSolver::CalcHOSolution(const Vector &u, Vector &du) const { - MFEM_VERIFY(M.GetAssemblyLevel() != AssemblyLevel::PARTIAL, - "PA for DG is not supported for Local Inverse."); + MFEM_VERIFY(timer, "Timer not set."); Vector rhs(u.Size()); - K.SpMat().HostReadWriteI(); - K.SpMat().HostReadWriteJ(); - K.SpMat().HostReadWriteData(); - HypreParMatrix *K_mat = K.ParallelAssemble(&K.SpMat()); - K_mat->Mult(u, rhs); - - const int ne = pfes.GetMesh()->GetNE(); - const int nd = pfes.GetFE(0)->GetDof(); - DenseMatrix M_loc(nd); - DenseMatrixInverse M_loc_inv(&M_loc); - Vector rhs_loc(nd), du_loc(nd); - Array dofs; - for (int i = 0; i < ne; i++) + if (M.GetAssemblyLevel() != AssemblyLevel::PARTIAL) { - pfes.GetElementDofs(i, dofs); - rhs.GetSubVector(dofs, rhs_loc); - M.SpMat().GetSubMatrix(dofs, dofs, M_loc); - M_loc_inv.Factor(); - M_loc_inv.Mult(rhs_loc, du_loc); - du.SetSubVector(dofs, du_loc); + timer->sw_rhs.Start(); + K.SpMat().HostReadWriteI(); + K.SpMat().HostReadWriteJ(); + K.SpMat().HostReadWriteData(); + HypreParMatrix *K_mat = K.ParallelAssemble(&K.SpMat()); + K_mat->Mult(u, rhs); + timer->sw_rhs.Stop(); + + const int ne = pfes.GetMesh()->GetNE(); + const int nd = pfes.GetFE(0)->GetDof(); + DenseMatrix M_loc(nd); + DenseMatrixInverse M_loc_inv(&M_loc); + Vector rhs_loc(nd), du_loc(nd); + Array dofs; + for (int i = 0; i < ne; i++) + { + pfes.GetElementDofs(i, dofs); + rhs.GetSubVector(dofs, rhs_loc); + timer->sw_L2inv.Start(); + M.SpMat().GetSubMatrix(dofs, dofs, M_loc); + M_loc_inv.Factor(); + M_loc_inv.Mult(rhs_loc, du_loc); + timer->sw_L2inv.Stop(); + du.SetSubVector(dofs, du_loc); + } + delete K_mat; + } + else + { + timer->sw_rhs.Start(); + K.Mult(u, rhs); + timer->sw_rhs.Stop(); + + timer->sw_L2inv.Start(); + M_inv->Update(), M_inv->Mult(rhs, du); + timer->sw_L2inv.Stop(); } - delete K_mat; } NeumannHOSolver::NeumannHOSolver(ParFiniteElementSpace &space, diff --git a/remhos_ho.hpp b/remhos_ho.hpp index 11b940c..8f498d2 100644 --- a/remhos_ho.hpp +++ b/remhos_ho.hpp @@ -22,6 +22,8 @@ namespace mfem { +struct TimingData; + // High-Order Solver. // Conserve mass / provide high-order convergence / may violate the bounds. class HOSolver @@ -35,6 +37,8 @@ class HOSolver virtual ~HOSolver() { } virtual void CalcHOSolution(const Vector &u, Vector &du) const = 0; + + TimingData *timer = nullptr; }; class CGHOSolver : public HOSolver @@ -53,12 +57,14 @@ class LocalInverseHOSolver : public HOSolver { protected: ParBilinearForm &M, &K; + mutable DGMassInverse *M_inv = nullptr; public: LocalInverseHOSolver(ParFiniteElementSpace &space, ParBilinearForm &Mbf, ParBilinearForm &Kbf); - virtual void CalcHOSolution(const Vector &u, Vector &du) const; + void CalcHOSolution(const Vector &u, Vector &du) const override; + ~LocalInverseHOSolver() { delete M_inv; } }; class Assembly; diff --git a/remhos_lo.cpp b/remhos_lo.cpp index efff529..b169531 100644 --- a/remhos_lo.cpp +++ b/remhos_lo.cpp @@ -246,36 +246,45 @@ void ResidualDistribution::CalcLOSolution(const Vector &u, Vector &du) const void MassBasedAvg::CalcLOSolution(const Vector &u, Vector &du) const { + timer->sw_LO.Start(); + // Compute the new HO solution. - Vector du_HO(u.Size()); ParGridFunction u_HO_new(&pfes); - ho_solver.CalcHOSolution(u, du_HO); - add(1.0, u, dt, du_HO, u_HO_new); - - // Mesh positions for the new HO solution. - ParMesh *pmesh = pfes.GetParMesh(); - GridFunction x_new(pmesh->GetNodes()->FESpace()); - // Copy the current nodes into x. - pmesh->GetNodes(x_new); - if (mesh_v) + if (du_HO) { - // Remap mode - get the positions of the mesh at time [t + dt]. - x_new.Add(dt, *mesh_v); + add(1.0, u, dt, *du_HO, u_HO_new); + du_HO = nullptr; + } + else + { + Vector du_HO(u.Size()); + ho_solver.CalcHOSolution(u, du_HO); + add(1.0, u, dt, du_HO, u_HO_new); } + // Mesh positions for the new HO solution. + ParMesh *pmesh = pfes.GetParMesh(); const int NE = pfes.GetNE(); Vector el_mass(NE), el_vol(NE); - MassesAndVolumesAtPosition(u_HO_new, x_new, el_mass, el_vol); + MassesAndVolumesAtPosition(u_HO_new, *pmesh->GetNodes(), el_mass, el_vol); const int ndofs = u.Size() / NE; - for (int k = 0; k < NE; k++) + + const auto mass = el_mass.Read(), vol = el_vol.Read(); + const auto U = mfem::Reshape(u.Read(), ndofs, NE); + auto DU = mfem::Reshape(du.Write(), ndofs, NE); + const auto δt = dt; + + mfem::forall(NE, [=] MFEM_HOST_DEVICE (int k) { - double u_LO_new = el_mass(k) / el_vol(k); + const double u_LO_new = mass[k] / vol[k]; for (int i = 0; i < ndofs; i++) { - du(k*ndofs + i) = (u_LO_new - u(k*ndofs + i)) / dt; + DU(i,k) = (u_LO_new - U(i, k)) / δt; } - } + }); + + timer->sw_LO.Stop(); } void MassBasedAvg::MassesAndVolumesAtPosition(const ParGridFunction &u, @@ -296,17 +305,22 @@ void MassBasedAvg::MassesAndVolumesAtPosition(const ParGridFunction &u, // As an L2 function, u has the correct EVector lexicographic ordering. qi_u->Values(u, u_qvals); - for (int k = 0; k < NE; k++) + const auto detJ = mfem::Reshape(geom.detJ.Read(), nqp, NE); + const auto u_q = mfem::Reshape(u_qvals.Read(), nqp, NE); + const auto weights = ir.GetWeights().Read(); + assert(NE == el_mass.Size() && NE == el_vol.Size()); + auto mass = el_mass.Write(); + auto vol = el_vol.Write(); + + mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e) { - el_mass(k) = 0.0; - el_vol(k) = 0.0; + mass[e] = 0.0, vol[e] = 0.0; for (int q = 0; q < nqp; q++) { - const IntegrationPoint &ip = ir.IntPoint(q); - el_mass(k) += ip.weight * geom.detJ(k*nqp + q) * u_qvals(k*nqp + q); - el_vol(k) += ip.weight * geom.detJ(k*nqp + q); + const auto w_detJ = weights[q] * detJ(q, e); + mass[e] += w_detJ * u_q(q, e), vol[e] += w_detJ; } - } + }); } const DofToQuad *get_maps(ParFiniteElementSpace &pfes, Assembly &asmbly) diff --git a/remhos_lo.hpp b/remhos_lo.hpp index 29d59a7..b5fecec 100644 --- a/remhos_lo.hpp +++ b/remhos_lo.hpp @@ -22,6 +22,8 @@ namespace mfem { +struct TimingData; + // Low-Order Solver. class LOSolver { @@ -37,6 +39,8 @@ class LOSolver virtual void UpdateTimeStep(double dt_new) { dt = dt_new; } virtual void CalcLOSolution(const Vector &u, Vector &du) const = 0; + + TimingData *timer = nullptr; }; class Assembly; @@ -84,18 +88,22 @@ class MassBasedAvg : public LOSolver { protected: HOSolver &ho_solver; - const GridFunction *mesh_v; - void MassesAndVolumesAtPosition(const ParGridFunction &u, - const GridFunction &x, - Vector &el_mass, Vector &el_vol) const; + // Temporary HO solution, used only in the next call to CalcLOSolution(). + mutable const Vector *du_HO = nullptr; public: - MassBasedAvg(ParFiniteElementSpace &space, HOSolver &hos, - const GridFunction *mesh_vel) - : LOSolver(space), ho_solver(hos), mesh_v(mesh_vel) { } + MassBasedAvg(ParFiniteElementSpace &space, HOSolver &hos) + : LOSolver(space), ho_solver(hos) { } + + // Temporary HO solution, used only in the next call to CalcLOSolution(). + void SetHOSolution(Vector &du) { du_HO = &du; } - virtual void CalcLOSolution(const Vector &u, Vector &du) const; + virtual void CalcLOSolution(const Vector &u, Vector &du) const; + + void MassesAndVolumesAtPosition(const ParGridFunction &u, + const GridFunction &x, + Vector &el_mass, Vector &el_vol) const; }; //PA based Residual Distribution diff --git a/remhos_main.cpp b/remhos_main.cpp new file mode 100644 index 0000000..aabb219 --- /dev/null +++ b/remhos_main.cpp @@ -0,0 +1,22 @@ +#include +#include + +// #define DBG_COLOR ::debug::kCyan +// #include "debug.hpp" + +/////////////////////////////////////////////////////////////////////////////// +int remhos(int argc, char *argv[], double &final_mass_u); + +/////////////////////////////////////////////////////////////////////////////// +int main(int argc, char* argv[]) try +{ + // dbgClearScreen(), dbg(); + double result; + return remhos(argc, argv, result); +} +catch (std::exception& e) +{ + std::cerr << "\033[31m..xxxXXX[ERROR]XXXxxx.." << std::endl; + std::cerr << "\033[31m{}" << e.what() << std::endl; + return EXIT_FAILURE; +} \ No newline at end of file diff --git a/remhos_tests.cpp b/remhos_tests.cpp new file mode 100644 index 0000000..1aa1af5 --- /dev/null +++ b/remhos_tests.cpp @@ -0,0 +1,181 @@ +#include +#include +#include +#include +#include + +// #define DBG_COLOR ::debug::kCyan +// #include "debug.hpp" + +int remhos(int, char *[], double &); + +//// /////////////////////////////////////////////////////////////////////////// +template +std::enable_if_t::is_integer, bool> +AlmostEq(T x, T y, T tolerance = 10.0*std::numeric_limits::epsilon()) +{ + const T neg = std::abs(x - y); + constexpr T min = std::numeric_limits::min(); + constexpr T eps = std::numeric_limits::epsilon(); + const T min_abs = std::min(std::abs(x), std::abs(y)); + if (std::abs(min_abs) == 0.0) { return neg < eps; } + return (neg / (1.0 + std::max(min, min_abs))) < tolerance; +} + +/////////////////////////////////////////////////////////////////////////////// +struct Test +{ + static constexpr const char *binary = "remhos "; + static constexpr const char *common = "-no-vis -vs 1"; + std::string mesh, options; + const double result = 0.0; + Test(const char * msh, const char * extra, double result) + : mesh(msh), options(std::string(extra) + common), result(result) {} + std::string Command() const { return binary + mesh + options; } +}; + +/////////////////////////////////////////////////////////////////////////////// +const Test runs[] = +{ + { + // #0 + "-m ./data/inline-quad.mesh ", + "-p 14 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.09711395400387984 + }, + { + // #1, 65536 + "-m ./data/inline-quad.mesh ", + "-p 14 -rs 4 -o 3 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.0930984399257905 + }, + { + // #2, 102400 + // RHS kernel time: 5.4988707 + // L2inv kernel time: 0.94180588 + // LO kernel time: 0.028575583 + // FCT kernel time: 0.0095909583 + // Total kernel time: 5.5370372 + "-m ./data/inline-quad.mesh ", + "-p 14 -rs 4 -o 4 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.09237630484178257 + }, + { + // #3 + "-m ./data/cube01_hex.mesh ", + "-p 10 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.11972857593296446 + }, + { + // #4 Partial assembly + "-m ./data/inline-quad.mesh ", + "-pa -p 14 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.09711395400387984 + }, + { + // #5 Partial assembly + "-m ./data/inline-quad.mesh ", + "-pa -p 14 -rs 4 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.09185717760402806 + }, + { + // #7: 3D PA + "-m ./data/cube01_hex.mesh ", + "-pa -p 10 -rs 3 -o 3 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 1 ", + 0.11601536511552431 + }, + { + // #8 + "-m ./data/star-q2.mesh ", + "-pa -p 14 -rs 1 -o 3 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.8069675186775516 + }, + { + // #9, 🔥 debug device 🔥 (last or !dup) + "-m ./data/inline-quad.mesh ", + "-d debug -pa -p 14 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.09711395400387984 + }, +#ifdef MFEM_USE_CUDA + { + // #10 + "-m ./data/inline-quad.mesh ", + "-d cuda -pa -p 14 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ", + 0.09711395400387984 + }, +#endif +}; +constexpr int N_TESTS = sizeof(runs) / sizeof(Test); + +/////////////////////////////////////////////////////////////////////////////// +using args_ptr_t = std::vector>; +using args_t = std::vector; + +/////////////////////////////////////////////////////////////////////////////// +int RemhosTest(const Test & test) +{ + static args_ptr_t args_ptr; + args_t args; + + std::istringstream iss(test.Command()); + + std::string token; + while (iss >> token) + { + auto arg_ptr = std::make_unique(token.size() + 1); + std::memcpy(arg_ptr.get(), token.c_str(), token.size() + 1); + arg_ptr[token.size()] = '\0'; + args.push_back(arg_ptr.get()); + args_ptr.emplace_back(std::move(arg_ptr)); + } + args.push_back(nullptr); + + double final_mass_u{}; + remhos(args.size()-1, args.data(), final_mass_u); + + // dbg("final_mass_u: {} vs. {}", final_mass_u, test.result); + + if (AlmostEq(final_mass_u, test.result)) { return std::cout << "✅" << std::endl, EXIT_SUCCESS; } + + return std::cout << "❌" << std::endl, EXIT_FAILURE; +} + +/////////////////////////////////////////////////////////////////////////////// +int main(int argc, char* argv[]) try +{ + // dbgClearScreen(), dbg(); + + int opt; + int test = -1; + auto show_usage = [](const int ret = EXIT_FAILURE) + { + printf("Usage: program [-a ] [-b ] [-h]\n"); + printf(" -t Optional test number \n"); + printf(" -h Show this help message\n"); + exit(ret); + }; + + while ((opt = getopt(argc, argv, "t:h")) != -1) + { + switch (opt) + { + case 't': test = std::atoi(optarg); break; + case 'h': show_usage(EXIT_SUCCESS); + default: show_usage(EXIT_FAILURE); + } + } + + if (test >= 0 && test < N_TESTS) { return RemhosTest(runs[test]); } + + for (auto & run : runs) + { + if (RemhosTest(run) != EXIT_SUCCESS) { return EXIT_FAILURE; } + } + return EXIT_SUCCESS; +} +catch (std::exception& e) +{ + std::cerr << "\033[31m..xxxXXX[ERROR]XXXxxx.." << std::endl; + std::cerr << "\033[31m{}" << e.what() << std::endl; + return EXIT_FAILURE; +} \ No newline at end of file diff --git a/remhos_tools.cpp b/remhos_tools.cpp index ddcc126..4422eb2 100644 --- a/remhos_tools.cpp +++ b/remhos_tools.cpp @@ -329,6 +329,9 @@ void SmoothnessIndicator::ComputeFromSparsity(const SparseMatrix &K, const int *I = K.GetI(), *J = K.GetJ(), loc_size = K.Size(); int end; + x_min.HostReadWrite(); + x_max.HostReadWrite(); + for (int i = 0, k = 0; i < loc_size; i++) { x_min(i) = numeric_limits::infinity(); @@ -342,8 +345,8 @@ void SmoothnessIndicator::ComputeFromSparsity(const SparseMatrix &K, } GroupCommunicator &gcomm = x.ParFESpace()->GroupComm(); - Array minvals(x_min.GetData(), x_min.Size()), - maxvals(x_max.GetData(), x_max.Size()); + Array minvals(x_min.HostReadWrite(), x_min.Size()), + maxvals(x_max.HostReadWrite(), x_max.Size()); gcomm.Reduce(minvals, GroupCommunicator::Min); gcomm.Bcast(minvals); gcomm.Reduce(maxvals, GroupCommunicator::Max); @@ -387,8 +390,8 @@ void DofInfo::ComputeMatrixSparsityBounds(const Vector &el_min, const int NE = pmesh->GetNE(); const int ndofs = dof_min.Size() / NE; - x_min.HostReadWrite(); - x_max.HostReadWrite(); + el_min.HostRead(), el_max.HostRead(); + x_min.HostReadWrite(), x_max.HostReadWrite(); x_min = el_min; x_max = el_max; @@ -438,13 +441,16 @@ void DofInfo::ComputeOverlapBounds(const Vector &el_min, // Form min/max at each CG dof, considering element overlaps. x_min = std::numeric_limits::infinity(); x_max = - std::numeric_limits::infinity(); + + el_min.HostRead(), el_max.HostRead(); + dof_min.HostReadWrite(), dof_max.HostReadWrite(); + x_min.HostReadWrite(), x_max.HostReadWrite(); + for (int i = 0; i < NE; i++) { // Inactive elements don't affect the bounds. if (active_el && (*active_el)[i] == false) { continue; } - x_min.HostReadWrite(); - x_max.HostReadWrite(); pfes_bounds.GetElementDofs(i, dofsCG); for (int j = 0; j < dofsCG.Size(); j++) { @@ -452,8 +458,8 @@ void DofInfo::ComputeOverlapBounds(const Vector &el_min, x_max(dofsCG[j]) = std::max(x_max(dofsCG[j]), el_max(i)); } } - Array minvals(x_min.GetData(), x_min.Size()); - Array maxvals(x_max.GetData(), x_max.Size()); + Array minvals(x_min.HostReadWrite(), x_min.Size()); + Array maxvals(x_max.HostReadWrite(), x_max.Size()); gcomm.Reduce(minvals, GroupCommunicator::Min); gcomm.Bcast(minvals); gcomm.Reduce(maxvals, GroupCommunicator::Max); @@ -1069,6 +1075,127 @@ void MixedConvectionIntegrator::AssembleElementMatrix2( } } +Mesh *CartesianMesh(int dim, int mpi_cnt, int elem_per_mpi, bool print, + int &par_ref, int **partitioning) +{ + MFEM_VERIFY(dim > 1, "Not implemented for 1D meshes."); + + auto factor = [&](int N) + { + for (int i = static_cast(sqrt(N)); i > 0; i--) + { if (N % i == 0) { return i; } } + return 1; + }; + + par_ref = 0; + const int ref_factor = (dim == 2) ? 4 : 8; + + // Elements per task before performing parallel refinements. + // This will be used to form the serial mesh. + int el0 = elem_per_mpi; + while (el0 % ref_factor == 0) + { + el0 /= ref_factor; + par_ref++; + } + + // In the serial mesh we have: + // The number of MPI blocks is mpi_cnt = mp_x.mpy_y.mpy_z. + // The size of each MPI block is el0 = el0_x.el0_y.el0_z. + int mpi_x, mpi_y, mpi_z; + int el0_x, el0_y, el0_z; + if (dim == 2) + { + mpi_x = factor(mpi_cnt); + mpi_y = mpi_cnt / mpi_x; + + // Switch order for better balance. + el0_y = factor(el0); + el0_x = el0 / el0_y; + } + else + { + mpi_x = factor(mpi_cnt); + mpi_y = factor(mpi_cnt / mpi_x); + mpi_z = mpi_cnt / mpi_x / mpi_y; + + // Switch order for better balance. + el0_z = factor(el0); + el0_y = factor(el0 / el0_z); + el0_x = el0 / el0_y / el0_z; + } + + if (print && dim == 2) + { + int elem_par_x = mpi_x * el0_x * pow(2, par_ref), + elem_par_y = mpi_y * el0_y * pow(2, par_ref); + + std::cout << "--- Mesh generation: \n"; + std::cout << "Par mesh: " << elem_par_x << " x " << elem_par_y + << " (" << elem_par_x * elem_par_y << " elements)\n" + << "Elem / task: " + << el0_x * pow(2, par_ref) << " x " + << el0_y * pow(2, par_ref) + << " (" << el0_x * pow(2, 2*par_ref) * el0_y << " elements)\n" + << "MPI blocks: " << mpi_x << " x " << mpi_y + << " (" << mpi_x * mpi_y << " mpi tasks)\n" << "-\n" + << "Serial mesh: " + << mpi_x * el0_x << " x " << mpi_y * el0_y + << " (" << mpi_x * el0_x * mpi_y * el0_y << " elements)\n" + << "Elem / task: " << el0_x << " x " << el0_y << std::endl + << "Par refine: " << par_ref << std::endl; + std::cout << "--- \n"; + } + + if (print && dim == 3) + { + int elem_par_x = mpi_x * el0_x * pow(2, par_ref), + elem_par_y = mpi_y * el0_y * pow(2, par_ref), + elem_par_z = mpi_z * el0_z * pow(2, par_ref); + + std::cout << "--- Mesh generation: \n"; + std::cout << "Par mesh: " + << elem_par_x << " x " << elem_par_y << " x " << elem_par_z + << " (" << elem_par_x*elem_par_y*elem_par_z << " elements)\n" + << "Elem / task: " + << el0_x * pow(2, par_ref) << " x " + << el0_y * pow(2, par_ref) << " x " + << el0_z * pow(2, par_ref) + << " (" << el0_x*pow(2, 3*par_ref)*el0_y*el0_z << " elements)\n" + << "MPI blocks: " << mpi_x << " x " << mpi_y << " x " << mpi_z + << " (" << mpi_x * mpi_y * mpi_z << " mpi tasks)\n" << "-\n" + << "Serial mesh: " + << mpi_x*el0_x << " x " << mpi_y*el0_y << " x " << mpi_z*el0_z + << " (" << mpi_x*el0_x*mpi_y*el0_y*mpi_z*el0_z << " elements)\n" + << "Elem / task: " + << el0_x << " x " << el0_y << " x " << el0_z << std::endl + << "Par refine: " << par_ref << std::endl; + std::cout << "--- \n"; + } + + Mesh *mesh; + if (dim == 2) + { + mesh = new Mesh(Mesh::MakeCartesian2D(mpi_x * el0_x, + mpi_y * el0_y, + Element::QUADRILATERAL, true)); + } + else + { + mesh = new Mesh(Mesh::MakeCartesian3D(mpi_x * el0_x, + mpi_y * el0_y, + mpi_z * el0_z, + Element::HEXAHEDRON, true)); + } + + int nxyz[dim]; + if (dim == 2) { nxyz[0] = mpi_x; nxyz[1] = mpi_y; } + else { nxyz[0] = mpi_x; nxyz[1] = mpi_y; nxyz[2] = mpi_z; } + *partitioning = mesh->CartesianPartitioning(nxyz); + + return mesh; +} + int GetLocalFaceDofIndex3D(int loc_face_id, int face_orient, int face_dof_id, int face_dof1D_cnt) { diff --git a/remhos_tools.hpp b/remhos_tools.hpp index 624c8d7..6b29897 100644 --- a/remhos_tools.hpp +++ b/remhos_tools.hpp @@ -23,6 +23,9 @@ namespace mfem { +Mesh *CartesianMesh(int dim, int mpi_cnt, int elem_per_mpi, bool print, + int &par_ref, int **partitioning); + int GetLocalFaceDofIndex(int dim, int loc_face_id, int face_orient, int face_dof_id, int face_dof1D_cnt); @@ -46,6 +49,20 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport, class DofInfo; +struct TimingData +{ + // Total times for all major computations. + StopWatch sw_rhs, sw_L2inv, sw_LO, sw_FCT; + + // Store the number of local L2 dofs. + const HYPRE_Int L2dof; + + // Iterations for the L2 mass inversion. + HYPRE_Int L2iter; + + TimingData(const HYPRE_Int l2d) : L2dof(l2d) { } +}; + class SmoothnessIndicator { private: