diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 1105b3e..d43763d 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -115,7 +115,7 @@ jobs:
 
     - name: Archive test results patch
       if: always()
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
       with:
         name: baseline-patch
         path: remhos/baseline.patch
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7ac56b3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+remhos
+*.dat
+*.mesh
+*.o
+*.gf
+*.svg
+.DS_Store
+/build/
+/.cache/
+/.vscode/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..7ddec8b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,85 @@
+cmake_minimum_required(VERSION 3.18)
+
+set(project remhos)
+project(${project} LANGUAGES CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# CXX flags *******************************************************************
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    add_compile_options(-fsanitize=address -O0)
+    add_link_options(-fsanitize=address)
+endif()
+
+# remove -DNDEBUG from default RelWithDebInfo flags
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2" CACHE STRING "" FORCE)
+
+# Verbosity options ***********************************************************
+set(CMAKE_VERBOSE_MAKEFILE OFF CACHE BOOL "" FORCE)
+# set(CUDA_VERBOSE_BUILD OFF CACHE BOOL "" FORCE)
+
+# Include paths ***************************************************************
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../mfem)
+if (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+include_directories(/usr/include/hypre /usr/include)
+elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+include_directories(/opt/homebrew/opt/fmt/include
+                    /opt/homebrew/opt/openmpi/include
+                    /opt/homebrew/opt/metis/include
+                    /opt/homebrew/opt/hypre/include)
+add_compile_definitions(MFEM_USE_CMAKE_TESTS)
+else()
+    message(FATAL_ERROR "Unsupported system")
+endif()
+
+# Copy mesh files *************************************************************
+file(GLOB SRC_MESH_FILES LIST_DIRECTORIES false data/*.mesh)
+set(BUILD_DATA_DIR ${CMAKE_CURRENT_BINARY_DIR}/data)
+add_custom_command(OUTPUT data_is_copied
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${BUILD_DATA_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${SRC_MESH_FILES} ${BUILD_DATA_DIR}
+    COMMAND ${CMAKE_COMMAND} -E touch data_is_copied
+    COMMENT "Copying mesh files ...")
+add_custom_target(copy_mesh_files DEPENDS data_is_copied)
+
+# Library source files ********************************************************
+add_library(Remhos STATIC remhos.cpp remhos_fct.cpp 
+                          remhos_ho.cpp remhos_lo.cpp 
+                          remhos_mono.cpp remhos_sync.cpp 
+                          remhos_tools.cpp)
+
+# Testing options *************************************************************
+enable_testing()
+function(add_remhos_test name)
+    set(file ${name}.cpp)
+    set(target ${name})
+    list(LENGTH ARGN ARGN_COUNT)
+    if(ARGN_COUNT GREATER 0) 
+        list(GET ARGN 0 extra)
+        string(APPEND name "_" ${extra})
+        string(APPEND target "_" ${extra})
+    endif()
+    message(STATUS "Adding target: ${target}")
+    add_executable(${target} ${file})
+    add_dependencies(${target} copy_mesh_files)
+    target_link_libraries(${target} Remhos -lmpi -lmfem -lhypre -lmetis -lfmt)
+if (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    target_link_directories(${target} PUBLIC /usr/lib/x86_64-linux-gnu) 
+elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    target_link_directories(${target} PUBLIC /opt/homebrew/opt/openmpi/lib
+                                          /opt/homebrew/opt/metis/lib
+                                          /opt/homebrew/opt/hypre/lib
+                                          /opt/homebrew/opt/fmt/lib) 
+endif()
+    target_link_directories(${target} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../mfem)
+    message(STATUS "Adding test: ${target}_n1")
+    add_test(NAME ${target}_n1  COMMAND mpirun -n 1 ${target} ${extra})
+    message(STATUS "Adding test: ${target}_n3")
+    add_test(NAME ${target}_n3 COMMAND mpirun -n 3 ${target})
+endfunction()
+
+add_remhos_test(remhos_main)
+add_remhos_test(remhos_tests)
\ No newline at end of file
diff --git a/makefile b/makefile
index 81957b7..1434602 100644
--- a/makefile
+++ b/makefile
@@ -64,6 +64,18 @@ TEST_MK = $(MFEM_DIR)/config/test.mk
 MFEM_DIR1 := $(MFEM_DIR)
 MFEM_DIR2 := $(realpath $(MFEM_DIR))
 
+# Use Caliper annotations
+
+ifdef CALIPER_DIR
+CALIPER_DIR = $(spack location --install-dir caliper)
+ADIAK_DIR = $(spack location --install-dir adiak)
+CALIPER_FLAGS = -I${CALIPER_DIR}/include -DUSE_CALIPER
+ADIAK_INCLUDE = -I${ADIAK_DIR}/include 
+ADIAK_LDFLAGS =  -L${ADIAK_DIR}/lib -ladiak
+CALIPER_LDFLAGS =  -L${CALIPER_DIR}/lib64 -lcaliper 
+endif
+
+
 # Use the compiler used by MFEM. Get the compiler and the options for compiling
 # and linking from MFEM's config.mk. (Skip this if the target does not require
 # building.)
@@ -73,7 +85,7 @@ ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS)))
 endif
 
 CXX = $(MFEM_CXX)
-CPPFLAGS = $(MFEM_CPPFLAGS)
+CPPFLAGS = $(MFEM_CPPFLAGS) $(CALIPER_FLAGS) $(ADIAK_FLAGS)
 CXXFLAGS = $(MFEM_CXXFLAGS)
 
 # MFEM config does not define C compiler
@@ -81,7 +93,8 @@ CC     = gcc
 CFLAGS = -O3
 
 # Optional link flags
-LDFLAGS =
+LDFLAGS = $(CALIPER_LDFLAGS) $(ADIAK_LDFLAGS)
+
 
 OPTIM_OPTS = -O3
 DEBUG_OPTS = -g -Wall -std=c++11
@@ -102,7 +115,8 @@ ifeq ($(REMHOS_DEBUG),YES)
 endif
 
 LIBS = $(strip $(REMHOS_LIBS) $(LDFLAGS))
-CCC  = $(strip $(CXX) $(REMHOS_FLAGS))
+EXTRA_INC_DIR = $(or $(wildcard $(MFEM_DIR)/include/mfem),$(MFEM_DIR))
+CCC  = $(strip $(CXX) $(REMHOS_FLAGS) $(if $(EXTRA_INC_DIR),-I$(EXTRA_INC_DIR)))
 Ccc  = $(strip $(CC) $(CFLAGS) $(GL_OPTS))
 
 SOURCE_FILES = remhos.cpp remhos_tools.cpp remhos_lo.cpp remhos_ho.cpp \
diff --git a/remhos.cpp b/remhos.cpp
index 1bda644..de1e514 100644
--- a/remhos.cpp
+++ b/remhos.cpp
@@ -39,14 +39,27 @@
 #include "remhos_tools.hpp"
 #include "remhos_sync.hpp"
 
+#ifdef USE_CALIPER
+#include <caliper/cali.h>
+#include <caliper/cali-manager.h>
+//#include <adiak.hpp>
+#ifdef HAVE_MPI
+#include <caliper/cali-mpi.h>
+#endif
+#endif
+
 using namespace std;
 using namespace mfem;
 
+#define REMHOS_GPU_SETUP
+
 enum class HOSolverType {None, Neumann, CG, LocalInverse};
 enum class FCTSolverType {None, FluxBased, ClipScale,
-                          NonlinearPenalty, FCTProject};
+                          NonlinearPenalty, FCTProject
+                         };
 enum class LOSolverType {None,    DiscrUpwind,    DiscrUpwindPrec,
-                         ResDist, ResDistSubcell, MassBased};
+                         ResDist, ResDistSubcell, MassBased
+                        };
 enum class MonolithicSolverType {None, ResDistMono, ResDistMonoSubcell};
 
 enum class TimeStepControl {FixedTimeStep, LOBoundsError};
@@ -72,13 +85,16 @@ double inflow_function(const Vector &x);
 // Mesh bounding box
 Vector bb_min, bb_max;
 
+//Caliper setup
+void setupCaliper();
+
 class AdvectionOperator : public TimeDependentOperator
 {
 private:
    BilinearForm &Mbf, &ml;
    ParBilinearForm &Kbf;
    ParBilinearForm &M_HO, &K_HO;
-   Vector &lumpedM;
+   Vector &lumpedM, ones;
 
    Vector start_mesh_pos, start_submesh_pos;
    GridFunction &mesh_pos, *submesh_pos, &mesh_vel, &submesh_vel;
@@ -88,6 +104,8 @@ class AdvectionOperator : public TimeDependentOperator
    double dt;
    TimeStepControl dt_control;
    mutable double dt_est;
+   mutable TimingData timer;
+
    Assembly &asmbl;
 
    LowOrderMethod &lom;
@@ -132,16 +150,30 @@ class AdvectionOperator : public TimeDependentOperator
       start_submesh_pos = sm_pos;
    }
 
+   void PrintTimingData(int steps) const;
+
    virtual ~AdvectionOperator() { }
 };
 
+#ifndef MFEM_USE_CMAKE_TESTS
+int remhos(int, char *[], double &);
+
 int main(int argc, char *argv[])
+{
+   double final_mass_u = M_PI; // unused
+   return remhos(argc, argv, final_mass_u);
+}
+#endif // MFEM_USE_CMAKE_TESTS
+
+MFEM_EXPORT int remhos(int argc, char *argv[], double &final_mass_u)
 {
    // Initialize MPI.
    mfem::MPI_Session mpi(argc, argv);
    const int myid = mpi.WorldRank();
 
-   const char *mesh_file = "data/periodic-square.mesh";
+   const char *mesh_file = "default";
+   int dim = 3;
+   int elem_per_mpi = 1;
    int rs_levels = 2;
    int rp_levels = 0;
    int order = 3;
@@ -158,7 +190,9 @@ int main(int argc, char *argv[])
    double t_final = 4.0;
    TimeStepControl dt_control = TimeStepControl::FixedTimeStep;
    double dt = 0.005;
+   int max_tsteps = -1;
    bool visualization = true;
+   bool save_meshes_and_solution = false;
    bool visit = false;
    bool verify_bounds = false;
    bool product_sync = false;
@@ -171,6 +205,9 @@ int main(int argc, char *argv[])
    OptionsParser args(argc, argv);
    args.AddOption(&mesh_file, "-m", "--mesh",
                   "Mesh file to use.");
+   args.AddOption(&dim, "-dim", "--dimension", "Dimension of the problem.");
+   args.AddOption(&elem_per_mpi, "-epm", "--elem-per-mpi",
+                  "Number of element per mpi task.");
    args.AddOption(&problem_num, "-p", "--problem",
                   "Problem setup to use. See options in velocity_function().");
    args.AddOption(&rs_levels, "-rs", "--refine-serial",
@@ -227,9 +264,14 @@ int main(int argc, char *argv[])
                   "                   1 - Bounds violation of the LO sltn.");
    args.AddOption(&dt, "-dt", "--time-step",
                   "Initial time step size (dt might change based on -dtc).");
+   args.AddOption(&max_tsteps, "-ms", "--max-steps",
+                  "Maximum number of steps (negative means no restriction).");
    args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
                   "--no-visualization",
                   "Enable or disable GLVis visualization.");
+   args.AddOption(&save_meshes_and_solution, "-save", "--save-meshes-and-solution",
+                  "-no-save", "--save-meshes-and-solution",
+                  "Print the final meshes and solution.");
    args.AddOption(&visit, "-visit", "--visit-datafiles", "-no-visit",
                   "--no-visit-datafiles",
                   "Save data files for VisIt (visit.llnl.gov) visualization.");
@@ -249,6 +291,25 @@ int main(int argc, char *argv[])
    }
    if (myid == 0) { args.PrintOptions(cout); }
 
+//setup caliper config manager
+#ifdef USE_CALIPER
+   setupCaliper();
+
+   cali::ConfigManager calimgr;
+   if (calimgr.error())
+       std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl;
+   calimgr.start();
+   //adiak::init(nullptr);
+   //adiak::cmdline();
+   //adiak::hostname();
+#endif
+    
+#ifdef REMHOS_GPU_SETUP
+   MFEM_VERIFY(ho_type  == HOSolverType::LocalInverse &&
+               lo_type  == LOSolverType::MassBased &&
+               fct_type == FCTSolverType::ClipScale, "Wrong GPU setup.");
+#endif
+
    // Enable hardware devices such as GPUs, and programming models such as
    // CUDA, OCCA, RAJA and OpenMP based on command line options.
    Device device(device_config);
@@ -259,29 +320,47 @@ int main(int argc, char *argv[])
    else if (problem_num < 20) { exec_mode = 1; }
    else { MFEM_ABORT("Unspecified execution mode."); }
 
-   // Read the serial mesh from the given mesh file on all processors.
-   // Refine the mesh in serial to increase the resolution.
-   Mesh *mesh = new Mesh(Mesh::LoadFromFile(mesh_file, 1, 1));
-   const int dim = mesh->Dimension();
-   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
-   mesh->GetBoundingBox(bb_min, bb_max, max(order, 1));
-
-   // Only standard assembly in 1D (some mfem functions just abort in 1D).
-   if ((pa || next_gen_full) && dim == 1)
+   Mesh *mesh = nullptr;
+   int *mpi_partitioning = nullptr;
+   if (strncmp(mesh_file, "default", 7) != 0)
    {
-      MFEM_WARNING("Disabling PA / FA for 1D.");
-      pa = false;
-      next_gen_full = false;
+      // Read the serial mesh from the given mesh file on all processors.
+      // Refine the mesh in serial to increase the resolution.
+      mesh = new Mesh(Mesh::LoadFromFile(mesh_file, 1, 1));
+      for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
+   }
+   else
+   {
+      mesh = CartesianMesh(dim, Mpi::WorldSize(), elem_per_mpi, myid == 0,
+                           rp_levels, &mpi_partitioning);
    }
+   dim = mesh->Dimension();
+   mesh->GetBoundingBox(bb_min, bb_max, max(order, 1));
 
    // Parallel partitioning of the mesh.
    // Refine the mesh further in parallel to increase the resolution.
-   ParMesh pmesh(MPI_COMM_WORLD, *mesh);
+   ParMesh pmesh(MPI_COMM_WORLD, *mesh, mpi_partitioning);
    delete mesh;
+   delete mpi_partitioning;
    for (int lev = 0; lev < rp_levels; lev++) { pmesh.UniformRefinement(); }
    MPI_Comm comm = pmesh.GetComm();
    const int NE  = pmesh.GetNE();
 
+   if (strncmp(mesh_file, "default", 7) == 0)
+   {
+      MFEM_VERIFY(pmesh.GetGlobalNE() == Mpi::WorldSize() * elem_per_mpi,
+                  "Mesh generation error.");
+      MFEM_VERIFY(NE == elem_per_mpi, "Mesh generation error.");
+   }
+
+   // Only standard assembly in 1D (some mfem functions just abort in 1D).
+   if ((pa || next_gen_full) && dim == 1)
+   {
+      MFEM_WARNING("Disabling PA / FA for 1D.");
+      pa = false;
+      next_gen_full = false;
+   }
+
    // Define the ODE solver used for time integration. Several explicit
    // Runge-Kutta methods are available.
    ODESolver *ode_solver = NULL;
@@ -361,14 +440,24 @@ int main(int argc, char *argv[])
       v.ProjectCoefficient(vcoeff);
 
       double t = 0.0;
+#ifdef USE_CALIPER
+      CALI_CXX_MARK_LOOP_BEGIN(mainloop, "rem.mainloop");
+#endif
       while (t < t_final)
       {
+#ifdef USE_CALIPER
+         CALI_CXX_MARK_LOOP_ITERATION(mainloop, t);
+#endif
          t += dt;
          // Move the mesh nodes.
          x.Add(std::min(dt, t_final-t), v);
          // Update the node velocities.
          v.ProjectCoefficient(vcoeff);
       }
+#ifdef USE_CALIPER
+      CALI_CXX_MARK_LOOP_END(mainloop);
+#endif
+
 
       // Pseudotime velocity.
       add(x, -1.0, x0, v_gf);
@@ -476,6 +565,9 @@ int main(int argc, char *argv[])
    {
       M_HO.SetAssemblyLevel(AssemblyLevel::PARTIAL);
       K_HO.SetAssemblyLevel(AssemblyLevel::PARTIAL);
+
+      k.SetAssemblyLevel(AssemblyLevel::PARTIAL);
+      m.SetAssemblyLevel(AssemblyLevel::PARTIAL);
    }
 
    if (next_gen_full)
@@ -493,12 +585,13 @@ int main(int argc, char *argv[])
    }
 
    // Compute the lumped mass matrix.
-   Vector lumpedM;
    ParBilinearForm ml(&pfes);
    ml.AddDomainIntegrator(new LumpedIntegrator(new MassIntegrator));
-   ml.Assemble();
-   ml.Finalize();
-   ml.SpMat().GetDiag(lumpedM);
+   if (!pa)
+   {
+      ml.Assemble();
+      ml.Finalize();
+   }
 
    m.Assemble();
    m.Finalize();
@@ -506,6 +599,16 @@ int main(int argc, char *argv[])
    k.Assemble(skip_zeros);
    k.Finalize(skip_zeros);
 
+   Vector lumpedM;
+   if (!pa) { ml.SpMat().GetDiag(lumpedM); }
+   else
+   {
+      lumpedM.SetSize(m.Height());
+      Vector ones(m.Height());
+      ones = 1.0;
+      m.Mult(ones, lumpedM);
+   }
+
    // Store topological dof data.
    DofInfo dofs(pfes, bounds_type);
 
@@ -770,8 +873,7 @@ int main(int argc, char *argv[])
    {
       MFEM_VERIFY(ho_solver != nullptr,
                   "Mass-Based LO solver requires a choice of a HO solver.");
-      lo_solver = new MassBasedAvg(pfes, *ho_solver,
-                                   (exec_mode == 1) ? &v_gf : nullptr);
+      lo_solver = new MassBasedAvg(pfes, *ho_solver);
    }
 
    // Setup of the monolithic solver (if any).
@@ -793,18 +895,21 @@ int main(int argc, char *argv[])
    }
 
    // Print the starting meshes and initial condition.
-   ofstream meshHO("meshHO_init.mesh");
-   meshHO.precision(precision);
-   pmesh.PrintAsOne(meshHO);
-   if (subcell_mesh)
+   if (save_meshes_and_solution)
    {
-      ofstream meshLO("meshLO_init.mesh");
-      meshLO.precision(precision);
-      subcell_mesh->PrintAsOne(meshLO);
+      ofstream meshHO("meshHO_init.mesh");
+      meshHO.precision(precision);
+      pmesh.PrintAsOne(meshHO);
+      if (subcell_mesh)
+      {
+         ofstream meshLO("meshLO_init.mesh");
+         meshLO.precision(precision);
+         subcell_mesh->PrintAsOne(meshLO);
+      }
+      ofstream sltn("sltn_init.gf");
+      sltn.precision(precision);
+      u.SaveAsOne(sltn);
    }
-   ofstream sltn("sltn_init.gf");
-   sltn.precision(precision);
-   u.SaveAsOne(sltn);
 
    // Create data collection for solution output: either VisItDataCollection for
    // ASCII data files, or SidreDataCollection for binary data files.
@@ -1062,6 +1167,8 @@ int main(int argc, char *argv[])
          else { res = u; }
       }
 
+      if (ti_total == max_tsteps) { done = true; }
+
       if (done || ti % vis_steps == 0)
       {
          if (myid == 0)
@@ -1096,6 +1203,16 @@ int main(int argc, char *argv[])
       }
    }
 
+   int steps = ti_total;
+   switch (ode_solver_type)
+   {
+   case 2: steps *= 2; break;
+   case 3: steps *= 3; break;
+   case 4: steps *= 4; break;
+   case 6: steps *= 6; break;
+   }
+   adv.PrintTimingData(steps);
+
    if (dt_control != TimeStepControl::FixedTimeStep && myid == 0)
    {
       cout << "Total time steps: " << ti_total
@@ -1103,6 +1220,7 @@ int main(int argc, char *argv[])
    }
 
    // Print the final meshes and solution.
+   if (save_meshes_and_solution)
    {
       ofstream meshHO("meshHO_final.mesh");
       meshHO.precision(precision);
@@ -1122,10 +1240,23 @@ int main(int argc, char *argv[])
    double mass_u_loc = 0.0, mass_us_loc = 0.0;
    if (exec_mode == 1)
    {
-      ml.BilinearForm::operator=(0.0);
-      ml.Assemble();
-      lumpedM.HostRead();
-      ml.SpMat().GetDiag(lumpedM);
+      // Using lumped diagonal for mass conservation.
+      if (!pa)
+      {
+         ml.BilinearForm::operator=(0.0);
+         ml.Assemble();
+         lumpedM.HostRead();
+         ml.SpMat().GetDiag(lumpedM);
+      }
+      else
+      {
+         ParBilinearForm m(&pfes);
+         m.AddDomainIntegrator(new MassIntegrator);
+         m.Assemble();
+         Vector ones(m.Height());
+         ones = 1.0;
+         m.Mult(ones, lumpedM);
+      }
       mass_u_loc = lumpedM * u;
       if (product_sync) { mass_us_loc = lumpedM * us; }
    }
@@ -1136,6 +1267,7 @@ int main(int argc, char *argv[])
    }
    double mass_u, mass_us = 0.0, s_max = 0.0;
    MPI_Allreduce(&mass_u_loc, &mass_u, 1, MPI_DOUBLE, MPI_SUM, comm);
+   final_mass_u = mass_u;
    const double umax_loc = u.Max();
    MPI_Allreduce(&umax_loc, &umax, 1, MPI_DOUBLE, MPI_MAX, comm);
    if (product_sync)
@@ -1194,7 +1326,7 @@ int main(int argc, char *argv[])
       }
    }
 
-   if (smth_indicator)
+   if (smth_indicator && save_meshes_and_solution)
    {
       // Print the values of the smoothness indicator.
       ParGridFunction si_val;
@@ -1228,7 +1360,9 @@ int main(int argc, char *argv[])
       delete lom.SubFes1;
       delete lom.VolumeTerms;
    }
-
+#ifdef USE_CALIPER
+   calimgr.flush();
+#endif
    return 0;
 }
 
@@ -1245,12 +1379,21 @@ AdvectionOperator::AdvectionOperator(int size, BilinearForm &Mbf_,
    TimeDependentOperator(size), Mbf(Mbf_), ml(_ml), Kbf(Kbf_),
    M_HO(M_HO_), K_HO(K_HO_),
    lumpedM(_lumpedM),
+   ones(lumpedM.Size()),
    start_mesh_pos(pos.Size()), start_submesh_pos(sub_vel.Size()),
    mesh_pos(pos), submesh_pos(sub_pos),
    mesh_vel(vel), submesh_vel(sub_vel),
    x_gf(Kbf.ParFESpace()),
+   timer(M_HO.Size()),
    asmbl(_asmbl), lom(_lom), dofs(_dofs),
-   ho_solver(hos), lo_solver(los), fct_solver(fct), mono_solver(mos) { }
+   ho_solver(hos), lo_solver(los), fct_solver(fct), mono_solver(mos)
+{
+   if (ho_solver)  { ho_solver->timer  = &timer; }
+   if (lo_solver)  { lo_solver->timer  = &timer; }
+   if (fct_solver) { fct_solver->timer = &timer; }
+
+   ones = 1.0;
+}
 
 void AdvectionOperator::Mult(const Vector &X, Vector &Y) const
 {
@@ -1266,21 +1409,33 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const
       // Reset precomputed geometric data.
       Mbf.FESpace()->GetMesh()->DeleteGeometricFactors();
 
+#ifndef REMHOS_GPU_SETUP
       // Reassemble on the new mesh. Element contributions.
       // Currently needed to have the sparse matrices used by the LO methods.
       Mbf.BilinearForm::operator=(0.0);
       Mbf.Assemble();
       Kbf.BilinearForm::operator=(0.0);
       Kbf.Assemble(0);
-      ml.BilinearForm::operator=(0.0);
-      ml.Assemble();
-      lumpedM.HostReadWrite();
-      ml.SpMat().GetDiag(lumpedM);
+#endif
 
+      timer.sw_L2inv.Start();
       M_HO.BilinearForm::operator=(0.0);
       M_HO.Assemble();
+      timer.sw_L2inv.Stop();
+
+      if (Mbf.GetAssemblyLevel() != AssemblyLevel::PARTIAL)
+      {
+         ml.BilinearForm::operator=(0.0);
+         ml.Assemble();
+         lumpedM.HostReadWrite();
+         ml.SpMat().GetDiag(lumpedM);
+      }
+      else { M_HO.Mult(ones, lumpedM); }
+
+      timer.sw_rhs.Start();
       K_HO.BilinearForm::operator=(0.0);
       K_HO.Assemble(0);
+      timer.sw_rhs.Stop();
 
       if (lom.pk)
       {
@@ -1288,6 +1443,7 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const
          lom.pk->Assemble();
       }
 
+#ifndef REMHOS_GPU_SETUP
       // Face contributions.
       asmbl.bdrInt = 0.;
       Mesh *mesh = M_HO.FESpace()->GetMesh();
@@ -1317,6 +1473,7 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const
             }
          }
       }
+#endif
    }
 
    const int size = Kbf.ParFESpace()->GetVSize();
@@ -1339,11 +1496,20 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const
    {
       MFEM_VERIFY(ho_solver && lo_solver, "FCT requires HO and LO solvers.");
 
-      lo_solver->CalcLOSolution(u, du_LO);
+      // High-order solution.
       ho_solver->CalcHOSolution(u, du_HO);
 
+      auto mba = dynamic_cast<MassBasedAvg *>(lo_solver);
+      if (mba) { mba->SetHOSolution(du_HO); }
+
+      // Low-order solution.
+      lo_solver->CalcLOSolution(u, du_LO);
+
+      // Computation of bounds and FCT blending.
+      timer.sw_FCT.Start();
       dofs.ComputeElementsMinMax(u, dofs.xe_min, dofs.xe_max, NULL, NULL);
       dofs.ComputeBounds(dofs.xe_min, dofs.xe_max, dofs.xi_min, dofs.xi_max);
+      timer.sw_FCT.Stop();
       fct_solver->CalcFCTSolution(x_gf, lumpedM, du_HO, du_LO,
                                   dofs.xi_min, dofs.xi_max, d_u);
 
@@ -1442,6 +1608,39 @@ void AdvectionOperator::Mult(const Vector &X, Vector &Y) const
    }
 }
 
+void AdvectionOperator::PrintTimingData(int steps) const
+{
+   const MPI_Comm com = M_HO.ParFESpace()->GetComm();
+   const int myid = M_HO.ParFESpace()->GetMyRank();
+
+   double my_rt[5], T[5];
+   my_rt[0] = timer.sw_rhs.RealTime();
+   my_rt[1] = timer.sw_L2inv.RealTime();
+   my_rt[2] = timer.sw_LO.RealTime();
+   my_rt[3] = timer.sw_FCT.RealTime();
+   my_rt[4] = my_rt[0] + my_rt[2] + my_rt[3];
+   MPI_Reduce(my_rt, T, 5, MPI_DOUBLE, MPI_MAX, 0, com);
+
+   const HYPRE_BigInt dofs_steps = M_HO.ParFESpace()->GlobalVSize() * steps;
+
+   if (myid == 0)
+   {
+      std::cout << "---" << std::endl;
+      std::cout << "RHS   kernel time: " << T[0] << std::endl
+                << "L2inv kernel time: " << T[1] << std::endl
+                << "LO    kernel time: " << T[2] << std::endl
+                << "FCT   kernel time: " << T[3] << std::endl
+                << "Total kernel time: " << T[4] << std::endl;
+      std::cout << "---" << std::endl;
+      std::cout << "FOM RHS: " << 1e-6 * dofs_steps / T[0] << std::endl
+                << "FOM INV: " << 1e-6 * dofs_steps / T[1] << std::endl
+                << "FOM LO:  " << 1e-6 * dofs_steps / T[2] << std::endl
+                << "FOM FCT: " << 1e-6 * dofs_steps / T[3] << std::endl
+                << "FOM:     " << 1e-6 * dofs_steps / T[4] << std::endl;
+      std::cout << "(megadofs x time steps / second)\n---" << std::endl;
+   }
+}
+
 void AdvectionOperator::UpdateTimeStepEstimate(const Vector &x,
                                                const Vector &dx,
                                                const Vector &x_min,
@@ -1453,6 +1652,7 @@ void AdvectionOperator::UpdateTimeStepEstimate(const Vector &x,
    int n = x.Size();
    const double eps = 1e-12;
    double dt = numeric_limits<double>::infinity();
+   x_min.HostRead(), x_max.HostRead(), x.HostRead(), dx.HostRead();
 
    for (int i = 0; i < n; i++)
    {
@@ -1854,3 +2054,19 @@ double inflow_function(const Vector &x)
    }
    else { return 0.0; }
 }
+
+void setupCaliper()
+{
+#ifdef USE_CALIPER
+#ifdef HAVE_MPI
+   cali_mpi_init();
+#endif
+
+   cali_config_preset("CALI_LOG_VERBOSITY", "0");
+   cali_config_preset("CALI_CALIPER_ATTRIBUTE_DEFAULT_SCOPE", "process");
+
+   //cali_set_global_string_byname("rem.git_vers", GIT_VERS);
+   //cali_set_global_string_byname("rem.git_hash", GIT_HASH);
+#endif
+}
+
diff --git a/remhos_fct.cpp b/remhos_fct.cpp
index be2454b..d3cb1cc 100644
--- a/remhos_fct.cpp
+++ b/remhos_fct.cpp
@@ -56,8 +56,8 @@ void FCTSolver::CalcCompatibleLOProduct(const ParGridFunction &us,
       double s_avg = mass_us / mass_u;
 
       // Min and max of s using the full stencil of active dofs.
-      s_min_loc.SetDataAndSize(s_min.GetData() + k*ndofs, ndofs);
-      s_max_loc.SetDataAndSize(s_max.GetData() + k*ndofs, ndofs);
+      s_min_loc.SetDataAndSize(s_min.HostReadWrite() + k*ndofs, ndofs);
+      s_max_loc.SetDataAndSize(s_max.HostReadWrite() + k*ndofs, ndofs);
       double smin = numeric_limits<double>::infinity(),
              smax = -numeric_limits<double>::infinity();
       for (int j = 0; j < ndofs; j++)
@@ -484,78 +484,93 @@ void ClipScaleSolver::CalcFCTSolution(const ParGridFunction &u, const Vector &m,
                                       const Vector &u_min, const Vector &u_max,
                                       Vector &du) const
 {
+   timer->sw_FCT.Start();
+
    const int NE = pfes.GetMesh()->GetNE();
    const int nd = pfes.GetFE(0)->GetDof();
-   Vector f_clip(nd);
-
-   int dof_id;
-   double sumPos, sumNeg, u_new_ho, u_new_lo, new_mass, f_clip_min, f_clip_max;
-   double umin, umax;
-   const double eps = 1.0e-15;
+   Vector f_clip(nd*NE);
 
    // Smoothness indicator.
    ParGridFunction si_val;
+
    if (smth_indicator)
    {
+      MFEM_ABORT("smth_indicator not supported in kernel!");
       smth_indicator->ComputeSmoothnessIndicator(u, si_val);
    }
 
-   u.HostRead();
-   m.HostRead();
-   du.HostReadWrite();
-   du_lo.HostRead(); du_ho.HostRead();
-   for (int k = 0; k < NE; k++)
+   const auto δt = dt;
+
+   const auto U = mfem::Reshape(u.Read(), NE*nd);
+   const auto M = mfem::Reshape(m.Read(), NE*nd);
+   const auto DU_LO = mfem::Reshape(du_lo.Read(), NE*nd);
+   const auto DU_HO = mfem::Reshape(du_ho.Read(), NE*nd);
+   const auto U_MIN = mfem::Reshape(u_min.Read(), NE*nd);
+   const auto U_MAX = mfem::Reshape(u_max.Read(), NE*nd);
+
+   auto F_CLIP = mfem::Reshape(f_clip.Write(), NE, nd);
+   auto DU = mfem::Reshape(du.Write(), NE*nd);
+
+   const bool update_bounds = smth_indicator != nullptr;
+   if (update_bounds) MFEM_ABORT("UpdateBounds not ported to device")
+
+      mfem::forall(NE, [=] MFEM_HOST_DEVICE (int k)
    {
-      sumPos = sumNeg = 0.0;
+      constexpr auto eps = 1.0e-15;
+      double sumPos = 0.0, sumNeg = 0.0;
 
       // Clip.
       for (int j = 0; j < nd; j++)
       {
-         dof_id = k*nd+j;
+         const int dof_id = k*nd+j;
+
+         const auto u_new_lo = U(dof_id) + δt * DU_LO(dof_id);
 
-         u_new_ho   = u(dof_id) + dt * du_ho(dof_id);
-         u_new_lo   = u(dof_id) + dt * du_lo(dof_id);
+         auto umin = U_MIN(dof_id);
+         auto umax = U_MAX(dof_id);
 
-         umin = u_min(dof_id);
-         umax = u_max(dof_id);
-         if (smth_indicator)
+#if !defined(MFEM_USE_CUDA) && !defined(MFEM_USE_HIP)
+         if (update_bounds)
          {
+            const auto u_new_ho   = U(dof_id) + δt * DU_HO(dof_id);
             smth_indicator->UpdateBounds(dof_id, u_new_ho, si_val, umin, umax);
          }
+#endif
 
-         f_clip_min = m(dof_id) / dt * (umin - u_new_lo);
-         f_clip_max = m(dof_id) / dt * (umax - u_new_lo);
+         const auto f_clip_min = M(dof_id) / δt * (umin - u_new_lo);
+         const auto f_clip_max = M(dof_id) / δt * (umax - u_new_lo);
 
-         f_clip(j) = m(dof_id) * (du_ho(dof_id) - du_lo(dof_id));
-         f_clip(j) = min(f_clip_max, max(f_clip_min, f_clip(j)));
+         F_CLIP(k,j) = M(dof_id) * (DU_HO(dof_id) - DU_LO(dof_id));
+         F_CLIP(k,j) = fmin(f_clip_max, fmax(f_clip_min, F_CLIP(k,j)));
 
-         sumNeg += min(f_clip(j), 0.0);
-         sumPos += max(f_clip(j), 0.0);
+         sumNeg += fmin(F_CLIP(k,j), 0.0);
+         sumPos += fmax(F_CLIP(k,j), 0.0);
       }
 
-      new_mass = sumNeg + sumPos;
+      double new_mass = sumNeg + sumPos;
 
       // Rescale.
       for (int j = 0; j < nd; j++)
       {
          if (new_mass > eps)
          {
-            f_clip(j) = min(0.0, f_clip(j)) -
-                        max(0.0, f_clip(j)) * sumNeg / sumPos;
+            F_CLIP(k,j) = fmin(0.0, F_CLIP(k,j)) -
+                          fmax(0.0, F_CLIP(k,j)) * sumNeg / sumPos;
          }
          if (new_mass < -eps)
          {
-            f_clip(j) = max(0.0, f_clip(j)) -
-                        min(0.0, f_clip(j)) * sumPos / sumNeg;
+            F_CLIP(k,j) = fmax(0.0, F_CLIP(k,j)) -
+                          fmin(0.0, F_CLIP(k,j)) * sumPos / sumNeg;
          }
 
          // Set du to the discrete time derivative featuring the high order
          // anti-diffusive reconstruction that leads to an forward Euler
          // updated admissible solution.
-         dof_id = k*nd+j;
-         du(dof_id) = du_lo(dof_id) + f_clip(j) / m(dof_id);
+         const int dof_id = k*nd+j;
+         DU(dof_id) = DU_LO(dof_id) + F_CLIP(k,j) / M(dof_id);
       }
-   }
+   });
+   timer->sw_FCT.Stop();
 }
 
 void ClipScaleSolver::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
@@ -745,12 +760,13 @@ void ElementFCTProjection::CalcFCTSolution(const ParGridFunction &u,
    } // element loop
 }
 
-void ElementFCTProjection::CalcFCTProduct(const ParGridFunction &us, const Vector &m,
-                                     const Vector &d_us_HO, const Vector &d_us_LO,
-                                     Vector &s_min, Vector &s_max,
-                                     const Vector &u_new,
-                                     const Array<bool> &active_el,
-                                     const Array<bool> &active_dofs, Vector &d_us)
+void ElementFCTProjection::CalcFCTProduct(const ParGridFunction &us,
+                                          const Vector &m,
+                                          const Vector &d_us_HO, const Vector &d_us_LO,
+                                          Vector &s_min, Vector &s_max,
+                                          const Vector &u_new,
+                                          const Array<bool> &active_el,
+                                          const Array<bool> &active_dofs, Vector &d_us)
 {
    us.HostRead();
    s_min.HostReadWrite();
diff --git a/remhos_fct.hpp b/remhos_fct.hpp
index e3b9f5e..f5891c9 100644
--- a/remhos_fct.hpp
+++ b/remhos_fct.hpp
@@ -25,6 +25,7 @@ namespace mfem
 {
 
 class SmoothnessIndicator;
+struct TimingData;
 
 // Monotone, High-order, Conservative Solver.
 class FCTSolver
@@ -83,6 +84,8 @@ class FCTSolver
    {
       MFEM_ABORT("Product remap is not implemented for the chosen solver");
    }
+
+   TimingData *timer = nullptr;
 };
 
 class FluxBasedFCT : public FCTSolver
diff --git a/remhos_ho.cpp b/remhos_ho.cpp
index 0125acc..7edbc25 100644
--- a/remhos_ho.cpp
+++ b/remhos_ho.cpp
@@ -72,38 +72,61 @@ void CGHOSolver::CalcHOSolution(const Vector &u, Vector &du) const
 LocalInverseHOSolver::LocalInverseHOSolver(ParFiniteElementSpace &space,
                                            ParBilinearForm &Mbf,
                                            ParBilinearForm &Kbf)
-   : HOSolver(space), M(Mbf), K(Kbf) { }
+   : HOSolver(space), M(Mbf), K(Kbf)
+{
+   if (M.GetAssemblyLevel() == AssemblyLevel::PARTIAL)
+   {
+      M_inv = new DGMassInverse(space, BasisType::GaussLegendre);
+      M_inv->SetAbsTol(1e-8), M_inv->SetRelTol(0.0);
+   }
+}
 
 void LocalInverseHOSolver::CalcHOSolution(const Vector &u, Vector &du) const
 {
-   MFEM_VERIFY(M.GetAssemblyLevel() != AssemblyLevel::PARTIAL,
-               "PA for DG is not supported for Local Inverse.");
+   MFEM_VERIFY(timer, "Timer not set.");
 
    Vector rhs(u.Size());
 
-   K.SpMat().HostReadWriteI();
-   K.SpMat().HostReadWriteJ();
-   K.SpMat().HostReadWriteData();
-   HypreParMatrix *K_mat = K.ParallelAssemble(&K.SpMat());
-   K_mat->Mult(u, rhs);
-
-   const int ne = pfes.GetMesh()->GetNE();
-   const int nd = pfes.GetFE(0)->GetDof();
-   DenseMatrix M_loc(nd);
-   DenseMatrixInverse M_loc_inv(&M_loc);
-   Vector rhs_loc(nd), du_loc(nd);
-   Array<int> dofs;
-   for (int i = 0; i < ne; i++)
+   if (M.GetAssemblyLevel() != AssemblyLevel::PARTIAL)
    {
-      pfes.GetElementDofs(i, dofs);
-      rhs.GetSubVector(dofs, rhs_loc);
-      M.SpMat().GetSubMatrix(dofs, dofs, M_loc);
-      M_loc_inv.Factor();
-      M_loc_inv.Mult(rhs_loc, du_loc);
-      du.SetSubVector(dofs, du_loc);
+      timer->sw_rhs.Start();
+      K.SpMat().HostReadWriteI();
+      K.SpMat().HostReadWriteJ();
+      K.SpMat().HostReadWriteData();
+      HypreParMatrix *K_mat = K.ParallelAssemble(&K.SpMat());
+      K_mat->Mult(u, rhs);
+      timer->sw_rhs.Stop();
+
+      const int ne = pfes.GetMesh()->GetNE();
+      const int nd = pfes.GetFE(0)->GetDof();
+      DenseMatrix M_loc(nd);
+      DenseMatrixInverse M_loc_inv(&M_loc);
+      Vector rhs_loc(nd), du_loc(nd);
+      Array<int> dofs;
+      for (int i = 0; i < ne; i++)
+      {
+         pfes.GetElementDofs(i, dofs);
+         rhs.GetSubVector(dofs, rhs_loc);
+         timer->sw_L2inv.Start();
+         M.SpMat().GetSubMatrix(dofs, dofs, M_loc);
+         M_loc_inv.Factor();
+         M_loc_inv.Mult(rhs_loc, du_loc);
+         timer->sw_L2inv.Stop();
+         du.SetSubVector(dofs, du_loc);
+      }
+      delete K_mat;
+   }
+   else
+   {
+      timer->sw_rhs.Start();
+      K.Mult(u, rhs);
+      timer->sw_rhs.Stop();
+
+      timer->sw_L2inv.Start();
+      M_inv->Update(), M_inv->Mult(rhs, du);
+      timer->sw_L2inv.Stop();
    }
 
-   delete K_mat;
 }
 
 NeumannHOSolver::NeumannHOSolver(ParFiniteElementSpace &space,
diff --git a/remhos_ho.hpp b/remhos_ho.hpp
index 11b940c..8f498d2 100644
--- a/remhos_ho.hpp
+++ b/remhos_ho.hpp
@@ -22,6 +22,8 @@
 namespace mfem
 {
 
+struct TimingData;
+
 // High-Order Solver.
 // Conserve mass / provide high-order convergence / may violate the bounds.
 class HOSolver
@@ -35,6 +37,8 @@ class HOSolver
    virtual ~HOSolver() { }
 
    virtual void CalcHOSolution(const Vector &u, Vector &du) const = 0;
+
+   TimingData *timer = nullptr;
 };
 
 class CGHOSolver : public HOSolver
@@ -53,12 +57,14 @@ class LocalInverseHOSolver : public HOSolver
 {
 protected:
    ParBilinearForm &M, &K;
+   mutable DGMassInverse *M_inv = nullptr;
 
 public:
    LocalInverseHOSolver(ParFiniteElementSpace &space,
                         ParBilinearForm &Mbf, ParBilinearForm &Kbf);
 
-   virtual void CalcHOSolution(const Vector &u, Vector &du) const;
+   void CalcHOSolution(const Vector &u, Vector &du) const override;
+   ~LocalInverseHOSolver() { delete M_inv; }
 };
 
 class Assembly;
diff --git a/remhos_lo.cpp b/remhos_lo.cpp
index efff529..b169531 100644
--- a/remhos_lo.cpp
+++ b/remhos_lo.cpp
@@ -246,36 +246,45 @@ void ResidualDistribution::CalcLOSolution(const Vector &u, Vector &du) const
 
 void MassBasedAvg::CalcLOSolution(const Vector &u, Vector &du) const
 {
+   timer->sw_LO.Start();
+
    // Compute the new HO solution.
-   Vector du_HO(u.Size());
    ParGridFunction u_HO_new(&pfes);
-   ho_solver.CalcHOSolution(u, du_HO);
-   add(1.0, u, dt, du_HO, u_HO_new);
-
-   // Mesh positions for the new HO solution.
-   ParMesh *pmesh = pfes.GetParMesh();
-   GridFunction x_new(pmesh->GetNodes()->FESpace());
-   // Copy the current nodes into x.
-   pmesh->GetNodes(x_new);
-   if (mesh_v)
+   if (du_HO)
    {
-      // Remap mode - get the positions of the mesh at time [t + dt].
-      x_new.Add(dt, *mesh_v);
+      add(1.0, u, dt, *du_HO, u_HO_new);
+      du_HO = nullptr;
+   }
+   else
+   {
+      Vector du_HO(u.Size());
+      ho_solver.CalcHOSolution(u, du_HO);
+      add(1.0, u, dt, du_HO, u_HO_new);
    }
 
+   // Mesh positions for the new HO solution.
+   ParMesh *pmesh = pfes.GetParMesh();
    const int NE = pfes.GetNE();
    Vector el_mass(NE), el_vol(NE);
-   MassesAndVolumesAtPosition(u_HO_new, x_new, el_mass, el_vol);
+   MassesAndVolumesAtPosition(u_HO_new, *pmesh->GetNodes(), el_mass, el_vol);
 
    const int ndofs = u.Size() / NE;
-   for (int k = 0; k < NE; k++)
+
+   const auto mass = el_mass.Read(), vol = el_vol.Read();
+   const auto U = mfem::Reshape(u.Read(), ndofs, NE);
+   auto DU = mfem::Reshape(du.Write(), ndofs, NE);
+   const auto δt = dt;
+
+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int k)
    {
-      double u_LO_new = el_mass(k) / el_vol(k);
+      const double u_LO_new = mass[k] / vol[k];
       for (int i = 0; i < ndofs; i++)
       {
-         du(k*ndofs + i) = (u_LO_new - u(k*ndofs + i)) / dt;
+         DU(i,k) = (u_LO_new - U(i, k)) / δt;
       }
-   }
+   });
+
+   timer->sw_LO.Stop();
 }
 
 void MassBasedAvg::MassesAndVolumesAtPosition(const ParGridFunction &u,
@@ -296,17 +305,22 @@ void MassBasedAvg::MassesAndVolumesAtPosition(const ParGridFunction &u,
    // As an L2 function, u has the correct EVector lexicographic ordering.
    qi_u->Values(u, u_qvals);
 
-   for (int k = 0; k < NE; k++)
+   const auto detJ = mfem::Reshape(geom.detJ.Read(), nqp, NE);
+   const auto u_q = mfem::Reshape(u_qvals.Read(), nqp, NE);
+   const auto weights = ir.GetWeights().Read();
+   assert(NE == el_mass.Size() && NE == el_vol.Size());
+   auto mass = el_mass.Write();
+   auto vol = el_vol.Write();
+
+   mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
    {
-      el_mass(k) = 0.0;
-      el_vol(k)  = 0.0;
+      mass[e] = 0.0, vol[e]  = 0.0;
       for (int q = 0; q < nqp; q++)
       {
-         const IntegrationPoint &ip = ir.IntPoint(q);
-         el_mass(k) += ip.weight * geom.detJ(k*nqp + q) * u_qvals(k*nqp + q);
-         el_vol(k)  += ip.weight * geom.detJ(k*nqp + q);
+         const auto w_detJ = weights[q] * detJ(q, e);
+         mass[e] += w_detJ * u_q(q, e), vol[e] += w_detJ;
       }
-   }
+   });
 }
 
 const DofToQuad *get_maps(ParFiniteElementSpace &pfes, Assembly &asmbly)
diff --git a/remhos_lo.hpp b/remhos_lo.hpp
index 29d59a7..b5fecec 100644
--- a/remhos_lo.hpp
+++ b/remhos_lo.hpp
@@ -22,6 +22,8 @@
 namespace mfem
 {
 
+struct TimingData;
+
 // Low-Order Solver.
 class LOSolver
 {
@@ -37,6 +39,8 @@ class LOSolver
    virtual void UpdateTimeStep(double dt_new) { dt = dt_new; }
 
    virtual void CalcLOSolution(const Vector &u, Vector &du) const = 0;
+
+   TimingData *timer = nullptr;
 };
 
 class Assembly;
@@ -84,18 +88,22 @@ class MassBasedAvg : public LOSolver
 {
 protected:
    HOSolver &ho_solver;
-   const GridFunction *mesh_v;
 
-   void MassesAndVolumesAtPosition(const ParGridFunction &u,
-                                   const GridFunction &x,
-                                   Vector &el_mass, Vector &el_vol) const;
+   // Temporary HO solution, used only in the next call to CalcLOSolution().
+   mutable const Vector *du_HO = nullptr;
 
 public:
-  MassBasedAvg(ParFiniteElementSpace &space, HOSolver &hos,
-               const GridFunction *mesh_vel)
-     : LOSolver(space), ho_solver(hos), mesh_v(mesh_vel) { }
+   MassBasedAvg(ParFiniteElementSpace &space, HOSolver &hos)
+      : LOSolver(space), ho_solver(hos) { }
+
+   // Temporary HO solution, used only in the next call to CalcLOSolution().
+   void SetHOSolution(Vector &du) { du_HO = &du; }
 
-  virtual void CalcLOSolution(const Vector &u, Vector &du) const;
+   virtual void CalcLOSolution(const Vector &u, Vector &du) const;
+
+   void MassesAndVolumesAtPosition(const ParGridFunction &u,
+                                   const GridFunction &x,
+                                   Vector &el_mass, Vector &el_vol) const;
 };
 
 //PA based Residual Distribution
diff --git a/remhos_main.cpp b/remhos_main.cpp
new file mode 100644
index 0000000..aabb219
--- /dev/null
+++ b/remhos_main.cpp
@@ -0,0 +1,22 @@
+#include <iostream>
+#include <unistd.h>
+
+// #define DBG_COLOR ::debug::kCyan
+// #include "debug.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+int remhos(int argc, char *argv[], double &final_mass_u);
+
+///////////////////////////////////////////////////////////////////////////////
+int main(int argc, char* argv[]) try
+{
+   // dbgClearScreen(), dbg();
+   double result;
+   return remhos(argc, argv, result);
+}
+catch (std::exception& e)
+{
+   std::cerr << "\033[31m..xxxXXX[ERROR]XXXxxx.." << std::endl;
+   std::cerr << "\033[31m{}" << e.what() << std::endl;
+   return EXIT_FAILURE;
+}
\ No newline at end of file
diff --git a/remhos_tests.cpp b/remhos_tests.cpp
new file mode 100644
index 0000000..1aa1af5
--- /dev/null
+++ b/remhos_tests.cpp
@@ -0,0 +1,181 @@
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <unistd.h>
+
+// #define DBG_COLOR ::debug::kCyan
+// #include "debug.hpp"
+
+int remhos(int, char *[], double &);
+
+//// ///////////////////////////////////////////////////////////////////////////
+template <class T>
+std::enable_if_t<!std::numeric_limits<T>::is_integer, bool>
+AlmostEq(T x, T y, T tolerance = 10.0*std::numeric_limits<T>::epsilon())
+{
+   const T neg = std::abs(x - y);
+   constexpr T min = std::numeric_limits<T>::min();
+   constexpr T eps = std::numeric_limits<T>::epsilon();
+   const T min_abs = std::min(std::abs(x), std::abs(y));
+   if (std::abs(min_abs) == 0.0) { return neg < eps; }
+   return (neg / (1.0 + std::max(min, min_abs))) < tolerance;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+struct Test
+{
+   static constexpr const char *binary = "remhos ";
+   static constexpr const char *common = "-no-vis -vs 1";
+   std::string mesh, options;
+   const double result = 0.0;
+   Test(const char * msh, const char * extra, double result)
+      : mesh(msh), options(std::string(extra) + common), result(result) {}
+   std::string Command() const { return binary + mesh + options; }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+const Test runs[] =
+{
+   {
+      // #0
+      "-m ./data/inline-quad.mesh ",
+      "-p 14 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.09711395400387984
+   },
+   {
+      // #1, 65536
+      "-m ./data/inline-quad.mesh ",
+      "-p 14 -rs 4 -o 3 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.0930984399257905
+   },
+   {
+      // #2, 102400
+      // RHS   kernel time: 5.4988707
+      // L2inv kernel time: 0.94180588
+      // LO    kernel time: 0.028575583
+      // FCT   kernel time: 0.0095909583
+      // Total kernel time: 5.5370372
+      "-m ./data/inline-quad.mesh ",
+      "-p 14 -rs 4 -o 4 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.09237630484178257
+   },
+   {
+      // #3
+      "-m ./data/cube01_hex.mesh ",
+      "-p 10 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.11972857593296446
+   },
+   {
+      // #4 Partial assembly
+      "-m ./data/inline-quad.mesh ",
+      "-pa -p 14 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.09711395400387984
+   },
+   {
+      // #5 Partial assembly
+      "-m ./data/inline-quad.mesh ",
+      "-pa -p 14 -rs 4 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.09185717760402806
+   },
+   {
+      // #7: 3D PA
+      "-m ./data/cube01_hex.mesh ",
+      "-pa -p 10 -rs 3 -o 3 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 1 ",
+      0.11601536511552431
+   },
+   {
+      // #8
+      "-m ./data/star-q2.mesh ",
+      "-pa -p 14 -rs 1 -o 3 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.8069675186775516
+   },
+   {
+      // #9, 🔥 debug device 🔥 (last or !dup)
+      "-m ./data/inline-quad.mesh ",
+      "-d debug -pa -p 14 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.09711395400387984
+   },
+#ifdef MFEM_USE_CUDA
+   {
+      // #10
+      "-m ./data/inline-quad.mesh ",
+      "-d cuda -pa -p 14 -rs 1 -o 2 -dt -1.0 -tf 0.5 -ho 3 -lo 5 -fct 2 -ms 5 ",
+      0.09711395400387984
+   },
+#endif
+};
+constexpr int N_TESTS = sizeof(runs) / sizeof(Test);
+
+///////////////////////////////////////////////////////////////////////////////
+using args_ptr_t = std::vector<std::unique_ptr<char[]>>;
+using args_t = std::vector<char*>;
+
+///////////////////////////////////////////////////////////////////////////////
+int RemhosTest(const Test & test)
+{
+   static args_ptr_t args_ptr;
+   args_t args;
+
+   std::istringstream iss(test.Command());
+
+   std::string token;
+   while (iss >> token)
+   {
+      auto arg_ptr = std::make_unique<char[]>(token.size() + 1);
+      std::memcpy(arg_ptr.get(), token.c_str(), token.size() + 1);
+      arg_ptr[token.size()] = '\0';
+      args.push_back(arg_ptr.get());
+      args_ptr.emplace_back(std::move(arg_ptr));
+   }
+   args.push_back(nullptr);
+
+   double final_mass_u{};
+   remhos(args.size()-1, args.data(), final_mass_u);
+
+   // dbg("final_mass_u: {} vs. {}", final_mass_u, test.result);
+
+   if (AlmostEq(final_mass_u, test.result)) { return std::cout << "✅" << std::endl, EXIT_SUCCESS; }
+
+   return  std::cout << "❌" << std::endl, EXIT_FAILURE;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int main(int argc, char* argv[]) try
+{
+   // dbgClearScreen(), dbg();
+
+   int opt;
+   int test = -1;
+   auto show_usage = [](const int ret = EXIT_FAILURE)
+   {
+      printf("Usage: program [-a <arg>] [-b <arg>] [-h]\n");
+      printf("  -t <test>  Optional test number \n");
+      printf("  -h         Show this help message\n");
+      exit(ret);
+   };
+
+   while ((opt = getopt(argc, argv, "t:h")) != -1)
+   {
+      switch (opt)
+      {
+         case 't': test = std::atoi(optarg); break;
+         case 'h': show_usage(EXIT_SUCCESS);
+         default: show_usage(EXIT_FAILURE);
+      }
+   }
+
+   if (test >= 0 && test < N_TESTS) { return RemhosTest(runs[test]); }
+
+   for (auto & run : runs)
+   {
+      if (RemhosTest(run) != EXIT_SUCCESS) { return EXIT_FAILURE; }
+   }
+   return EXIT_SUCCESS;
+}
+catch (std::exception& e)
+{
+   std::cerr << "\033[31m..xxxXXX[ERROR]XXXxxx.." << std::endl;
+   std::cerr << "\033[31m{}" << e.what() << std::endl;
+   return EXIT_FAILURE;
+}
\ No newline at end of file
diff --git a/remhos_tools.cpp b/remhos_tools.cpp
index ddcc126..4422eb2 100644
--- a/remhos_tools.cpp
+++ b/remhos_tools.cpp
@@ -329,6 +329,9 @@ void SmoothnessIndicator::ComputeFromSparsity(const SparseMatrix &K,
    const int *I = K.GetI(), *J = K.GetJ(), loc_size = K.Size();
    int end;
 
+   x_min.HostReadWrite();
+   x_max.HostReadWrite();
+
    for (int i = 0, k = 0; i < loc_size; i++)
    {
       x_min(i) = numeric_limits<double>::infinity();
@@ -342,8 +345,8 @@ void SmoothnessIndicator::ComputeFromSparsity(const SparseMatrix &K,
    }
 
    GroupCommunicator &gcomm = x.ParFESpace()->GroupComm();
-   Array<double> minvals(x_min.GetData(), x_min.Size()),
-         maxvals(x_max.GetData(), x_max.Size());
+   Array<double> minvals(x_min.HostReadWrite(), x_min.Size()),
+         maxvals(x_max.HostReadWrite(), x_max.Size());
    gcomm.Reduce<double>(minvals, GroupCommunicator::Min);
    gcomm.Bcast(minvals);
    gcomm.Reduce<double>(maxvals, GroupCommunicator::Max);
@@ -387,8 +390,8 @@ void DofInfo::ComputeMatrixSparsityBounds(const Vector &el_min,
    const int NE = pmesh->GetNE();
    const int ndofs = dof_min.Size() / NE;
 
-   x_min.HostReadWrite();
-   x_max.HostReadWrite();
+   el_min.HostRead(), el_max.HostRead();
+   x_min.HostReadWrite(), x_max.HostReadWrite();
 
    x_min = el_min;
    x_max = el_max;
@@ -438,13 +441,16 @@ void DofInfo::ComputeOverlapBounds(const Vector &el_min,
    // Form min/max at each CG dof, considering element overlaps.
    x_min =   std::numeric_limits<double>::infinity();
    x_max = - std::numeric_limits<double>::infinity();
+
+   el_min.HostRead(), el_max.HostRead();
+   dof_min.HostReadWrite(), dof_max.HostReadWrite();
+   x_min.HostReadWrite(), x_max.HostReadWrite();
+
    for (int i = 0; i < NE; i++)
    {
       // Inactive elements don't affect the bounds.
       if (active_el && (*active_el)[i] == false) { continue; }
 
-      x_min.HostReadWrite();
-      x_max.HostReadWrite();
       pfes_bounds.GetElementDofs(i, dofsCG);
       for (int j = 0; j < dofsCG.Size(); j++)
       {
@@ -452,8 +458,8 @@ void DofInfo::ComputeOverlapBounds(const Vector &el_min,
          x_max(dofsCG[j]) = std::max(x_max(dofsCG[j]), el_max(i));
       }
    }
-   Array<double> minvals(x_min.GetData(), x_min.Size());
-   Array<double> maxvals(x_max.GetData(), x_max.Size());
+   Array<double> minvals(x_min.HostReadWrite(), x_min.Size());
+   Array<double> maxvals(x_max.HostReadWrite(), x_max.Size());
    gcomm.Reduce<double>(minvals, GroupCommunicator::Min);
    gcomm.Bcast(minvals);
    gcomm.Reduce<double>(maxvals, GroupCommunicator::Max);
@@ -1069,6 +1075,127 @@ void MixedConvectionIntegrator::AssembleElementMatrix2(
    }
 }
 
+Mesh *CartesianMesh(int dim, int mpi_cnt, int elem_per_mpi, bool print,
+                    int &par_ref, int **partitioning)
+{
+   MFEM_VERIFY(dim > 1, "Not implemented for 1D meshes.");
+
+   auto factor = [&](int N)
+   {
+      for (int i = static_cast<int>(sqrt(N)); i > 0; i--)
+      { if (N % i == 0) { return i; } }
+      return 1;
+   };
+
+   par_ref = 0;
+   const int ref_factor = (dim == 2) ? 4 : 8;
+
+   // Elements per task before performing parallel refinements.
+   // This will be used to form the serial mesh.
+   int el0 = elem_per_mpi;
+   while (el0 % ref_factor == 0)
+   {
+      el0 /= ref_factor;
+      par_ref++;
+   }
+
+   // In the serial mesh we have:
+   // The number of MPI blocks is mpi_cnt = mp_x.mpy_y.mpy_z.
+   // The size of each MPI block is el0 = el0_x.el0_y.el0_z.
+   int mpi_x, mpi_y, mpi_z;
+   int el0_x, el0_y, el0_z;
+   if (dim == 2)
+   {
+      mpi_x = factor(mpi_cnt);
+      mpi_y = mpi_cnt / mpi_x;
+
+      // Switch order for better balance.
+      el0_y = factor(el0);
+      el0_x = el0 / el0_y;
+   }
+   else
+   {
+      mpi_x = factor(mpi_cnt);
+      mpi_y = factor(mpi_cnt / mpi_x);
+      mpi_z = mpi_cnt / mpi_x / mpi_y;
+
+      // Switch order for better balance.
+      el0_z = factor(el0);
+      el0_y = factor(el0 / el0_z);
+      el0_x = el0 / el0_y / el0_z;
+   }
+
+   if (print && dim == 2)
+   {
+      int elem_par_x = mpi_x * el0_x * pow(2, par_ref),
+          elem_par_y = mpi_y * el0_y * pow(2, par_ref);
+
+      std::cout << "--- Mesh generation: \n";
+      std::cout << "Par mesh:    " << elem_par_x << " x " << elem_par_y
+                << " (" << elem_par_x * elem_par_y << " elements)\n"
+                << "Elem / task: "
+                << el0_x * pow(2, par_ref) << " x "
+                << el0_y * pow(2, par_ref)
+                << " (" << el0_x * pow(2, 2*par_ref) * el0_y << " elements)\n"
+                << "MPI blocks:  " << mpi_x << " x " << mpi_y
+                << " (" << mpi_x * mpi_y << " mpi tasks)\n" << "-\n"
+                << "Serial mesh: "
+                << mpi_x * el0_x << " x " << mpi_y * el0_y
+                << " (" << mpi_x * el0_x * mpi_y * el0_y << " elements)\n"
+                << "Elem / task: " << el0_x << " x " << el0_y << std::endl
+                << "Par refine:  " << par_ref << std::endl;
+      std::cout << "--- \n";
+   }
+
+   if (print && dim == 3)
+   {
+      int elem_par_x = mpi_x * el0_x * pow(2, par_ref),
+          elem_par_y = mpi_y * el0_y * pow(2, par_ref),
+          elem_par_z = mpi_z * el0_z * pow(2, par_ref);
+
+      std::cout << "--- Mesh generation: \n";
+      std::cout << "Par mesh:    "
+                << elem_par_x << " x " << elem_par_y << " x " << elem_par_z
+                << " (" << elem_par_x*elem_par_y*elem_par_z << " elements)\n"
+                << "Elem / task: "
+                << el0_x * pow(2, par_ref) << " x "
+                << el0_y * pow(2, par_ref) << " x "
+                << el0_z * pow(2, par_ref)
+                << " (" << el0_x*pow(2, 3*par_ref)*el0_y*el0_z << " elements)\n"
+                << "MPI blocks:  " << mpi_x << " x " << mpi_y << " x " << mpi_z
+                << " (" << mpi_x * mpi_y * mpi_z << " mpi tasks)\n" << "-\n"
+                << "Serial mesh: "
+                << mpi_x*el0_x << " x " << mpi_y*el0_y << " x " << mpi_z*el0_z
+                << " (" << mpi_x*el0_x*mpi_y*el0_y*mpi_z*el0_z << " elements)\n"
+                << "Elem / task: "
+                << el0_x << " x " << el0_y << " x " << el0_z  << std::endl
+                << "Par refine:  " << par_ref << std::endl;
+      std::cout << "--- \n";
+   }
+
+   Mesh *mesh;
+   if (dim == 2)
+   {
+      mesh = new Mesh(Mesh::MakeCartesian2D(mpi_x * el0_x,
+                                            mpi_y * el0_y,
+                                            Element::QUADRILATERAL, true));
+   }
+   else
+   {
+      mesh = new Mesh(Mesh::MakeCartesian3D(mpi_x * el0_x,
+                                            mpi_y * el0_y,
+                                            mpi_z * el0_z,
+                                            Element::HEXAHEDRON, true));
+   }
+
+   int nxyz[dim];
+   if (dim == 2) { nxyz[0] = mpi_x; nxyz[1] = mpi_y; }
+   else          { nxyz[0] = mpi_x; nxyz[1] = mpi_y; nxyz[2] = mpi_z; }
+   *partitioning = mesh->CartesianPartitioning(nxyz);
+
+   return mesh;
+}
+
 int GetLocalFaceDofIndex3D(int loc_face_id, int face_orient,
                            int face_dof_id, int face_dof1D_cnt)
 {
diff --git a/remhos_tools.hpp b/remhos_tools.hpp
index 624c8d7..6b29897 100644
--- a/remhos_tools.hpp
+++ b/remhos_tools.hpp
@@ -23,6 +23,9 @@
 namespace mfem
 {
 
+Mesh *CartesianMesh(int dim, int mpi_cnt, int elem_per_mpi, bool print,
+                    int &par_ref, int **partitioning);
+
 int GetLocalFaceDofIndex(int dim, int loc_face_id, int face_orient,
                          int face_dof_id, int face_dof1D_cnt);
 
@@ -46,6 +49,20 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport,
 
 class DofInfo;
 
+struct TimingData
+{
+   // Total times for all major computations.
+   StopWatch sw_rhs, sw_L2inv, sw_LO, sw_FCT;
+
+   // Store the number of local L2 dofs.
+   const HYPRE_Int L2dof;
+
+   // Iterations for the L2 mass inversion.
+   HYPRE_Int L2iter;
+
+   TimingData(const HYPRE_Int l2d) : L2dof(l2d) { }
+};
+
 class SmoothnessIndicator
 {
 private: