From f9df9b44cad57ac6098d9ef974029b20c6a99ac5 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 22 Nov 2021 20:47:11 -0600
Subject: [PATCH 01/26] Performance prototype of a per-zone implicit solve,
 emulating evolving an EGRMHD viscous model.

---
 .gitmodules                    |   3 +
 CMakeLists.txt                 |   4 +
 external/kokkos-kernels        |   1 +
 kharma/CMakeLists.txt          |   4 +
 kharma/kharma.cpp              |  16 ++-
 kharma/viscosity/viscosity.cpp | 180 +++++++++++++++++++++++++++++++++
 kharma/viscosity/viscosity.hpp | 116 +++++++++++++++++++++
 pars/orszag_tang_viscous.par   |  63 ++++++++++++
 8 files changed, 382 insertions(+), 5 deletions(-)
 create mode 160000 external/kokkos-kernels
 create mode 100644 kharma/viscosity/viscosity.cpp
 create mode 100644 kharma/viscosity/viscosity.hpp
 create mode 100644 pars/orszag_tang_viscous.par
diff --git a/.gitmodules b/.gitmodules
index 8642247d..7c447280 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -5,3 +5,6 @@
 [submodule "external/variant"]
 	path = external/variant
 	url = https://github.com/mpark/variant.git
+[submodule "external/kokkos-kernels"]
+	path = external/kokkos-kernels
+	url = https://github.com/kokkos/kokkos-kernels
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dbdc1356..56556e40 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,10 @@ add_subdirectory(external/parthenon)
 include_directories(external/parthenon/src)
 # mpark::variant is header only, don't build anything
 include_directories(external/variant/include)
+# Kokkos kernels
+add_subdirectory(external/kokkos-kernels)
+include_directories(external/kokkos-kernels/src)
+include_directories(external/kokkos-kernels/src/batched)
 
 # KHARMA folder
 add_subdirectory(kharma)
diff --git a/external/kokkos-kernels b/external/kokkos-kernels
new file mode 160000
index 00000000..8381db04
--- /dev/null
+++ b/external/kokkos-kernels
@@ -0,0 +1 @@
+Subproject commit 8381db0486674c1be943de23974821ddfb9e6c29
diff --git a/kharma/CMakeLists.txt b/kharma/CMakeLists.txt
index 75f83c8c..6f872b1c 100644
--- a/kharma/CMakeLists.txt
+++ b/kharma/CMakeLists.txt
@@ -19,6 +19,7 @@ AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/current EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/electrons EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/grmhd EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/reductions EXE_NAME_SRC)
+AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/viscosity EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/wind EXE_NAME_SRC)
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@@ -31,10 +32,13 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/current)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/electrons)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grmhd)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/reductions)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/viscosity)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/wind)
 
 add_executable(${EXE_NAME} ${EXE_NAME_SRC})
 
+target_link_libraries(${EXE_NAME} PUBLIC kokkos)
+target_link_libraries(${EXE_NAME} PUBLIC kokkoskernels)
 target_link_libraries(${EXE_NAME} PUBLIC parthenon)
 
 # OPTIONS
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index 3c1d60b1..8ee714d6 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -46,6 +46,7 @@
 #include "electrons.hpp"
 #include "grmhd.hpp"
 #include "reductions.hpp"
+#include "viscosity.hpp"
 #include "wind.hpp"
 
 #include "bondi.hpp"
@@ -159,10 +160,11 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     // Read all options first so we can set their defaults here,
     // before any packages are initialized.
     std::string b_field_solver = pin->GetOrAddString("b_field", "solver", "flux_ct");
-    // TODO if jcon in outputs then...
+    // TODO enable this iff jcon is in the list of outputs
     bool add_jcon = pin->GetOrAddBoolean("GRMHD", "add_jcon", true);
     bool do_electrons = pin->GetOrAddBoolean("electrons", "on", false);
     bool do_reductions = pin->GetOrAddBoolean("reductions", "on", true);
+    bool do_viscosity = pin->GetOrAddBoolean("viscosity", "on", false);
     bool do_wind = pin->GetOrAddBoolean("wind", "on", false);
 
     // Global variables "package."  Anything that just, really oughta be a global
@@ -186,10 +188,6 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
         packages.Add(B_FluxCT::Initialize(pin.get(), packages));
     }
 
-    if (do_wind) {
-        packages.Add(Wind::Initialize(pin.get()));
-    }
-
     if (add_jcon) {
         packages.Add(Current::Initialize(pin.get()));
     }
@@ -202,6 +200,14 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
         packages.Add(Reductions::Initialize(pin.get()));
     }
 
+    if (do_viscosity) {
+        packages.Add(Viscosity::Initialize(pin.get(), packages));
+    }
+
+    if (do_wind) {
+        packages.Add(Wind::Initialize(pin.get()));
+    }
+
     return std::move(packages);
 }
 
diff --git a/kharma/viscosity/viscosity.cpp b/kharma/viscosity/viscosity.cpp
new file mode 100644
index 00000000..3cd8b70a
--- /dev/null
+++ b/kharma/viscosity/viscosity.cpp
@@ -0,0 +1,180 @@
+/* 
+ *  File: viscosity.cpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "viscosity.hpp"
+
+#include "decs.hpp"
+#include "grmhd.hpp"
+#include "kharma.hpp"
+
+#include <parthenon/parthenon.hpp>
+
+#include <batched/dense/KokkosBatched_LU_Decl.hpp>
+#include <batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp>
+#include <batched/dense/KokkosBatched_Trsv_Decl.hpp>
+using namespace KokkosBatched;
+
+using namespace parthenon;
+
+// Used only in Howes model
+#define ME (9.1093826e-28  ) // Electron mass
+#define MP (1.67262171e-24 ) // Proton mass
+
+// Do I really want to reintroduce this?
+#define SMALL 1.e-20
+
+namespace Viscosity
+{
+
+std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages)
+{
+    auto pkg = std::make_shared<StateDescriptor>("Viscosity");
+    Params &params = pkg->AllParams();
+
+    // Diagnostic data
+    int verbose = pin->GetOrAddInteger("debug", "verbose", 0);
+    params.Add("verbose", verbose);
+    int flag_verbose = pin->GetOrAddInteger("debug", "flag_verbose", 0);
+    params.Add("flag_verbose", flag_verbose);
+    int extra_checks = pin->GetOrAddInteger("debug", "extra_checks", 0);
+    params.Add("extra_checks", extra_checks);
+
+    // Floors & fluid gamma
+    // Any parameters, like above
+
+    MetadataFlag isPrimitive = packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+    MetadataFlag isNonideal = Metadata::AllocateNewFlag("Nonideal");
+    params.Add("NonidealFlag", isNonideal);
+
+    // General options for primitive and conserved scalar variables in KHARMA
+    Metadata m_con  = Metadata({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
+                 Metadata::Restart, Metadata::Conserved, Metadata::WithFluxes, isNonideal});
+    Metadata m_prim = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived,
+                  isPrimitive, isNonideal});
+
+    // Heat conduction
+    pkg->AddField("cons.q", m_con);
+    pkg->AddField("prims.q", m_prim);
+    // Pressure anisotropy
+    pkg->AddField("cons.dP", m_con);
+    pkg->AddField("prims.dP", m_prim);
+
+    // This ensures that UtoP is called (by way of viscosity.hpp definitions)
+    pkg->FillDerivedBlock = Viscosity::FillDerived;
+    pkg->PostFillDerivedBlock = Viscosity::PostFillDerived;
+    return pkg;
+}
+
+void UtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
+{
+    FLAG("UtoP electrons");
+    auto pmb = rc->GetBlockPointer();
+
+    MetadataFlag isNonideal = pmb->packages.Get("Viscosity")->Param<MetadataFlag>("NonidealFlag");
+    MetadataFlag isPrimitive = pmb->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+    // No need for a "map" here, we just want everything that fits these
+    auto& e_P = rc->PackVariables({isNonideal, isPrimitive});
+    auto& e_U = rc->PackVariables({isNonideal, Metadata::Conserved});
+    // And then the local density
+    GridScalar rho_U = rc->Get("cons.rho").data;
+
+    const auto& G = pmb->coords;
+
+    // Get array bounds from Parthenon
+    auto bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;
+    IndexRange ib = bounds.GetBoundsI(domain);
+    IndexRange jb = bounds.GetBoundsJ(domain);
+    IndexRange kb = bounds.GetBoundsK(domain);
+
+    // We will need need need to copy & reorder indices before running this
+
+    // Begin the funky kokkos bit
+    // Let's do a batched LU and Trsv!
+    const Real alpha = 1, tiny = 0;
+    const int ni = bounds.ncellsi(domain), nj = bounds.ncellsj(domain), nk = bounds.ncellsk(domain);
+    ParArray5D<Real> AA("AA", nk, nj, ni, 7, 7);
+    ParArray4D<Real> B("B", nk, nj, ni, 7);
+
+    // Simulating some iterations
+    for (int iter=0; iter < 5; iter++) {
+        // Normally, when doing multiple batched operations,
+        // we would need either a general solve function,
+        // or two reads through the full array. Not so in Kokkos!
+        // This could be faster I think -- there are versions of the inner portion
+        // that cover rows at a time, by taking member objects on a Team
+        // see e.g. fluxes.hpp for usage of teams
+        pmb->par_for("implicit_solve", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA_3D {
+                // This code lightly adapted from 
+                auto A = Kokkos::subview(AA, k, j, i, Kokkos::ALL(), Kokkos::ALL());
+                auto b = Kokkos::subview(B, k, j, i, Kokkos::ALL());
+                /// [template]AlgoType: Unblocked, Blocked, CompatMKL
+                /// [in/out]A: 2d view
+                /// [in]tiny: a magnitude scalar value to avoid div/0
+                KokkosBatched::SerialLU<Algo::LU::Blocked>::invoke(A, tiny);
+                /// [template]UploType: indicates either upper triangular or lower triangular; Uplo::Upper, Uplo::Lower
+                /// [template]TransType: transpose of A; Trans::NoTranspose, Trans::Transpose
+                /// [template]DiagType: diagonals; Diag::Unit or Diag::NonUnit
+                /// [template]AlgoType: Unblocked, Blocked, CompatMKL
+                /// [in]alpha: scalar value
+                /// [in]A: 2d view
+                /// [in]b: 1d view
+                KokkosBatched::SerialTrsv<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>::invoke(alpha, A, b);
+            }
+        );
+    }
+
+}
+
+void PostUtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
+{
+    // Any fixing after that... whole thing
+}
+
+TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *rc)
+{
+    FLAG("Printing electron diagnostics");
+
+    // Output any diagnostics after a step completes
+
+    FLAG("Printed")
+    return TaskStatus::complete;
+}
+
+void FillOutput(MeshBlock *pmb, ParameterInput *pin)
+{
+    // Any variables or diagnostics that should be computed especially for output to a file,
+    // but which are not otherwise updated.
+}
+
+} // namespace B_FluxCT
diff --git a/kharma/viscosity/viscosity.hpp b/kharma/viscosity/viscosity.hpp
new file mode 100644
index 00000000..23626f89
--- /dev/null
+++ b/kharma/viscosity/viscosity.hpp
@@ -0,0 +1,116 @@
+/* 
+ *  File: viscosity.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include <parthenon/parthenon.hpp>
+
+#include "mhd_functions.hpp"
+
+using namespace parthenon;
+
+/**
+ * This physics package may someday implement viscosity.  It doesn't yet!
+ */
+namespace Viscosity {
+/**
+ * Initialization: declare any fields this package will evolve, initialize any parameters
+ */
+std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages);
+
+/**
+ * In addition to the standard functions, packages can include extras.  This is called manually
+ * at the end of problem initialization in problem.cpp
+ */
+TaskStatus InitElectrons(MeshBlockData<Real> *rc, ParameterInput *pin);
+
+/**
+ * Determine the primitive variable values, given conserved forms
+ * This is where the implicit kernel will likely be placed, as each solve is per-cell after fluxes
+ * and boundaries.
+ * 
+ * TODO make this replace GRMHD::UtoP or make it step out of the way
+ */
+void UtoP(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire, bool coarse=false);
+inline void FillDerived(MeshBlockData<Real> *rc) { UtoP(rc); }
+
+/**
+ * Floors, fixes, or other cleaning up after determining primitives.
+ */
+void PostUtoP(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire, bool coarse=false);
+inline void PostFillDerived(MeshBlockData<Real> *rc) { PostUtoP(rc); }
+
+/**
+ * Diagnostics printed/computed after each step, called from kharma.cpp
+ * 
+ * Function in this package: Currently nothing
+ */
+TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *rc);
+
+/**
+ * Fill fields which are calculated only for output to dump files
+ * 
+ * Function in this package: Currently nothing
+ */
+void FillOutput(MeshBlock *pmb, ParameterInput *pin);
+
+/**
+ * KHARMA requires two forms of the function for obtaining conserved variables from primitives.
+ * However, these are very different from UtoP/FillDerived in that they are called exclusively on the
+ * device side, operating on a single zone rather than the whole fluid state.
+ * 
+ * Each should have roughly the signature used here, accepting scratchpads of size NVARxN1, and index
+ * maps (see types.hpp) indicating which index corresponds to which variable in the packed array, as well
+ * as indications of the desired zone location and flux direction (dir==0 for just the conserved variable forms).
+ * As used extensively here, any variables not present in a pack will have index -1 in the map.
+ *  
+ * The two functions differ in two ways:
+ * 1. The caller precalculate the four-vectors (u^mu, b^mu) and pass them in the struct D to prim_to_flux (see fluxes.hpp for call)
+ * 2. p_to_u will only ever be called to obtain the conserved variables U, not fluxes (i.e. dir == 0 in calls)
+ * 
+ * Function in this package: primitive to flux/conserved transformation of conduction term q, pressure anisotropy dP
+ */
+KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m_p, const FourVectors D,
+                                         const int& k, const int& j, const int& i, const int dir,
+                                         ScratchPad2D<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
+{
+    // Calculate flux through a face from primitives
+}
+KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
+                                         const int& k, const int& j, const int& i,
+                                         const VariablePack<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
+{
+    // Calculate conserved variables from primitives
+}
+
+}
diff --git a/pars/orszag_tang_viscous.par b/pars/orszag_tang_viscous.par
new file mode 100644
index 00000000..d566e5ed
--- /dev/null
+++ b/pars/orszag_tang_viscous.par
@@ -0,0 +1,63 @@
+# GRMHD Modes problem
+# Try to propagate several analytically-amenable linear modes of the MHD equations
+
+<parthenon/job>
+problem_id = orszag_tang
+
+<parthenon/mesh>
+refinement = none
+numlevel = 1
+
+nx1 = 512
+x1min = -3.141592653589793
+x1max = 3.141592653589793
+ix1_bc = periodic
+ox1_bc = periodic
+
+nx2 = 512
+x2min = -3.141592653589793
+x2max = 3.141592653589793
+ix2_bc = periodic
+ox2_bc = periodic
+
+nx3 = 1
+x3min = -0.01
+x3max = 0.01
+ix3_bc = periodic
+ox3_bc = periodic
+
+<parthenon/meshblock>
+nx1 = 128
+nx2 = 128
+nx3 = 1
+
+<coordinates>
+base = cartesian_minkowski
+transform = null
+
+<parthenon/time>
+tlim = 100.0
+integrator = rk2
+
+<GRMHD>
+cfl = 0.9
+gamma = 1.666667
+reconstruction = weno5
+
+<viscosity>
+on = true
+
+<debug>
+verbose = 0
+flag_verbose = 0
+extra_checks = 0
+
+<parthenon/output0>
+file_type = hdf5
+dt = 1.0
+single_precision_output = true
+variables = prims.rho, prims.u, prims.uvec, prims.B, jcon
+
+<parthenon/output1>
+file_type = hst
+dt = 0.1

From 22faac1a877917a76a196fb6d011707242e44487 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 16 Feb 2022 18:32:49 -0500
Subject: [PATCH 02/26] Load everything needed to compile over & above default
 Summit env.

---
 machines/incite.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/machines/incite.sh b/machines/incite.sh
index 974e1aa7..6c88bc04 100644
--- a/machines/incite.sh
+++ b/machines/incite.sh
@@ -31,7 +31,8 @@ if [[ $HOST == *".summit.olcf.ornl.gov" ]]; then
   else
     # Use nvc++ compiler in NVHPC
     module unload cuda
-    module load nvhpc/21.11
+    module load cmake cuda/nvhpc nvhpc/21.11 spectrum-mpi hdf5/1.10.7
+
     C_NATIVE="nvc"
     CXX_NATIVE="nvc++"
     export CXXFLAGS="-mp"

From 5792812caa4c687f60ecaf3eceebc74a5dbdc916 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 23 Feb 2022 10:09:42 -0600
Subject: [PATCH 03/26] Initial work for GRIM stepper: add HARM step ordering
 driver, options. Boundaries could be cleaner.

---
 kharma/boundaries.cpp          | 106 +++++++----
 kharma/fluxes.hpp              |   2 +
 kharma/grim_driver.cpp         | 324 +++++++++++++++++++++++++++++++++
 kharma/grim_driver.hpp         |  65 +++++++
 kharma/grmhd/grmhd.cpp         |  47 +++--
 kharma/harm_driver.cpp         |   2 +-
 kharma/harm_driver.hpp         |  34 +++-
 kharma/kharma.cpp              |   3 +
 kharma/main.cpp                |  22 ++-
 kharma/viscosity/viscosity.cpp |  13 +-
 machines/bp.sh                 |  16 +-
 pars/bondi.par                 |   4 +
 pars/mhdmodes.par              |   4 +
 pars/orszag_tang.par           |   4 +-
 tests/bondi/check.py           |   4 +-
 tests/bondi/check.sh           |   4 +-
 tests/bondi/run.sh             |   6 +
 tests/mhdmodes/check.sh        |   4 +
 tests/mhdmodes/run.sh          |   4 +
 19 files changed, 591 insertions(+), 77 deletions(-)
 create mode 100644 kharma/grim_driver.cpp
 create mode 100644 kharma/grim_driver.hpp

diff --git a/kharma/boundaries.cpp b/kharma/boundaries.cpp
index b8f2119d..28f5ddae 100644
--- a/kharma/boundaries.cpp
+++ b/kharma/boundaries.cpp
@@ -72,6 +72,8 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     auto P = GRMHD::PackMHDPrims(rc.get(), prims_map, coarse);
     auto q = rc->PackVariables({Metadata::FillGhost}, cons_map, coarse);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
+    // If we're running classic/GRIM, q is the primitive variables
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
 
     // KHARMA is very particular about corner boundaries.
     // In particular, we apply the outflow boundary over ALL X2, X3,
@@ -113,25 +115,34 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
             q(p, k, j, i) = q(p, k, j, ref);
         }
     );
-    // Apply KHARMA boundary to the primitive values
-    // TODO currently this includes B, which we then replace.
-    pmb->par_for("OutflowX1_prims", 0, P.GetDim(4) - 1, ks_e, ke_e, js_e, je_e, ibs, ibe,
-        KOKKOS_LAMBDA_VARS {
-            P(p, k, j, i) = P(p, k, j, ref);
-        }
-    );
-    // Zone-by-zone recovery of U from P
-    pmb->par_for("OutflowX1_PtoU", ks_e, ke_e, js_e, je_e, ibs, ibe,
+    if (!prim_ghosts) {
+        // Apply KHARMA boundary to the primitive values
+        // TODO currently this includes B, which we then replace.
+        pmb->par_for("OutflowX1_prims", 0, P.GetDim(4) - 1, ks_e, ke_e, js_e, je_e, ibs, ibe,
+            KOKKOS_LAMBDA_VARS {
+                P(p, k, j, i) = P(p, k, j, ref);
+            }
+        );
+    }
+    // Inflow check, recover U
+    pmb->par_for("OutflowX1_check", ks_e, ke_e, js_e, je_e, ibs, ibe,
         KOKKOS_LAMBDA_3D {
             // Inflow check
-            if (check_inflow) KBoundaries::check_inflow(G, P, m_p.U1, k, j, i, dir);
-            // TODO move these steps into FillDerivedDomain, make a GRMHD::PrimToFlux call the last in that series
-            // Correct primitive B
-            VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
-            // Recover conserved vars
-            GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
+            if (check_inflow) KBoundaries::check_inflow(G, P , m_p.U1, k, j, i, dir);
         }
     );
+    if (!prim_ghosts) {
+        // Recover U
+        pmb->par_for("OutflowX1_PtoU", ks_e, ke_e, js_e, je_e, ibs, ibe,
+            KOKKOS_LAMBDA_3D {
+                // TODO move these steps into FillDerivedDomain, make a GRMHD::PrimToFlux call the last in that series
+                // Correct primitive B
+                VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
+                // Recover conserved vars
+                GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
+            }
+        );
+    }
 
     Flag(rc.get(), "Applied");
 }
@@ -149,6 +160,8 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     auto P = GRMHD::PackMHDPrims(rc.get(), prims_map, coarse);
     auto q = rc->PackVariables({Metadata::FillGhost}, cons_map, coarse);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
+    // If we're running classic/GRIM, q is the primitive variables
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
 
     // KHARMA is very particular about corner boundaries, see above
     IndexDomain ldomain = IndexDomain::interior;
@@ -192,18 +205,24 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
             q(p, k, j, i) = reflect * q(p, k, (ref + add) + (ref - j), i);
         }
     );
-    pmb->par_for("ReflectX2_prims", 0, P.GetDim(4) - 1, ks_e, ke_e, jbs, jbe, ics, ice,
-        KOKKOS_LAMBDA_VARS {
-            Real reflect = P.VectorComponent(p) == X2DIR ? -1.0 : 1.0;
-            P(p, k, j, i) = reflect * P(p, k, (ref + add) + (ref - j), i);
-        }
-    );
-    pmb->par_for("ReflectX2_PtoU", ks_e, ke_e, jbs, jbe, ics, ice,
-        KOKKOS_LAMBDA_3D {
-            VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
-            GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
-        }
-    );
+    // If we're using the classic/GRIM algo, the above is all we need.
+    if (!prim_ghosts) {
+        // If we're using the HARM/KHARMA driver, we need to do the primitives
+        // separately after the conserved vars
+        pmb->par_for("ReflectX2_prims", 0, P.GetDim(4) - 1, ks_e, ke_e, jbs, jbe, ics, ice,
+            KOKKOS_LAMBDA_VARS {
+                Real reflect = P.VectorComponent(p) == X2DIR ? -1.0 : 1.0;
+                P(p, k, j, i) = reflect * P(p, k, (ref + add) + (ref - j), i);
+            }
+        );
+        // And we need to fill the corresponding conserved vars
+        pmb->par_for("ReflectX2_PtoU", ks_e, ke_e, jbs, jbe, ics, ice,
+            KOKKOS_LAMBDA_3D {
+                VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
+                GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
+            }
+        );
+    }
 }
 
 // Interface calls into the preceding functions
@@ -212,36 +231,45 @@ void KBoundaries::InnerX1(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
     // TODO implement as named callback, give combo start/bound problems their own "packages"
     auto pmb = rc->GetBlockPointer();
     std::string prob = pmb->packages.Get("GRMHD")->Param<std::string>("problem");
-    //if (prob == "hubble") {
-    //    SetHubble(rc.get(), IndexDomain::inner_x1, coarse);
-    //} else {
+    if (prob == "hubble") {
+       //SetHubble(rc.get(), IndexDomain::inner_x1, coarse);
+    } else {
         OutflowX1(rc, IndexDomain::inner_x1, coarse);
-    //}
-    KHARMA::FillDerivedDomain(rc, IndexDomain::inner_x1, coarse);
+    }
+    // If we're in KHARMA/HARM driver, we need primitive versions of all the
+    // non-GRMHD vars
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    if (!prim_ghosts) KHARMA::FillDerivedDomain(rc, IndexDomain::inner_x1, coarse);
 }
 void KBoundaries::OuterX1(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
 {
     auto pmb = rc->GetBlockPointer();
     std::string prob = pmb->packages.Get("GRMHD")->Param<std::string>("problem");
-    //if (prob == "hubble") {
-    //    SetHubble(rc.get(), IndexDomain::outer_x1, coarse);
-    //} else
-    if (prob == "bondi") {
+    if (prob == "hubble") {
+       //SetHubble(rc.get(), IndexDomain::outer_x1, coarse);
+    } else if (prob == "bondi") {
         SetBondi(rc.get(), IndexDomain::outer_x1, coarse);
     } else {
         OutflowX1(rc, IndexDomain::outer_x1, coarse);
     }
-    KHARMA::FillDerivedDomain(rc, IndexDomain::outer_x1, coarse);
+    // If we're in KHARMA/HARM driver, we need primitive versions of all the
+    // non-GRMHD vars
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    if (!prim_ghosts) KHARMA::FillDerivedDomain(rc, IndexDomain::outer_x1, coarse);
 }
 void KBoundaries::InnerX2(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
 {
+    auto pmb = rc->GetBlockPointer();
     ReflectX2(rc, IndexDomain::inner_x2, coarse);
-    KHARMA::FillDerivedDomain(rc, IndexDomain::inner_x2, coarse);
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    if (!prim_ghosts) KHARMA::FillDerivedDomain(rc, IndexDomain::inner_x2, coarse);
 }
 void KBoundaries::OuterX2(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
 {
+    auto pmb = rc->GetBlockPointer();
     ReflectX2(rc, IndexDomain::outer_x2, coarse);
-    KHARMA::FillDerivedDomain(rc, IndexDomain::outer_x2, coarse);
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    if (!prim_ghosts) KHARMA::FillDerivedDomain(rc, IndexDomain::outer_x2, coarse);
 }
 
 /**
diff --git a/kharma/fluxes.hpp b/kharma/fluxes.hpp
index b29339b7..80ff4fd5 100644
--- a/kharma/fluxes.hpp
+++ b/kharma/fluxes.hpp
@@ -62,8 +62,10 @@ TaskStatus ApplyFluxes(MeshData<Real> *md, MeshData<Real> *mdudt);
 
 /**
  * Fill all conserved variables (U) from primitive variables (P), over the whole grid.
+ * Second declaration is for Parthenon's benefit, similar to UtoP vs FillDerived in GRMHD::
  */
 TaskStatus PrimToFlux(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire);
+inline TaskStatus PrimToFluxTask(MeshBlockData<Real> *rc) { return PrimToFlux(rc); }
 
 // Fluxes a.k.a. "Approximate Riemann Solvers"
 // More complex solvers require speed estimates not calculable completely from
diff --git a/kharma/grim_driver.cpp b/kharma/grim_driver.cpp
new file mode 100644
index 00000000..c46f5ba7
--- /dev/null
+++ b/kharma/grim_driver.cpp
@@ -0,0 +1,324 @@
+/* 
+ *  File: grim_driver.cpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "grim_driver.hpp"
+
+#include <iostream>
+
+#include <parthenon/parthenon.hpp>
+#include <interface/update.hpp>
+#include <refinement/refinement.hpp>
+
+#include "decs.hpp"
+
+#include "b_flux_ct.hpp"
+#include "b_cd.hpp"
+#include "electrons.hpp"
+#include "grmhd.hpp"
+#include "wind.hpp"
+
+#include "boundaries.hpp"
+#include "debug.hpp"
+#include "fixup.hpp"
+#include "fluxes.hpp"
+#include "iharm_restart.hpp"
+#include "source.hpp"
+
+TaskCollection GRIMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
+{
+    // Reminder that NOTHING YOU CALL HERE WILL GET CALLED EVERY STEP
+    // this function is run *once*, and returns a list of what should be done every step.
+    // No prints or direct function calls here will do what you want, only calls to tl.AddTask()
+
+    // This is *not* likely the task list you are looking for, and is not well commented yet.
+    // See harm_driver.cpp for KHARMA's main driver.
+    TaskCollection tc;
+    TaskID t_none(0);
+
+    Real beta = integrator->beta[stage - 1];
+    const Real dt = integrator->dt;
+    auto stage_name = integrator->stage_name;
+
+    // Which packages we load affects which tasks we'll add to the list
+    auto& pkgs = blocks[0]->packages.AllPackages();
+    bool use_b_cd = pkgs.count("B_CD");
+    bool use_b_flux_ct = pkgs.count("B_FluxCT");
+    bool use_electrons = pkgs.count("Electrons");
+    bool use_wind = pkgs.count("Wind");
+
+    // Allocate the fields ("containers") we need block by block
+    for (int i = 0; i < blocks.size(); i++) {
+        auto &pmb = blocks[i];
+        // first make other useful containers
+        auto &base = pmb->meshblock_data.Get();
+        if (stage == 1) {
+            pmb->meshblock_data.Add("dUdt", base);
+            for (int i = 1; i < integrator->nstages; i++)
+                pmb->meshblock_data.Add(stage_name[i], base);
+            // At the end of the step, updating "sc1" updates the base
+            // So we have to keep a copy at the beginning to calculate jcon
+            pmb->meshblock_data.Add("preserve", base);
+        }
+    }
+
+    // Big synchronous region: get & apply fluxes to advance the fluid state
+    // num_partitions is usually 1
+    const int num_partitions = pmesh->DefaultNumPartitions();
+    TaskRegion &single_tasklist_per_pack_region = tc.AddRegion(num_partitions);
+    for (int i = 0; i < num_partitions; i++) {
+        auto &tl = single_tasklist_per_pack_region[i];
+        auto &mbase = pmesh->mesh_data.GetOrAdd("base", i);
+        auto &mc0 = pmesh->mesh_data.GetOrAdd(stage_name[stage - 1], i);
+        auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+        auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i);
+
+        auto t_start_recv = tl.AddTask(t_none, &MeshData<Real>::StartReceiving, mc1.get(),
+                                    BoundaryCommSubset::all);
+
+        // Calculate the HLL fluxes in each direction
+        // This reconstructs the primitives (P) at faces and uses them to calculate fluxes
+        // of the conserved variables (U)
+        const ReconstructionType& recon = blocks[0]->packages.Get("GRMHD")->Param<ReconstructionType>("recon");
+        TaskID t_calculate_flux1, t_calculate_flux2, t_calculate_flux3;
+        switch (recon) {
+        case ReconstructionType::donor_cell:
+            t_calculate_flux1 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::donor_cell, X1DIR>, mc0.get());
+            t_calculate_flux2 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::donor_cell, X2DIR>, mc0.get());
+            t_calculate_flux3 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::donor_cell, X3DIR>, mc0.get());
+            break;
+        case ReconstructionType::linear_mc:
+            t_calculate_flux1 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::linear_mc, X1DIR>, mc0.get());
+            t_calculate_flux2 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::linear_mc, X2DIR>, mc0.get());
+            t_calculate_flux3 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::linear_mc, X3DIR>, mc0.get());
+            break;
+        case ReconstructionType::linear_vl:
+            t_calculate_flux1 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::linear_vl, X1DIR>, mc0.get());
+            t_calculate_flux2 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::linear_vl, X2DIR>, mc0.get());
+            t_calculate_flux3 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::linear_vl, X3DIR>, mc0.get());
+            break;
+        case ReconstructionType::weno5:
+            t_calculate_flux1 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::weno5, X1DIR>, mc0.get());
+            t_calculate_flux2 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::weno5, X2DIR>, mc0.get());
+            t_calculate_flux3 = tl.AddTask(t_start_recv, Flux::GetFlux<ReconstructionType::weno5, X3DIR>, mc0.get());
+            break;
+        case ReconstructionType::ppm:
+        case ReconstructionType::mp5:
+        case ReconstructionType::weno5_lower_poles:
+            cerr << "Reconstruction type not supported!  Supported reconstructions:" << endl;
+            cerr << "donor_cell, linear_mc, linear_vl, weno5" << endl;
+            throw std::invalid_argument("Unsupported reconstruction algorithm!");
+        }
+        auto t_calculate_flux = t_calculate_flux1 | t_calculate_flux2 | t_calculate_flux3;
+
+        auto t_recv_flux = t_calculate_flux;
+        // TODO this appears to be implemented *only* block-wise, split it into its own region if so
+        if (pmesh->multilevel) {
+            // Get flux corrections from AMR neighbors
+            for (auto &pmb : pmesh->block_list) {
+                auto& rc = pmb->meshblock_data.Get();
+                auto t_send_flux =
+                    tl.AddTask(t_calculate_flux, &MeshBlockData<Real>::SendFluxCorrection, rc.get());
+                t_recv_flux =
+                    tl.AddTask(t_calculate_flux, &MeshBlockData<Real>::ReceiveFluxCorrection, rc.get());
+            }
+        }
+
+        // FIX FLUXES
+        // Zero any fluxes through the pole or inflow from outflow boundaries
+        auto t_fix_flux = tl.AddTask(t_recv_flux, KBoundaries::FixFlux, mc0.get());
+
+        auto t_flux_fixed = t_fix_flux;
+        if (use_b_flux_ct) {
+            // Fix the conserved fluxes (exclusively B1/2/3) so that they obey divB==0,
+            // and there is no B field flux through the pole
+            auto t_flux_ct = tl.AddTask(t_fix_flux, B_FluxCT::TransportB, mc0.get());
+            t_flux_fixed = t_flux_ct;
+        }
+    }
+
+    // This region is where GRIM and classic HARM split.
+    // Classic HARM applies the fluxes to calculate a new state of conserved variables,
+    // then solves for the primitive variables with UtoP (here "FillDerived")
+    const auto &driver_step =
+        blocks[0]->packages.Get("GRMHD")->Param<std::string>("driver_step");
+    if (driver_step == "explicit") { // This is the general HARM step, with flux divergence & UtoP
+        // Apply fluxes and update conserved state
+        const int num_partitions = pmesh->DefaultNumPartitions();
+        TaskRegion &single_tasklist_per_pack_region = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &tl = single_tasklist_per_pack_region[i];
+            auto &mbase = pmesh->mesh_data.GetOrAdd("base", i);
+            auto &mc0 = pmesh->mesh_data.GetOrAdd(stage_name[stage - 1], i);
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i);
+            // APPLY FLUXES
+            auto t_flux_div = tl.AddTask(t_none, Update::FluxDivergence<MeshData<Real>>, mc0.get(), mdudt.get());
+
+            // ADD SOURCES TO CONSERVED VARIABLES
+            // Source term for GRMHD, \Gamma * T
+            // TODO take this out in Minkowski space
+            auto t_flux_apply = tl.AddTask(t_flux_div, GRMHD::AddSource, mc0.get(), mdudt.get());
+            // Source term for constraint-damping.  Applied only to B
+            auto t_b_cd_source = t_flux_apply;
+            if (use_b_cd) {
+                t_b_cd_source = tl.AddTask(t_flux_apply, B_CD::AddSource, mc0.get(), mdudt.get());
+            }
+            // Wind source.  Applied to conserved variables similar to GR source term
+            auto t_wind_source = t_b_cd_source;
+            if (use_wind) {
+                t_wind_source = tl.AddTask(t_b_cd_source, Wind::AddSource, mdudt.get());
+            }
+            // Done with source terms
+            auto t_sources = t_wind_source;
+
+            // UPDATE BASE CONTAINER
+            auto t_avg_data = tl.AddTask(t_sources, Update::AverageIndependentData<MeshData<Real>>,
+                                    mc0.get(), mbase.get(), beta);
+            // apply du/dt to all independent fields in the container
+            auto t_update = tl.AddTask(t_avg_data, Update::UpdateIndependentData<MeshData<Real>>, mc0.get(),
+                                    mdudt.get(), beta * dt, mc1.get());
+        }
+
+        // Then solve for new primitives in the fluid interior, with the primitives at step start as a guess,
+        // using UtoP.  Note that since no ghost zones are updated here, and thus FixUtoP cannot use
+        // ghost zones, KHARMA behavior in this mode will dependent on the breakdown of meshblocks & possibly
+        // erratic for many fixups.  Full algo should boundary sync -> FixUtoP -> boundary sync
+        TaskRegion &async_region = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &pmb = blocks[i];
+            auto &tl = async_region[i];
+            auto &sc0 = pmb->meshblock_data.Get(stage_name[stage-1]);
+            auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
+
+            auto t_copy_prims = tl.AddTask(t_none,
+                [](MeshBlockData<Real> *rc0, MeshBlockData<Real> *rc1)
+                {
+                    Flag(rc1, "Copying prims");
+                    rc1->Get("prims.rho").data.DeepCopy(rc0->Get("prims.rho").data);
+                    rc1->Get("prims.u").data.DeepCopy(rc0->Get("prims.u").data);
+                    rc1->Get("prims.uvec").data.DeepCopy(rc0->Get("prims.uvec").data);
+                    Flag(rc1, "Copied");
+                    return TaskStatus::complete;
+                }, sc0.get(), sc1.get()
+            );
+
+
+            auto t_fill_derived = tl.AddTask(t_copy_prims, Update::FillDerived<MeshBlockData<Real>>, sc1.get());
+            // See note about syncing boundary here
+            auto t_fix_derived = tl.AddTask(t_fill_derived, GRMHD::FixUtoP, sc1.get());
+            auto t_heat_electrons = t_fix_derived;
+            if (use_electrons) {
+                auto t_heat_electrons = tl.AddTask(t_fix_derived, Electrons::ApplyElectronHeating, sc0.get(), sc1.get());
+            }
+        }
+    } else { // This is the GRIM step
+        // GRIM ALGO HERE
+    }
+
+    // MPI/MeshBlock boundary exchange.
+    // Optionally "packed" to send all data in one call (num_partitions defaults to 1)
+    // Note that in GRIM driver this block syncs *primitive* variables, not conserved
+    const auto &pack_comms =
+        blocks[0]->packages.Get("GRMHD")->Param<bool>("pack_comms");
+    if (pack_comms) {
+        TaskRegion &tr1 = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr1[i].AddTask(t_none, cell_centered_bvars::SendBoundaryBuffers, mc1);
+        }
+        TaskRegion &tr2 = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr2[i].AddTask(t_none, cell_centered_bvars::ReceiveBoundaryBuffers, mc1);
+        }
+        TaskRegion &tr3 = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr3[i].AddTask(t_none, cell_centered_bvars::SetBoundaries, mc1);
+        }
+    } else {
+        TaskRegion &tr1 = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr1[i].AddTask(t_none, &MeshBlockData<Real>::SendBoundaryBuffers, sc1.get());
+        }
+        TaskRegion &tr2 = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr2[i].AddTask(t_none, &MeshBlockData<Real>::ReceiveBoundaryBuffers, sc1.get());
+        }
+        TaskRegion &tr3 = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr3[i].AddTask(t_none, &MeshBlockData<Real>::SetBoundaries, sc1.get());
+        }
+    }
+
+    // Async Region: Any post-sync tasks.  Timestep & AMR things.
+    TaskRegion &async_region = tc.AddRegion(blocks.size());
+    for (int i = 0; i < blocks.size(); i++) {
+        auto &pmb = blocks[i];
+        auto &tl = async_region[i];
+        auto &sc0 = pmb->meshblock_data.Get(stage_name[stage-1]);
+        auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
+
+        auto t_clear_comm_flags = tl.AddTask(t_none, &MeshBlockData<Real>::ClearBoundary,
+                                        sc1.get(), BoundaryCommSubset::all);
+
+        auto t_prolongBound = t_clear_comm_flags;
+        if (pmesh->multilevel) {
+            t_prolongBound = tl.AddTask(t_clear_comm_flags, ProlongateBoundaries, sc1);
+        }
+
+
+        auto t_set_bc = tl.AddTask(t_prolongBound, parthenon::ApplyBoundaryConditions, sc1);
+
+        auto t_ptou = tl.AddTask(t_set_bc, Flux::PrimToFluxTask, sc1.get());
+
+        auto t_step_done = t_ptou;
+
+        // Estimate next time step based on ctop
+        if (stage == integrator->nstages) {
+            auto t_new_dt =
+                tl.AddTask(t_step_done, Update::EstimateTimestep<MeshBlockData<Real>>, sc1.get());
+
+            // Update refinement
+            if (pmesh->adaptive) {
+                auto tag_refine = tl.AddTask(
+                    t_step_done, parthenon::Refinement::Tag<MeshBlockData<Real>>, sc1.get());
+            }
+        }
+    }
+
+    return tc;
+}
diff --git a/kharma/grim_driver.hpp b/kharma/grim_driver.hpp
new file mode 100644
index 00000000..402109b4
--- /dev/null
+++ b/kharma/grim_driver.hpp
@@ -0,0 +1,65 @@
+/* 
+ *  File: grim_driver.cpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include <memory>
+
+#include <parthenon/parthenon.hpp>
+
+using namespace parthenon;
+
+/**
+ * A Driver object orchestrates everything that has to be done to a mesh to constitute a step.
+ * Nominally GRIM is very much like HARM, but in KHARMA the two drivers have one key difference:
+ * GRIMDriver syncs primitive variables, whereas HARM/KHARMA syncs conserved variables
+ */
+class GRIMDriver : public MultiStageDriver {
+    public:
+        /**
+         * Default constructor
+         */
+        GRIMDriver(ParameterInput *pin, ApplicationInput *papp, Mesh *pm) : MultiStageDriver(pin, papp, pm) {}
+
+        /**
+         * All the tasks which constitute advancing the fluid in a mesh by one stage.
+         * This includes calculation of the primitives and reconstruction of their face values,
+         * calculation of conserved values and fluxes thereof at faces,
+         * application of fluxes and a source term in order to update zone values,
+         * and finally calculation of the next timestep based on the CFL condition.
+         * 
+         * The function is heavily documented since order changes can introduce subtle bugs,
+         * usually w.r.t. fluid "state" being spread across the primitive and conserved quantities
+         */
+        TaskCollection MakeTaskCollection(BlockList_t &blocks, int stage);
+};
diff --git a/kharma/grmhd/grmhd.cpp b/kharma/grmhd/grmhd.cpp
index ba2072d7..174c125e 100644
--- a/kharma/grmhd/grmhd.cpp
+++ b/kharma/grmhd/grmhd.cpp
@@ -144,6 +144,11 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     bool fix_flux_pole = pin->GetOrAddBoolean("bounds", "fix_flux_pole", true);
     params.Add("fix_flux_pole", fix_flux_pole);
 
+    // Driver options
+    auto driver_type = pin->GetString("driver", "type"); // This is set in kharma.cpp
+    params.Add("driver_type", driver_type);
+    auto driver_step = pin->GetOrAddString("driver", "step", "explicit");
+    params.Add("driver_step", driver_step);
 
     // Performance options
     // Packed communications kernels, exchanging all boundary buffers of an MPI process
@@ -179,13 +184,33 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     // generally inherit the size of the MeshBlock (for "Cell" fields) or some
     // closely-related size (for "Face" and "Edge" fields)
 
-    // As mentioned elsewhere, KHARMA treats the conserved variables as the independent ones,
-    // and the primitives as "Derived"
-    // Primitives are still used for reconstruction, physical boundaries, and output, and are
-    // generally the easier to understand quantities
     std::vector<int> s_vector({3});
-    auto flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
-                                                      Metadata::Restart, isPrimitive, isHD, isMHD});
+    std::vector<MetadataFlag> flags_prim, flags_cons;
+    auto grim_driver = pin->GetString("driver", "type") == "grim";
+    if (!grim_driver) {
+        // As mentioned elsewhere, KHARMA treats the conserved variables as the independent ones,
+        // and the primitives as "Derived"
+        // Primitives are still used for reconstruction, physical boundaries, and output, and are
+        // generally the easier to understand quantities
+        flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
+                                                Metadata::Restart, isPrimitive, isHD, isMHD});
+        // Conserved variables are actualy rho*u^0 & T^0_mu, but are named after the prims for consistency
+        // We will rarely need the conserved variables by name, we will mostly be treating them as a group
+        flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
+                                                Metadata::WithFluxes, Metadata::FillGhost, Metadata::Restart,
+                                                Metadata::Conserved, isHD, isMHD});
+    } else {
+        // For GRIM/classic HARM, however, the primitive variables are independent, and boundary syncs are performed
+        // with them.
+        flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
+                                                Metadata::FillGhost, Metadata::Restart, isPrimitive, isHD, isMHD});
+        // Conserved variables are actualy rho*u^0 & T^0_mu, but are named after the prims for consistency
+        // We will rarely need the conserved variables by name, we will mostly be treating them as a group
+        flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
+                                                Metadata::WithFluxes, Metadata::Conserved, isHD, isMHD});
+    }
+
+    // With the flags sorted & explained, actually declaring fields is easy.
     auto m = Metadata(flags_prim);
     pkg->AddField("prims.rho", m);
     pkg->AddField("prims.u", m);
@@ -194,11 +219,6 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     m = Metadata(flags_prim_vec, s_vector);
     pkg->AddField("prims.uvec", m);
 
-    // Conserved variables are actualy rho*u^0 & T^0_mu, but are named after the prims for consistency
-    // We will rarely need the conserved variables by name, we will mostly be treating them as a group
-    auto flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
-                                                      Metadata::WithFluxes, Metadata::FillGhost, Metadata::Restart,
-                                                      Metadata::Conserved, isHD, isMHD});
     m = Metadata(flags_cons);
     pkg->AddField("cons.rho", m);
     pkg->AddField("cons.u", m);
@@ -217,8 +237,9 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
 
         // Remove the "HD" flag from B, since it is not that
         flags_prim_vec.erase(std::remove(flags_prim_vec.begin(), flags_prim_vec.end(), isHD), flags_prim_vec.end());
-        // Remove the "Restart" flag, since unlike the fluid prims, prims.B is fully redundant
-        flags_prim_vec.erase(std::remove(flags_prim_vec.begin(), flags_prim_vec.end(), Metadata::Restart), flags_prim_vec.end());
+        // If prims are derived, remove the "Restart" flag, since unlike the fluid prims, prims.B is fully redundant
+        if (!grim_driver)
+            flags_prim_vec.erase(std::remove(flags_prim_vec.begin(), flags_prim_vec.end(), Metadata::Restart), flags_prim_vec.end());
         flags_prim_vec.push_back(Metadata::Overridable);
         m = Metadata(flags_prim_vec, s_vector);
         pkg->AddField("prims.B", m);
diff --git a/kharma/harm_driver.cpp b/kharma/harm_driver.cpp
index 19bde14f..fcfe824f 100644
--- a/kharma/harm_driver.cpp
+++ b/kharma/harm_driver.cpp
@@ -1,5 +1,5 @@
 /* 
- *  File: harm.cpp
+ *  File: harm_driver.cpp
  *  
  *  BSD 3-Clause License
  *  
diff --git a/kharma/harm_driver.hpp b/kharma/harm_driver.hpp
index b9ce4560..ae445164 100644
--- a/kharma/harm_driver.hpp
+++ b/kharma/harm_driver.hpp
@@ -1,4 +1,36 @@
-// HARM Driver: implement the HARM scheme for GRMHD as described in Gammie et al 2003, 2004
+/* 
+ *  File: harm_driver.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 #pragma once
 
 #include <memory>
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index 9eee975c..da4fbb5f 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -193,6 +193,9 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     bool do_viscosity = pin->GetOrAddBoolean("viscosity", "on", false);
     bool do_wind = pin->GetOrAddBoolean("wind", "on", false);
 
+    // Set the default driver way up here.  TODO check for incompatibilities, etc
+    auto driver_type = pin->GetOrAddString("driver", "type", "harm");
+
     // Global variables "package."  Anything that just, really oughta be a global
     packages.Add(KHARMA::InitializeGlobals(pin.get()));
 
diff --git a/kharma/main.cpp b/kharma/main.cpp
index d34e6738..1f407d77 100644
--- a/kharma/main.cpp
+++ b/kharma/main.cpp
@@ -36,6 +36,7 @@
 #include "decs.hpp"
 
 #include "boundaries.hpp"
+#include "grim_driver.hpp"
 #include "harm_driver.hpp"
 #include "kharma.hpp"
 #include "mpi.hpp"
@@ -145,8 +146,15 @@ int main(int argc, char *argv[])
     KHARMA::PostInitialize(pin, pmesh, pman.IsRestart());
     Flag("Post-initialization completed");
 
-    // Then construct & run the driver
-    HARMDriver driver(pin, papp, pmesh);
+    // Construct a temporary driver purely for parameter parsing
+    auto driver_type = pin->GetString("driver", "type");
+    if (driver_type == "harm") {
+        HARMDriver driver(pin, papp, pmesh);
+    } else if (driver_type == "grim") {
+        GRIMDriver driver(pin, papp, pmesh);
+    } else {
+        throw std::invalid_argument("Expected driver type to be harm or grim!");
+    }
 
     // We could still have set parameters during driver initialization
     // Note the order here is *extremely important* as the first statement has a
@@ -174,7 +182,15 @@ int main(int argc, char *argv[])
     // of each step until a stop criterion is reached.
     Flag("Executing Driver");
 
-    auto driver_status = driver.Execute();
+    if (driver_type == "harm") {
+        cout << "Initializing and running KHARMA driver." << endl;
+        HARMDriver driver(pin, papp, pmesh);
+        auto driver_status = driver.Execute();
+    } else if (driver_type == "grim") {
+        cout << "Initializing and running GRIM driver." << endl;
+        GRIMDriver driver(pin, papp, pmesh);
+        auto driver_status = driver.Execute();
+    }
 
     // Parthenon cleanup includes Kokkos, MPI
     Flag("Finalizing");
diff --git a/kharma/viscosity/viscosity.cpp b/kharma/viscosity/viscosity.cpp
index 3cd8b70a..effa92cf 100644
--- a/kharma/viscosity/viscosity.cpp
+++ b/kharma/viscosity/viscosity.cpp
@@ -46,13 +46,6 @@ using namespace KokkosBatched;
 
 using namespace parthenon;
 
-// Used only in Howes model
-#define ME (9.1093826e-28  ) // Electron mass
-#define MP (1.67262171e-24 ) // Proton mass
-
-// Do I really want to reintroduce this?
-#define SMALL 1.e-20
-
 namespace Viscosity
 {
 
@@ -97,7 +90,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
 
 void UtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
 {
-    FLAG("UtoP electrons");
+    Flag(rc, "UtoP electrons");
     auto pmb = rc->GetBlockPointer();
 
     MetadataFlag isNonideal = pmb->packages.Get("Viscosity")->Param<MetadataFlag>("NonidealFlag");
@@ -163,11 +156,11 @@ void PostUtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
 
 TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *rc)
 {
-    FLAG("Printing electron diagnostics");
+    Flag("Printing electron diagnostics");
 
     // Output any diagnostics after a step completes
 
-    FLAG("Printed")
+    Flag("Printed");
     return TaskStatus::complete;
 }
 
diff --git a/machines/bp.sh b/machines/bp.sh
index 90020736..8cb9d7f2 100644
--- a/machines/bp.sh
+++ b/machines/bp.sh
@@ -3,7 +3,7 @@
 
 # TODO toolbox break to discover enclosing hostname
 if [[ "$HOST" == "toolbox"* ]]; then
-  HOST=fermium
+  HOST=ferrum
 fi
 if [[ "$HOST" == "e4s"* ]]; then
   HOST=fermium
@@ -61,15 +61,21 @@ if [[ $HOST == "fermium" ]]; then
 fi
 
 if [[ $HOST == "ferrum" ]]; then
-  # Intel SYCL implementation "DPC++"
-  module purge
-  module load compiler mpi
+  if [[ "$ARGS" == *"gcc"* ]]; then
+    module load mpi/mpich-x86_64
+    C_NATIVE="gcc"
+    CXX_NATIVE="g++"
+  else
+    # Intel SYCL implementation "DPC++"
+    module purge
+    module load compiler mpi
+    PREFIX_PATH="$HOME/libs/hdf5-oneapi"
+  fi
 
   NPROC=6 # My kingdom for a RAM!
 
   HOST_ARCH="HSW"
   DEVICE_ARCH="INTEL_GEN"
-  PREFIX_PATH="$HOME/libs/hdf5-oneapi"
 
   EXTRA_FLAGS="-DFUSE_FLUX_KERNELS=OFF -DFUSE_EMF_KERNELS=OFF -DFUSE_FLOOR_KERNELS=OFF $EXTRA_FLAGS"
 fi
diff --git a/pars/bondi.par b/pars/bondi.par
index 761a64be..b913f96f 100644
--- a/pars/bondi.par
+++ b/pars/bondi.par
@@ -55,6 +55,10 @@ solver = none
 <debug>
 verbose = 0
 
+<driver>
+type = harm
+step = explicit
+
 <parthenon/output0>
 file_type = hdf5
 dt = 5.0
diff --git a/pars/mhdmodes.par b/pars/mhdmodes.par
index 00441052..d25151f6 100644
--- a/pars/mhdmodes.par
+++ b/pars/mhdmodes.par
@@ -56,6 +56,10 @@ disable_floors = true
 <debug>
 verbose = 0
 
+<driver>
+type = harm
+step = explicit
+
 <parthenon/output0>
 file_type = hdf5
 # This is so as to output only the final state
diff --git a/pars/orszag_tang.par b/pars/orszag_tang.par
index c3702167..72a23c76 100644
--- a/pars/orszag_tang.par
+++ b/pars/orszag_tang.par
@@ -1,5 +1,5 @@
-# GRMHD Modes problem
-# Try to propagate several analytically-amenable linear modes of the MHD equations
+# Orszag-Tang Vortex problem:
+# Generate current sheets on short timescales
 
 <parthenon/job>
 problem_id = orszag_tang
diff --git a/tests/bondi/check.py b/tests/bondi/check.py
index adb5c7a1..8ef6a1fb 100644
--- a/tests/bondi/check.py
+++ b/tests/bondi/check.py
@@ -31,8 +31,8 @@
 
     r = r[imin:]
 
-    rho0 = np.mean(start['RHO'][imin:,:,0], axis=1)
-    rho1 = np.mean(end['RHO'][imin:,:,0], axis=1)
+    rho0 = np.mean(start['RHO'][imin:,:], axis=1)
+    rho1 = np.mean(end['RHO'][imin:,:], axis=1)
 
     fig = plt.figure(figsize=(5,5))
     ax = fig.add_subplot(1,1,1)
diff --git a/tests/bondi/check.sh b/tests/bondi/check.sh
index 51d8a187..ebf7e8c0 100755
--- a/tests/bondi/check.sh
+++ b/tests/bondi/check.sh
@@ -8,7 +8,9 @@ conda activate pyHARM
 res="32,48,64,96,128"
 python check.py $res "in 2D, FMKS coordinates" fmks || fail=1
 python check.py $res "in 2D, MKS coordinates" mks || fail=1
+python check.py $res "in 2D, EKS coordinates" eks || fail=1
 python check.py $res "in 2D, linear recon with MC limiter" linear_mc || fail=1
 python check.py $res "in 2D, linear recon with VL limiter" linear_vl || fail=1
+python check.py $res "in 2D, with classic algo/boundaries" classic || fail=1
 
-exit $fail
\ No newline at end of file
+exit $fail
diff --git a/tests/bondi/run.sh b/tests/bondi/run.sh
index a0419227..b808b7fd 100755
--- a/tests/bondi/run.sh
+++ b/tests/bondi/run.sh
@@ -17,7 +17,13 @@ conv_2d() {
     done
 }
 
+# Test coordinates (raw ks?)
 conv_2d fmks coordinates/transform=fmks
 conv_2d mks coordinates/transform=mks
+conv_2d eks coordinates/transform=eks
+# Recon
 conv_2d linear_mc GRMHD/reconstruction=linear_mc
 conv_2d linear_vl GRMHD/reconstruction=linear_vl
+# And the GRIM/classic driver
+conv_2d classic driver/type=grim
+#conv_2d grim driver/type=grim driver/step=implicit 
diff --git a/tests/mhdmodes/check.sh b/tests/mhdmodes/check.sh
index f1870117..8a8119c2 100755
--- a/tests/mhdmodes/check.sh
+++ b/tests/mhdmodes/check.sh
@@ -17,6 +17,10 @@ python3 check.py $RES3D "fast mode in 3D" fast || fail=1
 python3 check.py $RES3D "entropy mode in 3D, linear/MC reconstruction" entropy_mc || fail=1
 python3 check.py $RES3D "entropy mode in 3D, linear/VL reconstruction" entropy_vl || fail=1
 
+python3 check.py $RES3D "slow mode in 3D, classic algo" slow_grim || fail=1
+python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_grim || fail=1
+python3 check.py $RES3D "fast mode in 3D, classic algo" fast_grim || fail=1
+
 #python3 check.py $RES2D "fast mode in 2D, WENO5" fast2d 2d || fail=1
 #python3 check.py $RES2D "fast mode in 2D, linear/MC reconstruction" fast_mc 2d || fail=1
 #python3 check.py $RES2D "fast mode in 2D, linear/VL reconstruction" fast_vl 2d || fail=1
diff --git a/tests/mhdmodes/run.sh b/tests/mhdmodes/run.sh
index 7d992639..c7e8913a 100755
--- a/tests/mhdmodes/run.sh
+++ b/tests/mhdmodes/run.sh
@@ -54,6 +54,10 @@ conv_3d entropy_vl "mhdmodes/nmode=0 GRMHD/reconstruction=linear_vl"
 conv_3d slow mhdmodes/nmode=1
 conv_3d alfven mhdmodes/nmode=2
 conv_3d fast mhdmodes/nmode=3
+# And we've got to test classic/GRIM stepping
+conv_3d slow_grim   "mhdmodes/nmode=1 driver/type=grim"
+conv_3d alfven_grim "mhdmodes/nmode=2 driver/type=grim"
+conv_3d fast_grim   "mhdmodes/nmode=3 driver/type=grim"
 
 # 2D modes use small blocks, could pick up some problems at MPI ranks >> 1
 # Currently very slow, plus modes are incorrect

From 15b0a369250b19da206cdded74ba343d9700a402 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 23 Feb 2022 10:35:13 -0600
Subject: [PATCH 04/26] Make the tests less verbose to avoid overflowing GitLab
 logs

---
 tests/bondi/run.sh    | 2 +-
 tests/mhdmodes/run.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/bondi/run.sh b/tests/bondi/run.sh
index b808b7fd..7e76e752 100755
--- a/tests/bondi/run.sh
+++ b/tests/bondi/run.sh
@@ -11,7 +11,7 @@ conv_2d() {
       $BASE/run.sh -i $BASE/pars/bondi.par parthenon/output0/dt=1000 debug/verbose=1 \
                                            parthenon/mesh/nx1=$res parthenon/mesh/nx2=$res parthenon/mesh/nx3=1 \
                                            parthenon/meshblock/nx1=$half parthenon/meshblock/nx2=$half parthenon/meshblock/nx3=1 \
-                                           $2
+                                           $2 >log_${1}_${res}.txt 2>&1
         mv bondi.out0.00000.phdf bondi_2d_${res}_start_${1}.phdf
         mv bondi.out0.final.phdf bondi_2d_${res}_end_${1}.phdf
     done
diff --git a/tests/mhdmodes/run.sh b/tests/mhdmodes/run.sh
index c7e8913a..cd6904d6 100755
--- a/tests/mhdmodes/run.sh
+++ b/tests/mhdmodes/run.sh
@@ -14,7 +14,7 @@ conv_3d() {
       $BASE/run.sh -i $BASE/pars/mhdmodes.par debug/verbose=1 \
                       parthenon/mesh/nx1=$res parthenon/mesh/nx2=$res parthenon/mesh/nx3=$res \
                       parthenon/meshblock/nx1=$half parthenon/meshblock/nx2=$half parthenon/meshblock/nx3=$half \
-                      $2
+                      $2 >log_${1}_${res}.txt
         mv mhdmodes.out0.00000.phdf mhd_3d_${res}_start_${1}.phdf
         mv mhdmodes.out0.final.phdf mhd_3d_${res}_end_${1}.phdf
     done

From cd8515b1efba8fb8e147bbfd371767245e048297 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 23 Feb 2022 19:40:09 -0600
Subject: [PATCH 05/26] Fix prims-fundamental evolution by labelling B field
 correctly for Parthenon. Some viscosity/implicit stuff

---
 .gitignore                          |   3 +-
 CMakeLists.txt                      |  21 +-
 kharma/CMakeLists.txt               |   2 +
 kharma/b_flux_ct/b_flux_ct.cpp      |  22 +-
 kharma/implicit/implicit.cpp        | 140 +++++++++++
 kharma/implicit/implicit.hpp        |  49 ++++
 kharma/kharma.cpp                   |   3 +-
 kharma/main.cpp                     |   5 +-
 kharma/prob/iharm_restart.cpp       |   9 +-
 kharma/prob/post_initialize.cpp     |  11 +-
 kharma/types.hpp                    |  15 +-
 kharma/viscosity/emhd_functions.hpp | 370 ++++++++++++++++++++++++++++
 kharma/viscosity/viscosity.cpp      |  97 +-------
 kharma/viscosity/viscosity.hpp      |  53 +---
 pars/mhdmodes.par                   |   2 +-
 pars/mhdmodes_implicit.par          |  79 ++++++
 tests/bondi/run.sh                  |   2 +-
 17 files changed, 720 insertions(+), 163 deletions(-)
 create mode 100644 kharma/implicit/implicit.cpp
 create mode 100644 kharma/implicit/implicit.hpp
 create mode 100644 kharma/viscosity/emhd_functions.hpp
 create mode 100644 pars/mhdmodes_implicit.par

diff --git a/.gitignore b/.gitignore
index 74b2024c..233fae26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,8 +12,9 @@ frames_*/
 *.rhdf
 *.xdmf
 *.hst
-# Archival parsed parameters file
+# Archival files
 kharma_parsed_*.par
+log_*.txt
 
 # Editor documents
 .project
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56556e40..96e7d7ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,18 +26,25 @@ set(PARTHENON_DISABLE_HDF5_COMPRESSION ON CACHE BOOL "KHARMA Override")
 # Parthenon internal build options
 set(BUILD_TESTING OFF CACHE BOOL "KHARMA Override")
 set(ENABLE_COMPILER_WARNINGS OFF CACHE BOOL "KHARMA Override")
+# TODO set this here when I upstream or otherwise unfork
+#set(COORDINATE_TYPE GRCoordinates)
 
 # Kokkos options
-set(Kokkos_ENABLE_OPENMP ON)
-set(Kokkos_ENABLE_CUDA_LAMBDA ON)
-set(Kokkos_ENABLE_CUDA_CONSTEXPR ON)
-set(Kokkos_ENABLE_HWLOC OFF) # Possible speed improvement?
-set(Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION ON)
+set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "KHARMA Override")
+set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "KHARMA Override")
+set(Kokkos_ENABLE_CUDA_CONSTEXPR ON CACHE BOOL "KHARMA Override")
+set(Kokkos_ENABLE_HWLOC OFF CACHE BOOL "KHARMA Override") # Possible speed improvement?
+set(Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION ON CACHE BOOL "KHARMA Override")
+
+# Build only what we need of KokkosKernels
+set(KokkosKernels_ENABLE_TPL_CUSPARSE OFF CACHE BOOL "KHARMA Override")
+set(KokkosKernels_ENABLE_TPL_CUBLAS OFF CACHE BOOL "KHARMA Override")
 
-# TODO set this here when I upstream or otherwise unfork
-#set(COORDINATE_TYPE GRCoordinates)
 
 # Parthenon says it doesn't need MPI.  It just *strongly prefers* it, and so do we.
+# Builds without MPI have pretty limited support, you can usually find distribution 
+# packages or other ways to install it on personal machines without too much work.
+# Check out oneAPI or NVHPC for software distributions that include easily-usable MPI modules
 find_package(MPI REQUIRED)
 include_directories(SYSTEM ${MPI_INCLUDE_PATH})
 find_package(OpenMP REQUIRED)
diff --git a/kharma/CMakeLists.txt b/kharma/CMakeLists.txt
index dcf60274..87bebdf3 100644
--- a/kharma/CMakeLists.txt
+++ b/kharma/CMakeLists.txt
@@ -19,6 +19,7 @@ AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/current EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/electrons EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/floors EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/grmhd EXE_NAME_SRC)
+AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/implicit EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/reductions EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/viscosity EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/wind EXE_NAME_SRC)
@@ -33,6 +34,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/current)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/electrons)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/floors)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grmhd)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/implicit)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/reductions)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/viscosity)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/wind)
diff --git a/kharma/b_flux_ct/b_flux_ct.cpp b/kharma/b_flux_ct/b_flux_ct.cpp
index 23469c55..13cf97eb 100644
--- a/kharma/b_flux_ct/b_flux_ct.cpp
+++ b/kharma/b_flux_ct/b_flux_ct.cpp
@@ -74,12 +74,24 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
 
     // B fields.  "Primitive" form is field, "conserved" is flux
     // Note: when changing metadata, keep these in lockstep with grmhd.cpp
-    Metadata m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
-                 Metadata::Restart, Metadata::Conserved, isMHD, Metadata::WithFluxes, Metadata::Vector}, s_vector);
-    pkg->AddField("cons.B", m);
-    m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived,
-                  isPrimitive, isMHD, Metadata::Vector}, s_vector);
+    std::vector<MetadataFlag> flags_prim, flags_cons;
+    auto grim_driver = pin->GetString("driver", "type") == "grim";
+    if (!grim_driver) {
+        flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
+                                                isPrimitive, isMHD, Metadata::Vector});
+        flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
+                 Metadata::Restart, Metadata::Conserved, isMHD, Metadata::WithFluxes, Metadata::Vector});
+    } else {
+        flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::FillGhost, Metadata::Restart,
+                                                isPrimitive, isMHD, Metadata::Vector});
+        flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
+                                                Metadata::Conserved, isMHD, Metadata::WithFluxes, Metadata::Vector});
+    }
+
+    auto m = Metadata(flags_prim, s_vector);
     pkg->AddField("prims.B", m);
+    m = Metadata(flags_cons, s_vector);
+    pkg->AddField("cons.B", m);
 
     m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
     pkg->AddField("divB", m);
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
new file mode 100644
index 00000000..825eb442
--- /dev/null
+++ b/kharma/implicit/implicit.cpp
@@ -0,0 +1,140 @@
+/* 
+ *  File: implicit.cpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Floors.  Apply limits to fluid values to maintain integrable state
+
+#include "implicit.hpp"
+
+#include "debug.hpp"
+#include "fixup.hpp"
+#include "mhd_functions.hpp"
+#include "pack.hpp"
+
+#include <batched/dense/KokkosBatched_LU_Decl.hpp>
+#include <batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp>
+#include <batched/dense/KokkosBatched_Trsv_Decl.hpp>
+using namespace KokkosBatched;
+
+namespace Implicit
+{
+
+std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
+{
+    // TODO can I just build/add/use a Prescription here, rather than building one
+    // before each call?
+    auto pkg = std::make_shared<StateDescriptor>("Implicit");
+    Params &params = pkg->AllParams();
+
+    // Implicit solver parameters
+    bool jacobian_eps = pin->GetOrAddReal("implicit", "jacobian_eps", 4.e-8);
+    params.Add("jacobian_eps", jacobian_eps);
+    bool rootfind_tol = pin->GetOrAddReal("implicit", "rootfind_tol", 1.e-3);
+    params.Add("rootfind_tol", rootfind_tol);
+    bool max_nonlinear_iter = pin->GetOrAddInteger("implicit", "max_nonlinear_iter", 1);
+    params.Add("max_nonlinear_iter", max_nonlinear_iter);
+
+    // Any fields particular to the implicit solver (NOT EGRMHD IN GENERAL)
+    // Likely none...
+    // see viscosity/viscosity.cpp for EGRMHD/auxiliary fields
+
+    // Anything we need to run from this package on callbacks
+    // None of this will be crucial for the step
+    // pkg->PostFillDerivedBlock = Implicit::PostFillDerivedBlock;
+    // pkg->PostStepDiagnosticsMesh = Implicit::PostStepDiagnostics;
+
+    return pkg;
+}
+
+void UtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
+{
+    Flag(rc, "UtoP electrons");
+    auto pmb = rc->GetBlockPointer();
+
+    MetadataFlag isNonideal = pmb->packages.Get("Viscosity")->Param<MetadataFlag>("NonidealFlag");
+    MetadataFlag isPrimitive = pmb->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+    // No need for a "map" here, we just want everything that fits these
+    auto& e_P = rc->PackVariables({isNonideal, isPrimitive});
+    auto& e_U = rc->PackVariables({isNonideal, Metadata::Conserved});
+    // And then the local density
+    GridScalar rho_U = rc->Get("cons.rho").data;
+
+    const auto& G = pmb->coords;
+
+    // Get array bounds from Parthenon
+    auto bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;
+    IndexRange ib = bounds.GetBoundsI(domain);
+    IndexRange jb = bounds.GetBoundsJ(domain);
+    IndexRange kb = bounds.GetBoundsK(domain);
+
+    // For speed, we will need need need to copy & reorder indices before running this
+
+    // Begin the funky kokkos bit
+    // Let's do a batched LU and Trsv!
+    const Real alpha = 1, tiny = 0;
+    const int ni = bounds.ncellsi(domain), nj = bounds.ncellsj(domain), nk = bounds.ncellsk(domain);
+    ParArray5D<Real> AA("AA", nk, nj, ni, 7, 7);
+    ParArray4D<Real> B("B", nk, nj, ni, 7);
+
+    // Simulating some iterations
+    for (int iter=0; iter < 5; iter++) {
+        // Normally, when doing multiple batched operations,
+        // we would need either a general solve function,
+        // or two reads through the full array. Not so in Kokkos!
+        // This could be faster I think -- there are versions of the inner portion
+        // that cover rows at a time, by taking member objects on a Team
+        // see e.g. fluxes.hpp for usage of teams
+        pmb->par_for("implicit_solve", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA_3D {
+                // This code lightly adapted from 
+                auto A = Kokkos::subview(AA, k, j, i, Kokkos::ALL(), Kokkos::ALL());
+                auto b = Kokkos::subview(B, k, j, i, Kokkos::ALL());
+                /// [template]AlgoType: Unblocked, Blocked, CompatMKL
+                /// [in/out]A: 2d view
+                /// [in]tiny: a magnitude scalar value to avoid div/0
+                KokkosBatched::SerialLU<Algo::LU::Blocked>::invoke(A, tiny);
+                /// [template]UploType: indicates either upper triangular or lower triangular; Uplo::Upper, Uplo::Lower
+                /// [template]TransType: transpose of A; Trans::NoTranspose, Trans::Transpose
+                /// [template]DiagType: diagonals; Diag::Unit or Diag::NonUnit
+                /// [template]AlgoType: Unblocked, Blocked, CompatMKL
+                /// [in]alpha: scalar value
+                /// [in]A: 2d view
+                /// [in]b: 1d view
+                KokkosBatched::SerialTrsv<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>::invoke(alpha, A, b);
+            }
+        );
+    }
+
+}
+
+} // namespace Implicit
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
new file mode 100644
index 00000000..9ac7f2db
--- /dev/null
+++ b/kharma/implicit/implicit.hpp
@@ -0,0 +1,49 @@
+/* 
+ *  File: implicit.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include "decs.hpp"
+
+#include <parthenon/parthenon.hpp>
+
+namespace Implicit
+{
+
+/**
+ * Initialization.  Set parameters.
+ */
+std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin);
+
+
+} // namespace Implicit
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index da4fbb5f..205d8702 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -79,7 +79,8 @@ std::shared_ptr<StateDescriptor> KHARMA::InitializeGlobals(ParameterInput *pin)
 
 void KHARMA::FixParameters(std::unique_ptr<ParameterInput>& pin)
 {
-    // This would set ghost zones dynamically, or leave it up to Parthenon.  Dangerous?
+    // This would set ghost zones dynamically, or leave it up to Parthenon.
+    // TODO get this working so I don't have to when we really want to test it & get scaling happening
     // std::string recon = pin->GetOrAddString("GRMHD", "reconstruction", "weno5");
     // if (recon != "donor_cell" && recon != "linear_mc" && recon != "linear_vl") {
     //     pin->SetInteger("parthenon/mesh", "nghost", 4);
diff --git a/kharma/main.cpp b/kharma/main.cpp
index 1f407d77..77430142 100644
--- a/kharma/main.cpp
+++ b/kharma/main.cpp
@@ -88,10 +88,11 @@ using namespace parthenon;
  * sets of physical processes, while re-using particular physics packages to mix and match
  *
  * Currently available drivers:
- * HARM: GRMHD using LLF with zone-centered fields
+ * HARM: GRMHD using LLF with zone-centered fields, conserved variables are synchronized
+ * GRIM: same as HARM but primitive variables are synchronized,
+ *       optional implicit solve for doing e.g. Extended GRMHD
  *
  * Future drivers?
- * KHARMA: GRMHD using LLF with face-centered fields
  * bhlight: GRMHD with Monte Carlo particle transport
  */
 int main(int argc, char *argv[])
diff --git a/kharma/prob/iharm_restart.cpp b/kharma/prob/iharm_restart.cpp
index 563a6c3d..ce0bf702 100644
--- a/kharma/prob/iharm_restart.cpp
+++ b/kharma/prob/iharm_restart.cpp
@@ -49,10 +49,11 @@ void periodic_x3(const GRCoordinates& G, GridVars P, int nghost, int n1, int n2,
 using namespace Kokkos;
 
 // TODO
-// At least check that Rin,Rout match
-// Actually look at Rin,Rout,gamma and (re)build the Coordinates and mesh on them
-// Re-gridding algorithm
-// Start with multiple meshes i.e. find full file dimensions, where to start reading
+// Definitely check coordinate system params such that x1 in old mesh == x1 in new mesh
+// Implement Xtoijk and tri-linear (/etc) interp
+// Optimize by stashing file contents in a static pointer somewhere?
+// -> use above to re-map any restart to the given Parthenon mesh on import
+// Default to re-mapping but reintroduce option to set Parthenon mesh size to restart size
 
 void ReadIharmRestartHeader(std::string fname, std::unique_ptr<ParameterInput>& pin)
 {
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index 0ca03c3f..3defc5ad 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -228,8 +228,17 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart)
     Flag("Boundary sync");
     SyncAllBounds(pmesh);
 
-    // TODO be able to describe to a teenager why this block is necessary
+    // TODO when (restart/non) do we need this for setting ctop?
     if (is_restart) {
+        // Recover conserved variables 
+        if (pin->GetOrAddBoolean("driver", "type", false)) {
+            for (auto &pmb : pmesh->block_list) {
+                auto rc = pmb->meshblock_data.Get();
+                // This inserts only in vicinity of some global r,th,phi
+                InsertBlob(rc.get(), pin);
+            }
+        }
+
         auto& md = pmesh->mesh_data.GetOrAdd("base", 0);
         auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
         const ReconstructionType& recon = pmb0->packages.Get("GRMHD")->Param<ReconstructionType>("recon");
diff --git a/kharma/types.hpp b/kharma/types.hpp
index 5e3bb10d..37d1839f 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -76,10 +76,15 @@ typedef struct {
  */
 class VarMap {
     public:
-        // 127 values ought to be enough for anybody
-        int8_t RHO, UU, U1, U2, U3, B1, B2, B3, PSI;
+        // Use int8. 127 values ought to be enough for anybody, right?
+        // Basic primitive variables
+        int8_t RHO, UU, U1, U2, U3, B1, B2, B3;
+        // Tracker variables
         int8_t RHO_ADDED, UU_ADDED, PASSIVE;
+        // Electron entropy/energy tracking
         int8_t KTOT, K_CONSTANT, K_HOWES, K_KAWAZURA, K_WERNER, K_ROWAN, K_SHARMA;
+        // Implicit-solver variables: constraint damping, EGRMHD
+        int8_t PSI, Q, DP;
         // Total struct size 20 bytes, < 1 vector of 4 doubles
 
         VarMap(parthenon::PackIndexMap& name_map, bool is_cons)
@@ -103,6 +108,9 @@ class VarMap {
                 K_WERNER = name_map["cons.Kel_Werner"].first;
                 K_ROWAN = name_map["cons.Kel_Rowan"].first;
                 K_SHARMA = name_map["cons.Kel_Sharma"].first;
+                // Viscosity
+                Q = name_map["cons.q"].first;
+                DP = name_map["cons.dP"].first;
             } else {
                 // HD
                 RHO = name_map["prims.rho"].first;
@@ -122,6 +130,9 @@ class VarMap {
                 K_WERNER = name_map["prims.Kel_Werner"].first;
                 K_ROWAN = name_map["prims.Kel_Rowan"].first;
                 K_SHARMA = name_map["prims.Kel_Sharma"].first;
+                // Viscosity
+                Q = name_map["prims.q"].first;
+                DP = name_map["prims.dP"].first;
             }
             U2 = U1 + 1;
             U3 = U1 + 2;
diff --git a/kharma/viscosity/emhd_functions.hpp b/kharma/viscosity/emhd_functions.hpp
new file mode 100644
index 00000000..6d07e74c
--- /dev/null
+++ b/kharma/viscosity/emhd_functions.hpp
@@ -0,0 +1,370 @@
+/* 
+ *  File: mhd_functions.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include "decs.hpp"
+
+#include "gr_coordinates.hpp"
+#include "types.hpp"
+#include "kharma_utils.hpp"
+
+/**
+ * Device-side MHD functions
+ * They are specifically the subset which require the fluid primitives P & B field both
+ *
+ * These functions mostly have several overloads, related to local vs global variables
+ * Arguments can come in the form of global array or VariablePack references 
+ *
+ * This allows easy fusing/splitting of loops & use in different contexts
+ */
+
+namespace Viscosity
+{
+
+/**
+ * Find gamma-factor of the fluid w.r.t. normal observer
+ */
+KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const GridVector uvec,
+                                         const int& k, const int& j, const int& i,
+                                         const Loci loc)
+{
+
+    Real qsq = G.gcov(loc, j, i, 1, 1) * uvec(0, k, j, i) * uvec(0, k, j, i) +
+               G.gcov(loc, j, i, 2, 2) * uvec(1, k, j, i) * uvec(1, k, j, i) +
+               G.gcov(loc, j, i, 3, 3) * uvec(2, k, j, i) * uvec(2, k, j, i) +
+            2. * (G.gcov(loc, j, i, 1, 2) * uvec(0, k, j, i) * uvec(1, k, j, i) +
+                  G.gcov(loc, j, i, 1, 3) * uvec(0, k, j, i) * uvec(2, k, j, i) +
+                  G.gcov(loc, j, i, 2, 3) * uvec(1, k, j, i) * uvec(2, k, j, i));
+
+    return sqrt(1. + qsq);
+}
+KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const Real uv[NVEC],
+                                         const int& k, const int& j, const int& i,
+                                         const Loci loc)
+{
+    Real qsq = G.gcov(loc, j, i, 1, 1) * uv[0] * uv[0] +
+               G.gcov(loc, j, i, 2, 2) * uv[1] * uv[1] +
+               G.gcov(loc, j, i, 3, 3) * uv[2] * uv[2] +
+            2. * (G.gcov(loc, j, i, 1, 2) * uv[0] * uv[1] +
+                  G.gcov(loc, j, i, 1, 3) * uv[0] * uv[2] +
+                  G.gcov(loc, j, i, 2, 3) * uv[1] * uv[2]);
+
+    return sqrt(1. + qsq);
+}
+// Version for full primitives array
+KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
+                                         const int& k, const int& j, const int& i, const Loci& loc)
+{
+
+    Real qsq = G.gcov(loc, j, i, 1, 1) * P(m.U1, k, j, i) * P(m.U1, k, j, i) +
+               G.gcov(loc, j, i, 2, 2) * P(m.U2, k, j, i) * P(m.U2, k, j, i) +
+               G.gcov(loc, j, i, 3, 3) * P(m.U3, k, j, i) * P(m.U3, k, j, i) +
+            2. * (G.gcov(loc, j, i, 1, 2) * P(m.U1, k, j, i) * P(m.U2, k, j, i) +
+                  G.gcov(loc, j, i, 1, 3) * P(m.U1, k, j, i) * P(m.U3, k, j, i) +
+                  G.gcov(loc, j, i, 2, 3) * P(m.U2, k, j, i) * P(m.U3, k, j, i));
+
+    return sqrt(1. + qsq);
+}
+KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m,
+                                         const int& k, const int& j, const int& i, const Loci& loc)
+{
+    Real qsq = G.gcov(loc, j, i, 1, 1) * P(m.U1, i) * P(m.U1, i) +
+               G.gcov(loc, j, i, 2, 2) * P(m.U2, i) * P(m.U2, i) +
+               G.gcov(loc, j, i, 3, 3) * P(m.U3, i) * P(m.U3, i) +
+            2. * (G.gcov(loc, j, i, 1, 2) * P(m.U1, i) * P(m.U2, i) +
+                  G.gcov(loc, j, i, 1, 3) * P(m.U1, i) * P(m.U3, i) +
+                  G.gcov(loc, j, i, 2, 3) * P(m.U2, i) * P(m.U3, i));
+
+    return sqrt(1. + qsq);
+}
+
+/**
+ * Get a row of the MHD stress-energy tensor with first index up, second index down.
+ * A factor of sqrt(4 pi) is absorbed into the definition of b.
+ * See Gammie & McKinney '04
+ */
+KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Real& pgas,
+                                            const FourVectors& D, const int dir,
+                                            Real mhd[GR_DIM])
+{
+    Real bsq = dot(D.bcon, D.bcov);
+    Real eta = pgas + rho + u + bsq;
+    Real ptot = pgas + 0.5 * bsq;
+
+    DLOOP1 {
+        mhd[mu] = eta * D.ucon[dir] * D.ucov[mu] +
+                  ptot * (dir == mu) -
+                  D.bcon[dir] * D.bcov[mu];
+    }
+    double q       = S->q[k][j][i];
+    double delta_p = S->delta_p[k][j][i];
+    double ucon    = S->ucon[dir][k][j][i];
+    double bcon    = S->bcon[dir][k][j][i];
+    
+    DLOOP1 {
+        double bcov = S->bcov[mu][k][j][i];
+        double ucov = S->ucov[mu][k][j][i];
+
+        mhd[mu] += (q / sqrt(bsq)) * ((ucon * bcov) + (bcon * ucov)) 
+                + (-delta_p) * ((bcon * bcov / bsq) - (1./3.) * (delta(dir, mu) + ucon * ucov));
+    }
+}
+
+/**
+ * Just the velocity 4-vector
+ */
+KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const GridVector uvec,
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      Real ucon[GR_DIM])
+{
+    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
+    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    ucon[0] = gamma / alpha;
+    VLOOP ucon[v+1] = uvec(v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+}
+KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const Real uvec[NVEC],
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      Real ucon[GR_DIM])
+{
+    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
+    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    ucon[0] = gamma / alpha;
+    VLOOP ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+}
+
+/**
+ * Calculate the 4-velocities ucon, ucov, and 4-fields bcon, bcov from primitive versions
+ */
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Real uvec[NVEC], const Real B_P[NVEC],
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      FourVectors& D)
+{
+    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
+    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    D.ucon[0] = gamma / alpha;
+    VLOOP D.ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+
+    G.lower(D.ucon, D.ucov, k, j, i, loc);
+
+    D.bcon[0] = 0;
+    VLOOP D.bcon[0] += B_P[v] * D.ucov[v+1];
+    VLOOP D.bcon[v+1] = (B_P[v] + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
+
+    G.lower(D.bcon, D.bcov, k, j, i, loc);
+}
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const GridVector uvec, const GridVector B_P,
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      FourVectors& D)
+{
+    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
+    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    D.ucon[0] = gamma / alpha;
+    VLOOP D.ucon[v+1] = uvec(v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+
+    G.lower(D.ucon, D.ucov, k, j, i, loc);
+
+    D.bcon[0] = 0;
+    VLOOP D.bcon[0] += B_P(v, k, j, i) * D.ucov[v+1];
+    VLOOP D.bcon[v+1] = (B_P(v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
+
+    G.lower(D.bcon, D.bcov, k, j, i, loc);
+}
+
+// Primitive/VarMap version of calc_4vecs for kernels that use "packed" primitives
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
+                                      const int& k, const int& j, const int& i, const Loci loc, FourVectors& D)
+{
+    Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
+    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    D.ucon[0] = gamma / alpha;
+    VLOOP D.ucon[v+1] = P(m.U1 + v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+
+    G.lower(D.ucon, D.ucov, k, j, i, loc);
+
+    D.bcon[0] = 0;
+    VLOOP D.bcon[0] += P(m.B1 + v, k, j, i) * D.ucov[v+1];
+    VLOOP D.bcon[v+1] = (P(m.B1 + v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
+
+    G.lower(D.bcon, D.bcov, k, j, i, loc);
+}
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m,
+                                      const int& k, const int& j, const int& i, const Loci loc, FourVectors& D)
+{
+    Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
+    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    D.ucon[0] = gamma / alpha;
+    VLOOP D.ucon[v+1] = P(m.U1 + v, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+
+    G.lower(D.ucon, D.ucov, k, j, i, loc);
+
+    D.bcon[0] = 0;
+    VLOOP D.bcon[0] += P(m.B1 + v, i) * D.ucov[v+1];
+    VLOOP D.bcon[v+1] = (P(m.B1 + v, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
+
+    G.lower(D.bcon, D.bcov, k, j, i, loc);
+}
+
+/**
+ * Turn the primitive variables at a location into the local conserved variables, or fluxes at a face
+ * 
+ * Note this is for the five fluid variables only -- each package defines a prim_to_flux, which are called in GetFlux
+ */
+KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m_p, const FourVectors D,
+                                         const Real& gam, const int& k, const int& j, const int& i, const int dir,
+                                         ScratchPad2D<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
+{
+    Real gdet = G.gdet(loc, j, i);
+
+    // Particle number flux
+    flux(m_u.RHO, i) = P(m_p.RHO, i) * D.ucon[dir] * gdet;
+
+    // MHD stress-energy tensor w/ first index up, second index down
+    Real mhd[GR_DIM];
+    calc_tensor(P(m_p.RHO, i), P(m_p.UU, i), (gam - 1) * P(m_p.UU, i), D, dir, mhd);
+    flux(m_u.UU, i)  = mhd[0] * gdet + flux(m_u.RHO, i);
+    flux(m_u.U1, i) =  mhd[1] * gdet;
+    flux(m_u.U2, i) =  mhd[2] * gdet;
+    flux(m_u.U3, i) =  mhd[3] * gdet;
+    flux(m_u.Q, i)  = P(m_p.Q, i) * D.ucon[dir] * gdet;
+    flux(m_u.DP, i) = P(m_p.DP, i) * D.ucon[dir] * gdet;
+}
+
+/**
+ * Get the conserved (fluid only!) variables corresponding to primitives in a zone. Equivalent to prim_to_flux with dir==0
+ */
+KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
+                                   const Real& gam, const int& k, const int& j, const int& i,
+                                   const VariablePack<Real>& U, const VarMap m_u, const Loci loc=Loci::center)
+{
+    Real gdet = G.gdet(loc, j, i);
+
+    FourVectors Dtmp;
+    calc_4vecs(G, P, m_p, k, j, i, loc, Dtmp);
+
+    // Particle number flux
+    U(m_u.RHO, k, j, i) = P(m_p.RHO, k, j, i) * Dtmp.ucon[0] * gdet;
+
+    // MHD stress-energy tensor w/ first index up, second index down
+    Real mhd[GR_DIM];
+    calc_tensor(P(m_p.RHO, k, j, i), P(m_p.UU, k, j, i), (gam - 1) * P(m_p.UU, k, j, i), Dtmp, 0, mhd);
+
+    U(m_u.UU, k, j, i)  = mhd[0] * gdet + U(m_u.RHO, k, j, i);
+    VLOOP U(m_u.U1 + v, k, j, i) = mhd[1 + v] * gdet;
+}
+
+/**
+ * Special p_to_u call for fluid frame floors, which require a speculative transformation to add to existing U
+ * Also used in the wind source term calculation, of all places
+ */
+KOKKOS_INLINE_FUNCTION void p_to_u_loc(const GRCoordinates& G, const Real& rho, const Real& u, const Real uvec[NVEC],
+                                   const Real B_P[NVEC], const Real& gam, const int& k, const int& j, const int& i,
+                                   Real& rho_ut, Real T[GR_DIM], const Loci loc=Loci::center)
+{
+    Real gdet = G.gdet(loc, j, i);
+
+    FourVectors Dtmp;
+    calc_4vecs(G, uvec, B_P, k, j, i, loc, Dtmp);
+
+    // Particle number flux
+    rho_ut = rho * Dtmp.ucon[0] * gdet;
+
+    // MHD stress-energy tensor w/ first index up, second index down
+    Real mhd[GR_DIM];
+    calc_tensor(rho, u, (gam - 1) * u, Dtmp, 0, mhd);
+
+    T[0]  = mhd[0] * gdet + rho_ut;
+    VLOOP T[1 + v] = mhd[1 + v] * gdet;
+}
+
+
+/**
+ * Calculate components of magnetosonic velocity from primitive variables
+ * This is only called in GetFlux, so we only provide a ScratchPad form
+ */
+KOKKOS_INLINE_FUNCTION void vchar(const GRCoordinates &G, const ScratchPad2D<Real>& P, const VarMap& m, const FourVectors& D,
+                                  const Real& gam, const int& k, const int& j, const int& i, const Loci loc, const int& dir,
+                                  Real& cmax, Real& cmin)
+{
+    // Find fast magnetosonic speed
+    Real cms2;
+    {
+        Real bsq = dot(D.bcon, D.bcov);
+        Real ef = P(m.RHO, i) + gam * P(m.UU, i);
+        Real ee = bsq + ef;
+        Real va2 = bsq / ee;
+        Real cs2 = gam * (gam - 1) * P(m.UU, i) / ef;
+        cms2 = cs2 + va2 - cs2 * va2;
+        clip(cms2, 1.e-20, 1.);
+    }
+
+    // Require that speed of wave measured by observer q.ucon is cms2
+    Real A, B, C;
+    {
+        Real Bcov[GR_DIM] = {1., 0., 0., 0.};
+        Real Acov[GR_DIM] = {0}; Acov[dir] = 1.;
+
+        Real Acon[GR_DIM], Bcon[GR_DIM];
+        G.raise(Acov, Acon, k, j, i, loc);
+        G.raise(Bcov, Bcon, k, j, i, loc);
+
+        Real Asq = dot(Acon, Acov);
+        Real Bsq = dot(Bcon, Bcov);
+        Real Au = dot(Acov, D.ucon);
+        Real Bu = dot(Bcov, D.ucon);
+        Real AB = dot(Acon, Bcov);
+        Real Au2 = Au * Au;
+        Real Bu2 = Bu * Bu;
+        Real AuBu = Au * Bu;
+
+        A = Bu2 - (Bsq + Bu2) * cms2;
+        B = 2. * (AuBu - (AB + AuBu) * cms2);
+        C = Au2 - (Asq + Au2) * cms2;
+    }
+
+    Real discr = sqrt(max(B * B - 4. * A * C, 0.));
+
+    Real vp = -(-B + discr) / (2. * A);
+    Real vm = -(-B - discr) / (2. * A);
+
+    cmax = max(vp, vm);
+    cmin = min(vp, vm);
+}
+
+}
diff --git a/kharma/viscosity/viscosity.cpp b/kharma/viscosity/viscosity.cpp
index effa92cf..f468fde4 100644
--- a/kharma/viscosity/viscosity.cpp
+++ b/kharma/viscosity/viscosity.cpp
@@ -39,11 +39,6 @@
 
 #include <parthenon/parthenon.hpp>
 
-#include <batched/dense/KokkosBatched_LU_Decl.hpp>
-#include <batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp>
-#include <batched/dense/KokkosBatched_Trsv_Decl.hpp>
-using namespace KokkosBatched;
-
 using namespace parthenon;
 
 namespace Viscosity
@@ -81,93 +76,13 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     // Pressure anisotropy
     pkg->AddField("cons.dP", m_con);
     pkg->AddField("prims.dP", m_prim);
+    // Eventually also need (most or all of) Theta, bsq, nu_emhd, chi_emhd, tau
 
-    // This ensures that UtoP is called (by way of viscosity.hpp definitions)
-    pkg->FillDerivedBlock = Viscosity::FillDerived;
-    pkg->PostFillDerivedBlock = Viscosity::PostFillDerived;
+    // If we want to register viscosity-specific UtoP for some reason?
+    // Likely we'll only use the post-step summary hook
+    //pkg->FillDerivedBlock = Viscosity::FillDerived;
+    //pkg->PostFillDerivedBlock = Viscosity::PostFillDerived;
     return pkg;
 }
 
-void UtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
-{
-    Flag(rc, "UtoP electrons");
-    auto pmb = rc->GetBlockPointer();
-
-    MetadataFlag isNonideal = pmb->packages.Get("Viscosity")->Param<MetadataFlag>("NonidealFlag");
-    MetadataFlag isPrimitive = pmb->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
-    // No need for a "map" here, we just want everything that fits these
-    auto& e_P = rc->PackVariables({isNonideal, isPrimitive});
-    auto& e_U = rc->PackVariables({isNonideal, Metadata::Conserved});
-    // And then the local density
-    GridScalar rho_U = rc->Get("cons.rho").data;
-
-    const auto& G = pmb->coords;
-
-    // Get array bounds from Parthenon
-    auto bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;
-    IndexRange ib = bounds.GetBoundsI(domain);
-    IndexRange jb = bounds.GetBoundsJ(domain);
-    IndexRange kb = bounds.GetBoundsK(domain);
-
-    // We will need need need to copy & reorder indices before running this
-
-    // Begin the funky kokkos bit
-    // Let's do a batched LU and Trsv!
-    const Real alpha = 1, tiny = 0;
-    const int ni = bounds.ncellsi(domain), nj = bounds.ncellsj(domain), nk = bounds.ncellsk(domain);
-    ParArray5D<Real> AA("AA", nk, nj, ni, 7, 7);
-    ParArray4D<Real> B("B", nk, nj, ni, 7);
-
-    // Simulating some iterations
-    for (int iter=0; iter < 5; iter++) {
-        // Normally, when doing multiple batched operations,
-        // we would need either a general solve function,
-        // or two reads through the full array. Not so in Kokkos!
-        // This could be faster I think -- there are versions of the inner portion
-        // that cover rows at a time, by taking member objects on a Team
-        // see e.g. fluxes.hpp for usage of teams
-        pmb->par_for("implicit_solve", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-            KOKKOS_LAMBDA_3D {
-                // This code lightly adapted from 
-                auto A = Kokkos::subview(AA, k, j, i, Kokkos::ALL(), Kokkos::ALL());
-                auto b = Kokkos::subview(B, k, j, i, Kokkos::ALL());
-                /// [template]AlgoType: Unblocked, Blocked, CompatMKL
-                /// [in/out]A: 2d view
-                /// [in]tiny: a magnitude scalar value to avoid div/0
-                KokkosBatched::SerialLU<Algo::LU::Blocked>::invoke(A, tiny);
-                /// [template]UploType: indicates either upper triangular or lower triangular; Uplo::Upper, Uplo::Lower
-                /// [template]TransType: transpose of A; Trans::NoTranspose, Trans::Transpose
-                /// [template]DiagType: diagonals; Diag::Unit or Diag::NonUnit
-                /// [template]AlgoType: Unblocked, Blocked, CompatMKL
-                /// [in]alpha: scalar value
-                /// [in]A: 2d view
-                /// [in]b: 1d view
-                KokkosBatched::SerialTrsv<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>::invoke(alpha, A, b);
-            }
-        );
-    }
-
-}
-
-void PostUtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
-{
-    // Any fixing after that... whole thing
-}
-
-TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *rc)
-{
-    Flag("Printing electron diagnostics");
-
-    // Output any diagnostics after a step completes
-
-    Flag("Printed");
-    return TaskStatus::complete;
-}
-
-void FillOutput(MeshBlock *pmb, ParameterInput *pin)
-{
-    // Any variables or diagnostics that should be computed especially for output to a file,
-    // but which are not otherwise updated.
-}
-
-} // namespace B_FluxCT
+} // namespace Viscosity
diff --git a/kharma/viscosity/viscosity.hpp b/kharma/viscosity/viscosity.hpp
index 23626f89..1872cc1f 100644
--- a/kharma/viscosity/viscosity.hpp
+++ b/kharma/viscosity/viscosity.hpp
@@ -49,56 +49,11 @@ namespace Viscosity {
 std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages);
 
 /**
- * In addition to the standard functions, packages can include extras.  This is called manually
- * at the end of problem initialization in problem.cpp
+ * TODO standard interface for implicit solver & what that needs, similar to UtoP/prim_to_flux definitions
  */
-TaskStatus InitElectrons(MeshBlockData<Real> *rc, ParameterInput *pin);
 
 /**
- * Determine the primitive variable values, given conserved forms
- * This is where the implicit kernel will likely be placed, as each solve is per-cell after fluxes
- * and boundaries.
- * 
- * TODO make this replace GRMHD::UtoP or make it step out of the way
- */
-void UtoP(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire, bool coarse=false);
-inline void FillDerived(MeshBlockData<Real> *rc) { UtoP(rc); }
-
-/**
- * Floors, fixes, or other cleaning up after determining primitives.
- */
-void PostUtoP(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire, bool coarse=false);
-inline void PostFillDerived(MeshBlockData<Real> *rc) { PostUtoP(rc); }
-
-/**
- * Diagnostics printed/computed after each step, called from kharma.cpp
- * 
- * Function in this package: Currently nothing
- */
-TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *rc);
-
-/**
- * Fill fields which are calculated only for output to dump files
- * 
- * Function in this package: Currently nothing
- */
-void FillOutput(MeshBlock *pmb, ParameterInput *pin);
-
-/**
- * KHARMA requires two forms of the function for obtaining conserved variables from primitives.
- * However, these are very different from UtoP/FillDerived in that they are called exclusively on the
- * device side, operating on a single zone rather than the whole fluid state.
- * 
- * Each should have roughly the signature used here, accepting scratchpads of size NVARxN1, and index
- * maps (see types.hpp) indicating which index corresponds to which variable in the packed array, as well
- * as indications of the desired zone location and flux direction (dir==0 for just the conserved variable forms).
- * As used extensively here, any variables not present in a pack will have index -1 in the map.
- *  
- * The two functions differ in two ways:
- * 1. The caller precalculate the four-vectors (u^mu, b^mu) and pass them in the struct D to prim_to_flux (see fluxes.hpp for call)
- * 2. p_to_u will only ever be called to obtain the conserved variables U, not fluxes (i.e. dir == 0 in calls)
- * 
- * Function in this package: primitive to flux/conserved transformation of conduction term q, pressure anisotropy dP
+ * Whatever form these take for viscous variables
  */
 KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m_p, const FourVectors D,
                                          const int& k, const int& j, const int& i, const int dir,
@@ -113,4 +68,8 @@ KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Re
     // Calculate conserved variables from primitives
 }
 
+KOKKOS_INLINE_FUNCTION void gradient_calc(const GRCoordinates& G, const GridVector var, int loc, int i, int j, int k, double grad[NDIM]);
+
+KOKKOS_INLINE_FUNCTION void gradient_calc_vec(const GRCoordinates& G, const GridVector var, int loc, int i, int j, int k, double grad_vec[NDIM][NDIM]);
+
 }
diff --git a/pars/mhdmodes.par b/pars/mhdmodes.par
index d25151f6..ebecead4 100644
--- a/pars/mhdmodes.par
+++ b/pars/mhdmodes.par
@@ -36,7 +36,7 @@ base = cartesian_minkowski
 transform = null
 
 <parthenon/time>
-# This will be overridden depending on the problem
+# tlim will be overridden depending on the problem
 tlim = 5.0
 integrator = rk2
 dt_min = 0.0001
diff --git a/pars/mhdmodes_implicit.par b/pars/mhdmodes_implicit.par
new file mode 100644
index 00000000..f4ce1d0b
--- /dev/null
+++ b/pars/mhdmodes_implicit.par
@@ -0,0 +1,79 @@
+# GRMHD Modes problem
+# Try to propagate several analytically-amenable linear modes of the MHD equations
+
+<parthenon/job>
+problem_id = mhdmodes
+
+<parthenon/mesh>
+refinement = none
+numlevel = 1
+
+nx1 = 64
+x1min = 0.0
+x1max = 1.0
+ix1_bc = periodic
+ox1_bc = periodic
+
+nx2 = 64
+x2min = 0.0
+x2max = 1.0
+ix2_bc = periodic
+ox2_bc = periodic
+
+nx3 = 64
+x3min = 0.0
+x3max = 1.0
+ix3_bc = periodic
+ox3_bc = periodic
+
+<parthenon/meshblock>
+nx1 = 32
+nx2 = 32
+nx3 = 32
+
+<coordinates>
+base = cartesian_minkowski
+transform = null
+
+<parthenon/time>
+# tlim will be overridden depending on the problem
+tlim = 5.0
+integrator = rk2
+dt_min = 0.0001
+
+<GRMHD>
+cfl = 0.9
+gamma = 1.333333
+reconstruction = weno5
+
+<mhdmodes>
+nmode = 1
+dir = 0
+
+<floors>
+disable_floors = true
+
+<debug>
+verbose = 1
+flag_verbose = 1
+extra_checks = 1
+
+<driver>
+type = grim
+step = explicit
+
+<perf>
+pack_comms = false
+
+<parthenon/output0>
+file_type = hdf5
+# This is so as to output only the final state
+dt = 1.0
+single_precision_output = true
+variables = prims.rho, prims.u, prims.uvec, prims.B
+ghost_zones = true
+
+<parthenon/output1>
+file_type = hst
+dt = 0.1
+
diff --git a/tests/bondi/run.sh b/tests/bondi/run.sh
index 7e76e752..b71ac745 100755
--- a/tests/bondi/run.sh
+++ b/tests/bondi/run.sh
@@ -20,7 +20,7 @@ conv_2d() {
 # Test coordinates (raw ks?)
 conv_2d fmks coordinates/transform=fmks
 conv_2d mks coordinates/transform=mks
-conv_2d eks coordinates/transform=eks
+#conv_2d eks coordinates/transform=eks # TODO fix eks in pyHARM
 # Recon
 conv_2d linear_mc GRMHD/reconstruction=linear_mc
 conv_2d linear_vl GRMHD/reconstruction=linear_vl

From 077edeb810d966a4f4969b22d6831fd7d00cb50a Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Fri, 25 Feb 2022 16:48:29 -0600
Subject: [PATCH 06/26] Maim KHARMA

Adds working implicit solver for arbitrary nvar, which:
1. Calculates Jacobian and current residual of all primitive variables
2. Performs linear solve against residual using kokkos-kernels,
   producing an update dP to primitive variables.
3. Updates primitives with dP

Remains to fix:
1. This operation currently converges at first order. Likely using
   old (step start) data somewhere I shouldn't be.
2. Fix Bondi test, which was broken by something in this commit
3. Add EMHD source terms, both a call for implicit terms,
   and implicit term calculations at hooks in the implicit solve.

The latter will involve definitions which will change per-problem,
which may be difficult to handle elegantly, but with a working
implementation in iharm3d this whole thing has been/should be
straightforward.
---
 CMakeLists.txt                                |   1 +
 kharma/CMakeLists.txt                         |   6 +-
 kharma/b_cd/b_cd.hpp                          |  33 --
 kharma/b_cd/seed_B_cd.cpp                     |   9 +-
 kharma/b_flux_ct/b_flux_ct.cpp                |  30 +-
 kharma/b_flux_ct/b_flux_ct.hpp                |  40 +-
 kharma/b_flux_ct/seed_B_ct.cpp                |   8 +-
 kharma/boundaries.cpp                         |  18 +-
 kharma/boundaries.hpp                         |   2 +-
 kharma/current/current.hpp                    |   2 +-
 kharma/debug.cpp                              |   2 +-
 kharma/decs.hpp                               |   5 +
 kharma/electrons/electrons.hpp                |   4 +-
 .../viscosity.cpp => emhd/emhd.cpp}           |  16 +-
 .../viscosity.hpp => emhd/emhd.hpp}           |  13 +-
 kharma/emhd/emhd_sources.hpp                  | 189 +++++++++
 kharma/floors/floors.cpp                      |   2 +-
 kharma/floors/floors.hpp                      |  19 +-
 kharma/{fluxes.cpp => flux.cpp}               |  56 ++-
 kharma/{fluxes.hpp => flux.hpp}               |  95 ++---
 kharma/flux_functions.hpp                     | 198 ++++++++++
 kharma/grmhd/U_to_P.hpp                       |  16 +-
 kharma/grmhd/fixup.cpp                        |  16 +-
 kharma/grmhd/grmhd.cpp                        |  51 +--
 kharma/grmhd/grmhd_functions.hpp              | 335 ++++++++++++++++
 kharma/grmhd/mhd_functions.hpp                | 356 -----------------
 kharma/grmhd/source.hpp                       |   2 +-
 kharma/harm_driver.cpp                        |   2 +-
 kharma/{grim_driver.cpp => imex_driver.cpp}   | 105 +++--
 kharma/{grim_driver.hpp => imex_driver.hpp}   |  12 +-
 kharma/implicit/implicit.cpp                  | 290 +++++++++++---
 kharma/implicit/implicit.hpp                  |  95 +++++
 kharma/kharma.cpp                             |  22 +-
 kharma/main.cpp                               |  10 +-
 kharma/prob/b_field_tools.cpp                 |   2 +-
 kharma/prob/blob.hpp                          |   2 +-
 kharma/prob/bondi.hpp                         |   2 +-
 kharma/prob/post_initialize.cpp               |   4 +-
 kharma/prob/problem.cpp                       |   6 +-
 kharma/reductions/reductions.hpp              |  37 +-
 kharma/types.hpp                              |   6 +-
 kharma/viscosity/emhd_functions.hpp           | 370 ------------------
 kharma/wind/wind.cpp                          |   2 +-
 kharma/wind/wind.hpp                          |   2 +-
 pars/bondi.par                                |   1 +
 pars/mhdmodes.par                             |   1 +
 tests/bondi/check.sh                          |   7 +-
 tests/bondi/run.sh                            |   6 +-
 tests/mhdmodes/check.sh                       |   4 +
 tests/mhdmodes/run.sh                         |  10 +-
 50 files changed, 1412 insertions(+), 1110 deletions(-)
 rename kharma/{viscosity/viscosity.cpp => emhd/emhd.cpp} (90%)
 rename kharma/{viscosity/viscosity.hpp => emhd/emhd.hpp} (87%)
 create mode 100644 kharma/emhd/emhd_sources.hpp
 rename kharma/{fluxes.cpp => flux.cpp} (55%)
 rename kharma/{fluxes.hpp => flux.hpp} (75%)
 create mode 100644 kharma/flux_functions.hpp
 create mode 100644 kharma/grmhd/grmhd_functions.hpp
 delete mode 100644 kharma/grmhd/mhd_functions.hpp
 rename kharma/{grim_driver.cpp => imex_driver.cpp} (80%)
 rename kharma/{grim_driver.hpp => imex_driver.hpp} (82%)
 delete mode 100644 kharma/viscosity/emhd_functions.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 96e7d7ff..61d48916 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ include_directories(external/parthenon/src)
 # mpark::variant is header only, don't build anything
 include_directories(external/variant/include)
 # Kokkos kernels
+# Ubelievably, this actually needs to be compiled to use headers
 add_subdirectory(external/kokkos-kernels)
 include_directories(external/kokkos-kernels/src)
 include_directories(external/kokkos-kernels/src/batched)
diff --git a/kharma/CMakeLists.txt b/kharma/CMakeLists.txt
index 87bebdf3..742e9a8e 100644
--- a/kharma/CMakeLists.txt
+++ b/kharma/CMakeLists.txt
@@ -21,7 +21,7 @@ AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/floors EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/grmhd EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/implicit EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/reductions EXE_NAME_SRC)
-AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/viscosity EXE_NAME_SRC)
+AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/emhd EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/wind EXE_NAME_SRC)
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@@ -36,7 +36,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/floors)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grmhd)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/implicit)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/reductions)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/viscosity)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/emhd)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/wind)
 
 add_executable(${EXE_NAME} ${EXE_NAME_SRC})
@@ -48,7 +48,7 @@ target_link_libraries(${EXE_NAME} PUBLIC parthenon)
 # OPTIONS
 # These are almost universally performance trade-offs
 # TODO is there any way to make compile options less painful in CMake?
-option(FUSE_FLUX_KERNELS "Bundle the usual four flux calculation kernels (floors,R,L,apply) into one" ON)
+option(FUSE_FLUX_KERNELS "Bundle the usual four flux calculation kernels (floors,R,L,apply) into one" OFF)
 option(FUSE_EMF_KERNELS "Bundle the three emf direction kernels into one. Likely won't affect much" ON)
 option(FUSE_FLOOR_KERNELS "Bundle applying the floors and ceilings into one kernel" ON)
 option(FAST_CARTESIAN "Break operation in curved spacetimes to make Cartesian Minkowski space computations faster" OFF)
diff --git a/kharma/b_cd/b_cd.hpp b/kharma/b_cd/b_cd.hpp
index ed57d9f4..b62840a5 100644
--- a/kharma/b_cd/b_cd.hpp
+++ b/kharma/b_cd/b_cd.hpp
@@ -91,37 +91,4 @@ TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *rc);
  */
 void FillOutput(MeshBlock *pmb, ParameterInput *pin);
 
-/**
- * Turn the primitive B field into the local conserved flux
- */
-KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const ScratchPad2D<Real> &P, const VarMap& m_p, const FourVectors D,
-                                           const int& k, const int& j, const int& i, const int& dir,
-                                           ScratchPad2D<Real>& flux, const VarMap& m_u, const Loci& loc=Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-    if (dir == 0) { // Parent is templated on dir, so we should get the speed here still
-        VLOOP flux(m_u.B1 + v, i) = P(m_p.B1 + v, i) * gdet;
-        flux(m_u.PSI, i) = P(m_p.PSI, i) * gdet;
-    } else {
-        // Dual of Maxwell tensor
-        // Dedner would have e.g. P(m.psip, i) * gdet,
-        // but for us this is in the source term
-        VLOOP flux(m_u.B1 + v, i) = (D.bcon[v+1] * D.ucon[dir] - D.bcon[dir] * D.ucon[v+1]) * gdet;
-        // Psi field update as in Mosta et al (IllinoisGRMHD), alternate explanation Jesse et al (2020)
-        //Real alpha = 1. / sqrt(-G.gcon(Loci::center, j, i, 0, 0));
-        //Real beta_dir = G.gcon(Loci::center, j, i, 0, dir) * alpha * alpha;
-        flux(m_u.PSI, i) = (D.bcon[dir] - G.gcon(Loci::center, j, i, 0, dir) * P(m_p.PSI, i)) * gdet;
-    }
-}
-
-KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
-                                           const int& k, const int& j, const int& i,
-                                           const VariablePack<Real>& U, const VarMap& m_u, const Loci& loc=Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-    VLOOP U(m_u.B1 + v, k, j, i) = P(m_p.B1 + v, k, j, i) * gdet;
-    U(m_u.PSI, k, j, i) = P(m_p.PSI, k, j, i) * gdet;
-
-}
-
 }
diff --git a/kharma/b_cd/seed_B_cd.cpp b/kharma/b_cd/seed_B_cd.cpp
index 07ea0c49..cdad79b0 100644
--- a/kharma/b_cd/seed_B_cd.cpp
+++ b/kharma/b_cd/seed_B_cd.cpp
@@ -39,7 +39,7 @@
 #include "b_field_tools.hpp"
 
 #include "b_flux_ct.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 TaskStatus B_CD::SeedBField(MeshBlockData<Real> *rc, ParameterInput *pin)
 {
@@ -165,12 +165,7 @@ TaskStatus B_CD::SeedBField(MeshBlockData<Real> *rc, ParameterInput *pin)
             B_P(2, k, j, i) = 0.;
         }
     );
-    pmb->par_for("first_U_B", ks, ke, js, je, is, ie,
-        KOKKOS_LAMBDA_3D {
-            // Use the "other" P to U, because we're content that psi = 0 to begin
-            B_FluxCT::p_to_u(G, B_P, k, j, i, B_U);
-        }
-    );
+    B_FluxCT::PtoU(rc);
 
     return TaskStatus::complete;
 }
diff --git a/kharma/b_flux_ct/b_flux_ct.cpp b/kharma/b_flux_ct/b_flux_ct.cpp
index 13cf97eb..76bf7bba 100644
--- a/kharma/b_flux_ct/b_flux_ct.cpp
+++ b/kharma/b_flux_ct/b_flux_ct.cpp
@@ -73,10 +73,11 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     MetadataFlag isMHD = packages.Get("GRMHD")->Param<MetadataFlag>("MHDFlag");
 
     // B fields.  "Primitive" form is field, "conserved" is flux
-    // Note: when changing metadata, keep these in lockstep with grmhd.cpp
+    // Note: when changing metadata, keep these in lockstep with grmhd.cpp!!
+    // See notes there about changes for the Imex driver
     std::vector<MetadataFlag> flags_prim, flags_cons;
-    auto grim_driver = pin->GetString("driver", "type") == "grim";
-    if (!grim_driver) {
+    auto imex_driver = pin->GetString("driver", "type") == "grim";
+    if (!imex_driver) {
         flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
                                                 isPrimitive, isMHD, Metadata::Vector});
         flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
@@ -156,6 +157,29 @@ void UtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
     );
 }
 
+void PtoU(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
+{
+    Flag(rc, "B UtoP Block");
+    auto pmb = rc->GetBlockPointer();
+
+    auto B_U = rc->PackVariables(std::vector<std::string>{"cons.B"});
+    auto B_P = rc->PackVariables(std::vector<std::string>{"prims.B"});
+
+    const auto& G = pmb->coords;
+
+    auto bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;
+    const IndexRange ib = bounds.GetBoundsI(domain);
+    const IndexRange jb = bounds.GetBoundsJ(domain);
+    const IndexRange kb = bounds.GetBoundsK(domain);
+    const IndexRange vec = IndexRange({0, B_U.GetDim(4)-1});
+    pmb->par_for("UtoP_B", vec.s, vec.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_VEC {
+            // Update the primitive B-fields
+            B_U(mu, k, j, i) = B_P(mu, k, j, i) * G.gdet(Loci::center, j, i);
+        }
+    );
+}
+
 TaskStatus FluxCT(MeshData<Real> *md)
 {
     Flag(md, "Flux CT");
diff --git a/kharma/b_flux_ct/b_flux_ct.hpp b/kharma/b_flux_ct/b_flux_ct.hpp
index bdb3f5eb..737269c3 100644
--- a/kharma/b_flux_ct/b_flux_ct.hpp
+++ b/kharma/b_flux_ct/b_flux_ct.hpp
@@ -37,7 +37,7 @@
 
 #include <parthenon/parthenon.hpp>
 
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "types.hpp"
 
 using namespace parthenon;
@@ -68,6 +68,11 @@ inline void FillDerivedMesh(MeshData<Real> *md) { UtoP(md); }
 void UtoP(MeshBlockData<Real> *md, IndexDomain domain=IndexDomain::entire, bool coarse=false);
 inline void FillDerivedBlock(MeshBlockData<Real> *rc) { UtoP(rc); }
 
+/**
+ * Inverse of above. Generally only for initialization.
+ */
+void PtoU(MeshBlockData<Real> *md, IndexDomain domain=IndexDomain::entire, bool coarse=false);
+
 /**
  * Modify the B field fluxes to take a constrained-transport step as in Toth (2000)
  */
@@ -105,37 +110,4 @@ void FillOutput(MeshBlock *pmb, ParameterInput *pin);
 
 // TODO device-side divB at a single zone corner, to avoid code duplication?
 
-/**
- * Turn the primitive B field into the local conserved flux
- */
-KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, ScratchPad2D<Real>& P, const VarMap& m_p, const FourVectors D,
-                                         const int& k, const int& j, const int& i, const int dir,
-                                         ScratchPad2D<Real>& flux, const VarMap& m_u, const Loci loc = Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-    if (dir == 0) {
-        VLOOP flux(m_u.B1 + v, i) = P(m_p.B1 + v, i) * gdet;
-    } else {
-        VLOOP flux(m_u.B1 + v, i) = (D.bcon[v+1] * D.ucon[dir] - D.bcon[dir] * D.ucon[v+1]) * gdet;
-    }
-}
-
-/**
- * Convenience functions for zone 
- */
-KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const GridVector B_P,
-                                    const int& k, const int& j, const int& i,
-                                    GridVector B_U, const Loci loc = Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-    VLOOP B_U(v, k, j, i) = B_P(v, k, j, i) * gdet;
-}
-KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
-                                    const int& k, const int& j, const int& i,
-                                    const VariablePack<Real>& U, const VarMap& m_u, const Loci loc = Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-    VLOOP U(m_u.B1 + v, k, j, i) = P(m_p.B1 + v, k, j, i) * gdet;
-}
-
 }
diff --git a/kharma/b_flux_ct/seed_B_ct.cpp b/kharma/b_flux_ct/seed_B_ct.cpp
index 547f7e01..595d63d2 100644
--- a/kharma/b_flux_ct/seed_B_ct.cpp
+++ b/kharma/b_flux_ct/seed_B_ct.cpp
@@ -39,7 +39,7 @@
 #include "b_field_tools.hpp"
 #include "b_flux_ct.hpp"
 #include "fm_torus.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "prob_common.hpp"
 
 TaskStatus B_FluxCT::SeedBField(MeshBlockData<Real> *rc, ParameterInput *pin)
@@ -113,9 +113,9 @@ TaskStatus B_FluxCT::SeedBField(MeshBlockData<Real> *rc, ParameterInput *pin)
                 B_P(V1, k, j, i) = b10;
                 B_P(V2, k, j, i) = b20;
                 B_P(V3, k, j, i) = b30;
-                B_FluxCT::p_to_u(G, B_P, k, j, i, B_U);
             }
         );
+        B_FluxCT::PtoU(rc);
         return TaskStatus::complete;
     } else if (b_field_flag == BSeedType::monopole) {
         pmb->par_for("B_field_B", ks, ke, js, je, is, ie,
@@ -124,9 +124,9 @@ TaskStatus B_FluxCT::SeedBField(MeshBlockData<Real> *rc, ParameterInput *pin)
                 B_P(V1, k, j, i) = b10 / G.gdet(Loci::center, j, i);
                 B_P(V2, k, j, i) = 0.;
                 B_P(V3, k, j, i) = 0.;
-                B_FluxCT::p_to_u(G, B_P, k, j, i, B_U);
             }
         );
+        B_FluxCT::PtoU(rc);
         return TaskStatus::complete;
     }
 
@@ -299,7 +299,7 @@ TaskStatus B_FluxCT::SeedBField(MeshBlockData<Real> *rc, ParameterInput *pin)
     }
 
     // Then make sure the primitive versions are updated, too
-    UtoP(rc);
+    B_FluxCT::UtoP(rc);
 
     return TaskStatus::complete;
 }
diff --git a/kharma/boundaries.cpp b/kharma/boundaries.cpp
index 28f5ddae..23229a4e 100644
--- a/kharma/boundaries.cpp
+++ b/kharma/boundaries.cpp
@@ -37,7 +37,7 @@
 #include "boundaries.hpp"
 
 #include "kharma.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "pack.hpp"
 #include "types.hpp"
 
@@ -72,7 +72,7 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     auto P = GRMHD::PackMHDPrims(rc.get(), prims_map, coarse);
     auto q = rc->PackVariables({Metadata::FillGhost}, cons_map, coarse);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
-    // If we're running classic/GRIM, q is the primitive variables
+    // If we're running imex, q is the *primitive* variables
     bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
 
     // KHARMA is very particular about corner boundaries.
@@ -128,16 +128,17 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     pmb->par_for("OutflowX1_check", ks_e, ke_e, js_e, je_e, ibs, ibe,
         KOKKOS_LAMBDA_3D {
             // Inflow check
-            if (check_inflow) KBoundaries::check_inflow(G, P , m_p.U1, k, j, i, dir);
+            if (check_inflow) KBoundaries::check_inflow(G, P, m_p.U1, k, j, i, dir);
         }
     );
     if (!prim_ghosts) {
         // Recover U
         pmb->par_for("OutflowX1_PtoU", ks_e, ke_e, js_e, je_e, ibs, ibe,
             KOKKOS_LAMBDA_3D {
-                // TODO move these steps into FillDerivedDomain, make a GRMHD::PrimToFlux call the last in that series
+                // TODO move these steps into FillDerivedDomain, make a GRMHD::PtoU call the last in that series
                 // Correct primitive B
-                VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
+                if (m_p.B1 >= 0)
+                    VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
                 // Recover conserved vars
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
             }
@@ -160,7 +161,7 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     auto P = GRMHD::PackMHDPrims(rc.get(), prims_map, coarse);
     auto q = rc->PackVariables({Metadata::FillGhost}, cons_map, coarse);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
-    // If we're running classic/GRIM, q is the primitive variables
+    // If we're running imex, q is the *primitive* variables
     bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
 
     // KHARMA is very particular about corner boundaries, see above
@@ -205,7 +206,7 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
             q(p, k, j, i) = reflect * q(p, k, (ref + add) + (ref - j), i);
         }
     );
-    // If we're using the classic/GRIM algo, the above is all we need.
+    // If we're using imex driver, the above is all we need.
     if (!prim_ghosts) {
         // If we're using the HARM/KHARMA driver, we need to do the primitives
         // separately after the conserved vars
@@ -218,7 +219,8 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
         // And we need to fill the corresponding conserved vars
         pmb->par_for("ReflectX2_PtoU", ks_e, ke_e, jbs, jbe, ics, ice,
             KOKKOS_LAMBDA_3D {
-                VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
+                if (m_p.B1 >= 0)
+                    VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
             }
         );
diff --git a/kharma/boundaries.hpp b/kharma/boundaries.hpp
index a01e529c..a94b5a3e 100644
--- a/kharma/boundaries.hpp
+++ b/kharma/boundaries.hpp
@@ -4,7 +4,7 @@
 #include "decs.hpp"
 
 #include "bondi.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 namespace KBoundaries {
 
diff --git a/kharma/current/current.hpp b/kharma/current/current.hpp
index 0296a105..9329ec75 100644
--- a/kharma/current/current.hpp
+++ b/kharma/current/current.hpp
@@ -37,7 +37,7 @@
 
 #include "decs.hpp"
 #include "matrix.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 namespace Current
 {
diff --git a/kharma/debug.cpp b/kharma/debug.cpp
index 6be04a1e..b0c8037f 100644
--- a/kharma/debug.cpp
+++ b/kharma/debug.cpp
@@ -37,7 +37,7 @@
 #include "decs.hpp"
 
 #include "floors.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "types.hpp"
 
 using namespace Kokkos;
diff --git a/kharma/decs.hpp b/kharma/decs.hpp
index 55a7a692..59624a99 100644
--- a/kharma/decs.hpp
+++ b/kharma/decs.hpp
@@ -90,6 +90,11 @@ using GReal = double;
 #define V2 1
 #define V3 2
 
+// And an odd but useful loop for ex-iharm3d code
+// This requires nvar to be defined in caller!
+// It is not a const/global anymore.  So, use this loop carefully
+#define PLOOP for(int ip=0; ip < nvar; ++ip)
+
 // Useful Enums to avoid lots of #defines
 #define NLOC 5
 enum Loci{face1=0, face2, face3, center, corner};
diff --git a/kharma/electrons/electrons.hpp b/kharma/electrons/electrons.hpp
index 022be7d3..5b3d818d 100644
--- a/kharma/electrons/electrons.hpp
+++ b/kharma/electrons/electrons.hpp
@@ -37,7 +37,7 @@
 
 #include <parthenon/parthenon.hpp>
 
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 using namespace parthenon;
 
@@ -142,7 +142,7 @@ void FillOutput(MeshBlock *pmb, ParameterInput *pin);
  * As used extensively here, any variables not present in a pack will have index -1 in the map.
  *  
  * The two functions differ in two ways:
- * 1. The caller precalculate the four-vectors (u^mu, b^mu) and pass them in the struct D to prim_to_flux (see fluxes.hpp for call)
+ * 1. The caller precalculate the four-vectors (u^mu, b^mu) and pass them in the struct D to prim_to_flux (see flux.hpp for call)
  * 2. p_to_u will only ever be called to obtain the conserved variables U, not fluxes (i.e. dir == 0 in calls)
  * 
  * Function in this package: Divide or multiply by local density to get entropy/particle -- opposite of UtoP above
diff --git a/kharma/viscosity/viscosity.cpp b/kharma/emhd/emhd.cpp
similarity index 90%
rename from kharma/viscosity/viscosity.cpp
rename to kharma/emhd/emhd.cpp
index f468fde4..1819041c 100644
--- a/kharma/viscosity/viscosity.cpp
+++ b/kharma/emhd/emhd.cpp
@@ -1,5 +1,5 @@
 /* 
- *  File: viscosity.cpp
+ *  File: emhd.cpp
  *  
  *  BSD 3-Clause License
  *  
@@ -31,7 +31,7 @@
  *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-#include "viscosity.hpp"
+#include "emhd.hpp"
 
 #include "decs.hpp"
 #include "grmhd.hpp"
@@ -41,12 +41,12 @@
 
 using namespace parthenon;
 
-namespace Viscosity
+namespace EMHD
 {
 
 std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages)
 {
-    auto pkg = std::make_shared<StateDescriptor>("Viscosity");
+    auto pkg = std::make_shared<StateDescriptor>("EMHD");
     Params &params = pkg->AllParams();
 
     // Diagnostic data
@@ -78,11 +78,11 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     pkg->AddField("prims.dP", m_prim);
     // Eventually also need (most or all of) Theta, bsq, nu_emhd, chi_emhd, tau
 
-    // If we want to register viscosity-specific UtoP for some reason?
+    // If we want to register an EMHD-specific UtoP for some reason?
     // Likely we'll only use the post-step summary hook
-    //pkg->FillDerivedBlock = Viscosity::FillDerived;
-    //pkg->PostFillDerivedBlock = Viscosity::PostFillDerived;
+    //pkg->FillDerivedBlock = EMHD::FillDerived;
+    //pkg->PostFillDerivedBlock = EMHD::PostFillDerived;
     return pkg;
 }
 
-} // namespace Viscosity
+} // namespace EMHD
diff --git a/kharma/viscosity/viscosity.hpp b/kharma/emhd/emhd.hpp
similarity index 87%
rename from kharma/viscosity/viscosity.hpp
rename to kharma/emhd/emhd.hpp
index 1872cc1f..4c35a9f3 100644
--- a/kharma/viscosity/viscosity.hpp
+++ b/kharma/emhd/emhd.hpp
@@ -1,5 +1,5 @@
 /* 
- *  File: viscosity.hpp
+ *  File: emhd.hpp
  *  
  *  BSD 3-Clause License
  *  
@@ -35,14 +35,19 @@
 
 #include <parthenon/parthenon.hpp>
 
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 using namespace parthenon;
 
 /**
- * This physics package may someday implement viscosity.  It doesn't yet!
+ * This physics package implements the Extended GRMHD "EGRMHD" scheme of Chandra et al. 2015,
+ * First implemented in GRIM, of Chandra et al. 2017.
+ * 
+ * It adds variables representing viscosity and heat conduction, with a combination of explicit
+ * and implicit source terms; thus it requires a semi-implicit scheme for evolution,
+ * implemented in KHARMA as ImexDriver.
  */
-namespace Viscosity {
+namespace EMHD {
 /**
  * Initialization: declare any fields this package will evolve, initialize any parameters
  */
diff --git a/kharma/emhd/emhd_sources.hpp b/kharma/emhd/emhd_sources.hpp
new file mode 100644
index 00000000..6ec7ad98
--- /dev/null
+++ b/kharma/emhd/emhd_sources.hpp
@@ -0,0 +1,189 @@
+
+#pragma once
+
+#include "decs.hpp"
+
+/**
+ * Implicit source terms for EMHD
+ */
+KOKKOS_INLINE_FUNCTION void emhd_implicit_sources(const GRCoordinates& G, const Local& P, const VarMap& m_p,
+                                                  const Local& dU, const VarMap& m_u)
+{
+    Real gdet = G.gdet(loc, j, i);
+    Real tau = 0. //HFSDAJKHFASDHJLASFD
+    dU(m_u.Q)  = -gdet * (P(m_p.Q) / tau);
+    dU(m_u.DP) = -gdet * (P(m_p.DP) / tau);
+}
+
+
+KOKKOS_INLINE_FUNCTION void emhd_time_derivative_sources(const GRCoordinates& G, const Local& P, const VarMap& m_p,
+                                                         const Local& dU, const VarMap& m_u)
+{
+
+    // Initializations
+    double rho      = P(m_p.RHO);
+    double Theta    = S->Theta[k][j][i];
+    double bsq      = S->bsq[k][j][i];
+    double chi_emhd = S->chi_emhd[k][j][i];
+    double nu_emhd  = S->nu_emhd[k][j][i];
+    double tau      = S->tau[k][j][i];
+
+    double gdet = G->gdet[loc][j][i];
+
+    // Compute partial derivative of ucov
+    double dt_ucov[GR_DIM];
+    DLOOP1 {
+        double ucov_new = S_new->ucov[mu][k][j][i];
+        double ucov_old = S_old->ucov[mu][k][j][i];
+
+        dt_ucov[mu] = (ucov_new - ucov_old) / dt;
+    }
+
+    // Compute div of ucon (only temporal part is nonzero)
+    double div_ucon = 0;
+    DLOOP1 {
+        double gcon_t_mu = G->gcon[loc][0][mu][j][i];
+
+        div_ucon += gcon_t_mu * dt_ucov[mu];
+    }
+
+    // Compute q0 and delta_P0 (temporal terms)
+    double Theta_new, Theta_old, dt_Theta;
+    Theta_new = S_new->Theta[k][j][i];
+    Theta_old = S_old->Theta[k][j][i];
+
+    dt_Theta = (Theta_new - Theta_old) / dt;
+
+    double q0, deltaP0;
+    double bcon_t  = S->bcon[0][k][j][i];
+
+    q0 = -rho * chi_emhd * (bcon_t / sqrt(bsq)) * dt_Theta;
+    DLOOP1 {
+        double ucon_t  = S->ucon[0][k][j][i];
+        double bcon_mu = S->bcon[mu][k][j][i];
+
+        q0 -= rho * chi_emhd * (bcon_mu / sqrt(bsq)) * Theta * ucon_t * dt_ucov[mu];
+    }
+
+    deltaP0 = -rho * nu_emhd * div_ucon;
+    DLOOP1 {
+        double bcon_mu = S->bcon[mu][k][j][i];
+
+        deltaP0 += 3. * rho * nu_emhd * (bcon_t * bcon_mu / bsq) * dt_ucov[mu];
+    }
+
+    // Add the time derivative source terms (conduction and viscosity)
+    // NOTE: Will have to edit this when higher order terms are considered
+    dU(Q)  += gdet * (q0 / tau);
+    dU(DP) += gdet * (deltaP0 / tau);
+}
+
+// Compute explicit source terms
+KOKKOS_INLINE_FUNCTION void emhd_explicit_sources(struct GridGeom *G, struct FluidState *S, int loc,
+                                                  int i, int j, int k, double dU_explicit)
+{
+    // Extended MHD components
+
+    // Initializations
+
+    double rho      = S->P[RHO][k][j][i];
+    double Theta    = S->Theta[k][j][i];
+    double bsq      = S->bsq[k][j][i];
+    double chi_emhd = S->chi_emhd[k][j][i];
+    double nu_emhd  = S->nu_emhd[k][j][i];
+    double tau      = S->tau[k][j][i];
+
+    double gdet = G->gdet[loc][j][i];
+
+    double grad_ucov[GR_DIM][GR_DIM], grad_Theta[GR_DIM];
+
+    // Compute gradient of ucov and Theta
+    gradient_calc(G, S, loc, i, j, k, grad_ucov, grad_Theta);
+
+    // Compute div of ucon (all terms but the time-derivative ones are nonzero)
+    double div_ucon = 0;
+    DLOOP2 {
+        double gcon_mu_nu = G->gcon[loc][mu][nu][j][i];
+
+        div_ucon += gcon_mu_nu * grad_ucov[mu][nu];
+    }
+
+    // Compute q0 and deltaP0 (everything but the time-derivative terms)
+    double q0, deltaP0;
+
+    DLOOP1 {
+        double bcon_mu = S->bcon[mu][k][j][i];
+
+        q0 = -rho * chi_emhd * (bcon_mu / sqrt(bsq)) * grad_Theta[mu];
+    }
+
+    DLOOP2 {
+        double bcon_mu = S->bcon[mu][k][j][i];
+        double ucon_nu = S->ucon[nu][k][j][i];
+
+        q0 -= rho * chi_emhd * (bcon_mu / sqrt(bsq)) * Theta * ucon_nu * grad_ucov[nu][mu];
+    }
+
+    deltaP0 = -rho * nu_emhd * div_ucon;
+    DLOOP2  {
+        double bcon_mu = S->bcon[mu][k][j][i];
+        double bcon_nu = S->bcon[nu][k][j][i];
+
+        deltaP0 += 3. * rho * nu_emhd * (bcon_mu * bcon_nu / bsq) * grad_ucov[mu][nu];
+    }
+
+    // Add explicit source terms (conduction and viscosity)
+    // NOTE: Will have to edit this when higher order terms are considered
+    dU(Q)  += gdet * (q0 / tau);
+    dU(DP) += gdet * (deltaP0) / tau;
+}
+
+// Compute gradient of four velocities and temperature
+// Called by emhd_explicit_sources
+KOKKOS_INLINE_FUNCTION void gradient_calc(struct GridGeom *G, struct FluidState *S, int loc, int i, int j, int k,
+                                          double grad_ucov[GR_DIM][GR_DIM], double grad_Theta[GR_DIM])
+{
+    // Compute gradient of ucov
+    DLOOP1 {
+        grad_ucov[0][mu] = 0;
+
+        slope_calc_4vec(S->ucov, mu, 1, i, j, k, grad_ucov[1][mu]);
+        slope_calc_4vec(S->ucov, mu, 2, i, j, k, grad_ucov[2][mu]);
+        slope_calc_4vec(S->ucov, mu, 3, i, j, k, grad_ucov[3][mu]);
+    }
+
+    DLOOP2 {
+        for (int gam = 0; gam < GR_DIM; gam++)
+            grad_ucov[mu][nu] -= G->conn[gam][mu][nu][j][i] * S->ucov[gam][k][j][i];
+    }
+
+    // Compute temperature gradient
+    // Time derivative component computed in emhd_time_derivative_sources
+    grad_Theta[0] = 0;
+    slope_calc_scalar(S->Theta, 1, i, j, k, grad_Theta[1]);
+    slope_calc_scalar(S->Theta, 2, i, j, k, grad_Theta[2]);
+    slope_calc_scalar(S->Theta, 3, i, j, k, grad_Theta[3]);
+}
+
+// Compute slope for 4 vectors
+// TODO going to need to either keep or calculate these based on recon choices
+KOKKOS_INLINE_FUNCTION void slope_calc_4vec(GridVector u, int component, int dir, int i, int j, int k, double slope)
+{
+    if (dir == 1)
+        slope = SLOPE_ALGO(u[component][k][j][i-2], u[component][k][j][i-1], u[component][k][j][i],
+                            u[component][k][j][i+1], u[component][k][j][i+2], dx[dir]);
+    if (dir == 2)
+        slope = SLOPE_ALGO(u[component][k][j-2][i], u[component][k][j-1][i], u[component][k][j][i],
+                            u[component][k][j+1][i], u[component][k][j+2][i], dx[dir]);
+    if (dir == 3)
+        slope = SLOPE_ALGO(u[component][k-2][j][i], u[component][k-1][j][i], u[component][k][j][i],
+                            u[component][k+1][j][i], u[component][k+2][j][i], dx[dir]);
+}
+
+// Compute slope for scalars
+KOKKOS_INLINE_FUNCTION void slope_calc_scalar(GridDouble T, int dir, int i, int j, int k, double slope)
+{
+  if (dir == 1) slope = SLOPE_ALGO(T[k][j][i-2], T[k][j][i-1], T[k][j][i], T[k][j][i+1], T[k][j][i+2], dx[dir]);
+  if (dir == 2) slope = SLOPE_ALGO(T[k][j-2][i], T[k][j-1][i], T[k][j][i], T[k][j+1][i], T[k][j+2][i], dx[dir]);
+  if (dir == 3) slope = SLOPE_ALGO(T[k-2][j][i], T[k-1][j][i], T[k][j][i], T[k+1][j][i], T[k+2][j][i], dx[dir]);
+}
\ No newline at end of file
diff --git a/kharma/floors/floors.cpp b/kharma/floors/floors.cpp
index eec643f5..cf1d35f3 100644
--- a/kharma/floors/floors.cpp
+++ b/kharma/floors/floors.cpp
@@ -38,7 +38,7 @@
 
 #include "debug.hpp"
 #include "fixup.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "pack.hpp"
 
 namespace Floors
diff --git a/kharma/floors/floors.hpp b/kharma/floors/floors.hpp
index 4438894b..308eaeac 100644
--- a/kharma/floors/floors.hpp
+++ b/kharma/floors/floors.hpp
@@ -37,7 +37,7 @@
 
 
 #include "b_flux_ct.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "U_to_P.hpp"
 
 #include <parthenon/parthenon.hpp>
@@ -73,16 +73,16 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin);
 /**
  * Apply density and internal energy floors and ceilings
  * 
- * This function is called just after UtoP finishes, and
- * applies to the same subset of zones (anything "on" the grid,
- * i.e. not past a polar or outflow boundary)
+ * This function definitely applies floors (regardless of "disable_floors")
+ * to the interior domain (not ghost zones).
  * 
  * LOCKSTEP: this function respects P and returns consistent P<->U
  */
 TaskStatus ApplyFloors(MeshBlockData<Real> *rc);
 
 /**
- * Parthenon wrapper for ApplyFloors.  Decides whether to apply floors, then does so
+ * Parthenon call wrapper for ApplyFloors, called just after FillDerived == UtoP
+ * Decides whether to apply floors based on options, then does so
  */
 TaskStatus PostFillDerivedBlock(MeshBlockData<Real> *rc);
 
@@ -280,7 +280,7 @@ KOKKOS_INLINE_FUNCTION int apply_floors(const GRCoordinates& G, const VariablePa
 
             // Calculating the corresponding conserved variables
             Real rho_ut, T[GR_DIM];
-            GRMHD::p_to_u_loc(G, rho_add, u_add, uvec, B, gam, k, j, i, rho_ut, T, loc);
+            GRMHD::p_to_u_mhd(G, rho_add, u_add, uvec, B, gam, k, j, i, rho_ut, T, loc);
 
             // Add new conserved mass/energy to the current "conserved" state,
             // and to the local primitives as a guess
@@ -333,7 +333,8 @@ KOKKOS_INLINE_FUNCTION int apply_floors(const GRCoordinates& G, const VariablePa
  * 
  * LOCKSTEP: Operates on and respects primitives *only*
  */
-KOKKOS_INLINE_FUNCTION int apply_geo_floors(const GRCoordinates& G, ScratchPad2D<Real>& P, const VarMap& m,
+template<typename Local>
+KOKKOS_INLINE_FUNCTION int apply_geo_floors(const GRCoordinates& G, Local& P, const VarMap& m,
                                             const Real& gam, const int& k, const int& j, const int& i,
                                             const Floors::Prescription& floors, const Loci loc=Loci::center)
 {
@@ -367,8 +368,8 @@ KOKKOS_INLINE_FUNCTION int apply_geo_floors(const GRCoordinates& G, ScratchPad2D
     fflag |= (uflr_geom > P(m.UU, i)) * HIT_FLOOR_GEOM_U_FLUX;
 #endif
 
-    P(m.RHO, i) += max(0., rhoflr_geom - P(m.RHO, i));
-    P(m.UU, i) += max(0., uflr_geom - P(m.UU, i));
+    P(m.RHO) += max(0., rhoflr_geom - P(m.RHO));
+    P(m.UU) += max(0., uflr_geom - P(m.UU));
 
     return fflag;
 }
diff --git a/kharma/fluxes.cpp b/kharma/flux.cpp
similarity index 55%
rename from kharma/fluxes.cpp
rename to kharma/flux.cpp
index bc15a953..1420e464 100644
--- a/kharma/fluxes.cpp
+++ b/kharma/flux.cpp
@@ -32,7 +32,7 @@
  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "fluxes.hpp"
+#include "flux.hpp"
 
 #include "source.hpp"
 
@@ -41,7 +41,7 @@ using namespace parthenon;
 // GetFlux is in the header, as it is templated on reconstruction scheme and flux direction
 // That's also why we don't have any extra includes in here
 
-TaskStatus Flux::PrimToFlux(MeshBlockData<Real> *rc, IndexDomain domain)
+TaskStatus Flux::PtoU(MeshBlockData<Real> *rc, IndexDomain domain)
 {
     Flag(rc, "Getting conserved fluxes");
     // Pointers
@@ -57,22 +57,52 @@ TaskStatus Flux::PrimToFlux(MeshBlockData<Real> *rc, IndexDomain domain)
 
     // Pack variables
     PackIndexMap prims_map, cons_map;
-    const auto& P = rc->PackVariables({isPrimitive}, prims_map);
-    const auto& U = rc->PackVariables({Metadata::Conserved}, cons_map);
+    const auto& P_all = rc->PackVariables({isPrimitive}, prims_map);
+    const auto& U_all = rc->PackVariables({Metadata::Conserved}, cons_map);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
+    const int nvar = U_all.GetDim(4);
 
-    IndexRange ib = rc->GetBoundsI(domain);
-    IndexRange jb = rc->GetBoundsJ(domain);
-    IndexRange kb = rc->GetBoundsK(domain);
+    const IndexRange ib = rc->GetBoundsI(domain);
+    const IndexRange jb = rc->GetBoundsJ(domain);
+    const IndexRange kb = rc->GetBoundsK(domain);
+    const int n1 = pmb->cellbounds.ncellsi(IndexDomain::entire);
 
     const auto& G = pmb->coords;
 
-    pmb->par_for("P_to_U", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA_3D {
-            GRMHD::p_to_u(G, P, m_p, gam, k, j, i, U, m_u);
-            if (flux_ct) B_FluxCT::p_to_u(G, P, m_p, k, j, i, U, m_u);
-            else if (b_cd) B_CD::p_to_u(G, P, m_p, k, j, i, U, m_u);
-            if (use_electrons) Electrons::p_to_u(G, P, m_p, k, j, i, U, m_u);
+    // This is basically what all kernels look like if I want to stick to
+    // single, simple device side functions called over slices
+    // See fluxes.hpp or implicit.cpp for explanations of what everything here does
+    const int scratch_level = 1;
+    const size_t var_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, n1);
+    const size_t total_scratch_bytes = (2) * var_size_in_bytes;
+
+    parthenon::par_for_outer(DEFAULT_OUTER_LOOP_PATTERN, "PtoU", pmb->exec_space,
+        total_scratch_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e,
+        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int& k, const int& j) {
+            ScratchPad2D<Real> P_s(member.team_scratch(scratch_level), nvar, n1);
+            ScratchPad2D<Real> U_s(member.team_scratch(scratch_level), nvar, n1);
+
+            PLOOP parthenon::par_for_inner(member, ib.s, ib.e,
+                [&](const int& i) {
+                    P_s(ip, i) = P_all(ip, k, j, i);
+                    U_s(ip, i) = U_all(ip, k, j, i);
+                }
+            );
+
+            parthenon::par_for_inner(member, ib.s, ib.e,
+                [&](const int& i) {
+                    auto P = Kokkos::subview(P_s, Kokkos::ALL(), i);
+                    auto U = Kokkos::subview(U_s, Kokkos::ALL(), i);
+                    Flux::p_to_u(G, P, m_p, gam, j, i, U, m_u);
+                }
+            );
+
+            PLOOP parthenon::par_for_inner(member, ib.s, ib.e,
+                [&](const int& i) {
+                    P_all(ip, k, j, i) = P_s(ip, i);
+                    U_all(ip, k, j, i) = U_s(ip, i);
+                }
+            );
         }
     );
 
diff --git a/kharma/fluxes.hpp b/kharma/flux.hpp
similarity index 75%
rename from kharma/fluxes.hpp
rename to kharma/flux.hpp
index 80ff4fd5..3ce68e95 100644
--- a/kharma/fluxes.hpp
+++ b/kharma/flux.hpp
@@ -1,5 +1,5 @@
 /* 
- *  File: fluxes.hpp
+ *  File: flux.hpp
  *  
  *  BSD 3-Clause License
  *  
@@ -39,12 +39,13 @@
 
 #include "debug.hpp"
 #include "floors.hpp"
+#include "flux_functions.hpp"
 #include "pack.hpp"
 #include "reconstruction.hpp"
 #include "types.hpp"
 
 // Package functions
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "b_flux_ct.hpp"
 #include "b_cd.hpp"
 #include "electrons.hpp"
@@ -62,10 +63,11 @@ TaskStatus ApplyFluxes(MeshData<Real> *md, MeshData<Real> *mdudt);
 
 /**
  * Fill all conserved variables (U) from primitive variables (P), over the whole grid.
- * Second declaration is for Parthenon's benefit, similar to UtoP vs FillDerived in GRMHD::
+ * Second declaration is for Parthenon's benefit, similar to e.g.
+ * declaring UtoP vs FillDerived in GRMHD package.
  */
-TaskStatus PrimToFlux(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire);
-inline TaskStatus PrimToFluxTask(MeshBlockData<Real> *rc) { return PrimToFlux(rc); }
+TaskStatus PtoU(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire);
+inline TaskStatus PtoUTask(MeshBlockData<Real> *rc) { return PtoU(rc); }
 
 // Fluxes a.k.a. "Approximate Riemann Solvers"
 // More complex solvers require speed estimates not calculable completely from
@@ -138,8 +140,8 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
     // Pack variables.  Keep ctop separate
     PackIndexMap prims_map, cons_map;
     const auto& ctop = md->PackVariables(std::vector<std::string>{"ctop"});
-    const auto& P = md->PackVariables(std::vector<MetadataFlag>{isPrimitive}, prims_map);
-    const auto& U = md->PackVariablesAndFluxes(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
+    const auto& P_all = md->PackVariables(std::vector<MetadataFlag>{isPrimitive}, prims_map);
+    const auto& U_all = md->PackVariablesAndFluxes(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
     Flag(md, "Packed variables");
 
@@ -149,7 +151,7 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
     const IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
     const IndexRange kb = md->GetBoundsK(IndexDomain::interior);
     const IndexRange block = IndexRange{0, ctop.GetDim(5) - 1};
-    const int nvar = U.GetDim(4);
+    const int nvar = U_all.GetDim(4);
     // 1-zone halo in nontrivial dimensions
     // We leave is/ie, js/je, ks/ke with their usual definitions for consistency, and define
     // the loop bounds separately to include the appropriate halo
@@ -175,19 +177,19 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
     parthenon::par_for_outer(DEFAULT_OUTER_LOOP_PATTERN, "calc_flux", pmb0->exec_space,
         total_scratch_bytes, scratch_level, block.s, block.e, kl.s, kl.e, jl.s, jl.e,
         KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int& b, const int& k, const int& j) {
-            const auto& G = U.GetCoords(b);
-            ScratchPad2D<Real> Pl(member.team_scratch(scratch_level), nvar, n1);
-            ScratchPad2D<Real> Pr(member.team_scratch(scratch_level), nvar, n1);
-            ScratchPad2D<Real> Ul(member.team_scratch(scratch_level), nvar, n1);
-            ScratchPad2D<Real> Ur(member.team_scratch(scratch_level), nvar, n1);
-            ScratchPad2D<Real> Fl(member.team_scratch(scratch_level), nvar, n1);
-            ScratchPad2D<Real> Fr(member.team_scratch(scratch_level), nvar, n1);
+            const auto& G = U_all.GetCoords(b);
+            ScratchPad2D<Real> Pl_s(member.team_scratch(scratch_level), nvar, n1);
+            ScratchPad2D<Real> Pr_s(member.team_scratch(scratch_level), nvar, n1);
+            ScratchPad2D<Real> Ul_s(member.team_scratch(scratch_level), nvar, n1);
+            ScratchPad2D<Real> Ur_s(member.team_scratch(scratch_level), nvar, n1);
+            ScratchPad2D<Real> Fl_s(member.team_scratch(scratch_level), nvar, n1);
+            ScratchPad2D<Real> Fr_s(member.team_scratch(scratch_level), nvar, n1);
             ScratchPad1D<Real> cmax(member.team_scratch(scratch_level), n1);
             ScratchPad1D<Real> cmin(member.team_scratch(scratch_level), n1);
 
             // Wrapper for a big switch statement between reconstruction schemes. Possibly slow.
             // This function is generally a lot of if statements
-            KReconstruction::reconstruct<Recon, dir>(member, G, P(b), k, j, il.s, il.e, Pl, Pr);
+            KReconstruction::reconstruct<Recon, dir>(member, G, P_all(b), k, j, il.s, il.e, Pl_s, Pr_s);
 
             // Sync all threads in the team so that scratch memory is consistent
             member.team_barrier();
@@ -195,6 +197,8 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
             // Calculate conserved fluxes at centers & faces
             parthenon::par_for_inner(member, il.s, il.e,
                 [&](const int& i) {
+                    auto Pl = Kokkos::subview(Pl_s, Kokkos::ALL(), i);
+                    auto Pr = Kokkos::subview(Pr_s, Kokkos::ALL(), i);
                     // Apply floors to the *reconstructed* primitives, because without TVD
                     // we have no guarantee they remotely resemble the *centered* primitives
                     if (Recon == ReconstructionType::weno5 && !disable_floors) {
@@ -209,30 +213,22 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
             // LEFT FACES, final ctop
             parthenon::par_for_inner(member, il.s, il.e,
                 [&](const int& i) {
+                    auto Pl = Kokkos::subview(Pl_s, Kokkos::ALL(), i);
 #endif
+                    auto Ul = Kokkos::subview(Ul_s, Kokkos::ALL(), i);
+                    auto Fl = Kokkos::subview(Fl_s, Kokkos::ALL(), i);
                     // LR -> flux
                     // Declare temporary vectors
                     FourVectors Dtmp;
 
                     // Left
-                    GRMHD::calc_4vecs(G, Pl, m_p, k, j, i, loc, Dtmp);
-                    GRMHD::prim_to_flux(G, Pl, m_p, Dtmp, gam, k, j, i, 0, Ul, m_u, loc);
-                    GRMHD::prim_to_flux(G, Pl, m_p, Dtmp, gam, k, j, i, dir, Fl, m_u, loc);
-                    if (use_b_flux_ct) {
-                        B_FluxCT::prim_to_flux(G, Pl, m_p, Dtmp, k, j, i, 0, Ul, m_u, loc);
-                        B_FluxCT::prim_to_flux(G, Pl, m_p, Dtmp, k, j, i, dir, Fl, m_u, loc);
-                    } else if (use_b_cd) {
-                        B_CD::prim_to_flux(G, Pl, m_p, Dtmp, k, j, i, 0, Ul, m_u, loc);
-                        B_CD::prim_to_flux(G, Pl, m_p, Dtmp, k, j, i, dir, Fl, m_u, loc);
-                    }
-                    if (use_electrons) {
-                        Electrons::prim_to_flux(G, Pl, m_p, Dtmp, k, j, i, 0, Ul, m_u, loc);
-                        Electrons::prim_to_flux(G, Pl, m_p, Dtmp, k, j, i, dir, Fl, m_u, loc);
-                    }
+                    GRMHD::calc_4vecs(G, Pl, m_p, j, i, loc, Dtmp);
+                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, gam, j, i, 0, Ul, m_u, loc);
+                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, gam, j, i, dir, Fl, m_u, loc);
 
                     // Magnetosonic speeds
                     Real cmaxL, cminL;
-                    GRMHD::vchar(G, Pl, m_p, Dtmp, gam, k, j, i, loc, dir, cmaxL, cminL);
+                    Flux::vchar(G, Pl, m_p, Dtmp, gam, k, j, i, loc, dir, cmaxL, cminL);
 
 #if !FUSE_FLUX_KERNELS
                     // Record speeds
@@ -248,26 +244,19 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
                     // LR -> flux
                     // Declare temporary vectors
                     FourVectors Dtmp;
+                    auto Pr = Kokkos::subview(Pr_s, Kokkos::ALL(), i);
 #endif
+                    auto Ur = Kokkos::subview(Ur_s, Kokkos::ALL(), i);
+                    auto Fr = Kokkos::subview(Fr_s, Kokkos::ALL(), i);
                     // Right
-                    GRMHD::calc_4vecs(G, Pr, m_p, k, j, i, loc, Dtmp);
-                    GRMHD::prim_to_flux(G, Pr, m_p, Dtmp, gam, k, j, i, 0, Ur, m_u, loc);
-                    GRMHD::prim_to_flux(G, Pr, m_p, Dtmp, gam, k, j, i, dir, Fr, m_u, loc);
-                    if (use_b_flux_ct) {
-                        B_FluxCT::prim_to_flux(G, Pr, m_p, Dtmp, k, j, i, 0, Ur, m_u, loc);
-                        B_FluxCT::prim_to_flux(G, Pr, m_p, Dtmp, k, j, i, dir, Fr, m_u, loc);
-                    } else if (use_b_cd) {
-                        B_CD::prim_to_flux(G, Pr, m_p, Dtmp, k, j, i, 0, Ur, m_u, loc);
-                        B_CD::prim_to_flux(G, Pr, m_p, Dtmp, k, j, i, dir, Fr, m_u, loc);
-                    }
-                    if (use_electrons) {
-                        Electrons::prim_to_flux(G, Pr, m_p, Dtmp, k, j, i, 0, Ur, m_u, loc);
-                        Electrons::prim_to_flux(G, Pr, m_p, Dtmp, k, j, i, dir, Fr, m_u, loc);
-                    }
+                    // TODO GRMHD/GRHD versions of this
+                    GRMHD::calc_4vecs(G, Pr, m_p, j, i, loc, Dtmp);
+                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, gam, j, i, 0, Ur, m_u, loc);
+                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, gam, j, i, dir, Fr, m_u, loc);
 
                     // Magnetosonic speeds
                     Real cmaxR, cminR;
-                    GRMHD::vchar(G, Pr, m_p, Dtmp, gam, k, j, i, loc, dir, cmaxR, cminR);
+                    Flux::vchar(G, Pr, m_p, Dtmp, gam, k, j, i, loc, dir, cmaxR, cminR);
 
 #if FUSE_FLUX_KERNELS
                     // Calculate cmax/min from local variables
@@ -276,17 +265,17 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
 
                     if (use_hlle) {
                         for (int p=0; p < nvar; ++p)
-                            U(b).flux(dir, p, k, j, i) = hlle(Fl(p,i), Fr(p,i), cmax(i), cmin(i), Ul(p,i), Ur(p,i));
+                            U_all(b).flux(dir, p, k, j, i) = hlle(Fl(p), Fr(p), cmax(i), cmin(i), Ul(p), Ur(p));
                     } else {
                         for (int p=0; p < nvar; ++p)
-                            U(b).flux(dir, p, k, j, i) = llf(Fl(p,i), Fr(p,i), cmax(i), cmin(i), Ul(p,i), Ur(p,i));
+                            U_all(b).flux(dir, p, k, j, i) = llf(Fl(p), Fr(p), cmax(i), cmin(i), Ul(p), Ur(p));
                     }
                     if (use_b_cd) {
                         // The unphysical variable psi and its corrections can propagate at the max speed
                         // for the stepsize, rather than the sound speed
                         // Since the speeds are the same it will always correspond to the LLF flux
-                        U(b).flux(dir, m_u.PSI, k, j, i) = llf(Fl(m_u.PSI,i), Fr(m_u.PSI,i), ctop_max, ctop_max, Ul(m_u.PSI,i), Ur(m_u.PSI,i));
-                        U(b).flux(dir, m_u.B1+dir-1, k, j, i) = llf(Fl(m_u.B1+dir-1,i), Fr(m_u.B1+dir-1,i), ctop_max, ctop_max, Ul(m_u.B1+dir-1,i), Ur(m_u.B1+dir-1,i));
+                        U_all(b).flux(dir, m_u.PSI, k, j, i) = llf(Fl(m_u.PSI), Fr(m_u.PSI), ctop_max, ctop_max, Ul(m_u.PSI), Ur(m_u.PSI));
+                        U_all(b).flux(dir, m_u.B1+dir-1, k, j, i) = llf(Fl(m_u.B1+dir-1), Fr(m_u.B1+dir-1), ctop_max, ctop_max, Ul(m_u.B1+dir-1), Ur(m_u.B1+dir-1));
                     }
 #else
                     // Calculate cmax/min based on comparison with cached values
@@ -307,21 +296,21 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
                     // Since the speeds are the same it will always correspond to the LLF flux
                     parthenon::par_for_inner(member, il.s, il.e,
                         [&](const int& i) {
-                            U(b).flux(dir, p, k, j, i) = llf(Fl(p,i), Fr(p,i), ctop_max, ctop_max, Ul(p,i), Ur(p,i));
+                            U_all(b).flux(dir, p, k, j, i) = llf(Fl_s(p,i), Fr_s(p,i), ctop_max, ctop_max, Ul_s(p,i), Ur_s(p,i));
                         }
                     );
                 } else if (use_hlle) {
                     // Option to try HLLE fluxes for everything else
                     parthenon::par_for_inner(member, il.s, il.e,
                         [&](const int& i) {
-                            U(b).flux(dir, p, k, j, i) = hlle(Fl(p,i), Fr(p,i), cmax(i), cmin(i), Ul(p,i), Ur(p,i));
+                            U_all(b).flux(dir, p, k, j, i) = hlle(Fl_s(p,i), Fr_s(p,i), cmax(i), cmin(i), Ul_s(p,i), Ur_s(p,i));
                         }
                     );
                 } else {
                     // Or LLF, probably safest option
                     parthenon::par_for_inner(member, il.s, il.e,
                         [&](const int& i) {
-                            U(b).flux(dir, p, k, j, i) = llf(Fl(p,i), Fr(p,i), cmax(i), cmin(i), Ul(p,i), Ur(p,i));
+                            U_all(b).flux(dir, p, k, j, i) = llf(Fl_s(p,i), Fr_s(p,i), cmax(i), cmin(i), Ul_s(p,i), Ur_s(p,i));
                         }
                     );
                 }
diff --git a/kharma/flux_functions.hpp b/kharma/flux_functions.hpp
new file mode 100644
index 00000000..c6765b1e
--- /dev/null
+++ b/kharma/flux_functions.hpp
@@ -0,0 +1,198 @@
+/* 
+ *  File: flux_functions.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include "decs.hpp"
+
+#include "gr_coordinates.hpp"
+#include "grmhd_functions.hpp"
+#include "kharma_utils.hpp"
+#include "types.hpp"
+/**
+ * Device-side functions prim_to_flux and vchar, which will depend on
+ * the set of enabled packages.
+ */
+
+namespace Flux
+{
+
+/**
+ * Turn the primitive variables at a location into:
+ * a. conserved variables (dir==0), or
+ * b. fluxes in a direction (dir!=0)
+ * Keep in mind loc should usually correspond to dir for perpendicuar fluxes
+ */
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P, const VarMap& m_p, const FourVectors D,
+                                         const Real& gam, const int& j, const int& i, const int dir,
+                                         const Local& flux, const VarMap& m_u, const Loci loc=Loci::center)
+{
+    Real gdet = G.gdet(loc, j, i);
+    // Particle number flux
+    flux(m_u.RHO) = P(m_p.RHO) * D.ucon[dir] * gdet;
+
+    if (m_p.B1 >= 0) {
+        // MHD stress-energy tensor w/ first index up, second index down
+        Real mhd[GR_DIM];
+        GRMHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), D, dir, mhd);
+        flux(m_u.UU) = mhd[0] * gdet + flux(m_u.RHO);
+        flux(m_u.U1) = mhd[1] * gdet;
+        flux(m_u.U2) = mhd[2] * gdet;
+        flux(m_u.U3) = mhd[3] * gdet;
+
+        // Magnetic field
+        if (dir == 0) {
+            VLOOP flux(m_u.B1 + v) = P(m_p.B1 + v) * gdet;
+        } else {
+            // Constraint damping w/Dedner may add also P(m_p.psi) * gdet,
+            // but for us this is in the source term
+            VLOOP flux(m_u.B1 + v) = (D.bcon[v+1] * D.ucon[dir] - D.bcon[dir] * D.ucon[v+1]) * gdet;
+        }
+    } else {
+        // HD stress-energy tensor w/ first index up, second index down
+        Real hd[GR_DIM];
+        GRHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), D, dir, hd);
+        flux(m_u.UU) = hd[0] * gdet + flux(m_u.RHO);
+        flux(m_u.U1) = hd[1] * gdet;
+        flux(m_u.U2) = hd[2] * gdet;
+        flux(m_u.U3) = hd[3] * gdet;
+    }
+    if (m_p.PSI >= 0) {
+        // Extra scalar psi for constraint damping, see B_CD
+        if (dir == 0) {
+            flux(m_u.PSI) = P(m_p.PSI) * gdet;
+        } else {
+            // Psi field update as in Mosta et al (IllinoisGRMHD), alternate explanation Jesse et al (2020)
+            //Real alpha = 1. / sqrt(-G.gcon(Loci::center, j, i, 0, 0));
+            //Real beta_dir = G.gcon(Loci::center, j, i, 0, dir) * alpha * alpha;
+            flux(m_u.PSI) = (D.bcon[dir] - G.gcon(Loci::center, j, i, 0, dir) * P(m_p.PSI)) * gdet;
+        }
+    }
+
+    if (m_p.KTOT >= 0) {
+        // Take the factor from the primitives, in case we need to reorder this to happen before GRMHD::prim_to_flux later
+        flux(m_u.KTOT) = flux(m_u.RHO) * P(m_p.KTOT);
+        if (m_p.K_CONSTANT >= 0)
+            flux(m_u.K_CONSTANT) = flux(m_u.RHO) * P(m_p.K_CONSTANT);
+        if (m_p.K_HOWES >= 0)
+            flux(m_u.K_HOWES) = flux(m_u.RHO) * P(m_p.K_HOWES);
+        if (m_p.K_KAWAZURA >= 0)
+            flux(m_u.K_KAWAZURA) = flux(m_u.RHO) * P(m_p.K_KAWAZURA);
+        if (m_p.K_WERNER >= 0)
+            flux(m_u.K_WERNER) = flux(m_u.RHO) * P(m_p.K_WERNER);
+        if (m_p.K_ROWAN >= 0)
+            flux(m_u.K_ROWAN) = flux(m_u.RHO) * P(m_p.K_ROWAN);
+        if (m_p.K_SHARMA >= 0)
+            flux(m_u.K_SHARMA) = flux(m_u.RHO) * P(m_p.K_SHARMA);
+    }
+
+}
+
+/**
+ * Get the conserved GRHD variables corresponding to primitives in a zone. Equivalent to prim_to_flux with dir==0
+ */
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const VarMap& m_p,
+                                   const Real& gam, const int& j, const int& i,
+                                   const Local& U, const VarMap& m_u, const Loci& loc=Loci::center)
+{
+    FourVectors Dtmp;
+    GRMHD::calc_4vecs(G, P, m_p, j, i, loc, Dtmp); // TODO switch GRHD/GRMHD
+    prim_to_flux(G, P, m_p, Dtmp, gam, j, i, 0, U, m_u, loc);
+}
+// KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
+//                                    const Real& gam, const int& k, const int& j, const int& i,
+//                                    const VariablePack<Real>& U, const VarMap& m_u, const Loci& loc=Loci::center)
+// {
+
+// }
+
+/**
+ * Calculate components of magnetosonic velocity from primitive variables
+ * This is only called in GetFlux, so we only provide a ScratchPad form
+ */
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void vchar(const GRCoordinates& G, const Local& P, const VarMap& m, const FourVectors& D,
+                                  const Real& gam, const int& k, const int& j, const int& i, const Loci& loc, const int& dir,
+                                  Real& cmax, Real& cmin)
+{
+    // Find sound speed
+    const Real ef = P(m.RHO) + gam * P(m.UU);
+    const Real cs2 = gam * (gam - 1) * P(m.UU) / ef;
+    Real cms2;
+    if (m.B1 >= 0) {
+        // Find fast magnetosonic speed
+        const Real bsq = dot(D.bcon, D.bcov);
+        const Real ee = bsq + ef;
+        const Real va2 = bsq / ee;
+        cms2 = cs2 + va2 - cs2 * va2;
+    } else {
+        cms2 = cs2;
+    }
+    clip(cms2, 1.e-20, 1.);
+
+    // Require that speed of wave measured by observer q.ucon is cms2
+    Real A, B, C;
+    {
+        Real Bcov[GR_DIM] = {1., 0., 0., 0.};
+        Real Acov[GR_DIM] = {0}; Acov[dir] = 1.;
+
+        Real Acon[GR_DIM], Bcon[GR_DIM];
+        G.raise(Acov, Acon, k, j, i, loc);
+        G.raise(Bcov, Bcon, k, j, i, loc);
+
+        const Real Asq = dot(Acon, Acov);
+        const Real Bsq = dot(Bcon, Bcov);
+        const Real Au = dot(Acov, D.ucon);
+        const Real Bu = dot(Bcov, D.ucon);
+        const Real AB = dot(Acon, Bcov);
+        const Real Au2 = Au * Au;
+        const Real Bu2 = Bu * Bu;
+        const Real AuBu = Au * Bu;
+
+        A = Bu2 - (Bsq + Bu2) * cms2;
+        B = 2. * (AuBu - (AB + AuBu) * cms2);
+        C = Au2 - (Asq + Au2) * cms2;
+    }
+
+    Real discr = sqrt(max(B * B - 4. * A * C, 0.));
+
+    Real vp = -(-B + discr) / (2. * A);
+    Real vm = -(-B - discr) / (2. * A);
+
+    cmax = max(vp, vm);
+    cmin = min(vp, vm);
+}
+
+} // namespace Flux
diff --git a/kharma/grmhd/U_to_P.hpp b/kharma/grmhd/U_to_P.hpp
index d3e883ba..026ddca9 100644
--- a/kharma/grmhd/U_to_P.hpp
+++ b/kharma/grmhd/U_to_P.hpp
@@ -62,8 +62,8 @@ KOKKOS_INLINE_FUNCTION Real lorentz_calc_w(const Real& Bsq, const Real& D, const
  * These are fixed later, in FixUtoP
  */
 KOKKOS_INLINE_FUNCTION InversionStatus u_to_p(const GRCoordinates &G, const VariablePack<Real>& U, const VarMap& m_u,
-                                    const Real& gam, const int& k, const int& j, const int& i, const Loci loc,
-                                    const VariablePack<Real>& P, const VarMap& m_p)
+                                              const Real& gam, const int& k, const int& j, const int& i, const Loci loc,
+                                              const VariablePack<Real>& P, const VarMap& m_p)
 {
     // Catch negative density
     if (U(m_u.RHO, k, j, i) <= 0.) {
@@ -76,10 +76,12 @@ KOKKOS_INLINE_FUNCTION InversionStatus u_to_p(const GRCoordinates &G, const Vari
     const Real a_over_g = alpha / gdet;
     const Real D = U(m_u.RHO, k, j, i) * a_over_g;
 
-    const Real Bcon[GR_DIM] = {0,
-        U(m_u.B1, k, j, i) * a_over_g,
-        U(m_u.B2, k, j, i) * a_over_g,
-        U(m_u.B3, k, j, i) * a_over_g};
+    Real Bcon[GR_DIM] = {0};
+    if (m_u.B1 >= 0) {
+        Bcon[1] = U(m_u.B1, k, j, i) * a_over_g;
+        Bcon[2] = U(m_u.B2, k, j, i) * a_over_g;
+        Bcon[3] = U(m_u.B3, k, j, i) * a_over_g;
+    }
 
     const Real Qcov[GR_DIM] =
         {(U(m_u.UU, k, j, i) - U(m_u.RHO, k, j, i)) * a_over_g,
@@ -216,7 +218,7 @@ KOKKOS_INLINE_FUNCTION Real err_eqn(const Real& gam, const Real& Bsq, const Real
  * Fluid relativistic factor gamma in terms of inversion state variables
  */
 KOKKOS_INLINE_FUNCTION Real lorentz_calc_w(const Real& Bsq, const Real& D, const Real& QdB,
-                                        const Real& Qtsq, const Real& Wp)
+                                           const Real& Qtsq, const Real& Wp)
 {
     const Real QdBsq = QdB * QdB;
     const Real W = Wp + D;
diff --git a/kharma/grmhd/fixup.cpp b/kharma/grmhd/fixup.cpp
index 3972c8c4..38f6e1ee 100644
--- a/kharma/grmhd/fixup.cpp
+++ b/kharma/grmhd/fixup.cpp
@@ -37,9 +37,9 @@
 #include "floors.hpp"
 #include "pack.hpp"
 
-// I'm undecided on reintroducing these more widely but they clearly make sense here
+// Version of PLOOP guaranteeing specifically the 5 GRMHD fixup-amenable primitive vars
 #define NPRIM 5
-#define PLOOP for(int p=0; p < NPRIM; ++p)
+#define PRIMLOOP for(int p=0; p < NPRIM; ++p)
 
 TaskStatus GRMHD::FixUtoP(MeshBlockData<Real> *rc)
 {
@@ -94,11 +94,11 @@ TaskStatus GRMHD::FixUtoP(MeshBlockData<Real> *rc)
                                 if (((int) pflag(kk, jj, ii)) == InversionStatus::success) {
                                     // Weight by distance.  Note interpolated "fixed" cells stay flagged
                                     wsum += w;
-                                    PLOOP sum[p] += w * P(p, kk, jj, ii);
+                                    PRIMLOOP sum[p] += w * P(p, kk, jj, ii);
                                 }
                                 // Just in case, keep a sum of even the bad ones
                                 wsum_x += w;
-                                PLOOP sum_x[p] += w * P(p, kk, jj, ii);
+                                PRIMLOOP sum_x[p] += w * P(p, kk, jj, ii);
                             }
                         }
                     }
@@ -110,9 +110,9 @@ TaskStatus GRMHD::FixUtoP(MeshBlockData<Real> *rc)
                     if (verbose >= 1 && inside(k, j, i, kb_b, jb_b, ib_b)) // If an interior zone...
                         printf("No neighbors were available at %d %d %d!\n", i, j, k);
 #endif
-                    PLOOP P(p, k, j, i) = sum_x[p]/wsum_x;
+                    PRIMLOOP P(p, k, j, i) = sum_x[p]/wsum_x;
                 } else {
-                    PLOOP P(p, k, j, i) = sum[p]/wsum;
+                    PRIMLOOP P(p, k, j, i) = sum[p]/wsum;
                 }
             }
         }
@@ -135,8 +135,8 @@ TaskStatus GRMHD::FixUtoP(MeshBlockData<Real> *rc)
 
                 // And make sure the fixed values still abide by floors (floors keep lockstep)
                 int fflag_local = 0;
-                fflag_local |= apply_floors(G, P, m_p, gam, k, j, i, floors, U, m_u);
-                fflag_local |= apply_ceilings(G, P, m_p, gam, k, j, i, floors, U, m_u);
+                fflag_local |= Floors::apply_floors(G, P, m_p, gam, k, j, i, floors, U, m_u);
+                fflag_local |= Floors::apply_ceilings(G, P, m_p, gam, k, j, i, floors, U, m_u);
                 fflag(k, j, i) = fflag_local;
             }
         }
diff --git a/kharma/grmhd/grmhd.cpp b/kharma/grmhd/grmhd.cpp
index 174c125e..874b588a 100644
--- a/kharma/grmhd/grmhd.cpp
+++ b/kharma/grmhd/grmhd.cpp
@@ -51,10 +51,10 @@
 #include "debug.hpp"
 #include "fixup.hpp"
 #include "floors.hpp"
-#include "fluxes.hpp"
+#include "flux.hpp"
 #include "gr_coordinates.hpp"
 #include "kharma.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "source.hpp"
 #include "U_to_P.hpp"
 
@@ -186,8 +186,9 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
 
     std::vector<int> s_vector({3});
     std::vector<MetadataFlag> flags_prim, flags_cons;
-    auto grim_driver = pin->GetString("driver", "type") == "grim";
-    if (!grim_driver) {
+    auto imex_driver = pin->GetString("driver", "type") == "grim";
+    auto explicit_step = (pin->GetOrAddString("driver", "step", "explicit") == "explicit");
+    if (!imex_driver) { // Normal operation
         // As mentioned elsewhere, KHARMA treats the conserved variables as the independent ones,
         // and the primitives as "Derived"
         // Primitives are still used for reconstruction, physical boundaries, and output, and are
@@ -200,8 +201,9 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
                                                 Metadata::WithFluxes, Metadata::FillGhost, Metadata::Restart,
                                                 Metadata::Conserved, isHD, isMHD});
     } else {
-        // For GRIM/classic HARM, however, the primitive variables are independent, and boundary syncs are performed
-        // with them.
+        // For ImexDriver, however, the primitive variables are independent, and boundary syncs are performed
+        // with them.  This is to accommodate the implicit step, which takes and returns primitive values and
+        // thus is much easier to handle by just using primitives everywhere.
         flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
                                                 Metadata::FillGhost, Metadata::Restart, isPrimitive, isHD, isMHD});
         // Conserved variables are actualy rho*u^0 & T^0_mu, but are named after the prims for consistency
@@ -227,37 +229,24 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     m = Metadata(flags_cons_vec, s_vector);
     pkg->AddField("cons.uvec", m);
 
-    bool use_b = (pin->GetString("b_field", "solver") != "none");
-    params.Add("use_b", use_b);
-    if (!use_b) {
-        // Declare placeholder fields only if not using another package providing B field.
-        // This should be redundant w/using the "Overridable" flag but has caused problems in the past.
-        // The ultimate goal is to support never defining these fields in the first place, i.e. true GRHD
-        // without memory or computation penalties.
-
-        // Remove the "HD" flag from B, since it is not that
-        flags_prim_vec.erase(std::remove(flags_prim_vec.begin(), flags_prim_vec.end(), isHD), flags_prim_vec.end());
-        // If prims are derived, remove the "Restart" flag, since unlike the fluid prims, prims.B is fully redundant
-        if (!grim_driver)
-            flags_prim_vec.erase(std::remove(flags_prim_vec.begin(), flags_prim_vec.end(), Metadata::Restart), flags_prim_vec.end());
-        flags_prim_vec.push_back(Metadata::Overridable);
-        m = Metadata(flags_prim_vec, s_vector);
-        pkg->AddField("prims.B", m);
-        flags_cons_vec.erase(std::remove(flags_cons_vec.begin(), flags_cons_vec.end(), isHD), flags_cons_vec.end());
-        flags_cons_vec.push_back(Metadata::Overridable);
-        m = Metadata(flags_cons_vec, s_vector);
-        pkg->AddField("cons.B", m);
-    }
+    // No magnetic fields here. KHARMA should operate fine in GRHD without them,
+    // so they are allocated only by B field packages.
 
     // Maximum signal speed (magnitude).
     // Needs to be cached from flux updates for calculating the timestep later
     m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, s_vector);
     pkg->AddField("ctop", m);
 
-    // Temporary fix just for being able to save field values
-    // Should switch these to "Integer" fields when Parthenon supports it
-    m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
-    pkg->AddField("pflag", m);
+    if (explicit_step) {
+        // Flag denoting UtoP inversion failures.
+        // Not used for implicit stepper, that has its own flag
+        if (imex_driver) {
+            m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy, Metadata::FillGhost});
+        } else {
+            m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
+        }
+        pkg->AddField("pflag", m);
+    }
 
     // Finally, the StateDescriptor/Package object determines the Callbacks Parthenon makes to
     // a particular package -- that is, some portion of the things that the package needs done
diff --git a/kharma/grmhd/grmhd_functions.hpp b/kharma/grmhd/grmhd_functions.hpp
new file mode 100644
index 00000000..f35dcbc3
--- /dev/null
+++ b/kharma/grmhd/grmhd_functions.hpp
@@ -0,0 +1,335 @@
+/* 
+ *  File: grmhd_functions.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include "decs.hpp"
+
+#include "gr_coordinates.hpp"
+#include "types.hpp"
+#include "kharma_utils.hpp"
+
+/**
+ * Device-side GR(M)HD functions
+ * Anything reasonably specific to doing GRHD/GRMHD, which will not change:
+ * lorentz factor, 4-vectors ucon/bcon
+ *
+ * These functions mostly have several overloads, related to local vs global variables.
+ * Many also have a form for split variables rho, uvec, etc, and one for a full array of primitive variables P.
+ * Where all 4 combinations are used, we get 4 overloads.
+ * 
+ * Local full-primitives versions are templated, to accept Slices/Scratch/etc equivalently 
+ */
+
+namespace GRMHD
+{
+
+/**
+ * Find gamma-factor of the fluid w.r.t. normal observer
+ */
+KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const GridVector uvec,
+                                         const int& k, const int& j, const int& i,
+                                         const Loci loc)
+{
+
+    const Real qsq = G.gcov(loc, j, i, 1, 1) * uvec(V1, k, j, i) * uvec(V1, k, j, i) +
+                    G.gcov(loc, j, i, 2, 2) * uvec(V2, k, j, i) * uvec(V2, k, j, i) +
+                    G.gcov(loc, j, i, 3, 3) * uvec(V3, k, j, i) * uvec(V3, k, j, i) +
+                    2. * (G.gcov(loc, j, i, 1, 2) * uvec(V1, k, j, i) * uvec(V2, k, j, i) +
+                        G.gcov(loc, j, i, 1, 3) * uvec(V1, k, j, i) * uvec(V3, k, j, i) +
+                        G.gcov(loc, j, i, 2, 3) * uvec(V2, k, j, i) * uvec(V3, k, j, i));
+
+    return sqrt(1. + qsq);
+}
+KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const Real uv[NVEC],
+                                         const int& k, const int& j, const int& i,
+                                         const Loci loc)
+{
+    const Real qsq = G.gcov(loc, j, i, 1, 1) * uv[V1] * uv[V1] +
+                    G.gcov(loc, j, i, 2, 2) * uv[V2] * uv[V2] +
+                    G.gcov(loc, j, i, 3, 3) * uv[V3] * uv[V3] +
+                    2. * (G.gcov(loc, j, i, 1, 2) * uv[V1] * uv[V2] +
+                        G.gcov(loc, j, i, 1, 3) * uv[V1] * uv[V3] +
+                        G.gcov(loc, j, i, 2, 3) * uv[V2] * uv[V3]);
+
+    return sqrt(1. + qsq);
+}
+// Versions for full primitives array
+KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
+                                         const int& k, const int& j, const int& i, const Loci& loc=Loci::center)
+{
+    const Real qsq = G.gcov(loc, j, i, 1, 1) * P(m.U1, k, j, i) * P(m.U1, k, j, i) +
+                    G.gcov(loc, j, i, 2, 2) * P(m.U2, k, j, i) * P(m.U2, k, j, i) +
+                    G.gcov(loc, j, i, 3, 3) * P(m.U3, k, j, i) * P(m.U3, k, j, i) +
+                    2. * (G.gcov(loc, j, i, 1, 2) * P(m.U1, k, j, i) * P(m.U2, k, j, i) +
+                        G.gcov(loc, j, i, 1, 3) * P(m.U1, k, j, i) * P(m.U3, k, j, i) +
+                        G.gcov(loc, j, i, 2, 3) * P(m.U2, k, j, i) * P(m.U3, k, j, i));
+
+    return sqrt(1. + qsq);
+}
+template<typename Local>
+KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const Local& P, const VarMap& m,
+                                         const int& j, const int& i, const Loci& loc=Loci::center)
+{
+    const Real qsq = G.gcov(loc, j, i, 1, 1) * P(m.U1) * P(m.U1) +
+                    G.gcov(loc, j, i, 2, 2) * P(m.U2) * P(m.U2) +
+                    G.gcov(loc, j, i, 3, 3) * P(m.U3) * P(m.U3) +
+                    2. * (G.gcov(loc, j, i, 1, 2) * P(m.U1) * P(m.U2) +
+                        G.gcov(loc, j, i, 1, 3) * P(m.U1) * P(m.U3) +
+                        G.gcov(loc, j, i, 2, 3) * P(m.U2) * P(m.U3));
+
+    return sqrt(1. + qsq);
+}
+
+/**
+ * Get a row of the MHD stress-energy tensor with first index up, second index down.
+ * A factor of sqrt(4 pi) is absorbed into the definition of b.
+ * See Gammie & McKinney '04.
+ *
+ * Entirely local!
+ */
+KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Real& pgas,
+                                            const FourVectors& D, const int dir,
+                                            Real mhd[GR_DIM])
+{
+    const Real bsq = dot(D.bcon, D.bcov);
+    const Real eta = pgas + rho + u + bsq;
+    const Real ptot = pgas + 0.5 * bsq;
+
+    DLOOP1 {
+        mhd[mu] = eta * D.ucon[dir] * D.ucov[mu] +
+                  ptot * (dir == mu) -
+                  D.bcon[dir] * D.bcov[mu];
+    }
+}
+
+/**
+ * Calculate the 4-velocities ucon, ucov, and 4-fields bcon, bcov from primitive versions
+ * 
+ * First two versions are for local stack variables and split global variables, respectively,
+ * as we sometimes want the 4-vectors without having assembled the full primitives list or anything.
+ * 
+ * The latter are the usual Local/Global versions for primitives arrays
+ */
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Real uvec[NVEC], const Real B_P[NVEC],
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      FourVectors& D)
+{
+    const Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
+    const Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    D.ucon[0] = gamma / alpha;
+    VLOOP D.ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+
+    G.lower(D.ucon, D.ucov, k, j, i, loc);
+
+    D.bcon[0] = 0;
+    VLOOP D.bcon[0] += B_P[v] * D.ucov[v+1];
+    VLOOP D.bcon[v+1] = (B_P[v] + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
+
+    G.lower(D.bcon, D.bcov, k, j, i, loc);
+}
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const GridVector uvec, const GridVector B_P,
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      FourVectors& D)
+{
+    const Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
+    const Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    D.ucon[0] = gamma / alpha;
+    VLOOP D.ucon[v+1] = uvec(v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+
+    G.lower(D.ucon, D.ucov, k, j, i, loc);
+
+    D.bcon[0] = 0;
+    VLOOP D.bcon[0] += B_P(v, k, j, i) * D.ucov[v+1];
+    VLOOP D.bcon[v+1] = (B_P(v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
+
+    G.lower(D.bcon, D.bcov, k, j, i, loc);
+}
+// Primitive/VarMap versions of calc_4vecs for kernels that use "packed" primitives
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Local& P, const VarMap& m,
+                                      const int& j, const int& i, const Loci loc, FourVectors& D)
+{
+    const Real gamma = lorentz_calc(G, P, m, j, i, loc);
+    const Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    D.ucon[0] = gamma / alpha;
+    VLOOP D.ucon[v+1] = P(m.U1 + v) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+
+    G.lower(D.ucon, D.ucov, k, j, i, loc);
+
+    if (m.B1 >= 0) {
+        D.bcon[0] = 0;
+        VLOOP D.bcon[0] += P(m.B1 + v) * D.ucov[v+1];
+        VLOOP D.bcon[v+1] = (P(m.B1 + v) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
+
+        G.lower(D.bcon, D.bcov, k, j, i, loc);
+    }
+}
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
+                                      const int& k, const int& j, const int& i, const Loci loc, FourVectors& D)
+{
+    const Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
+    const Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    D.ucon[0] = gamma / alpha;
+    VLOOP D.ucon[v+1] = P(m.U1 + v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+
+    G.lower(D.ucon, D.ucov, k, j, i, loc);
+
+    if (m.B1 >= 0) {
+        D.bcon[0] = 0;
+        VLOOP D.bcon[0] += P(m.B1 + v, k, j, i) * D.ucov[v+1];
+        VLOOP D.bcon[v+1] = (P(m.B1 + v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
+
+        G.lower(D.bcon, D.bcov, k, j, i, loc);
+    }
+}
+/**
+ * Just the velocity 4-vector, in the first two styles of calc_4vecs.  For various corners.
+ */
+KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const GridVector uvec,
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      Real ucon[GR_DIM])
+{
+    const Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
+    const Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    ucon[0] = gamma / alpha;
+    VLOOP ucon[v+1] = uvec(v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+}
+KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const Real uvec[NVEC],
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      Real ucon[GR_DIM])
+{
+    const Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
+    const Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    ucon[0] = gamma / alpha;
+    VLOOP ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+}
+
+/**
+ * Global GRMHD-only "p_to_u" call: just MHD variables (no B!). TODO elminate?
+ */
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const VarMap& m_p,
+                                   const Real& gam, const int& j, const int& i,
+                                   const Local& U, const VarMap& m_u, const Loci& loc=Loci::center)
+{
+    Real gdet = G.gdet(loc, j, i);
+    FourVectors Dtmp;
+    GRMHD::calc_4vecs(G, P, m_p, j, i, loc, Dtmp); // TODO switch GRHD/GRMHD
+    // Particle number flux
+    U(m_u.RHO) = P(m_p.RHO) * Dtmp.ucon[0] * gdet;
+
+    if (m_p.B1 >= 0) {
+        // MHD stress-energy tensor w/ first index up, second index down
+        Real mhd[GR_DIM];
+        GRMHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), Dtmp, 0, mhd);
+        U(m_u.UU)  = mhd[0] * gdet + U(m_u.RHO);
+        U(m_u.U1) =  mhd[1] * gdet;
+        U(m_u.U2) =  mhd[2] * gdet;
+        U(m_u.U3) =  mhd[3] * gdet;
+    }
+}
+KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
+                                   const Real& gam, const int& k, const int& j, const int& i,
+                                   const VariablePack<Real>& U, const VarMap& m_u, const Loci& loc=Loci::center)
+{
+    Real gdet = G.gdet(loc, j, i);
+    FourVectors Dtmp;
+    GRMHD::calc_4vecs(G, P, m_p, k, j, i, loc, Dtmp); // TODO switch GRHD/GRMHD
+    // Particle number flux
+    U(m_u.RHO, k, j, i) = P(m_p.RHO, k, j, i) * Dtmp.ucon[0] * gdet;
+
+    if (m_p.B1 >= 0) {
+        // MHD stress-energy tensor w/ first index up, second index down
+        Real mhd[GR_DIM];
+        GRMHD::calc_tensor(P(m_p.RHO, k, j, i), P(m_p.UU, k, j, i), (gam - 1) * P(m_p.UU, k, j, i), Dtmp, 0, mhd);
+        U(m_u.UU, k, j, i)  = mhd[0] * gdet + U(m_u.RHO, k, j, i);
+        U(m_u.U1, k, j, i) =  mhd[1] * gdet;
+        U(m_u.U2, k, j, i) =  mhd[2] * gdet;
+        U(m_u.U3, k, j, i) =  mhd[3] * gdet;
+    }
+}
+
+/**
+ * Special local "p_to_u" call for just MHD variables, used in fluid frame floors & wind source.
+ * See Flux::p_to_u in flux_functions.hpp for documentation.
+ */
+KOKKOS_INLINE_FUNCTION void p_to_u_mhd(const GRCoordinates& G, const Real& rho, const Real& u, const Real uvec[NVEC],
+                                   const Real B_P[NVEC], const Real& gam, const int& k, const int& j, const int& i,
+                                   Real& rho_ut, Real T[GR_DIM], const Loci loc=Loci::center)
+{
+    Real gdet = G.gdet(loc, j, i);
+
+    FourVectors Dtmp;
+    calc_4vecs(G, uvec, B_P, k, j, i, loc, Dtmp);
+
+    // Particle number flux
+    rho_ut = rho * Dtmp.ucon[0] * gdet;
+
+    // MHD stress-energy tensor w/ first index up, second index down
+    Real mhd[GR_DIM];
+    calc_tensor(rho, u, (gam - 1) * u, Dtmp, 0, mhd);
+
+    T[0]  = mhd[0] * gdet + rho_ut;
+    VLOOP T[1 + v] = mhd[1 + v] * gdet;
+}
+
+}
+
+/**
+ * This namespace is solely for calc_tensor.
+ * calc_4vecs above intelligently skips the bcon calculation if B field is not present
+ */
+namespace GRHD
+{
+/**
+ * Get a row of the hydrodynamic stress-energy tensor with first index up, second index down.
+ */
+KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Real& pgas,
+                                            const FourVectors& D, const int dir,
+                                            Real hd[GR_DIM])
+{
+    const Real eta = pgas + rho + u;
+    DLOOP1 {
+        hd[mu] = eta * D.ucon[dir] * D.ucov[mu] +
+                 pgas * (dir == mu);
+    }
+}
+
+}
diff --git a/kharma/grmhd/mhd_functions.hpp b/kharma/grmhd/mhd_functions.hpp
deleted file mode 100644
index 5e265663..00000000
--- a/kharma/grmhd/mhd_functions.hpp
+++ /dev/null
@@ -1,356 +0,0 @@
-/* 
- *  File: mhd_functions.hpp
- *  
- *  BSD 3-Clause License
- *  
- *  Copyright (c) 2020, AFD Group at UIUC
- *  All rights reserved.
- *  
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *  
- *  1. Redistributions of source code must retain the above copyright notice, this
- *     list of conditions and the following disclaimer.
- *  
- *  2. Redistributions in binary form must reproduce the above copyright notice,
- *     this list of conditions and the following disclaimer in the documentation
- *     and/or other materials provided with the distribution.
- *  
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *  
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#pragma once
-
-#include "decs.hpp"
-
-#include "gr_coordinates.hpp"
-#include "types.hpp"
-#include "kharma_utils.hpp"
-
-/**
- * Device-side MHD functions
- * They are specifically the subset which require the fluid primitives P & B field both
- *
- * These functions mostly have several overloads, related to local vs global variables
- * Arguments can come in the form of global array or VariablePack references 
- *
- * This allows easy fusing/splitting of loops & use in different contexts
- */
-
-namespace GRMHD
-{
-
-/**
- * Find gamma-factor of the fluid w.r.t. normal observer
- */
-KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const GridVector uvec,
-                                         const int& k, const int& j, const int& i,
-                                         const Loci loc)
-{
-
-    Real qsq = G.gcov(loc, j, i, 1, 1) * uvec(0, k, j, i) * uvec(0, k, j, i) +
-               G.gcov(loc, j, i, 2, 2) * uvec(1, k, j, i) * uvec(1, k, j, i) +
-               G.gcov(loc, j, i, 3, 3) * uvec(2, k, j, i) * uvec(2, k, j, i) +
-            2. * (G.gcov(loc, j, i, 1, 2) * uvec(0, k, j, i) * uvec(1, k, j, i) +
-                  G.gcov(loc, j, i, 1, 3) * uvec(0, k, j, i) * uvec(2, k, j, i) +
-                  G.gcov(loc, j, i, 2, 3) * uvec(1, k, j, i) * uvec(2, k, j, i));
-
-    return sqrt(1. + qsq);
-}
-KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const Real uv[NVEC],
-                                         const int& k, const int& j, const int& i,
-                                         const Loci loc)
-{
-    Real qsq = G.gcov(loc, j, i, 1, 1) * uv[0] * uv[0] +
-               G.gcov(loc, j, i, 2, 2) * uv[1] * uv[1] +
-               G.gcov(loc, j, i, 3, 3) * uv[2] * uv[2] +
-            2. * (G.gcov(loc, j, i, 1, 2) * uv[0] * uv[1] +
-                  G.gcov(loc, j, i, 1, 3) * uv[0] * uv[2] +
-                  G.gcov(loc, j, i, 2, 3) * uv[1] * uv[2]);
-
-    return sqrt(1. + qsq);
-}
-// Version for full primitives array
-KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
-                                         const int& k, const int& j, const int& i, const Loci& loc)
-{
-
-    Real qsq = G.gcov(loc, j, i, 1, 1) * P(m.U1, k, j, i) * P(m.U1, k, j, i) +
-               G.gcov(loc, j, i, 2, 2) * P(m.U2, k, j, i) * P(m.U2, k, j, i) +
-               G.gcov(loc, j, i, 3, 3) * P(m.U3, k, j, i) * P(m.U3, k, j, i) +
-            2. * (G.gcov(loc, j, i, 1, 2) * P(m.U1, k, j, i) * P(m.U2, k, j, i) +
-                  G.gcov(loc, j, i, 1, 3) * P(m.U1, k, j, i) * P(m.U3, k, j, i) +
-                  G.gcov(loc, j, i, 2, 3) * P(m.U2, k, j, i) * P(m.U3, k, j, i));
-
-    return sqrt(1. + qsq);
-}
-KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m,
-                                         const int& k, const int& j, const int& i, const Loci& loc)
-{
-    Real qsq = G.gcov(loc, j, i, 1, 1) * P(m.U1, i) * P(m.U1, i) +
-               G.gcov(loc, j, i, 2, 2) * P(m.U2, i) * P(m.U2, i) +
-               G.gcov(loc, j, i, 3, 3) * P(m.U3, i) * P(m.U3, i) +
-            2. * (G.gcov(loc, j, i, 1, 2) * P(m.U1, i) * P(m.U2, i) +
-                  G.gcov(loc, j, i, 1, 3) * P(m.U1, i) * P(m.U3, i) +
-                  G.gcov(loc, j, i, 2, 3) * P(m.U2, i) * P(m.U3, i));
-
-    return sqrt(1. + qsq);
-}
-
-/**
- * Get a row of the MHD stress-energy tensor with first index up, second index down.
- * A factor of sqrt(4 pi) is absorbed into the definition of b.
- * See Gammie & McKinney '04
- */
-KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Real& pgas,
-                                            const FourVectors& D, const int dir,
-                                            Real mhd[GR_DIM])
-{
-    Real bsq = dot(D.bcon, D.bcov);
-    Real eta = pgas + rho + u + bsq;
-    Real ptot = pgas + 0.5 * bsq;
-
-    DLOOP1 {
-        mhd[mu] = eta * D.ucon[dir] * D.ucov[mu] +
-                  ptot * (dir == mu) -
-                  D.bcon[dir] * D.bcov[mu];
-    }
-}
-
-/**
- * Just the velocity 4-vector
- */
-KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const GridVector uvec,
-                                      const int& k, const int& j, const int& i, const Loci loc,
-                                      Real ucon[GR_DIM])
-{
-    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    ucon[0] = gamma / alpha;
-    VLOOP ucon[v+1] = uvec(v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-}
-KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const Real uvec[NVEC],
-                                      const int& k, const int& j, const int& i, const Loci loc,
-                                      Real ucon[GR_DIM])
-{
-    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    ucon[0] = gamma / alpha;
-    VLOOP ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-}
-
-/**
- * Calculate the 4-velocities ucon, ucov, and 4-fields bcon, bcov from primitive versions
- */
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Real uvec[NVEC], const Real B_P[NVEC],
-                                      const int& k, const int& j, const int& i, const Loci loc,
-                                      FourVectors& D)
-{
-    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    D.ucon[0] = gamma / alpha;
-    VLOOP D.ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
-
-    D.bcon[0] = 0;
-    VLOOP D.bcon[0] += B_P[v] * D.ucov[v+1];
-    VLOOP D.bcon[v+1] = (B_P[v] + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
-
-    G.lower(D.bcon, D.bcov, k, j, i, loc);
-}
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const GridVector uvec, const GridVector B_P,
-                                      const int& k, const int& j, const int& i, const Loci loc,
-                                      FourVectors& D)
-{
-    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    D.ucon[0] = gamma / alpha;
-    VLOOP D.ucon[v+1] = uvec(v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
-
-    D.bcon[0] = 0;
-    VLOOP D.bcon[0] += B_P(v, k, j, i) * D.ucov[v+1];
-    VLOOP D.bcon[v+1] = (B_P(v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
-
-    G.lower(D.bcon, D.bcov, k, j, i, loc);
-}
-
-// Primitive/VarMap version of calc_4vecs for kernels that use "packed" primitives
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
-                                      const int& k, const int& j, const int& i, const Loci loc, FourVectors& D)
-{
-    Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    D.ucon[0] = gamma / alpha;
-    VLOOP D.ucon[v+1] = P(m.U1 + v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
-
-    D.bcon[0] = 0;
-    VLOOP D.bcon[0] += P(m.B1 + v, k, j, i) * D.ucov[v+1];
-    VLOOP D.bcon[v+1] = (P(m.B1 + v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
-
-    G.lower(D.bcon, D.bcov, k, j, i, loc);
-}
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m,
-                                      const int& k, const int& j, const int& i, const Loci loc, FourVectors& D)
-{
-    Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    D.ucon[0] = gamma / alpha;
-    VLOOP D.ucon[v+1] = P(m.U1 + v, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
-
-    D.bcon[0] = 0;
-    VLOOP D.bcon[0] += P(m.B1 + v, i) * D.ucov[v+1];
-    VLOOP D.bcon[v+1] = (P(m.B1 + v, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
-
-    G.lower(D.bcon, D.bcov, k, j, i, loc);
-}
-
-/**
- * Turn the primitive variables at a location into the local conserved variables, or fluxes at a face
- * 
- * Note this is for the five fluid variables only -- each package defines a prim_to_flux, which are called in GetFlux
- */
-KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m_p, const FourVectors D,
-                                         const Real& gam, const int& k, const int& j, const int& i, const int dir,
-                                         ScratchPad2D<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-
-    // Particle number flux
-    flux(m_u.RHO, i) = P(m_p.RHO, i) * D.ucon[dir] * gdet;
-
-    // MHD stress-energy tensor w/ first index up, second index down
-    Real mhd[GR_DIM];
-    calc_tensor(P(m_p.RHO, i), P(m_p.UU, i), (gam - 1) * P(m_p.UU, i), D, dir, mhd);
-    flux(m_u.UU, i)  = mhd[0] * gdet + flux(m_u.RHO, i);
-    flux(m_u.U1, i) =  mhd[1] * gdet;
-    flux(m_u.U2, i) =  mhd[2] * gdet;
-    flux(m_u.U3, i) =  mhd[3] * gdet;
-}
-
-/**
- * Get the conserved (fluid only!) variables corresponding to primitives in a zone. Equivalent to prim_to_flux with dir==0
- */
-KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
-                                   const Real& gam, const int& k, const int& j, const int& i,
-                                   const VariablePack<Real>& U, const VarMap m_u, const Loci loc=Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-
-    FourVectors Dtmp;
-    calc_4vecs(G, P, m_p, k, j, i, loc, Dtmp);
-
-    // Particle number flux
-    U(m_u.RHO, k, j, i) = P(m_p.RHO, k, j, i) * Dtmp.ucon[0] * gdet;
-
-    // MHD stress-energy tensor w/ first index up, second index down
-    Real mhd[GR_DIM];
-    calc_tensor(P(m_p.RHO, k, j, i), P(m_p.UU, k, j, i), (gam - 1) * P(m_p.UU, k, j, i), Dtmp, 0, mhd);
-
-    U(m_u.UU, k, j, i)  = mhd[0] * gdet + U(m_u.RHO, k, j, i);
-    VLOOP U(m_u.U1 + v, k, j, i) = mhd[1 + v] * gdet;
-}
-
-/**
- * Special p_to_u call for fluid frame floors, which require a speculative transformation to add to existing U
- * Also used in the wind source term calculation, of all places
- */
-KOKKOS_INLINE_FUNCTION void p_to_u_loc(const GRCoordinates& G, const Real& rho, const Real& u, const Real uvec[NVEC],
-                                   const Real B_P[NVEC], const Real& gam, const int& k, const int& j, const int& i,
-                                   Real& rho_ut, Real T[GR_DIM], const Loci loc=Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-
-    FourVectors Dtmp;
-    calc_4vecs(G, uvec, B_P, k, j, i, loc, Dtmp);
-
-    // Particle number flux
-    rho_ut = rho * Dtmp.ucon[0] * gdet;
-
-    // MHD stress-energy tensor w/ first index up, second index down
-    Real mhd[GR_DIM];
-    calc_tensor(rho, u, (gam - 1) * u, Dtmp, 0, mhd);
-
-    T[0]  = mhd[0] * gdet + rho_ut;
-    VLOOP T[1 + v] = mhd[1 + v] * gdet;
-}
-
-
-/**
- * Calculate components of magnetosonic velocity from primitive variables
- * This is only called in GetFlux, so we only provide a ScratchPad form
- */
-KOKKOS_INLINE_FUNCTION void vchar(const GRCoordinates &G, const ScratchPad2D<Real>& P, const VarMap& m, const FourVectors& D,
-                                  const Real& gam, const int& k, const int& j, const int& i, const Loci loc, const int& dir,
-                                  Real& cmax, Real& cmin)
-{
-    // Find fast magnetosonic speed
-    Real cms2;
-    {
-        Real bsq = dot(D.bcon, D.bcov);
-        Real ef = P(m.RHO, i) + gam * P(m.UU, i);
-        Real ee = bsq + ef;
-        Real va2 = bsq / ee;
-        Real cs2 = gam * (gam - 1) * P(m.UU, i) / ef;
-        cms2 = cs2 + va2 - cs2 * va2;
-        clip(cms2, 1.e-20, 1.);
-    }
-
-    // Require that speed of wave measured by observer q.ucon is cms2
-    Real A, B, C;
-    {
-        Real Bcov[GR_DIM] = {1., 0., 0., 0.};
-        Real Acov[GR_DIM] = {0}; Acov[dir] = 1.;
-
-        Real Acon[GR_DIM], Bcon[GR_DIM];
-        G.raise(Acov, Acon, k, j, i, loc);
-        G.raise(Bcov, Bcon, k, j, i, loc);
-
-        Real Asq = dot(Acon, Acov);
-        Real Bsq = dot(Bcon, Bcov);
-        Real Au = dot(Acov, D.ucon);
-        Real Bu = dot(Bcov, D.ucon);
-        Real AB = dot(Acon, Bcov);
-        Real Au2 = Au * Au;
-        Real Bu2 = Bu * Bu;
-        Real AuBu = Au * Bu;
-
-        A = Bu2 - (Bsq + Bu2) * cms2;
-        B = 2. * (AuBu - (AB + AuBu) * cms2);
-        C = Au2 - (Asq + Au2) * cms2;
-    }
-
-    Real discr = sqrt(max(B * B - 4. * A * C, 0.));
-
-    Real vp = -(-B + discr) / (2. * A);
-    Real vm = -(-B - discr) / (2. * A);
-
-    cmax = max(vp, vm);
-    cmin = min(vp, vm);
-}
-
-}
diff --git a/kharma/grmhd/source.hpp b/kharma/grmhd/source.hpp
index 9e6ef291..aa88ff27 100644
--- a/kharma/grmhd/source.hpp
+++ b/kharma/grmhd/source.hpp
@@ -35,7 +35,7 @@
 #pragma once
 
 #include "decs.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 namespace GRMHD
 {
diff --git a/kharma/harm_driver.cpp b/kharma/harm_driver.cpp
index fcfe824f..56363c28 100644
--- a/kharma/harm_driver.cpp
+++ b/kharma/harm_driver.cpp
@@ -50,7 +50,7 @@
 #include "boundaries.hpp"
 #include "debug.hpp"
 #include "fixup.hpp"
-#include "fluxes.hpp"
+#include "flux.hpp"
 #include "iharm_restart.hpp"
 #include "source.hpp"
 
diff --git a/kharma/grim_driver.cpp b/kharma/imex_driver.cpp
similarity index 80%
rename from kharma/grim_driver.cpp
rename to kharma/imex_driver.cpp
index c46f5ba7..3299643f 100644
--- a/kharma/grim_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -1,5 +1,5 @@
 /* 
- *  File: grim_driver.cpp
+ *  File: imex_driver.cpp
  *  
  *  BSD 3-Clause License
  *  
@@ -31,7 +31,7 @@
  *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-#include "grim_driver.hpp"
+#include "imex_driver.hpp"
 
 #include <iostream>
 
@@ -41,20 +41,22 @@
 
 #include "decs.hpp"
 
+//Packages
 #include "b_flux_ct.hpp"
 #include "b_cd.hpp"
 #include "electrons.hpp"
 #include "grmhd.hpp"
 #include "wind.hpp"
-
+// Other headers
 #include "boundaries.hpp"
 #include "debug.hpp"
 #include "fixup.hpp"
-#include "fluxes.hpp"
+#include "flux.hpp"
 #include "iharm_restart.hpp"
+#include "implicit.hpp"
 #include "source.hpp"
 
-TaskCollection GRIMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
+TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
 {
     // Reminder that NOTHING YOU CALL HERE WILL GET CALLED EVERY STEP
     // this function is run *once*, and returns a list of what should be done every step.
@@ -164,6 +166,26 @@ TaskCollection GRIMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             auto t_flux_ct = tl.AddTask(t_fix_flux, B_FluxCT::TransportB, mc0.get());
             t_flux_fixed = t_flux_ct;
         }
+
+        // APPLY FLUXES
+        auto t_flux_div = tl.AddTask(t_none, Update::FluxDivergence<MeshData<Real>>, mc0.get(), mdudt.get());
+
+        // ADD EXPLICIT SOURCES TO CONSERVED VARIABLES
+        // Source term for GRMHD, \Gamma * T
+        // TODO take this out in Minkowski space
+        auto t_flux_apply = tl.AddTask(t_flux_div, GRMHD::AddSource, mc0.get(), mdudt.get());
+        // Source term for constraint-damping.  Applied only to B
+        auto t_b_cd_source = t_flux_apply;
+        if (use_b_cd) {
+            t_b_cd_source = tl.AddTask(t_flux_apply, B_CD::AddSource, mc0.get(), mdudt.get());
+        }
+        // Wind source.  Applied to conserved variables similar to GR source term
+        auto t_wind_source = t_b_cd_source;
+        if (use_wind) {
+            t_wind_source = tl.AddTask(t_b_cd_source, Wind::AddSource, mdudt.get());
+        }
+        // Done with source terms
+        auto t_sources = t_wind_source;
     }
 
     // This region is where GRIM and classic HARM split.
@@ -171,8 +193,8 @@ TaskCollection GRIMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
     // then solves for the primitive variables with UtoP (here "FillDerived")
     const auto &driver_step =
         blocks[0]->packages.Get("GRMHD")->Param<std::string>("driver_step");
-    if (driver_step == "explicit") { // This is the general HARM step, with flux divergence & UtoP
-        // Apply fluxes and update conserved state
+    if (driver_step == "explicit") { // Explicit step
+        // Update conserved state with dUdt
         const int num_partitions = pmesh->DefaultNumPartitions();
         TaskRegion &single_tasklist_per_pack_region = tc.AddRegion(num_partitions);
         for (int i = 0; i < num_partitions; i++) {
@@ -181,28 +203,9 @@ TaskCollection GRIMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             auto &mc0 = pmesh->mesh_data.GetOrAdd(stage_name[stage - 1], i);
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
             auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i);
-            // APPLY FLUXES
-            auto t_flux_div = tl.AddTask(t_none, Update::FluxDivergence<MeshData<Real>>, mc0.get(), mdudt.get());
-
-            // ADD SOURCES TO CONSERVED VARIABLES
-            // Source term for GRMHD, \Gamma * T
-            // TODO take this out in Minkowski space
-            auto t_flux_apply = tl.AddTask(t_flux_div, GRMHD::AddSource, mc0.get(), mdudt.get());
-            // Source term for constraint-damping.  Applied only to B
-            auto t_b_cd_source = t_flux_apply;
-            if (use_b_cd) {
-                t_b_cd_source = tl.AddTask(t_flux_apply, B_CD::AddSource, mc0.get(), mdudt.get());
-            }
-            // Wind source.  Applied to conserved variables similar to GR source term
-            auto t_wind_source = t_b_cd_source;
-            if (use_wind) {
-                t_wind_source = tl.AddTask(t_b_cd_source, Wind::AddSource, mdudt.get());
-            }
-            // Done with source terms
-            auto t_sources = t_wind_source;
 
             // UPDATE BASE CONTAINER
-            auto t_avg_data = tl.AddTask(t_sources, Update::AverageIndependentData<MeshData<Real>>,
+            auto t_avg_data = tl.AddTask(t_none, Update::AverageIndependentData<MeshData<Real>>,
                                     mc0.get(), mbase.get(), beta);
             // apply du/dt to all independent fields in the container
             auto t_update = tl.AddTask(t_avg_data, Update::UpdateIndependentData<MeshData<Real>>, mc0.get(),
@@ -220,6 +223,8 @@ TaskCollection GRIMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             auto &sc0 = pmb->meshblock_data.Get(stage_name[stage-1]);
             auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
 
+            // COPY PRIMITIVES
+            // These form the guess for UtoP
             auto t_copy_prims = tl.AddTask(t_none,
                 [](MeshBlockData<Real> *rc0, MeshBlockData<Real> *rc1)
                 {
@@ -232,22 +237,29 @@ TaskCollection GRIMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
                 }, sc0.get(), sc1.get()
             );
 
-
             auto t_fill_derived = tl.AddTask(t_copy_prims, Update::FillDerived<MeshBlockData<Real>>, sc1.get());
-            // See note about syncing boundary here
-            auto t_fix_derived = tl.AddTask(t_fill_derived, GRMHD::FixUtoP, sc1.get());
-            auto t_heat_electrons = t_fix_derived;
-            if (use_electrons) {
-                auto t_heat_electrons = tl.AddTask(t_fix_derived, Electrons::ApplyElectronHeating, sc0.get(), sc1.get());
-            }
+            // This is *not* immediately corrected with FixUtoP, but synchronized (including pflags!) first.
+            // With an extra ghost zone, this *should* still allow binary-similar evolution between numbers of mesh blocks
+        }
+    } else { // Implicit step
+        const int num_partitions = pmesh->DefaultNumPartitions();
+        TaskRegion &single_tasklist_per_pack_region = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &tl = single_tasklist_per_pack_region[i];
+            auto &mbase = pmesh->mesh_data.GetOrAdd("base", i);
+            auto &mc0 = pmesh->mesh_data.GetOrAdd(stage_name[stage - 1], i);
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i);
+
+            // time-step by root-finding the residual
+            // This applies the functions of both t_update and t_fill_derived
+            auto t_implicit_solve = tl.AddTask(t_none, Implicit::Step, mbase.get(), mc0.get(), mdudt.get(), mc1.get(), dt);
         }
-    } else { // This is the GRIM step
-        // GRIM ALGO HERE
     }
 
     // MPI/MeshBlock boundary exchange.
     // Optionally "packed" to send all data in one call (num_partitions defaults to 1)
-    // Note that in GRIM driver this block syncs *primitive* variables, not conserved
+    // Note that in this driver, this block syncs *primitive* variables, not conserved
     const auto &pack_comms =
         blocks[0]->packages.Get("GRMHD")->Param<bool>("pack_comms");
     if (pack_comms) {
@@ -300,10 +312,25 @@ TaskCollection GRIMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             t_prolongBound = tl.AddTask(t_clear_comm_flags, ProlongateBoundaries, sc1);
         }
 
-
         auto t_set_bc = tl.AddTask(t_prolongBound, parthenon::ApplyBoundaryConditions, sc1);
 
-        auto t_ptou = tl.AddTask(t_set_bc, Flux::PrimToFluxTask, sc1.get());
+        // Syncing bounds before fixUtoP, and thus running it over the whole domain, will make
+        // behavior for different mesh breakdowns much more similar (identical?), as bad zones on boundaries
+        // will get to use all the same neighbors.
+        // As long as we sync pflags by setting FillGhosts when using this driver!
+        auto t_fix_derived = t_set_bc;
+        if (driver_step == "explicit") {
+            t_fix_derived = tl.AddTask(t_set_bc, GRMHD::FixUtoP, sc1.get());
+        }
+
+        // Electron heating goes where it does in HARMDriver, for the same reasons
+        auto t_heat_electrons = t_fix_derived;
+        if (use_electrons) {
+            t_heat_electrons = tl.AddTask(t_fix_derived, Electrons::ApplyElectronHeating, sc0.get(), sc1.get());
+        }
+
+        // 
+        auto t_ptou = tl.AddTask(t_heat_electrons, Flux::PtoUTask, sc1.get());
 
         auto t_step_done = t_ptou;
 
diff --git a/kharma/grim_driver.hpp b/kharma/imex_driver.hpp
similarity index 82%
rename from kharma/grim_driver.hpp
rename to kharma/imex_driver.hpp
index 402109b4..6f3acfe3 100644
--- a/kharma/grim_driver.hpp
+++ b/kharma/imex_driver.hpp
@@ -1,5 +1,5 @@
 /* 
- *  File: grim_driver.cpp
+ *  File: imex_driver.hpp
  *  
  *  BSD 3-Clause License
  *  
@@ -41,15 +41,17 @@ using namespace parthenon;
 
 /**
  * A Driver object orchestrates everything that has to be done to a mesh to constitute a step.
- * Nominally GRIM is very much like HARM, but in KHARMA the two drivers have one key difference:
- * GRIMDriver syncs primitive variables, whereas HARM/KHARMA syncs conserved variables
+ * This driver does pretty much the same thing as the HARMDriver, with one important difference:
+ * ImexDriver syncs primitive variables and treats them as fundamental, whereas HARMDriver syncs conserved variables.
+ * This allows ImexDriver to optionally use a semi-implicit step, adding a per-zone implicit solve via the 'Implicit'
+ * package, instead of just explicit RK2 time-stepping.  This driver also allows explicit-only RK2 operation
  */
-class GRIMDriver : public MultiStageDriver {
+class ImexDriver : public MultiStageDriver {
     public:
         /**
          * Default constructor
          */
-        GRIMDriver(ParameterInput *pin, ApplicationInput *papp, Mesh *pm) : MultiStageDriver(pin, papp, pm) {}
+        ImexDriver(ParameterInput *pin, ApplicationInput *papp, Mesh *pm) : MultiStageDriver(pin, papp, pm) {}
 
         /**
          * All the tasks which constitute advancing the fluid in a mesh by one stage.
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
index 825eb442..5d336e5d 100644
--- a/kharma/implicit/implicit.cpp
+++ b/kharma/implicit/implicit.cpp
@@ -38,7 +38,7 @@
 
 #include "debug.hpp"
 #include "fixup.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "pack.hpp"
 
 #include <batched/dense/KokkosBatched_LU_Decl.hpp>
@@ -51,22 +51,23 @@ namespace Implicit
 
 std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
 {
-    // TODO can I just build/add/use a Prescription here, rather than building one
-    // before each call?
     auto pkg = std::make_shared<StateDescriptor>("Implicit");
     Params &params = pkg->AllParams();
 
     // Implicit solver parameters
-    bool jacobian_eps = pin->GetOrAddReal("implicit", "jacobian_eps", 4.e-8);
-    params.Add("jacobian_eps", jacobian_eps);
-    bool rootfind_tol = pin->GetOrAddReal("implicit", "rootfind_tol", 1.e-3);
+    Real jacobian_delta = pin->GetOrAddReal("implicit", "jacobian_delta", 4.e-8);
+    params.Add("jacobian_delta", jacobian_delta);
+    Real rootfind_tol = pin->GetOrAddReal("implicit", "rootfind_tol", 1.e-3);
     params.Add("rootfind_tol", rootfind_tol);
-    bool max_nonlinear_iter = pin->GetOrAddInteger("implicit", "max_nonlinear_iter", 1);
+    Real linesearch_lambda = pin->GetOrAddReal("implicit", "linesearch_lambda", 1.0);
+    params.Add("linesearch_lambda", linesearch_lambda);
+    int max_nonlinear_iter = pin->GetOrAddInteger("implicit", "max_nonlinear_iter", 3);
     params.Add("max_nonlinear_iter", max_nonlinear_iter);
 
-    // Any fields particular to the implicit solver (NOT EGRMHD IN GENERAL)
-    // Likely none...
-    // see viscosity/viscosity.cpp for EGRMHD/auxiliary fields
+    // No field specific to implicit solving, but we keep around the residual since
+    // we need to write the whole thing out anyway
+    Metadata m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
+    pkg->AddField("pflag", m);
 
     // Anything we need to run from this package on callbacks
     // None of this will be crucial for the step
@@ -76,65 +77,226 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     return pkg;
 }
 
-void UtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
+TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
+                MeshData<Real> *md1, const Real& dt)
 {
-    Flag(rc, "UtoP electrons");
-    auto pmb = rc->GetBlockPointer();
-
-    MetadataFlag isNonideal = pmb->packages.Get("Viscosity")->Param<MetadataFlag>("NonidealFlag");
-    MetadataFlag isPrimitive = pmb->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
-    // No need for a "map" here, we just want everything that fits these
-    auto& e_P = rc->PackVariables({isNonideal, isPrimitive});
-    auto& e_U = rc->PackVariables({isNonideal, Metadata::Conserved});
-    // And then the local density
-    GridScalar rho_U = rc->Get("cons.rho").data;
-
-    const auto& G = pmb->coords;
-
-    // Get array bounds from Parthenon
-    auto bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;
-    IndexRange ib = bounds.GetBoundsI(domain);
-    IndexRange jb = bounds.GetBoundsJ(domain);
-    IndexRange kb = bounds.GetBoundsK(domain);
-
-    // For speed, we will need need need to copy & reorder indices before running this
-
-    // Begin the funky kokkos bit
-    // Let's do a batched LU and Trsv!
-    const Real alpha = 1, tiny = 0;
-    const int ni = bounds.ncellsi(domain), nj = bounds.ncellsj(domain), nk = bounds.ncellsk(domain);
-    ParArray5D<Real> AA("AA", nk, nj, ni, 7, 7);
-    ParArray4D<Real> B("B", nk, nj, ni, 7);
-
-    // Simulating some iterations
-    for (int iter=0; iter < 5; iter++) {
-        // Normally, when doing multiple batched operations,
-        // we would need either a general solve function,
-        // or two reads through the full array. Not so in Kokkos!
-        // This could be faster I think -- there are versions of the inner portion
-        // that cover rows at a time, by taking member objects on a Team
-        // see e.g. fluxes.hpp for usage of teams
-        pmb->par_for("implicit_solve", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-            KOKKOS_LAMBDA_3D {
-                // This code lightly adapted from 
-                auto A = Kokkos::subview(AA, k, j, i, Kokkos::ALL(), Kokkos::ALL());
-                auto b = Kokkos::subview(B, k, j, i, Kokkos::ALL());
-                /// [template]AlgoType: Unblocked, Blocked, CompatMKL
-                /// [in/out]A: 2d view
-                /// [in]tiny: a magnitude scalar value to avoid div/0
-                KokkosBatched::SerialLU<Algo::LU::Blocked>::invoke(A, tiny);
-                /// [template]UploType: indicates either upper triangular or lower triangular; Uplo::Upper, Uplo::Lower
-                /// [template]TransType: transpose of A; Trans::NoTranspose, Trans::Transpose
-                /// [template]DiagType: diagonals; Diag::Unit or Diag::NonUnit
-                /// [template]AlgoType: Unblocked, Blocked, CompatMKL
-                /// [in]alpha: scalar value
-                /// [in]A: 2d view
-                /// [in]b: 1d view
-                KokkosBatched::SerialTrsv<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>::invoke(alpha, A, b);
+    Flag(mdi, "Implicit Iteration start, i");
+    Flag(md0, "Implicit Iteration start, 0");
+    Flag(dudt, "Implicit Iteration start, dudt");
+    auto pmb0 = mdi->GetBlockData(0)->GetBlockPointer();
+
+    const auto& implicit_par = pmb0->packages.Get("Implicit")->AllParams();
+    const int iter_max = implicit_par.Get<int>("max_nonlinear_iter");
+    const Real lambda = implicit_par.Get<Real>("linesearch_lambda");
+    const Real delta = implicit_par.Get<Real>("jacobian_delta");
+    const Real gam = pmb0->packages.Get("GRMHD")->Param<Real>("gamma");
+
+
+    //MetadataFlag isNonideal = pmb0->packages.Get("EMHD")->Param<MetadataFlag>("NonidealFlag");
+    MetadataFlag isPrimitive = pmb0->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+    // Initial state.  Also mapping template
+    PackIndexMap prims_map, cons_map;
+    auto& Pi_all = mdi->PackVariables(std::vector<MetadataFlag>{isPrimitive}, prims_map);
+    auto& Ui_all = mdi->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
+    const VarMap m_u(cons_map, true), m_p(prims_map, false);
+    // Current sub-step starting state.
+    auto& Ps_all = md0->PackVariables(std::vector<MetadataFlag>{isPrimitive});
+    auto& Us_all = md0->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved});
+    // Flux divergence plus explicit source terms. This is what we'd be adding 
+    auto& dUdt_all = dudt->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved});
+    // Desired final state.  Note this is prims only: we sync these, then run P->U on each node.
+    // TODO REMEMBER TO COPY IN MD0 CONTENTS AS GUESS
+    auto& P_solver_all = md1->PackVariables(std::vector<MetadataFlag>{isPrimitive});
+
+    // Note this iterator, like all of KHARMA, requires nprim == ncons
+    // TODO Maybe should enforce that at start?
+    const int nblock = Ui_all.GetDim(5);
+    const int nvar = Ui_all.GetDim(4);
+
+    // Workspaces for iteration, include ghosts to match indices.
+    // Probably should never need coarse/entire...
+    auto bounds = pmb0->cellbounds; //coarse ? pmb0->c_cellbounds : pmb0->cellbounds;
+    const int n1 = bounds.ncellsi(IndexDomain::entire);
+    const int n2 = bounds.ncellsj(IndexDomain::entire);
+    const int n3 = bounds.ncellsk(IndexDomain::entire);
+
+    // The norm of the residual.  We store this to avoid the main kernel
+    // also being a 2-stage reduction, which is complex and sucks.
+    ParArray4D<Real> norm_all("norm_all", nblock, n3, n2, n1);
+
+    // Prep Jacobian and delta arrays.
+    // This lays out memory correctly & allows splitting kernel as/if we need.
+    const Real alpha = 1, tiny = SMALL;
+    const bool am_rank0 = MPIRank0();
+
+    // Get meshblock array bounds from Parthenon
+    const IndexDomain domain = IndexDomain::interior;
+    const IndexRange ib = bounds.GetBoundsI(domain);
+    const IndexRange jb = bounds.GetBoundsJ(domain);
+    const IndexRange kb = bounds.GetBoundsK(domain);
+    const IndexRange block = IndexRange{0, nblock - 1};
+    //const IndexRange vb = IndexRange{0, nvar - 1};
+
+    // Allocate scratch space
+    // It is impossible to declare runtime-sized arrays in CUDA
+    // of e.g. length var[nvar] (recall nvar can change at runtime in KHARMA)
+    // Instead we copy to scratch!
+    // This allows flexibility in structuring the kernel, as
+    // well as slicing, which in turn allows writing just *one* version of each operation!
+    // Older versions of KHARMA solved this with overloads, it was a mess.  This is less mess.
+    const int scratch_level = 1; // 0 is actual scratch (tiny); 1 is HBM
+    const size_t var_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, n1);
+    const size_t tensor_size_in_bytes = parthenon::ScratchPad3D<Real>::shmem_size(nvar, nvar, n1);
+    // Allocate enough to cache:
+    // jacobian (2D)
+    // residual, deltaP, dUi, two temps
+    // Pi/Ui, Ps/Us, dUdt, P_solver
+    const size_t total_scratch_bytes = tensor_size_in_bytes + (12) * var_size_in_bytes;
+
+    // Iterate.  This loop is outside the kokkos kernel in order to print max_norm
+    // There are generally a low and similar number of iterations between
+    // different zones, so probably acceptable speed loss.
+    for (int iter=0; iter < iter_max; iter++) {
+        // Flags per iter, since debugging here will be rampant
+        Flag(md0, "Implicit Iteration: md0");
+        Flag(md1, "Implicit Iteration: md1");
+
+        parthenon::par_for_outer(DEFAULT_OUTER_LOOP_PATTERN, "implicit_solve", pmb0->exec_space,
+            total_scratch_bytes, scratch_level, block.s, block.e, kb.s, kb.e, jb.s, jb.e,
+            KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int& b, const int& k, const int& j) {
+                const auto& G = Ui_all.GetCoords(b);
+                ScratchPad3D<Real> jacobian_s(member.team_scratch(scratch_level), nvar, nvar, n1);
+                ScratchPad2D<Real> residual_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> delta_prim_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> dUi_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> tmp1_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> tmp2_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> tmp3_s(member.team_scratch(scratch_level), nvar, n1);
+                // Local versions of the variables
+                ScratchPad2D<Real> Pi_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> Ui_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> Ps_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> Us_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> dUdt_s(member.team_scratch(scratch_level), nvar, n1);
+                ScratchPad2D<Real> P_solver_s(member.team_scratch(scratch_level), nvar, n1);
+
+                // Copy some file contents to scratchpads, so we can slice them
+                PLOOP parthenon::par_for_inner(member, ib.s, ib.e,
+                    [&](const int& i) {
+                        Pi_s(ip, i) = Pi_all(b)(ip, k, j, i);
+                        Ui_s(ip, i) = Ui_all(b)(ip, k, j, i);
+                        Ps_s(ip, i) = Ps_all(b)(ip, k, j, i);
+                        Us_s(ip, i) = Us_all(b)(ip, k, j, i);
+                        dUdt_s(ip, i) = dUdt_all(b)(ip, k, j, i);
+                        // Finally, P_solver should actually be initialized to Ps
+                        if (iter == 0) {
+                            P_solver_s(ip, i) = Ps_all(b)(ip, k, j, i);
+                        } else {
+                            P_solver_s(ip, i) = P_solver_all(b)(ip, k, j, i);
+                        }
+                    }
+                );
+
+                parthenon::par_for_inner(member, ib.s, ib.e,
+                    [&](const int& i) {
+                        // Lots of slicing.  This is still way faster & cleaner than alternatives, trust me
+                        auto Pi = Kokkos::subview(Pi_s, Kokkos::ALL(), i);
+                        auto Ui = Kokkos::subview(Ui_s, Kokkos::ALL(), i);
+                        auto Ps = Kokkos::subview(Ps_s, Kokkos::ALL(), i);
+                        auto Us = Kokkos::subview(Us_s, Kokkos::ALL(), i);
+                        auto dUdt = Kokkos::subview(dUdt_s, Kokkos::ALL(), i);
+                        auto P_solver = Kokkos::subview(P_solver_s, Kokkos::ALL(), i);
+                        // Solver variables
+                        auto residual = Kokkos::subview(residual_s, Kokkos::ALL(), i);
+                        auto jacobian = Kokkos::subview(jacobian_s, Kokkos::ALL(), Kokkos::ALL(), i);
+                        auto delta_prim = Kokkos::subview(delta_prim_s, Kokkos::ALL(), i);
+                        // Temporaries
+                        auto tmp1 = Kokkos::subview(tmp1_s, Kokkos::ALL(), i);
+                        auto tmp2 = Kokkos::subview(tmp2_s, Kokkos::ALL(), i);
+                        auto tmp3 = Kokkos::subview(tmp3_s, Kokkos::ALL(), i);
+                        // Implicit sources at starting state
+                        auto dUi = Kokkos::subview(dUi_s, Kokkos::ALL(), i);
+                        if (m_p.Q >= 0) {
+                            //emhd_implicit_sources(G, Si, dUi);
+                        } else {
+                            PLOOP dUi(ip) = 0;
+                        }
+
+                        // Jacobian calculation
+                        // Requires calculating the residual anyway, so we grab it here
+                        // (the array will eventually hold delta_prim, after the matrix solve)
+                        calc_jacobian(G, P_solver, Ui, Us, dUdt, dUi, tmp1, tmp2, tmp3,
+                                      m_p, m_u, nvar, j, i, delta, gam, dt, jacobian, residual);
+                        // Initial delta prim is negative residual
+                        PLOOP delta_prim(ip) = -residual(ip);
+
+                        // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 8) {
+                        //     printf("\nSample Jacobian and residual:");
+                        //     for (int u=0; u < nvar; u++) {
+                        //         printf("\n");
+                        //         for (int v=0; v < nvar; v++) printf("%f ", jacobian(u, v));
+                        //     }
+                        //     printf("\nres:\n");
+                        //     for (int u=0; u < nvar; u++) printf("%f ", delta_prim(u));
+                        //     printf("\n");
+                        // }
+
+                        // Linear solve
+                        // This code lightly adapted from Kokkos batched examples
+                        KokkosBatched::SerialLU<Algo::LU::Unblocked>::invoke(jacobian, tiny);
+                        KokkosBatched::SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>
+                        ::invoke(alpha, jacobian, delta_prim);
+
+                        // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 8) {
+                        //     printf("\nTri Jacobian and dP:");
+                        //     for (int u=0; u < nvar; u++) {
+                        //         printf("\n");
+                        //         for (int v=0; v < nvar; v++) printf("%f ", jacobian(u, v));
+                        //     }
+                        //     printf("\ndP:\n");
+                        //     for (int u=0; u < nvar; u++) printf("%f ", delta_prim(u));
+                        //     printf("\n");
+                        // }
+
+                        // Update the guess.  For now lambda == 1, choose on the fly?
+                        PLOOP P_solver(ip) += lambda * delta_prim(ip);
+
+                        calc_residual(G, P_solver, Ui, Us, dUdt, dUi, tmp3,
+                                      m_p, m_u, nvar, j, i, gam, dt, residual);
+
+                        // Store for maximum/output
+                        // I would be tempted to store the whole residual, but it's of variable size
+                        norm_all(b, k , j, i) = 0;
+                        PLOOP norm_all(b, k, j, i) += pow(residual(ip), 2);
+                        norm_all(b, k, j, i) = sqrt(norm_all(b, k, j, i)); // TODO faster to scratch cache & copy?
+
+                    }
+                );
+
+                // Copy out P_solver to the existing array
+                // This combo still works if P_solver is aliased to one of the other arrays!
+                PLOOP parthenon::par_for_inner(member, ib.s, ib.e,
+                    [&](const int& i) {
+                        P_solver_all(b)(ip, k, j, i) = P_solver_s(ip, i);
+                    }
+                );
             }
         );
+
+        // L2 norm maximum.
+        Real max_norm;
+        Kokkos::Max<Real> norm_max(max_norm);
+        pmb0->par_reduce("max_norm", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA_MESH_3D_REDUCE {
+                if (norm_all(b, k, j, i) > local_result) local_result = norm_all(b, k, j, i);
+            }
+        , norm_max);
+        max_norm = MPIMax(max_norm);
+        if (MPIRank0()) fprintf(stdout, "Nonlinear iter %d. Max L2 norm: %g\n", iter, max_norm);
     }
 
+    return TaskStatus::complete;
+
 }
 
 } // namespace Implicit
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
index 9ac7f2db..cba102e2 100644
--- a/kharma/implicit/implicit.hpp
+++ b/kharma/implicit/implicit.hpp
@@ -34,6 +34,9 @@
 #pragma once
 
 #include "decs.hpp"
+#include "flux_functions.hpp"
+#include "types.hpp"
+#include "grmhd_functions.hpp"
 
 #include <parthenon/parthenon.hpp>
 
@@ -45,5 +48,97 @@ namespace Implicit
  */
 std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin);
 
+/**
+ * @brief take an implicit step.
+ * 
+ * @param mdi the fluid state at the beginning of the step
+ * @param md0 the initial fluid state
+ * @param dudt the negative flux divergence plus explicit source terms
+ * @param md1 the final fluid state
+ * @param dt the timestep (current substep)
+ */
+TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
+                MeshData<Real> *md1, const Real& dt);
+
+/**
+ * Calculate the residual generated by the trial primitives P_test
+ * 
+ * "Global" here are read-only input arrays addressed var(ip, k, j, i)
+ * "Local" here is anything sliced (usually Scratch) addressable var(ip)
+ */
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P_test, const Local& Ui, const Local& Us,
+                                          const Local& dudt_explicit, const Local& dUi, const Local& tmp, 
+                                          const VarMap& m_p, const VarMap& m_u,
+                                          const int& nvar, const int& j, const int& i,
+                                          const Real& gam, const double& dt,
+                                          Local& residual)
+{
+    // These lines calculate res = (U_test - Ui)/dt - dudt_explicit - 0.5*(dU_new(ip) + dUi(ip)) - dU_time(ip) )
+    // Start with conserved vars corresponding to test P, U_test
+    // Note this uses the Flux:: call, it needs *all* conserved vars!
+    Flux::p_to_u(G, P_test, m_p, gam, j, i, tmp, m_u); // U_test
+    // (U_test - Ui)/dt - dudt_explicit ...
+    PLOOP residual(ip) = (tmp(ip) - Ui(ip)) / dt - dudt_explicit(ip);
+
+    if (m_p.Q >= 0) {
+        // Compute new implicit source terms and time derivative source terms
+        //emhd_implicit_sources(G, P_test, j, i, tmp); // dU_new
+        // ... - 0.5*(dU_new(ip) + dUi(ip)) ...
+        //PLOOP residual(ip) -= 0.5*(tmp(ip) + dUi(ip));
+        //emhd_time_derivative_sources(G, P_test, Ui, Us, dt, j, i, tmp); // dU_time
+        // ... - dU_time(ip)
+        //PLOOP residual(ip) -= tmp(ip);
+    }
+}
+
+/**
+ * Evaluate the jacobian for the implicit iteration, in one zone
+ * 
+ * Local is anything addressable by (0:nvar-1), Local2 is the same for 2D (0:nvar-1, 0:nvar-1)
+ * Usually these are Kokkos subviews
+ */
+template<typename Local, typename Local2>
+KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P, const Local& Ui, const Local& Us,
+                                          const Local& dudt_explicit, const Local& dUi,
+                                          Local& tmp1, Local& tmp2, Local& tmp3,
+                                          const VarMap& m_p, const VarMap& m_u,
+                                          const int& nvar, const int& j, const int& i,
+                                          const Real& jac_delta, const Real& gam, const double& dt,
+                                          Local2& jacobian, Local& residual)
+{
+    // Calculate residual for Sf->P
+    calc_residual(G, P, Ui, Us, dudt_explicit, dUi, tmp3, m_p, m_u, nvar, j, i, gam, dt, residual);
+
+    // Use one scratchpad as the incremented prims P_delta,
+    // one as the new residual residual_delta
+    auto& P_delta = tmp1;
+    auto& residual_delta = tmp2;
+    // set P_delta to P to begin with
+    PLOOP P_delta(ip) = P(ip);
+
+    // Numerically evaluate the Jacobian
+    for (int col = 0; col < nvar; col++) {
+        // Compute P_delta, differently depending on whether the prims are small compared to eps
+        if (abs(P(col)) < (0.5 * jac_delta)) {
+            P_delta(col) = P(col) + jac_delta;
+        } else {
+            P_delta(col) = (1 + jac_delta) * P(col);
+        }
+
+        // Compute the residual for P_delta, residual_delta
+        calc_residual(G, P_delta, Ui, Us, dudt_explicit, dUi, tmp3, m_p, m_u, nvar, j, i, gam, dt, residual_delta);
+
+        // Compute forward derivatives of each residual vs the primitive col
+        for (int row = 0; row < nvar; row++) {
+            //if (row == m_p.RHO && col == m_p.RHO) 
+            jacobian(row, col) = (residual_delta(row) - residual(row)) / (P_delta(col) - P(col) + SMALL);
+        }
+
+        // Reset P_delta in this col
+        P_delta(col) = P(col);
+
+    }
+}   
 
 } // namespace Implicit
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index 205d8702..c496f373 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -44,10 +44,11 @@
 #include "b_cd.hpp"
 #include "current.hpp"
 #include "electrons.hpp"
+#include "implicit.hpp"
 #include "floors.hpp"
 #include "grmhd.hpp"
 #include "reductions.hpp"
-#include "viscosity.hpp"
+#include "emhd.hpp"
 #include "wind.hpp"
 
 #include "bondi.hpp"
@@ -191,10 +192,11 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     bool add_jcon = pin->GetOrAddBoolean("GRMHD", "add_jcon", true);
     bool do_electrons = pin->GetOrAddBoolean("electrons", "on", false);
     bool do_reductions = pin->GetOrAddBoolean("reductions", "on", true);
-    bool do_viscosity = pin->GetOrAddBoolean("viscosity", "on", false);
+    bool do_emhd = pin->GetOrAddBoolean("emhd", "on", false);
     bool do_wind = pin->GetOrAddBoolean("wind", "on", false);
 
-    // Set the default driver way up here.  TODO check for incompatibilities, etc
+    // Set the default driver way up here so packages know how to flag
+    // prims vs cons (grim stepper syncs prims)
     auto driver_type = pin->GetOrAddString("driver", "type", "harm");
 
     // Global variables "package."  Anything that just, really oughta be a global
@@ -220,7 +222,15 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
         packages.Add(B_FluxCT::Initialize(pin.get(), packages));
     }
 
-    if (add_jcon) {
+    // Implicit timestepping has a few of its own functions
+    bool implicit_step = pin->GetOrAddString("driver", "step", "explicit") == "implicit";
+    if (driver_type != "harm" && implicit_step) {
+        packages.Add(Implicit::Initialize(pin.get()));
+    }
+
+    // Even if we want to, there's no adding current if we don't know B.
+    // Avoid it.
+    if (add_jcon && b_field_solver != "none") {
         packages.Add(Current::Initialize(pin.get()));
     }
 
@@ -232,8 +242,8 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
         packages.Add(Reductions::Initialize(pin.get()));
     }
 
-    if (do_viscosity) {
-        packages.Add(Viscosity::Initialize(pin.get(), packages));
+    if (do_emhd) {
+        packages.Add(EMHD::Initialize(pin.get(), packages));
     }
 
     if (do_wind) {
diff --git a/kharma/main.cpp b/kharma/main.cpp
index 77430142..5e0dd68a 100644
--- a/kharma/main.cpp
+++ b/kharma/main.cpp
@@ -36,7 +36,7 @@
 #include "decs.hpp"
 
 #include "boundaries.hpp"
-#include "grim_driver.hpp"
+#include "imex_driver.hpp"
 #include "harm_driver.hpp"
 #include "kharma.hpp"
 #include "mpi.hpp"
@@ -89,8 +89,8 @@ using namespace parthenon;
  *
  * Currently available drivers:
  * HARM: GRMHD using LLF with zone-centered fields, conserved variables are synchronized
- * GRIM: same as HARM but primitive variables are synchronized,
- *       optional implicit solve for doing e.g. Extended GRMHD
+ * Imex: same functionality HARM but primitive variables are synchronized,
+ *       optionally uses per-zone implicit solve for some variables, for e.g. Extended GRMHD
  *
  * Future drivers?
  * bhlight: GRMHD with Monte Carlo particle transport
@@ -152,7 +152,7 @@ int main(int argc, char *argv[])
     if (driver_type == "harm") {
         HARMDriver driver(pin, papp, pmesh);
     } else if (driver_type == "grim") {
-        GRIMDriver driver(pin, papp, pmesh);
+        ImexDriver driver(pin, papp, pmesh);
     } else {
         throw std::invalid_argument("Expected driver type to be harm or grim!");
     }
@@ -189,7 +189,7 @@ int main(int argc, char *argv[])
         auto driver_status = driver.Execute();
     } else if (driver_type == "grim") {
         cout << "Initializing and running GRIM driver." << endl;
-        GRIMDriver driver(pin, papp, pmesh);
+        ImexDriver driver(pin, papp, pmesh);
         auto driver_status = driver.Execute();
     }
 
diff --git a/kharma/prob/b_field_tools.cpp b/kharma/prob/b_field_tools.cpp
index 95f8abf5..d4fe36a5 100644
--- a/kharma/prob/b_field_tools.cpp
+++ b/kharma/prob/b_field_tools.cpp
@@ -36,7 +36,7 @@
 
 #include "b_field_tools.hpp"
 
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 TaskStatus NormalizeBField(MeshBlockData<Real> *rc, Real norm)
 {
diff --git a/kharma/prob/blob.hpp b/kharma/prob/blob.hpp
index 61678a52..9772f931 100644
--- a/kharma/prob/blob.hpp
+++ b/kharma/prob/blob.hpp
@@ -37,7 +37,7 @@
 
 #include "decs.hpp"
 
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "pack.hpp"
 
 #include <parthenon/parthenon.hpp>
diff --git a/kharma/prob/bondi.hpp b/kharma/prob/bondi.hpp
index a8b96263..926f30c0 100644
--- a/kharma/prob/bondi.hpp
+++ b/kharma/prob/bondi.hpp
@@ -36,7 +36,7 @@
 #include "decs.hpp"
 
 #include "gr_coordinates.hpp"
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "pack.hpp"
 #include "prob_common.hpp"
 #include "types.hpp"
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index 3defc5ad..c111baaf 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -40,7 +40,7 @@
 #include "debug.hpp"
 #include "fixup.hpp"
 #include "floors.hpp"
-#include "fluxes.hpp"
+#include "flux.hpp"
 #include "gr_coordinates.hpp"
 #include "types.hpp"
 
@@ -58,7 +58,7 @@ void SyncAllBounds(Mesh *pmesh)
 
     for (auto &pmb : pmesh->block_list) {
         auto& rc = pmb->meshblock_data.Get();
-        Flux::PrimToFlux(rc.get(), IndexDomain::entire);
+        Flux::PtoU(rc.get(), IndexDomain::entire);
     }
 
     for (auto &pmb : pmesh->block_list) {
diff --git a/kharma/prob/problem.cpp b/kharma/prob/problem.cpp
index 50ba4b1b..61e90a11 100644
--- a/kharma/prob/problem.cpp
+++ b/kharma/prob/problem.cpp
@@ -39,7 +39,7 @@
 #include "debug.hpp"
 #include "fixup.hpp"
 #include "floors.hpp"
-#include "fluxes.hpp"
+#include "flux.hpp"
 #include "gr_coordinates.hpp"
 #include "types.hpp"
 
@@ -58,7 +58,7 @@
 #include "b_field_tools.hpp"
 
 // Package headers
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 #include "bvals/boundary_conditions.hpp"
 #include "mesh/mesh.hpp"
@@ -122,7 +122,7 @@ void KHARMA::ProblemGenerator(MeshBlock *pmb, ParameterInput *pin)
     // Fill the conserved variables U,
     // which we'll treat as the independent/fundamental state.
     // P is filled again from this later on
-    Flux::PrimToFlux(rc.get(), IndexDomain::entire);
+    Flux::PtoU(rc.get(), IndexDomain::entire);
 
     Flag(rc.get(), "Initialized Block");
 }
diff --git a/kharma/reductions/reductions.hpp b/kharma/reductions/reductions.hpp
index b4255958..42a02fd6 100644
--- a/kharma/reductions/reductions.hpp
+++ b/kharma/reductions/reductions.hpp
@@ -35,7 +35,7 @@
 
 #include "debug.hpp"
 
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 #include "types.hpp"
 
 namespace Reductions {
@@ -73,15 +73,21 @@ Real DomainSum(MeshData<Real> *md);
         if (pmb->boundary_flag[parthenon::BoundaryFace::inner_x1] == BoundaryFlag::user) { \
             GridScalar rho_U = rc->Get("cons.rho").data; \
             GridScalar u_U = rc->Get("cons.u").data; \
-            GridScalar uvec_U = rc->Get("cons.uvec").data; \
-            GridScalar B_U = rc->Get("cons.B").data; \
+            GridVector uvec_U = rc->Get("cons.uvec").data; \
             GridScalar rho_P = rc->Get("prims.rho").data; \
             GridScalar u_P = rc->Get("prims.u").data; \
-            GridScalar uvec_P = rc->Get("prims.uvec").data; \
-            GridScalar B_P = rc->Get("prims.B").data; \
+            GridVector uvec_P = rc->Get("prims.uvec").data; \
             GridScalar rho_F = rc->Get("cons.rho").flux[1]; \
             GridScalar u_F = rc->Get("cons.u").flux[1]; \
-            GridScalar uvec_F = rc->Get("cons.uvec").flux[1]; \
+            GridVector uvec_F = rc->Get("cons.uvec").flux[1]; \
+            GridVector B_P, B_U; \
+            if (rc->HasCellVariable("prims.B")) { \
+                B_P = rc->Get("prims.B").data; \
+                B_U = rc->Get("cons.B").data; \
+            } else { \
+                B_P = rc->Get("prims.uvec").data; \
+                B_U = rc->Get("cons.uvec").data; \
+            } \
             const Real gam = pmb->packages.Get("GRMHD")->Param<Real>("gamma"); \
 \
             IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior); \
@@ -101,6 +107,9 @@ Real DomainSum(MeshData<Real> *md);
 \
     return result; \
 }
+// Re: B_P and B_U above, they need to not crash but can return nonsense:
+// hence, just use an equivalent-size replacement.
+// There may be more elegant solutions...
 
 // Now we need some valid type names to use in distinguishing functions.
 // The 'enum class' lines just serve to define an arbitrary name as some valid type,
@@ -157,13 +166,19 @@ inline Real LdotEHFlux(MeshData<Real> *md) {return AccretionRate<Ldot_Flux>(md,
         auto& rc = pmb->meshblock_data.Get(); \
         GridScalar rho_U = rc->Get("cons.rho").data; \
         GridScalar u_U = rc->Get("cons.u").data; \
-        GridScalar uvec_U = rc->Get("cons.uvec").data; \
-        GridScalar B_U = rc->Get("cons.B").data; \
+        GridVector uvec_U = rc->Get("cons.uvec").data; \
         GridScalar rho_P = rc->Get("prims.rho").data; \
         GridScalar u_P = rc->Get("prims.u").data; \
-        GridScalar uvec_P = rc->Get("prims.uvec").data; \
-        GridScalar B_P = rc->Get("prims.B").data; \
+        GridVector uvec_P = rc->Get("prims.uvec").data; \
         const Real gam = pmb->packages.Get("GRMHD")->Param<Real>("gamma"); \
+        GridVector B_P, B_U; \
+        if (rc->HasCellVariable("prims.B")) { \
+            B_P = rc->Get("prims.B").data; \
+            B_U = rc->Get("cons.B").data; \
+        } else { \
+            B_P = rc->Get("prims.uvec").data; \
+            B_U = rc->Get("cons.uvec").data; \
+        } \
 \
         IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior); \
         IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior); \
@@ -215,7 +230,7 @@ MAKE_SUM3D_FN(JetLum, (KOKKOS_LAMBDA_3D_REDUCE {
         Real uvec_loc[NVEC] = {uvec_P(0, k, j, i), uvec_P(1, k, j, i), uvec_P(2, k, j, i)};
         Real B_loc[NVEC] = {B_P(0, k, j, i), B_P(1, k, j, i), B_P(2, k, j, i)};
         Real rho_ut, T[GR_DIM];
-        GRMHD::p_to_u_loc(G, 0., 0., uvec_loc, B_loc, gam, k, j, i, rho_ut, T);
+        GRMHD::p_to_u_mhd(G, 0., 0., uvec_loc, B_loc, gam, k, j, i, rho_ut, T); // TODO should this be just GRMHD::calc_tensor?
         local_result += -T[1] * G.dx3v(k) * G.dx2v(j);
     }
 }))
diff --git a/kharma/types.hpp b/kharma/types.hpp
index 37d1839f..31271a76 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -63,7 +63,7 @@ typedef struct {
 /**
  * Map of the locations of particular variables in a VariablePack
  * Used for operations conducted over all vars which must still
- * distinguish between them, e.g. fluxes.hpp
+ * distinguish between them, e.g. flux.hpp
  *
  * We use this instead of the PackIndexMap, because comparing strings
  * on the device every time we need the index of a variable is slow.
@@ -108,7 +108,7 @@ class VarMap {
                 K_WERNER = name_map["cons.Kel_Werner"].first;
                 K_ROWAN = name_map["cons.Kel_Rowan"].first;
                 K_SHARMA = name_map["cons.Kel_Sharma"].first;
-                // Viscosity
+                // Extended MHD
                 Q = name_map["cons.q"].first;
                 DP = name_map["cons.dP"].first;
             } else {
@@ -130,7 +130,7 @@ class VarMap {
                 K_WERNER = name_map["prims.Kel_Werner"].first;
                 K_ROWAN = name_map["prims.Kel_Rowan"].first;
                 K_SHARMA = name_map["prims.Kel_Sharma"].first;
-                // Viscosity
+                // Extended MHD
                 Q = name_map["prims.q"].first;
                 DP = name_map["prims.dP"].first;
             }
diff --git a/kharma/viscosity/emhd_functions.hpp b/kharma/viscosity/emhd_functions.hpp
deleted file mode 100644
index 6d07e74c..00000000
--- a/kharma/viscosity/emhd_functions.hpp
+++ /dev/null
@@ -1,370 +0,0 @@
-/* 
- *  File: mhd_functions.hpp
- *  
- *  BSD 3-Clause License
- *  
- *  Copyright (c) 2020, AFD Group at UIUC
- *  All rights reserved.
- *  
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *  
- *  1. Redistributions of source code must retain the above copyright notice, this
- *     list of conditions and the following disclaimer.
- *  
- *  2. Redistributions in binary form must reproduce the above copyright notice,
- *     this list of conditions and the following disclaimer in the documentation
- *     and/or other materials provided with the distribution.
- *  
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *  
- *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#pragma once
-
-#include "decs.hpp"
-
-#include "gr_coordinates.hpp"
-#include "types.hpp"
-#include "kharma_utils.hpp"
-
-/**
- * Device-side MHD functions
- * They are specifically the subset which require the fluid primitives P & B field both
- *
- * These functions mostly have several overloads, related to local vs global variables
- * Arguments can come in the form of global array or VariablePack references 
- *
- * This allows easy fusing/splitting of loops & use in different contexts
- */
-
-namespace Viscosity
-{
-
-/**
- * Find gamma-factor of the fluid w.r.t. normal observer
- */
-KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const GridVector uvec,
-                                         const int& k, const int& j, const int& i,
-                                         const Loci loc)
-{
-
-    Real qsq = G.gcov(loc, j, i, 1, 1) * uvec(0, k, j, i) * uvec(0, k, j, i) +
-               G.gcov(loc, j, i, 2, 2) * uvec(1, k, j, i) * uvec(1, k, j, i) +
-               G.gcov(loc, j, i, 3, 3) * uvec(2, k, j, i) * uvec(2, k, j, i) +
-            2. * (G.gcov(loc, j, i, 1, 2) * uvec(0, k, j, i) * uvec(1, k, j, i) +
-                  G.gcov(loc, j, i, 1, 3) * uvec(0, k, j, i) * uvec(2, k, j, i) +
-                  G.gcov(loc, j, i, 2, 3) * uvec(1, k, j, i) * uvec(2, k, j, i));
-
-    return sqrt(1. + qsq);
-}
-KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const Real uv[NVEC],
-                                         const int& k, const int& j, const int& i,
-                                         const Loci loc)
-{
-    Real qsq = G.gcov(loc, j, i, 1, 1) * uv[0] * uv[0] +
-               G.gcov(loc, j, i, 2, 2) * uv[1] * uv[1] +
-               G.gcov(loc, j, i, 3, 3) * uv[2] * uv[2] +
-            2. * (G.gcov(loc, j, i, 1, 2) * uv[0] * uv[1] +
-                  G.gcov(loc, j, i, 1, 3) * uv[0] * uv[2] +
-                  G.gcov(loc, j, i, 2, 3) * uv[1] * uv[2]);
-
-    return sqrt(1. + qsq);
-}
-// Version for full primitives array
-KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
-                                         const int& k, const int& j, const int& i, const Loci& loc)
-{
-
-    Real qsq = G.gcov(loc, j, i, 1, 1) * P(m.U1, k, j, i) * P(m.U1, k, j, i) +
-               G.gcov(loc, j, i, 2, 2) * P(m.U2, k, j, i) * P(m.U2, k, j, i) +
-               G.gcov(loc, j, i, 3, 3) * P(m.U3, k, j, i) * P(m.U3, k, j, i) +
-            2. * (G.gcov(loc, j, i, 1, 2) * P(m.U1, k, j, i) * P(m.U2, k, j, i) +
-                  G.gcov(loc, j, i, 1, 3) * P(m.U1, k, j, i) * P(m.U3, k, j, i) +
-                  G.gcov(loc, j, i, 2, 3) * P(m.U2, k, j, i) * P(m.U3, k, j, i));
-
-    return sqrt(1. + qsq);
-}
-KOKKOS_INLINE_FUNCTION Real lorentz_calc(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m,
-                                         const int& k, const int& j, const int& i, const Loci& loc)
-{
-    Real qsq = G.gcov(loc, j, i, 1, 1) * P(m.U1, i) * P(m.U1, i) +
-               G.gcov(loc, j, i, 2, 2) * P(m.U2, i) * P(m.U2, i) +
-               G.gcov(loc, j, i, 3, 3) * P(m.U3, i) * P(m.U3, i) +
-            2. * (G.gcov(loc, j, i, 1, 2) * P(m.U1, i) * P(m.U2, i) +
-                  G.gcov(loc, j, i, 1, 3) * P(m.U1, i) * P(m.U3, i) +
-                  G.gcov(loc, j, i, 2, 3) * P(m.U2, i) * P(m.U3, i));
-
-    return sqrt(1. + qsq);
-}
-
-/**
- * Get a row of the MHD stress-energy tensor with first index up, second index down.
- * A factor of sqrt(4 pi) is absorbed into the definition of b.
- * See Gammie & McKinney '04
- */
-KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Real& pgas,
-                                            const FourVectors& D, const int dir,
-                                            Real mhd[GR_DIM])
-{
-    Real bsq = dot(D.bcon, D.bcov);
-    Real eta = pgas + rho + u + bsq;
-    Real ptot = pgas + 0.5 * bsq;
-
-    DLOOP1 {
-        mhd[mu] = eta * D.ucon[dir] * D.ucov[mu] +
-                  ptot * (dir == mu) -
-                  D.bcon[dir] * D.bcov[mu];
-    }
-    double q       = S->q[k][j][i];
-    double delta_p = S->delta_p[k][j][i];
-    double ucon    = S->ucon[dir][k][j][i];
-    double bcon    = S->bcon[dir][k][j][i];
-    
-    DLOOP1 {
-        double bcov = S->bcov[mu][k][j][i];
-        double ucov = S->ucov[mu][k][j][i];
-
-        mhd[mu] += (q / sqrt(bsq)) * ((ucon * bcov) + (bcon * ucov)) 
-                + (-delta_p) * ((bcon * bcov / bsq) - (1./3.) * (delta(dir, mu) + ucon * ucov));
-    }
-}
-
-/**
- * Just the velocity 4-vector
- */
-KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const GridVector uvec,
-                                      const int& k, const int& j, const int& i, const Loci loc,
-                                      Real ucon[GR_DIM])
-{
-    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    ucon[0] = gamma / alpha;
-    VLOOP ucon[v+1] = uvec(v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-}
-KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const Real uvec[NVEC],
-                                      const int& k, const int& j, const int& i, const Loci loc,
-                                      Real ucon[GR_DIM])
-{
-    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    ucon[0] = gamma / alpha;
-    VLOOP ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-}
-
-/**
- * Calculate the 4-velocities ucon, ucov, and 4-fields bcon, bcov from primitive versions
- */
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Real uvec[NVEC], const Real B_P[NVEC],
-                                      const int& k, const int& j, const int& i, const Loci loc,
-                                      FourVectors& D)
-{
-    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    D.ucon[0] = gamma / alpha;
-    VLOOP D.ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
-
-    D.bcon[0] = 0;
-    VLOOP D.bcon[0] += B_P[v] * D.ucov[v+1];
-    VLOOP D.bcon[v+1] = (B_P[v] + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
-
-    G.lower(D.bcon, D.bcov, k, j, i, loc);
-}
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const GridVector uvec, const GridVector B_P,
-                                      const int& k, const int& j, const int& i, const Loci loc,
-                                      FourVectors& D)
-{
-    Real gamma = lorentz_calc(G, uvec, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    D.ucon[0] = gamma / alpha;
-    VLOOP D.ucon[v+1] = uvec(v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
-
-    D.bcon[0] = 0;
-    VLOOP D.bcon[0] += B_P(v, k, j, i) * D.ucov[v+1];
-    VLOOP D.bcon[v+1] = (B_P(v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
-
-    G.lower(D.bcon, D.bcov, k, j, i, loc);
-}
-
-// Primitive/VarMap version of calc_4vecs for kernels that use "packed" primitives
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
-                                      const int& k, const int& j, const int& i, const Loci loc, FourVectors& D)
-{
-    Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    D.ucon[0] = gamma / alpha;
-    VLOOP D.ucon[v+1] = P(m.U1 + v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
-
-    D.bcon[0] = 0;
-    VLOOP D.bcon[0] += P(m.B1 + v, k, j, i) * D.ucov[v+1];
-    VLOOP D.bcon[v+1] = (P(m.B1 + v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
-
-    G.lower(D.bcon, D.bcov, k, j, i, loc);
-}
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m,
-                                      const int& k, const int& j, const int& i, const Loci loc, FourVectors& D)
-{
-    Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
-    Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
-
-    D.ucon[0] = gamma / alpha;
-    VLOOP D.ucon[v+1] = P(m.U1 + v, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
-
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
-
-    D.bcon[0] = 0;
-    VLOOP D.bcon[0] += P(m.B1 + v, i) * D.ucov[v+1];
-    VLOOP D.bcon[v+1] = (P(m.B1 + v, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
-
-    G.lower(D.bcon, D.bcov, k, j, i, loc);
-}
-
-/**
- * Turn the primitive variables at a location into the local conserved variables, or fluxes at a face
- * 
- * Note this is for the five fluid variables only -- each package defines a prim_to_flux, which are called in GetFlux
- */
-KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m_p, const FourVectors D,
-                                         const Real& gam, const int& k, const int& j, const int& i, const int dir,
-                                         ScratchPad2D<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-
-    // Particle number flux
-    flux(m_u.RHO, i) = P(m_p.RHO, i) * D.ucon[dir] * gdet;
-
-    // MHD stress-energy tensor w/ first index up, second index down
-    Real mhd[GR_DIM];
-    calc_tensor(P(m_p.RHO, i), P(m_p.UU, i), (gam - 1) * P(m_p.UU, i), D, dir, mhd);
-    flux(m_u.UU, i)  = mhd[0] * gdet + flux(m_u.RHO, i);
-    flux(m_u.U1, i) =  mhd[1] * gdet;
-    flux(m_u.U2, i) =  mhd[2] * gdet;
-    flux(m_u.U3, i) =  mhd[3] * gdet;
-    flux(m_u.Q, i)  = P(m_p.Q, i) * D.ucon[dir] * gdet;
-    flux(m_u.DP, i) = P(m_p.DP, i) * D.ucon[dir] * gdet;
-}
-
-/**
- * Get the conserved (fluid only!) variables corresponding to primitives in a zone. Equivalent to prim_to_flux with dir==0
- */
-KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
-                                   const Real& gam, const int& k, const int& j, const int& i,
-                                   const VariablePack<Real>& U, const VarMap m_u, const Loci loc=Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-
-    FourVectors Dtmp;
-    calc_4vecs(G, P, m_p, k, j, i, loc, Dtmp);
-
-    // Particle number flux
-    U(m_u.RHO, k, j, i) = P(m_p.RHO, k, j, i) * Dtmp.ucon[0] * gdet;
-
-    // MHD stress-energy tensor w/ first index up, second index down
-    Real mhd[GR_DIM];
-    calc_tensor(P(m_p.RHO, k, j, i), P(m_p.UU, k, j, i), (gam - 1) * P(m_p.UU, k, j, i), Dtmp, 0, mhd);
-
-    U(m_u.UU, k, j, i)  = mhd[0] * gdet + U(m_u.RHO, k, j, i);
-    VLOOP U(m_u.U1 + v, k, j, i) = mhd[1 + v] * gdet;
-}
-
-/**
- * Special p_to_u call for fluid frame floors, which require a speculative transformation to add to existing U
- * Also used in the wind source term calculation, of all places
- */
-KOKKOS_INLINE_FUNCTION void p_to_u_loc(const GRCoordinates& G, const Real& rho, const Real& u, const Real uvec[NVEC],
-                                   const Real B_P[NVEC], const Real& gam, const int& k, const int& j, const int& i,
-                                   Real& rho_ut, Real T[GR_DIM], const Loci loc=Loci::center)
-{
-    Real gdet = G.gdet(loc, j, i);
-
-    FourVectors Dtmp;
-    calc_4vecs(G, uvec, B_P, k, j, i, loc, Dtmp);
-
-    // Particle number flux
-    rho_ut = rho * Dtmp.ucon[0] * gdet;
-
-    // MHD stress-energy tensor w/ first index up, second index down
-    Real mhd[GR_DIM];
-    calc_tensor(rho, u, (gam - 1) * u, Dtmp, 0, mhd);
-
-    T[0]  = mhd[0] * gdet + rho_ut;
-    VLOOP T[1 + v] = mhd[1 + v] * gdet;
-}
-
-
-/**
- * Calculate components of magnetosonic velocity from primitive variables
- * This is only called in GetFlux, so we only provide a ScratchPad form
- */
-KOKKOS_INLINE_FUNCTION void vchar(const GRCoordinates &G, const ScratchPad2D<Real>& P, const VarMap& m, const FourVectors& D,
-                                  const Real& gam, const int& k, const int& j, const int& i, const Loci loc, const int& dir,
-                                  Real& cmax, Real& cmin)
-{
-    // Find fast magnetosonic speed
-    Real cms2;
-    {
-        Real bsq = dot(D.bcon, D.bcov);
-        Real ef = P(m.RHO, i) + gam * P(m.UU, i);
-        Real ee = bsq + ef;
-        Real va2 = bsq / ee;
-        Real cs2 = gam * (gam - 1) * P(m.UU, i) / ef;
-        cms2 = cs2 + va2 - cs2 * va2;
-        clip(cms2, 1.e-20, 1.);
-    }
-
-    // Require that speed of wave measured by observer q.ucon is cms2
-    Real A, B, C;
-    {
-        Real Bcov[GR_DIM] = {1., 0., 0., 0.};
-        Real Acov[GR_DIM] = {0}; Acov[dir] = 1.;
-
-        Real Acon[GR_DIM], Bcon[GR_DIM];
-        G.raise(Acov, Acon, k, j, i, loc);
-        G.raise(Bcov, Bcon, k, j, i, loc);
-
-        Real Asq = dot(Acon, Acov);
-        Real Bsq = dot(Bcon, Bcov);
-        Real Au = dot(Acov, D.ucon);
-        Real Bu = dot(Bcov, D.ucon);
-        Real AB = dot(Acon, Bcov);
-        Real Au2 = Au * Au;
-        Real Bu2 = Bu * Bu;
-        Real AuBu = Au * Bu;
-
-        A = Bu2 - (Bsq + Bu2) * cms2;
-        B = 2. * (AuBu - (AB + AuBu) * cms2);
-        C = Au2 - (Asq + Au2) * cms2;
-    }
-
-    Real discr = sqrt(max(B * B - 4. * A * C, 0.));
-
-    Real vp = -(-B + discr) / (2. * A);
-    Real vm = -(-B - discr) / (2. * A);
-
-    cmax = max(vp, vm);
-    cmin = min(vp, vm);
-}
-
-}
diff --git a/kharma/wind/wind.cpp b/kharma/wind/wind.cpp
index 0d56d281..f9c99b59 100644
--- a/kharma/wind/wind.cpp
+++ b/kharma/wind/wind.cpp
@@ -109,7 +109,7 @@ TaskStatus Wind::AddSource(MeshData<Real> *mdudt)
             // Add plasma to the T^t_a component of the stress-energy tensor
             // Notice that U already contains a factor of sqrt{-g}
             Real rho_ut, T[GR_DIM];
-            GRMHD::p_to_u_loc(G, drhopdt, drhopdt * Tp * 3., uvec, B_P, gam, k, j, i, rho_ut, T);
+            GRMHD::p_to_u_mhd(G, drhopdt, drhopdt * Tp * 3., uvec, B_P, gam, k, j, i, rho_ut, T);
 
             dUdt(b, m_u.RHO, k, j, i) += rho_ut;
             dUdt(b, m_u.UU, k, j, i) += T[0];
diff --git a/kharma/wind/wind.hpp b/kharma/wind/wind.hpp
index affa47bb..f6cede09 100644
--- a/kharma/wind/wind.hpp
+++ b/kharma/wind/wind.hpp
@@ -33,7 +33,7 @@
  */
 #pragma once
 
-#include "mhd_functions.hpp"
+#include "grmhd_functions.hpp"
 
 #include <parthenon/parthenon.hpp>
 
diff --git a/pars/bondi.par b/pars/bondi.par
index b913f96f..d143a8cf 100644
--- a/pars/bondi.par
+++ b/pars/bondi.par
@@ -58,6 +58,7 @@ verbose = 0
 <driver>
 type = harm
 step = explicit
+max_nonlinear_iter = 3
 
 <parthenon/output0>
 file_type = hdf5
diff --git a/pars/mhdmodes.par b/pars/mhdmodes.par
index ebecead4..219d8861 100644
--- a/pars/mhdmodes.par
+++ b/pars/mhdmodes.par
@@ -59,6 +59,7 @@ verbose = 0
 <driver>
 type = harm
 step = explicit
+max_nonlinear_iter = 3
 
 <parthenon/output0>
 file_type = hdf5
diff --git a/tests/bondi/check.sh b/tests/bondi/check.sh
index ebf7e8c0..ad13c435 100755
--- a/tests/bondi/check.sh
+++ b/tests/bondi/check.sh
@@ -8,9 +8,12 @@ conda activate pyHARM
 res="32,48,64,96,128"
 python check.py $res "in 2D, FMKS coordinates" fmks || fail=1
 python check.py $res "in 2D, MKS coordinates" mks || fail=1
-python check.py $res "in 2D, EKS coordinates" eks || fail=1
+# TODO EKS in pyHARM
+#python check.py $res "in 2D, EKS coordinates" eks || fail=1
 python check.py $res "in 2D, linear recon with MC limiter" linear_mc || fail=1
 python check.py $res "in 2D, linear recon with VL limiter" linear_vl || fail=1
-python check.py $res "in 2D, with classic algo/boundaries" classic || fail=1
+
+python check.py $res "in 2D, with Imex driver" imex || fail=1
+python check.py $res "in 2D, with implicit stepping" imex_im || fail=1
 
 exit $fail
diff --git a/tests/bondi/run.sh b/tests/bondi/run.sh
index b71ac745..b7c7e4c6 100755
--- a/tests/bondi/run.sh
+++ b/tests/bondi/run.sh
@@ -20,10 +20,10 @@ conv_2d() {
 # Test coordinates (raw ks?)
 conv_2d fmks coordinates/transform=fmks
 conv_2d mks coordinates/transform=mks
-#conv_2d eks coordinates/transform=eks # TODO fix eks in pyHARM
+conv_2d eks coordinates/transform=eks # TODO fix eks in pyHARM
 # Recon
 conv_2d linear_mc GRMHD/reconstruction=linear_mc
 conv_2d linear_vl GRMHD/reconstruction=linear_vl
 # And the GRIM/classic driver
-conv_2d classic driver/type=grim
-#conv_2d grim driver/type=grim driver/step=implicit 
+conv_2d imex driver/type=grim
+conv_2d imex_im "driver/type=grim driver/step=implicit"
diff --git a/tests/mhdmodes/check.sh b/tests/mhdmodes/check.sh
index 8a8119c2..498df83e 100755
--- a/tests/mhdmodes/check.sh
+++ b/tests/mhdmodes/check.sh
@@ -21,6 +21,10 @@ python3 check.py $RES3D "slow mode in 3D, classic algo" slow_grim || fail=1
 python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_grim || fail=1
 python3 check.py $RES3D "fast mode in 3D, classic algo" fast_grim || fail=1
 
+python3 check.py $RES3D "slow mode in 3D, classic algo" slow_grim_im || fail=1
+python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_grim_im || fail=1
+python3 check.py $RES3D "fast mode in 3D, classic algo" fast_grim_im || fail=1
+
 #python3 check.py $RES2D "fast mode in 2D, WENO5" fast2d 2d || fail=1
 #python3 check.py $RES2D "fast mode in 2D, linear/MC reconstruction" fast_mc 2d || fail=1
 #python3 check.py $RES2D "fast mode in 2D, linear/VL reconstruction" fast_vl 2d || fail=1
diff --git a/tests/mhdmodes/run.sh b/tests/mhdmodes/run.sh
index cd6904d6..ad500ae7 100755
--- a/tests/mhdmodes/run.sh
+++ b/tests/mhdmodes/run.sh
@@ -55,9 +55,13 @@ conv_3d slow mhdmodes/nmode=1
 conv_3d alfven mhdmodes/nmode=2
 conv_3d fast mhdmodes/nmode=3
 # And we've got to test classic/GRIM stepping
-conv_3d slow_grim   "mhdmodes/nmode=1 driver/type=grim"
-conv_3d alfven_grim "mhdmodes/nmode=2 driver/type=grim"
-conv_3d fast_grim   "mhdmodes/nmode=3 driver/type=grim"
+conv_3d slow_imex   "mhdmodes/nmode=1 driver/type=imex"
+conv_3d alfven_imex "mhdmodes/nmode=2 driver/type=imex"
+conv_3d fast_imex   "mhdmodes/nmode=3 driver/type=imex"
+# And the implicit solver
+conv_3d slow_imex_im   "mhdmodes/nmode=1 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=1"
+conv_3d alfven_imex_im "mhdmodes/nmode=2 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=1"
+conv_3d fast_imex_im   "mhdmodes/nmode=3 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=1"
 
 # 2D modes use small blocks, could pick up some problems at MPI ranks >> 1
 # Currently very slow, plus modes are incorrect

From f989face19b21f64dfa106ec12ef63d28b414d76 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 28 Feb 2022 10:22:49 -0600
Subject: [PATCH 07/26] Fixes to a couple of naming/typo errors from Friday. 
 Passes mhdmodes locally

---
 kharma/b_flux_ct/b_flux_ct.cpp   |  2 +-
 kharma/boundaries.cpp            | 12 ++++++------
 kharma/grmhd/grmhd.cpp           |  2 +-
 kharma/grmhd/grmhd_functions.hpp |  4 ++--
 kharma/kharma.cpp                |  4 ++--
 kharma/main.cpp                  |  6 +++---
 tests/mhdmodes/run.sh            |  6 +++---
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/kharma/b_flux_ct/b_flux_ct.cpp b/kharma/b_flux_ct/b_flux_ct.cpp
index 76bf7bba..e066828f 100644
--- a/kharma/b_flux_ct/b_flux_ct.cpp
+++ b/kharma/b_flux_ct/b_flux_ct.cpp
@@ -76,7 +76,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     // Note: when changing metadata, keep these in lockstep with grmhd.cpp!!
     // See notes there about changes for the Imex driver
     std::vector<MetadataFlag> flags_prim, flags_cons;
-    auto imex_driver = pin->GetString("driver", "type") == "grim";
+    auto imex_driver = pin->GetString("driver", "type") == "imex";
     if (!imex_driver) {
         flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
                                                 isPrimitive, isMHD, Metadata::Vector});
diff --git a/kharma/boundaries.cpp b/kharma/boundaries.cpp
index 23229a4e..ca8a887c 100644
--- a/kharma/boundaries.cpp
+++ b/kharma/boundaries.cpp
@@ -73,7 +73,7 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     auto q = rc->PackVariables({Metadata::FillGhost}, cons_map, coarse);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
     // If we're running imex, q is the *primitive* variables
-    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "imex";
 
     // KHARMA is very particular about corner boundaries.
     // In particular, we apply the outflow boundary over ALL X2, X3,
@@ -162,7 +162,7 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     auto q = rc->PackVariables({Metadata::FillGhost}, cons_map, coarse);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
     // If we're running imex, q is the *primitive* variables
-    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "imex";
 
     // KHARMA is very particular about corner boundaries, see above
     IndexDomain ldomain = IndexDomain::interior;
@@ -240,7 +240,7 @@ void KBoundaries::InnerX1(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
     }
     // If we're in KHARMA/HARM driver, we need primitive versions of all the
     // non-GRMHD vars
-    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "imex";
     if (!prim_ghosts) KHARMA::FillDerivedDomain(rc, IndexDomain::inner_x1, coarse);
 }
 void KBoundaries::OuterX1(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
@@ -256,21 +256,21 @@ void KBoundaries::OuterX1(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
     }
     // If we're in KHARMA/HARM driver, we need primitive versions of all the
     // non-GRMHD vars
-    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "imex";
     if (!prim_ghosts) KHARMA::FillDerivedDomain(rc, IndexDomain::outer_x1, coarse);
 }
 void KBoundaries::InnerX2(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
 {
     auto pmb = rc->GetBlockPointer();
     ReflectX2(rc, IndexDomain::inner_x2, coarse);
-    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "imex";
     if (!prim_ghosts) KHARMA::FillDerivedDomain(rc, IndexDomain::inner_x2, coarse);
 }
 void KBoundaries::OuterX2(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse)
 {
     auto pmb = rc->GetBlockPointer();
     ReflectX2(rc, IndexDomain::outer_x2, coarse);
-    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "grim";
+    bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "imex";
     if (!prim_ghosts) KHARMA::FillDerivedDomain(rc, IndexDomain::outer_x2, coarse);
 }
 
diff --git a/kharma/grmhd/grmhd.cpp b/kharma/grmhd/grmhd.cpp
index 874b588a..4a927fd5 100644
--- a/kharma/grmhd/grmhd.cpp
+++ b/kharma/grmhd/grmhd.cpp
@@ -186,7 +186,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
 
     std::vector<int> s_vector({3});
     std::vector<MetadataFlag> flags_prim, flags_cons;
-    auto imex_driver = pin->GetString("driver", "type") == "grim";
+    auto imex_driver = pin->GetString("driver", "type") == "imex";
     auto explicit_step = (pin->GetOrAddString("driver", "step", "explicit") == "explicit");
     if (!imex_driver) { // Normal operation
         // As mentioned elsewhere, KHARMA treats the conserved variables as the independent ones,
diff --git a/kharma/grmhd/grmhd_functions.hpp b/kharma/grmhd/grmhd_functions.hpp
index f35dcbc3..8ef89c86 100644
--- a/kharma/grmhd/grmhd_functions.hpp
+++ b/kharma/grmhd/grmhd_functions.hpp
@@ -188,14 +188,14 @@ KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Local& P, c
     D.ucon[0] = gamma / alpha;
     VLOOP D.ucon[v+1] = P(m.U1 + v) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
 
-    G.lower(D.ucon, D.ucov, k, j, i, loc);
+    G.lower(D.ucon, D.ucov, 0, j, i, loc);
 
     if (m.B1 >= 0) {
         D.bcon[0] = 0;
         VLOOP D.bcon[0] += P(m.B1 + v) * D.ucov[v+1];
         VLOOP D.bcon[v+1] = (P(m.B1 + v) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
 
-        G.lower(D.bcon, D.bcov, k, j, i, loc);
+        G.lower(D.bcon, D.bcov, 0, j, i, loc);
     }
 }
 KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index c496f373..eaced6e2 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -196,8 +196,8 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     bool do_wind = pin->GetOrAddBoolean("wind", "on", false);
 
     // Set the default driver way up here so packages know how to flag
-    // prims vs cons (grim stepper syncs prims)
-    auto driver_type = pin->GetOrAddString("driver", "type", "harm");
+    // prims vs cons (imex stepper syncs prims, but packages have to mark them that way)
+    auto driver_type = pin->GetOrAddString("driver", "type", "imex");
 
     // Global variables "package."  Anything that just, really oughta be a global
     packages.Add(KHARMA::InitializeGlobals(pin.get()));
diff --git a/kharma/main.cpp b/kharma/main.cpp
index 5e0dd68a..cc93bd36 100644
--- a/kharma/main.cpp
+++ b/kharma/main.cpp
@@ -151,10 +151,10 @@ int main(int argc, char *argv[])
     auto driver_type = pin->GetString("driver", "type");
     if (driver_type == "harm") {
         HARMDriver driver(pin, papp, pmesh);
-    } else if (driver_type == "grim") {
+    } else if (driver_type == "imex") {
         ImexDriver driver(pin, papp, pmesh);
     } else {
-        throw std::invalid_argument("Expected driver type to be harm or grim!");
+        throw std::invalid_argument("Expected driver type to be harm or imex!");
     }
 
     // We could still have set parameters during driver initialization
@@ -187,7 +187,7 @@ int main(int argc, char *argv[])
         cout << "Initializing and running KHARMA driver." << endl;
         HARMDriver driver(pin, papp, pmesh);
         auto driver_status = driver.Execute();
-    } else if (driver_type == "grim") {
+    } else if (driver_type == "imex") {
         cout << "Initializing and running GRIM driver." << endl;
         ImexDriver driver(pin, papp, pmesh);
         auto driver_status = driver.Execute();
diff --git a/tests/mhdmodes/run.sh b/tests/mhdmodes/run.sh
index ad500ae7..02b129db 100755
--- a/tests/mhdmodes/run.sh
+++ b/tests/mhdmodes/run.sh
@@ -59,9 +59,9 @@ conv_3d slow_imex   "mhdmodes/nmode=1 driver/type=imex"
 conv_3d alfven_imex "mhdmodes/nmode=2 driver/type=imex"
 conv_3d fast_imex   "mhdmodes/nmode=3 driver/type=imex"
 # And the implicit solver
-conv_3d slow_imex_im   "mhdmodes/nmode=1 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=1"
-conv_3d alfven_imex_im "mhdmodes/nmode=2 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=1"
-conv_3d fast_imex_im   "mhdmodes/nmode=3 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=1"
+conv_3d slow_imex_im   "mhdmodes/nmode=1 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=3"
+conv_3d alfven_imex_im "mhdmodes/nmode=2 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=3"
+conv_3d fast_imex_im   "mhdmodes/nmode=3 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=3"
 
 # 2D modes use small blocks, could pick up some problems at MPI ranks >> 1
 # Currently very slow, plus modes are incorrect

From a3ccf555f0df58606ffcbc6829c7f2412c782fe6 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 2 Mar 2022 10:27:20 -0600
Subject: [PATCH 08/26] Resize meshes on restart

Updates the IharmRestart initialization so it can linearly interpolate
primitives onto a mesh given in its initialization file.  Since this
is now the primary reason to use it, the problem parameters have been
renamed to resize_restart for filename clarity.

Linear interpolation at domain boundaries could be better, and
initialization of ghost zones hasn't been well tested.  That said,
there's no issue resizing & restarting various torii.

Resizing a magnetic field requires revising B so that divB~=0. This is
done with a new package, "B_Cleanup", which implements a damped Jacobi
iterative solver for a scalar potential p, the divergence of which can
be subtracted from B to eliminate divB. This solver is implemented such
that it could be used every so often during a normal run, to cut down
on B field divergence that may have built up over ~millions of steps.

This solver is quite slow, costing the same as ~100M of evolution at
the new resolution.  I don't know if this is inevitable from problem
size, a consequence of the higher-stencil Laplacian we use for corner-
centered divB, or something about my implementation.  If the MPI syncs
could be restricted to just P, that might speed things up considerably.

This is on 'kharmaim' branch as it might have and might in near future
use the Kokkos kernels.
---
 kharma/CMakeLists.txt                         |   8 +-
 kharma/b_cd/b_cd.cpp                          |   2 +-
 kharma/b_cleanup/b_cleanup.cpp                | 532 ++++++++++++++++++
 kharma/b_cleanup/b_cleanup.hpp                |  96 ++++
 kharma/b_flux_ct/b_flux_ct.cpp                |  33 +-
 kharma/b_flux_ct/b_flux_ct.hpp                |  70 ++-
 kharma/decs.hpp                               |   1 +
 kharma/grmhd/grmhd.cpp                        |  17 +-
 kharma/harm_driver.cpp                        |   2 +-
 kharma/harm_driver.hpp                        |   4 +
 kharma/imex_driver.cpp                        |  11 +-
 kharma/implicit/implicit.hpp                  |   4 +-
 kharma/kharma.cpp                             |  49 +-
 kharma/kharma.hpp                             |   2 +
 kharma/main.cpp                               |  10 +-
 kharma/prob/post_initialize.cpp               |  58 +-
 kharma/prob/post_initialize.hpp               |   6 +-
 kharma/prob/problem.cpp                       |   6 +-
 kharma/prob/resize.hpp                        | 150 +++++
 .../{iharm_restart.cpp => resize_restart.cpp} | 149 ++---
 .../{iharm_restart.hpp => resize_restart.hpp} |   0
 .../{iharm_restart.par => resize_restart.par} |  33 +-
 22 files changed, 1067 insertions(+), 176 deletions(-)
 create mode 100644 kharma/b_cleanup/b_cleanup.cpp
 create mode 100644 kharma/b_cleanup/b_cleanup.hpp
 create mode 100644 kharma/prob/resize.hpp
 rename kharma/prob/{iharm_restart.cpp => resize_restart.cpp} (63%)
 rename kharma/prob/{iharm_restart.hpp => resize_restart.hpp} (100%)
 rename pars/{iharm_restart.par => resize_restart.par} (59%)

diff --git a/kharma/CMakeLists.txt b/kharma/CMakeLists.txt
index 742e9a8e..49f05d67 100644
--- a/kharma/CMakeLists.txt
+++ b/kharma/CMakeLists.txt
@@ -13,10 +13,12 @@ AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR} EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/prob EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/coordinates EXE_NAME_SRC)
 
-AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/b_flux_ct EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/b_cd EXE_NAME_SRC)
+AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/b_cleanup EXE_NAME_SRC)
+AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/b_flux_ct EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/current EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/electrons EXE_NAME_SRC)
+AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/emhd EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/floors EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/grmhd EXE_NAME_SRC)
 AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/implicit EXE_NAME_SRC)
@@ -28,10 +30,12 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/prob)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/coordinates)
 
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/b_flux_ct)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/b_cd)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/b_cleanup)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/b_flux_ct)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/current)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/electrons)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/emhd)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/floors)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/grmhd)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/implicit)
diff --git a/kharma/b_cd/b_cd.cpp b/kharma/b_cd/b_cd.cpp
index 2222ddd3..844df286 100644
--- a/kharma/b_cd/b_cd.cpp
+++ b/kharma/b_cd/b_cd.cpp
@@ -59,7 +59,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     Real damping = pin->GetOrAddReal("b_field", "damping", 0.1);
     params.Add("damping", damping);
 
-    std::vector<int> s_vector({3});
+    std::vector<int> s_vector({NVEC});
 
     MetadataFlag isPrimitive = packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
 
diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
new file mode 100644
index 00000000..34779f36
--- /dev/null
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -0,0 +1,532 @@
+/* 
+ *  File: b_cleanup.cpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <parthenon/parthenon.hpp>
+
+#include "b_cleanup.hpp"
+
+// For a bunch of utility functions
+#include "b_flux_ct.hpp"
+
+#include "decs.hpp"
+#include "grmhd.hpp"
+#include "kharma.hpp"
+
+using namespace parthenon;
+
+namespace B_Cleanup
+{
+
+std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages)
+{
+    auto pkg = std::make_shared<StateDescriptor>("B_Cleanup");
+    Params &params = pkg->AllParams();
+
+    // OPTIONS
+    // Diagnostic data
+    int verbose = pin->GetOrAddInteger("debug", "verbose", 0);
+    params.Add("verbose", verbose);
+    int flag_verbose = pin->GetOrAddInteger("debug", "flag_verbose", 0);
+    params.Add("flag_verbose", flag_verbose);
+    int extra_checks = pin->GetOrAddInteger("debug", "extra_checks", 0);
+    params.Add("extra_checks", extra_checks);
+
+    // Solver options
+    Real error_tolerance = pin->GetOrAddReal("b_cleanup", "error_tolerance", 1e-8);
+    params.Add("error_tolerance", error_tolerance);
+    Real sor_factor = pin->GetOrAddReal("b_cleanup", "sor_factor", 2./3);
+    params.Add("sor_factor", sor_factor);
+    int max_iterations = pin->GetOrAddInteger("b_cleanup", "max_iterations", 1e8);
+    params.Add("max_iterations", max_iterations);
+    int check_interval = pin->GetOrAddInteger("b_cleanup", "check_interval", 1e4);
+    params.Add("check_interval", check_interval);
+    bool fail_without_convergence = pin->GetOrAddBoolean("b_cleanup", "fail_without_convergence", true);
+    params.Add("fail_without_convergence", fail_without_convergence);
+    bool warn_without_convergence = pin->GetOrAddBoolean("b_cleanup", "warn_without_convergence", false);
+    params.Add("warn_without_convergence", warn_without_convergence);
+
+    // TODO find a way to add this to the list every N steps
+    int cleanup_interval = pin->GetOrAddInteger("b_cleanup", "cleanup_interval", 0);
+    params.Add("cleanup_interval", cleanup_interval);
+
+    // Someday we could use the fancy Parthenon tools to fill a sparse matrix representing
+    // div(grad(p)).  However, this is complicated by the averaging to centers/corners required
+    // when preserving divB at corners for flux-CT
+    // It would be much easier to set up for a lone/cleanup+Dedner setup
+
+    // FIELDS
+    std::vector<int> s_vector({NVEC});
+    Metadata m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy, Metadata::FillGhost});
+    // Scalar potential, solution to div^2 p = div B
+    // Thus when we subtract the gradient, div (B - div p) == 0!
+    pkg->AddField("p", m);
+
+    // Scalar laplacian div^2 p. No need to sync this, we write/read it only on physical zones
+    m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
+    pkg->AddField("lap", m);
+    // Gradient of potential, for use when computing laplacian & for final subtraction
+    m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, s_vector);
+    pkg->AddField("dB", m);
+
+    // If there's not another B field transport (dangerous!), take care of it ourselves.
+    // Allocate the field, register most of the B_FluxCT callbacks
+    // TODO check if B is allocated and set this if not
+    bool manage_field = pin->GetOrAddBoolean("b_cleanup", "manage_field", false);
+    params.Add("manage_field", manage_field);
+    if (manage_field) {
+        MetadataFlag isPrimitive = packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+        MetadataFlag isMHD = packages.Get("GRMHD")->Param<MetadataFlag>("MHDFlag");
+
+        // B fields.  "Primitive" form is field, "conserved" is flux
+        // Note: when changing metadata, keep these in lockstep with grmhd.cpp!!
+        // See notes there about changes for the Imex driver
+        std::vector<MetadataFlag> flags_prim, flags_cons;
+        auto imex_driver = pin->GetString("driver", "type") == "imex";
+        if (!imex_driver) {
+            flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
+                                                    isPrimitive, isMHD, Metadata::Vector});
+            flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
+                    Metadata::Restart, Metadata::Conserved, isMHD, Metadata::WithFluxes, Metadata::Vector});
+        } else {
+            flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::FillGhost, Metadata::Restart,
+                                                    isPrimitive, isMHD, Metadata::Vector});
+            flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
+                                                    Metadata::Conserved, isMHD, Metadata::WithFluxes, Metadata::Vector});
+        }
+
+        m = Metadata(flags_prim, s_vector);
+        pkg->AddField("prims.B", m);
+        m = Metadata(flags_cons, s_vector);
+        pkg->AddField("cons.B", m);
+
+        m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
+        pkg->AddField("divB", m);
+
+        pkg->FillDerivedMesh = B_FluxCT::FillDerivedMesh;
+        pkg->FillDerivedBlock = B_FluxCT::FillDerivedBlock;
+        pkg->PostStepDiagnosticsMesh = B_FluxCT::PostStepDiagnostics;
+
+        // List (vector) of HistoryOutputVar that will all be enrolled as output variables
+        parthenon::HstVar_list hst_vars = {};
+        // The definition of MaxDivB we care about actually changes per-transport. Use our function.
+        hst_vars.emplace_back(parthenon::HistoryOutputVar(UserHistoryOperation::max, B_FluxCT::MaxDivB, "MaxDivB"));
+        // add callbacks for HST output to the Params struct, identified by the `hist_param_key`
+        pkg->AddParam<>(parthenon::hist_param_key, hst_vars);
+    }
+
+    return pkg;
+}
+
+void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
+{
+    Flag(md.get(), "Cleaning up divB");
+    // Local Allreduce values since we're just calling things
+    AllReduce<Real> update_norm, divB_norm, divB_max;
+    AllReduce<Real> P_norm;
+
+    auto pkg = md->GetMeshPointer()->packages.Get("B_Cleanup");
+    auto max_iters = pkg->Param<int>("max_iterations");
+    auto check_interval = pkg->Param<int>("check_interval");
+    auto error_tolerance = pkg->Param<Real>("error_tolerance");
+    auto fail_flag = pkg->Param<bool>("fail_without_convergence");
+    auto warn_flag = pkg->Param<bool>("warn_without_convergence");
+    auto verbose = pkg->Param<int>("verbose");
+
+    if (MPIRank0() && verbose > 0) {
+        std::cout << "Cleaning divB" << std::endl;
+    }
+
+    // Calculate existing divB max & sum for checking relative error later
+    divB_max.val = 0.;
+    B_FluxCT::MaxDivBTask(md.get(), divB_max.val);
+    divB_max.StartReduce(MPI_MAX);
+    divB_max.CheckReduce();
+
+    divB_norm.val = 0.;
+    B_Cleanup::CalcSumDivB(md.get(), divB_norm.val);
+    divB_norm.StartReduce(MPI_SUM);
+    divB_norm.CheckReduce();
+
+    if (MPIRank0() && verbose > 0) {
+        std::cout << "Starting divB max is " << divB_max.val << " and sum is " << divB_norm.val << std::endl;
+    }
+
+    // set P = divB as guess
+    B_Cleanup::InitP(md.get());
+
+    bool converged = false;
+    int iter = 0;
+    while ( (!converged) && (iter < max_iters) ) {
+        // Start syncing bounds
+        md.get()->StartReceiving(BoundaryCommSubset::all);
+
+        // Update our guess at the potential 
+        B_Cleanup::UpdateP(md.get());
+
+        // Boundary sync. We really only need p syncd here...
+        cell_centered_bvars::SendBoundaryBuffers(md);
+        cell_centered_bvars::ReceiveBoundaryBuffers(md);
+        cell_centered_bvars::SetBoundaries(md);
+
+        md.get()->ClearBoundary(BoundaryCommSubset::all);
+
+        if (iter % check_interval == 0) {
+            // Calculate the new norm & relative error in eliminating divB
+            update_norm.val = 0.;
+            B_Cleanup::SumError(md.get(), update_norm.val);
+            update_norm.StartReduce(MPI_SUM);
+            update_norm.CheckReduce();
+            // P_norm.val = 0.;
+            // B_Cleanup::SumP(md.get(), P_norm.val);
+            // P_norm.StartReduce(MPI_SUM);
+            // P_norm.CheckReduce();
+            if (MPIRank0() && verbose > 0) {
+                std::cout << "divB step " << iter << " error is "
+                        << update_norm.val / divB_norm.val << std::endl;
+                // std::cout << "P norm is " << P_norm.val << std::endl;
+            }
+
+            // Both these values are already MPI reduced, but we want to make sure
+            converged = (update_norm.val / divB_norm.val) < error_tolerance;
+            converged = MPIMin(converged);
+        }
+
+        iter++;
+    }
+    if (iter >= max_iters) {
+        if (fail_flag) {
+            throw std::runtime_error("Failed to converge when cleaning magnetic field divergence!");
+        } else if (warn_flag) {
+            cerr << "Failed to converge when cleaning magnetic field divergence!" << endl;
+        }
+    }
+
+    if (MPIRank0() && verbose > 0) {
+        std::cout << "Applying magnetic field correction!" << std::endl;
+    }
+
+    // Update the magnetic field with one damped Jacobi step
+    B_Cleanup::ApplyP(md.get());
+
+    // Recalculate divB max to reassure
+    divB_max.val = 0.;
+    B_FluxCT::MaxDivBTask(md.get(), divB_max.val);
+    divB_max.StartReduce(MPI_MAX);
+    divB_max.CheckReduce();
+
+    if (MPIRank0() && verbose > 0) {
+        std::cout << "Final divB max is " << divB_max.val << std::endl;
+    }
+
+    Flag(md.get(), "Cleaned");
+}
+
+TaskStatus CalcSumDivB(MeshData<Real> *md, Real& reduce_sum)
+{
+    Flag(md, "Calculating & summing divB");
+    auto pm = md->GetParentPointer();
+    IndexRange ib = md->GetBoundsI(IndexDomain::interior);
+    IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = md->GetBoundsK(IndexDomain::interior);
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+
+    // Get variables
+    auto B = md->PackVariables(std::vector<std::string>{"cons.B"});
+    auto divB = md->PackVariables(std::vector<std::string>{"divB"});
+
+    const int ndim = B.GetNdim();
+
+    // Total divB.
+    Real divB_total;
+    pmb0->par_reduce("SumDivB", 0, B.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_3D_REDUCE {
+            const auto& G = B.GetCoords(b);
+            divB(b, 0, k, j, i) = B_FluxCT::corner_div(G, B, b, k, j, i, ndim > 2);
+            local_result += abs(divB(b, 0, k, j, i));
+        }
+    , Kokkos::Sum<Real>(divB_total));
+
+    // Parthenon/caller will take care of MPI reduction
+    reduce_sum += divB_total;
+    return TaskStatus::complete;
+}
+
+TaskStatus InitP(MeshData<Real> *md)
+{
+    Flag(md, "Initializing P");
+    auto pm = md->GetParentPointer();
+    IndexRange ib = md->GetBoundsI(IndexDomain::interior);
+    IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = md->GetBoundsK(IndexDomain::interior);
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+
+    // Pack variables
+    auto P = md->PackVariables(std::vector<std::string>{"p"});
+    auto divB = md->PackVariables(std::vector<std::string>{"divB"});
+
+    // Initialize P = divB
+    pmb0->par_for("init_p", 0, P.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_3D {
+            P(b, 0, k, j, i) = divB(b, 0, k, j, i);
+        }
+    );
+
+    return TaskStatus::complete;
+}
+
+TaskStatus UpdateP(MeshData<Real> *md)
+{
+    Flag(md, "Updating P");
+    auto pmesh = md->GetParentPointer();
+    const int ndim = pmesh->ndim;
+    const IndexRange ib = md->GetBoundsI(IndexDomain::interior);
+    const IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
+    const IndexRange kb = md->GetBoundsK(IndexDomain::interior);
+    const IndexRange ib_l = IndexRange{ib.s-1, ib.e};
+    const IndexRange jb_l = (ndim > 1) ? IndexRange{jb.s-1, jb.e} : jb;
+    const IndexRange kb_l = (ndim > 2) ? IndexRange{kb.s-1, kb.e} : kb;
+
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+    const int n1 = pmb0->cellbounds.ncellsi(IndexDomain::interior);
+
+    // Options
+    auto pkg = md->GetMeshPointer()->packages.Get("B_Cleanup");
+    const auto omega = pkg->Param<double>("sor_factor");
+
+    // Pack variables
+    auto P = md->PackVariables(std::vector<std::string>{"p"});
+    auto lap = md->PackVariables(std::vector<std::string>{"lap"});
+    auto dB = md->PackVariables(std::vector<std::string>{"dB"});
+    auto divB = md->PackVariables(std::vector<std::string>{"divB"});
+
+    // TODO Damped Jacobi takes a *lot* of iterations for anything bigger than a toy problem.
+    // We probably need CG
+
+    // dB = grad(p), defined at cell centers
+    // Need a halo one zone *left*, as corner_div will read that.
+    // Therefore P's ghosts need to be up to date!
+    pmb0->par_for("gradient_P", 0, P.GetDim(5) - 1, kb_l.s, kb_l.e, jb_l.s, jb_l.e, ib_l.s, ib_l.e,
+        KOKKOS_LAMBDA_MESH_3D {
+            const auto& G = P.GetCoords(b);
+            double b1, b2, b3;
+            B_FluxCT::center_grad(G, P, b, k, j, i, ndim > 2, b1, b2, b3);
+            dB(b, V1, k, j, i) = b1;
+            dB(b, V2, k, j, i) = b2;
+            dB(b, V3, k, j, i) = b3;
+        }
+    );
+
+    // lap = div(dB), defined at cell corners
+    // Then apply a damped Jacobi iteration
+    pmb0->par_for("laplacian_dB", 0, lap.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_3D {
+            const auto& G = lap.GetCoords(b);
+            // This is the inverse diagonal element of a fictional a_ij Laplacian operator
+            // denoted D^-1 below. Note it's not quite what a_ij might work out to for our "laplacian"
+            const double dt = (-1./6) * G.dx1v(i) * G.dx2v(j) * G.dx3v(k);
+            lap(b, 0, k, j, i) = B_FluxCT::corner_div(G, dB, b, k, j, i, ndim > 2);
+            // In matrix notation the following would be:
+            // x^k+1 = omega*D^-1*(b - (L + U) x^k) + (1-omega)*x^k
+            // But since we can't actually calculate L+U, we use A*x-D*x
+            //P(b, 0, k, j, i) = omega*dt*(divB(b, 0, k, j, i) - (lap(b, 0, k, j, i) - 1/dt*P(b, 0, k, j, i)))
+            //                    + (1 - omega)*P(b, 0, k, j, i);
+            // ...or more simply...
+            P(b, 0, k, j, i) += omega*dt*(divB(b, 0, k, j, i) - lap(b, 0, k, j, i));
+
+        }
+    );
+
+    return TaskStatus::complete;
+}
+
+TaskStatus SumError(MeshData<Real> *md, Real& reduce_sum)
+{
+    Flag(md, "Summing remaining error term");
+    auto pm = md->GetParentPointer();
+    IndexRange ib = md->GetBoundsI(IndexDomain::interior);
+    IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = md->GetBoundsK(IndexDomain::interior);
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+
+    // Get variables
+    auto lap = md->PackVariables(std::vector<std::string>{"lap"});
+    auto divB = md->PackVariables(std::vector<std::string>{"divB"});
+
+    // TODO this can be done as
+    // 1. (K*lap - divB) as here
+    // 2. (div of (B - dB)), simulating the actual result
+    // The latter would require a full/scratch vector temporary, and
+    // setting FillGhost on dB, but the sync is in the right spot
+    Real err_total;
+    pmb0->par_reduce("SumError", 0, lap.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_3D_REDUCE {
+            local_result += abs(lap(b, 0, k, j, i) - divB(b, 0, k, j, i));
+        }
+    , Kokkos::Sum<Real>(err_total));
+
+    // Parthenon/caller will take care of MPI reduction
+    reduce_sum += err_total;
+    return TaskStatus::complete;
+}
+
+TaskStatus SumP(MeshData<Real> *md, Real& reduce_sum)
+{
+    Flag(md, "Summing P");
+    auto pm = md->GetParentPointer();
+    IndexRange ib = md->GetBoundsI(IndexDomain::interior);
+    IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = md->GetBoundsK(IndexDomain::interior);
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+
+    // Get variables
+    auto P = md->PackVariables(std::vector<std::string>{"p"});
+
+    Real P_total;
+    pmb0->par_reduce("SumError", 0, P.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_3D_REDUCE {
+            local_result += abs(P(b, 0, k, j, i));
+        }
+    , Kokkos::Sum<Real>(P_total));
+
+    // Parthenon/caller will take care of MPI reduction
+    reduce_sum += P_total;
+    return TaskStatus::complete;
+}
+
+TaskStatus ApplyP(MeshData<Real> *md)
+{
+    Flag(md, "Applying divB correction");
+    auto pm = md->GetParentPointer();
+    IndexRange ib = md->GetBoundsI(IndexDomain::interior);
+    IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = md->GetBoundsK(IndexDomain::interior);
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+
+    // Pack variables
+    auto P = md->PackVariables(std::vector<std::string>{"p"});
+    auto B = md->PackVariables(std::vector<std::string>{"cons.B"});
+
+    const int ndim = B.GetNdim();
+
+    // Apply B -= grad(p) to actually remove divergence
+    pmb0->par_for("apply_dp", 0, P.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_3D {
+            const auto& G = P.GetCoords(b);
+            double b1, b2, b3;
+            B_FluxCT::center_grad(G, P, b, k, j, i, ndim > 2, b1, b2, b3);
+            B(b, V1, k, j, i) -= b1;
+            B(b, V2, k, j, i) -= b2;
+            if (ndim > 2) {
+                B(b, V3, k, j, i) -= b3;
+            } else {
+                B(b, V3, k, j, i) = 0;
+            }
+        }
+    );
+
+    B_FluxCT::UtoP(md);
+
+    return TaskStatus::complete;
+}
+
+// TODO get this working later. Needs:
+// 1. Some way to call every X steps (just return converged if off-cadence?)
+// 2. Parameters, mesh pointer, or just driver pointer as arg
+// 3. Is this a good idea here?  More broadly? e.g. for MPI sync, sources, etc?
+// void AddBCleanupTasks(TaskList& tl, const TaskID& t_dep, AllReduce<Real>& update_norm) {
+//     TaskID t_none(0);
+
+//     auto pkg = md->GetMeshPointer()->packages.Get("B_Cleanup");
+//     auto max_iters = pkg->Param<int>("max_iterations");
+//     auto check_interval = pkg->Param<int>("check_interval");
+//     auto fail_flag = pkg->Param<bool>("fail_without_convergence");
+//     auto warn_flag = pkg->Param<bool>("warn_without_convergence");
+
+//     const int num_partitions = md->GetMeshPointer()->DefaultNumPartitions();
+//     TaskRegion &solver_region = tc.AddRegion(num_partitions);
+//     for (int i = 0; i < num_partitions; i++) {
+//         int reg_dep_id = 0;
+
+//         auto &t_solver = tl.AddIteration("B field cleanup");
+//         t_solver.SetMaxIterations(max_iters);
+//         t_solver.SetCheckInterval(check_interval);
+//         t_solver.SetFailWithMaxIterations(fail_flag);
+//         t_solver.SetWarnWithMaxIterations(warn_flag);
+//         auto t_start_recv = t_solver.AddTask(t_dep, &MeshData<Real>::StartReceiving, md.get(),
+//                                         BoundaryCommSubset::all);
+
+//         auto t_update = t_solver.AddTask(t_start_recv, B_Cleanup::UpdatePhi,
+//                                     md.get(), mdelta.get());
+
+//         auto t_norm = t_solver.AddTask(t_update, B_Cleanup::SumDeltaPhi,
+//                                 mdelta.get(), &update_norm.val);
+//         solver_region.AddRegionalDependencies(reg_dep_id, i, t_norm);
+//         reg_dep_id++;
+//         auto t_start_reduce_norm = (i == 0 ? t_solver.AddTask(t_norm, &AllReduce<Real>::StartReduce,
+//                                                         &update_norm, MPI_SUM)
+//                                         : t_none);
+//         auto finish_reduce_norm =
+//             t_solver.AddTask(start_reduce_norm, &AllReduce<Real>::CheckReduce, &update_norm);
+//         auto t_report_norm = (i == 0 ? t_solver.AddTask(
+//                                         finish_reduce_norm,
+//                                         [](Real *norm) {
+//                                             if (Globals::my_rank == 0) {
+//                                                 std::cout << "Update norm = " << *norm << std::endl;
+//                                             }
+//                                             *norm = 0.0;
+//                                             return TaskStatus::complete;
+//                                         },
+//                                         &update_norm.val)
+//                                 : none);
+
+//         auto t_send = t_solver.AddTask(t_update, cell_centered_bvars::SendBoundaryBuffers, md);
+
+//         auto t_recv =
+//             t_solver.AddTask(t_start_recv, cell_centered_bvars::ReceiveBoundaryBuffers, md);
+
+//         auto t_setb = t_solver.AddTask(t_recv | t_update, cell_centered_bvars::SetBoundaries, md);
+
+//         auto t_clear = t_solver.AddTask(t_send | t_setb | t_report_norm, &MeshData<Real>::ClearBoundary,
+//                                     md.get(), BoundaryCommSubset::all);
+
+//         auto t_check = t_solver.SetCompletionTask(
+//             t_clear, B_Cleanup::CheckConvergence, md.get(), mdelta.get());
+//         // mark task so that dependent tasks (below) won't execute
+//         // until all task lists have completed it
+//         solver_region.AddRegionalDependencies(reg_dep_id, i, t_check);
+//         reg_dep_id++;
+//     }
+// }
+
+} // namespace B_Cleanup
diff --git a/kharma/b_cleanup/b_cleanup.hpp b/kharma/b_cleanup/b_cleanup.hpp
new file mode 100644
index 00000000..df7c84d9
--- /dev/null
+++ b/kharma/b_cleanup/b_cleanup.hpp
@@ -0,0 +1,96 @@
+/*
+ *  File: b_cleanup.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include <memory>
+
+#include <parthenon/parthenon.hpp>
+
+#include "grmhd_functions.hpp"
+#include "types.hpp"
+
+using namespace parthenon;
+
+/**
+ * This physics package implements an elliptic solver which minimizes the divergence of
+ * the magnetic field B, most useful for mesh resizing.
+ * Written to leave open the possibility of using this at every 
+ * 
+ * Mostly now, it is used when resizing input arrays
+ */
+namespace B_Cleanup {
+/**
+ * Declare fields, initialize (few) parameters
+ */
+std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages);
+
+/**
+ * Calculate the field divergence, and sum the absolute value as a reduction
+ * (for convergence comparisons).
+ */
+TaskStatus CalcSumDivB(MeshData<Real> *du, Real& reduce_sum);
+
+/**
+ * Set P = divB as initial guess
+ */
+TaskStatus InitP(MeshData<Real> *md);
+
+/**
+ * Take a Gauss-Seidel/SOR step.
+ */
+TaskStatus UpdateP(MeshData<Real> *md);
+
+/**
+ * Sum the remaining error, that is, the difference del^2 p - divB
+ */
+TaskStatus SumError(MeshData<Real> *du, Real& reduce_sum);
+TaskStatus SumP(MeshData<Real> *md, Real& reduce_sum);
+
+/**
+ * Apply B -= grad(P) to subtract divergence from the magnetic field
+ */
+TaskStatus ApplyP(MeshData<Real> *md);
+
+/**
+ * Single-call divergence cleanup.  Lots of MPI syncs, probably slow to use in task lists.
+ */
+void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md);
+
+/**
+ * Add the iterative tasks required for B field cleanup to the tasklist
+ * Likely faster than above if we want to clean periodically
+ */
+//void AddBCleanupTasks(TaskList tl, TaskID t_dep);
+
+} // namespace B_Cleanup
diff --git a/kharma/b_flux_ct/b_flux_ct.cpp b/kharma/b_flux_ct/b_flux_ct.cpp
index e066828f..eb0e6e5e 100644
--- a/kharma/b_flux_ct/b_flux_ct.cpp
+++ b/kharma/b_flux_ct/b_flux_ct.cpp
@@ -53,6 +53,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     auto pkg = std::make_shared<StateDescriptor>("B_FluxCT");
     Params &params = pkg->AllParams();
 
+    // OPTIONS
     // Diagnostic data
     int verbose = pin->GetOrAddInteger("debug", "verbose", 0);
     params.Add("verbose", verbose);
@@ -63,11 +64,14 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
 
     bool fix_flux = pin->GetOrAddBoolean("b_field", "fix_polar_flux", true);
     params.Add("fix_polar_flux", fix_flux);
-    // WARNING this disables constrained transport, so the field will quickly pick up a divergence
+    // WARNING this disables constrained transport, so the field will quickly pick up a divergence.
+    // To use another transport, just specify it instead of this one.
     bool disable_flux_ct = pin->GetOrAddBoolean("b_field", "disable_flux_ct", false);
     params.Add("disable_flux_ct", disable_flux_ct);
 
-    std::vector<int> s_vector({3});
+    // FIELDS
+
+    std::vector<int> s_vector({NVEC});
 
     MetadataFlag isPrimitive = packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
     MetadataFlag isMHD = packages.Get("GRMHD")->Param<MetadataFlag>("MHDFlag");
@@ -364,31 +368,12 @@ double MaxDivB(MeshData<Real> *md)
     const IndexRange jl = IndexRange{jb.s + 1, jb.e};
     const IndexRange kl = (ndim > 2) ? IndexRange{kb.s + 1, kb.e} : kb;
 
-    const double norm = (ndim > 2) ? 0.25 : 0.5;
-
     double max_divb;
     Kokkos::Max<double> max_reducer(max_divb);
     pmb0->par_reduce("divB_max", block.s, block.e, kl.s, kl.e, jl.s, jl.e, il.s, il.e,
         KOKKOS_LAMBDA_MESH_3D_REDUCE {
             const auto& G = B_U.GetCoords(b);
-            // 2D divergence, averaging to corners
-            double term1 = B_U(b, V1, k, j, i)   + B_U(b, V1, k, j-1, i)
-                         - B_U(b, V1, k, j, i-1) - B_U(b, V1, k, j-1, i-1);
-            double term2 = B_U(b, V2, k, j, i)   + B_U(b, V2, k, j, i-1)
-                         - B_U(b, V2, k, j-1, i) - B_U(b, V2, k, j-1, i-1);
-            double term3 = 0.;
-            if (ndim > 2) {
-                // Average to corners in 3D, add 3rd flux
-                term1 +=  B_U(b, V1, k-1, j, i)   + B_U(b, V1, k-1, j-1, i)
-                        - B_U(b, V1, k-1, j, i-1) - B_U(b, V1, k-1, j-1, i-1);
-                term2 +=  B_U(b, V2, k-1, j, i)   + B_U(b, V2, k-1, j, i-1)
-                        - B_U(b, V2, k-1, j-1, i) - B_U(b, V2, k-1, j-1, i-1);
-                term3 =   B_U(b, V3, k, j, i)     + B_U(b, V3, k, j-1, i)
-                        + B_U(b, V3, k, j, i-1)   + B_U(b, V3, k, j-1, i-1)
-                        - B_U(b, V3, k-1, j, i)   - B_U(b, V3, k-1, j-1, i)
-                        - B_U(b, V3, k-1, j, i-1) - B_U(b, V3, k-1, j-1, i-1);
-            }
-            double local_divb = fabs(norm*term1/G.dx1v(i) + norm*term2/G.dx2v(j) + norm*term3/G.dx3v(k));
+            double local_divb = fabs(corner_div(G, B_U, b, k, j, i, ndim > 2));
             if (local_divb > local_result) local_result = local_divb;
         }
     , max_reducer);
@@ -417,11 +402,13 @@ TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
 
 void FillOutput(MeshBlock *pmb, ParameterInput *pin)
 {
+    // TODO define this on meshblock or pack vars
     auto rc = pmb->meshblock_data.Get().get();
     Flag(rc, "Calculating divB for output");
     const int ndim = pmb->pmy_mesh->ndim;
     if (ndim < 2) return;
 
+    // TODO can we call corner_div here?  Extra b=0 out front in addressing zones...
     GridVars B_U = rc->Get("cons.B").data;
     GridVars divB = rc->Get("divB").data;
 
@@ -458,7 +445,7 @@ void FillOutput(MeshBlock *pmb, ParameterInput *pin)
                         - B_U(V3, k-1, j, i)   - B_U(V3, k-1, j-1, i)
                         - B_U(V3, k-1, j, i-1) - B_U(V3, k-1, j-1, i-1);
             }
-            divB(k, j, i) = fabs(norm*term1/G.dx1v(i) + norm*term2/G.dx2v(j) + norm*term3/G.dx3v(k));
+            divB(k, j, i) = norm*term1/G.dx1v(i) + norm*term2/G.dx2v(j) + norm*term3/G.dx3v(k);
         }
     );
 
diff --git a/kharma/b_flux_ct/b_flux_ct.hpp b/kharma/b_flux_ct/b_flux_ct.hpp
index 737269c3..5f466546 100644
--- a/kharma/b_flux_ct/b_flux_ct.hpp
+++ b/kharma/b_flux_ct/b_flux_ct.hpp
@@ -96,6 +96,16 @@ TaskStatus TransportB(MeshData<Real> *md);
  * listed arguments
  */
 double MaxDivB(MeshData<Real> *md);
+// Version for Parthenon tasking as a reduction
+inline TaskStatus MaxDivBTask(MeshData<Real> *md, double& divb_max)
+    { divb_max = MaxDivB(md); return TaskStatus::complete; }
+
+/**
+ * Clean the magnetic field divergence via successive over-relaxation
+ * Currently only used when resizing inputs.
+ * TODO option to sprinkle into updates every N steps
+ */
+void CleanupDivergence(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire, bool coarse=false);
 
 /**
  * Diagnostics printed/computed after each step
@@ -108,6 +118,64 @@ TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md);
  */
 void FillOutput(MeshBlock *pmb, ParameterInput *pin);
 
-// TODO device-side divB at a single zone corner, to avoid code duplication?
+/**
+ * 2D or 3D divergence, averaging to cell corners
+ */
+template<typename Global>
+KOKKOS_INLINE_FUNCTION double corner_div(const GRCoordinates& G, const Global& B_U, const int& b,
+                                         const int& k, const int& j, const int& i, const bool& do_3D)
+{
+    const double norm = (do_3D) ? 0.25 : 0.5;
+    // 2D divergence, averaging to corners
+    double term1 = B_U(b, V1, k, j, i)   + B_U(b, V1, k, j-1, i)
+                    - B_U(b, V1, k, j, i-1) - B_U(b, V1, k, j-1, i-1);
+    double term2 = B_U(b, V2, k, j, i)   + B_U(b, V2, k, j, i-1)
+                    - B_U(b, V2, k, j-1, i) - B_U(b, V2, k, j-1, i-1);
+    double term3 = 0.;
+    if (do_3D) {
+        // Average to corners in 3D, add 3rd flux
+        term1 +=  B_U(b, V1, k-1, j, i)   + B_U(b, V1, k-1, j-1, i)
+                - B_U(b, V1, k-1, j, i-1) - B_U(b, V1, k-1, j-1, i-1);
+        term2 +=  B_U(b, V2, k-1, j, i)   + B_U(b, V2, k-1, j, i-1)
+                - B_U(b, V2, k-1, j-1, i) - B_U(b, V2, k-1, j-1, i-1);
+        term3 =   B_U(b, V3, k, j, i)     + B_U(b, V3, k, j-1, i)
+                + B_U(b, V3, k, j, i-1)   + B_U(b, V3, k, j-1, i-1)
+                - B_U(b, V3, k-1, j, i)   - B_U(b, V3, k-1, j-1, i)
+                - B_U(b, V3, k-1, j, i-1) - B_U(b, V3, k-1, j-1, i-1);
+    }
+    return norm*term1/G.dx1v(i) + norm*term2/G.dx2v(j) + norm*term3/G.dx3v(k);
+}
+
+/**
+ * 2D or 3D gradient, averaging to cell centers from corners.
+ * Note this is forward-difference, while previous def is backward
+ */
+template<typename Global>
+KOKKOS_INLINE_FUNCTION void center_grad(const GRCoordinates& G, const Global& P, const int& b,
+                                          const int& k, const int& j, const int& i, const bool& do_3D,
+                                          double& B1, double& B2, double& B3)
+{
+    const double norm = (do_3D) ? 0.25 : 0.5;
+    // 2D divergence, averaging to corners
+    double term1 =  P(b, 0, k, j+1, i+1) + P(b, 0, k, j, i+1)
+                  - P(b, 0, k, j+1, i)   - P(b, 0, k, j, i);
+    double term2 =  P(b, 0, k, j+1, i+1) + P(b, 0, k, j+1, i)
+                  - P(b, 0, k, j, i+1)   - P(b, 0, k, j, i);
+    double term3 = 0.;
+    if (do_3D) {
+        // Average to corners in 3D, add 3rd flux
+        term1 += P(b, 0, k+1, j+1, i+1) + P(b, 0, k+1, j, i+1)
+               - P(b, 0, k+1, j+1, i)   - P(b, 0, k+1, j, i);
+        term2 += P(b, 0, k+1, j+1, i+1) + P(b, 0, k+1, j+1, i)
+               - P(b, 0, k+1, j, i+1)   - P(b, 0, k+1, j, i);
+        term3 =  P(b, 0, k+1, j+1, i+1) + P(b, 0, k+1, j, i+1)
+               + P(b, 0, k+1, j+1, i)   + P(b, 0, k+1, j, i)
+               - P(b, 0, k, j+1, i+1)   - P(b, 0, k, j, i+1)
+               - P(b, 0, k, j+1, i)     - P(b, 0, k, j, i);
+    }
+    B1 = norm*term1/G.dx1v(i);
+    B2 = norm*term2/G.dx2v(j);
+    B3 = norm*term3/G.dx3v(k);
+}
 
 }
diff --git a/kharma/decs.hpp b/kharma/decs.hpp
index 59624a99..a511a9be 100644
--- a/kharma/decs.hpp
+++ b/kharma/decs.hpp
@@ -170,3 +170,4 @@ using GeomTensor3 = parthenon::ParArrayND<Real>;
 // Versions for full mesh
 #define KOKKOS_LAMBDA_MESH_3D_REDUCE KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, double &local_result)
 #define KOKKOS_LAMBDA_MESH_3D_REDUCE_INT KOKKOS_LAMBDA (const int &b, const int &k, const int &j, const int &i, int &local_result)
+#define KOKKOS_LAMBDA_MESH_4D_REDUCE KOKKOS_LAMBDA (const int &b, const int &v, const int &k, const int &j, const int &i, double &local_result)
diff --git a/kharma/grmhd/grmhd.cpp b/kharma/grmhd/grmhd.cpp
index 4a927fd5..5ef248ef 100644
--- a/kharma/grmhd/grmhd.cpp
+++ b/kharma/grmhd/grmhd.cpp
@@ -32,9 +32,6 @@
  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/**
- * GRMHD package.  Manipulations on GRMHD 
- */
 #include "grmhd.hpp"
 
 #include <memory>
@@ -62,6 +59,10 @@ using namespace parthenon;
 // Need to access these directly for reductions
 using namespace Kokkos;
 
+
+/**
+ * GRMHD package.  Global operations on General Relativistic Magnetohydrodynamic systems.
+ */
 namespace GRMHD
 {
 
@@ -100,11 +101,13 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
         params.Add("use_hlle", false);
     }
 
+    // These parameters are put in "parthenon/time" to match others, but ultimately we should
+    // override the parthenon timestep chooser
     // Minimum timestep, if something about the sound speed goes wonky. Probably won't save you :)
-    // know what we're doing modifying "parthenon/time" -- subclass 
-    double dt_min = pin->GetOrAddReal("parthenon/time", "dt_min", 1.e-4);
+    double dt_min = pin->GetOrAddReal("parthenon/time", "dt_min", 1.e-5);
     params.Add("dt_min", dt_min);
-    // Starting timestep, in case we're restarting
+    // Starting timestep: guaranteed step 1 timestep returned by EstimateTimestep,
+    // usually matters most for restarts
     double dt_start = pin->GetOrAddReal("parthenon/time", "dt", dt_min);
     params.Add("dt_start", dt_start);
     double max_dt_increase = pin->GetOrAddReal("parthenon/time", "max_dt_increase", 2.0);
@@ -184,7 +187,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     // generally inherit the size of the MeshBlock (for "Cell" fields) or some
     // closely-related size (for "Face" and "Edge" fields)
 
-    std::vector<int> s_vector({3});
+    std::vector<int> s_vector({NVEC});
     std::vector<MetadataFlag> flags_prim, flags_cons;
     auto imex_driver = pin->GetString("driver", "type") == "imex";
     auto explicit_step = (pin->GetOrAddString("driver", "step", "explicit") == "explicit");
diff --git a/kharma/harm_driver.cpp b/kharma/harm_driver.cpp
index 56363c28..f7e218f1 100644
--- a/kharma/harm_driver.cpp
+++ b/kharma/harm_driver.cpp
@@ -51,7 +51,7 @@
 #include "debug.hpp"
 #include "fixup.hpp"
 #include "flux.hpp"
-#include "iharm_restart.hpp"
+#include "resize_restart.hpp"
 #include "source.hpp"
 
 TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
diff --git a/kharma/harm_driver.hpp b/kharma/harm_driver.hpp
index ae445164..609949f9 100644
--- a/kharma/harm_driver.hpp
+++ b/kharma/harm_driver.hpp
@@ -61,4 +61,8 @@ class HARMDriver : public MultiStageDriver {
          * usually w.r.t. fluid "state" being spread across the primitive and conserved quantities
          */
         TaskCollection MakeTaskCollection(BlockList_t &blocks, int stage);
+
+    private:
+        // Global solves need a reduction point
+        AllReduce<Real> update_norm;
 };
diff --git a/kharma/imex_driver.cpp b/kharma/imex_driver.cpp
index 3299643f..12e1a152 100644
--- a/kharma/imex_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -52,7 +52,7 @@
 #include "debug.hpp"
 #include "fixup.hpp"
 #include "flux.hpp"
-#include "iharm_restart.hpp"
+#include "resize_restart.hpp"
 #include "implicit.hpp"
 #include "source.hpp"
 
@@ -71,14 +71,14 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
     const Real dt = integrator->dt;
     auto stage_name = integrator->stage_name;
 
-    // Which packages we load affects which tasks we'll add to the list
+    // Which packages we've loaded affects which tasks we'll add to the list
     auto& pkgs = blocks[0]->packages.AllPackages();
     bool use_b_cd = pkgs.count("B_CD");
     bool use_b_flux_ct = pkgs.count("B_FluxCT");
     bool use_electrons = pkgs.count("Electrons");
     bool use_wind = pkgs.count("Wind");
 
-    // Allocate the fields ("containers") we need block by block
+    // Allocate the fluid states ("containers") we need for each block
     for (int i = 0; i < blocks.size(); i++) {
         auto &pmb = blocks[i];
         // first make other useful containers
@@ -214,8 +214,9 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
 
         // Then solve for new primitives in the fluid interior, with the primitives at step start as a guess,
         // using UtoP.  Note that since no ghost zones are updated here, and thus FixUtoP cannot use
-        // ghost zones, KHARMA behavior in this mode will dependent on the breakdown of meshblocks & possibly
-        // erratic for many fixups.  Full algo should boundary sync -> FixUtoP -> boundary sync
+        // ghost zones. Thus KHARMA behavior in this mode will dependent on the breakdown of meshblocks,
+        // & possibly erratic when there are many fixups.
+        // Full algo should boundary sync -> FixUtoP -> boundary sync
         TaskRegion &async_region = tc.AddRegion(blocks.size());
         for (int i = 0; i < blocks.size(); i++) {
             auto &pmb = blocks[i];
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
index cba102e2..08c349fb 100644
--- a/kharma/implicit/implicit.hpp
+++ b/kharma/implicit/implicit.hpp
@@ -49,10 +49,10 @@ namespace Implicit
 std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin);
 
 /**
- * @brief take an implicit step.
+ * @brief take the per-zone implicit portion of a semi-implicit scheme
  * 
  * @param mdi the fluid state at the beginning of the step
- * @param md0 the initial fluid state
+ * @param md0 the initial fluid state for this substep
  * @param dudt the negative flux divergence plus explicit source terms
  * @param md1 the final fluid state
  * @param dt the timestep (current substep)
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index eaced6e2..e67d7e3d 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -42,6 +42,7 @@
 // Packages
 #include "b_flux_ct.hpp"
 #include "b_cd.hpp"
+#include "b_cleanup.hpp"
 #include "current.hpp"
 #include "electrons.hpp"
 #include "implicit.hpp"
@@ -55,7 +56,7 @@
 #include "boundaries.hpp"
 #include "fixup.hpp"
 #include "harm_driver.hpp"
-#include "iharm_restart.hpp"
+#include "resize_restart.hpp"
 
 std::shared_ptr<StateDescriptor> KHARMA::InitializeGlobals(ParameterInput *pin)
 {
@@ -77,6 +78,19 @@ std::shared_ptr<StateDescriptor> KHARMA::InitializeGlobals(ParameterInput *pin)
 
     return pkg;
 }
+void KHARMA::ResetGlobals(ParameterInput *pin, Mesh *pmesh)
+{
+    // The globals package was loaded & exists, retrieve it
+    auto pkg = pmesh->packages.Get("Globals");
+    Params &params = pkg->AllParams();
+    // This needs to be reset to guarantee that EstimateTimestep doesn't try to
+    // calculate a new dt from a blank 'ctop' variable,
+    // just uses whatever the next step was going to be at reset
+    params.Update("in_loop", false);
+
+    // Everything else is a per-step variable, not per-run, so they're fine
+    // to be restored by Parthenon
+}
 
 void KHARMA::FixParameters(std::unique_ptr<ParameterInput>& pin)
 {
@@ -93,8 +107,8 @@ void KHARMA::FixParameters(std::unique_ptr<ParameterInput>& pin)
 
     // If we're restarting (not via Parthenon), read the restart file to get most parameters
     std::string prob = pin->GetString("parthenon/job", "problem_id");
-    if (prob == "iharm_restart") {
-        ReadIharmRestartHeader(pin->GetString("iharm_restart", "fname"), pin);
+    if (prob == "resize_restart") {
+        ReadIharmRestartHeader(pin->GetString("resize_restart", "fname"), pin);
     }
 
     // Then handle coordinate systems and boundaries!
@@ -185,9 +199,12 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     // Then put together what we're supposed to
     Packages_t packages;
 
-    // Read all options first so we can set their defaults here,
-    // before any packages are initialized.
+    // Read all package enablements first so we can set their defaults here,
+    // before any packages are initialized: thus they can know the full list
     std::string b_field_solver = pin->GetOrAddString("b_field", "solver", "flux_ct");
+    // Enable b_cleanup package if we want periodic cleanups OR are resizing a restart file
+    bool b_cleanup = pin->GetOrAddBoolean("b_cleanup", "on", false) ||
+                     pin->GetString("parthenon/job", "problem_id") == "resize_restart";
     // TODO enable this iff jcon is in the list of outputs
     bool add_jcon = pin->GetOrAddBoolean("GRMHD", "add_jcon", true);
     bool do_electrons = pin->GetOrAddBoolean("electrons", "on", false);
@@ -197,13 +214,14 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
 
     // Set the default driver way up here so packages know how to flag
     // prims vs cons (imex stepper syncs prims, but packages have to mark them that way)
-    auto driver_type = pin->GetOrAddString("driver", "type", "imex");
+    auto driver_type = pin->GetOrAddString("driver", "type", "harm");
 
-    // Global variables "package."  Anything that just, really oughta be a global
+    // Global variables "package."  Mutable global state Parthenon doesn't keep for us.
+    // Always enable.
     packages.Add(KHARMA::InitializeGlobals(pin.get()));
 
-    // Most functions and variables are in the GRMHD package,
-    // initialize it first among physics stuff
+    // Lots of common functions and variables are still in the GRMHD package,
+    // always initialize it first among physics stuff
     packages.Add(GRMHD::Initialize(pin.get()));
 
     // We'll also always want the floors package, even if floors are disabled
@@ -221,6 +239,14 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
         // Don't even error on bad values.  This is probably what you want
         packages.Add(B_FluxCT::Initialize(pin.get(), packages));
     }
+    // Additional cleanup on B field.
+    // Can be enabled with or without a per-step solver, currently used for restart resizing
+    if (b_cleanup) {
+        packages.Add(B_Cleanup::Initialize(pin.get(), packages));
+    }
+    // Unless both a field solver and cleanup routine are disabled,
+    // there is some form of B field present/declared.
+    bool b_field_exists = !(b_field_solver == "none" && !b_cleanup);
 
     // Implicit timestepping has a few of its own functions
     bool implicit_step = pin->GetOrAddString("driver", "step", "explicit") == "implicit";
@@ -228,9 +254,8 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
         packages.Add(Implicit::Initialize(pin.get()));
     }
 
-    // Even if we want to, there's no adding current if we don't know B.
-    // Avoid it.
-    if (add_jcon && b_field_solver != "none") {
+    // Add jcon, so long as there's a field to calculate it from
+    if (add_jcon && b_field_exists) {
         packages.Add(Current::Initialize(pin.get()));
     }
 
diff --git a/kharma/kharma.hpp b/kharma/kharma.hpp
index bf124032..1c1290be 100644
--- a/kharma/kharma.hpp
+++ b/kharma/kharma.hpp
@@ -58,6 +58,8 @@ Packages_t ProcessPackages(std::unique_ptr<ParameterInput>& pin);
  * in_loop, whether one step has been completed (for e.g. EstimateTimestep)
  */
 std::shared_ptr<StateDescriptor> InitializeGlobals(ParameterInput *pin);
+// Version for restarts, called in PostInitialize if we're restarting from a Parthenon restart file
+void ResetGlobals(ParameterInput *pin, Mesh *pmesh);
 
 /**
  * Imitate Parthenon's FillDerived call, but on only a subset of zones defined by 'domain'
diff --git a/kharma/main.cpp b/kharma/main.cpp
index cc93bd36..179689e8 100644
--- a/kharma/main.cpp
+++ b/kharma/main.cpp
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
     Flag("Parthenon Initialized");
 
 #if DEBUG
-    // Replace Parthenon signals with something that just prints a backtrace
+    // Replace Parthenon signal handlers with something that just prints a backtrace
     signal(SIGINT, print_backtrace);
     signal(SIGTERM, print_backtrace);
     signal(SIGSEGV, print_backtrace);
@@ -144,7 +144,11 @@ int main(int argc, char *argv[])
     // this usually involves global reductions for normalization
     if(MPIRank0())
         cout << "Running post-initialization tasks..." << endl;
-    KHARMA::PostInitialize(pin, pmesh, pman.IsRestart());
+
+    auto prob = pin->GetString("parthenon/job", "problem_id");
+    bool is_restart = (prob == "resize_restart") || pman.IsRestart();
+    bool is_resize = (prob == "resize_restart") && !pman.IsRestart();
+    KHARMA::PostInitialize(pin, pmesh, is_restart, is_resize);
     Flag("Post-initialization completed");
 
     // Construct a temporary driver purely for parameter parsing
@@ -188,7 +192,7 @@ int main(int argc, char *argv[])
         HARMDriver driver(pin, papp, pmesh);
         auto driver_status = driver.Execute();
     } else if (driver_type == "imex") {
-        cout << "Initializing and running GRIM driver." << endl;
+        cout << "Initializing and running IMEX driver." << endl;
         ImexDriver driver(pin, papp, pmesh);
         auto driver_status = driver.Execute();
     }
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index c111baaf..7aac23d0 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -35,6 +35,7 @@
 #include "post_initialize.hpp"
 
 #include "b_field_tools.hpp"
+#include "b_cleanup.hpp"
 #include "blob.hpp"
 #include "boundaries.hpp"
 #include "debug.hpp"
@@ -42,6 +43,7 @@
 #include "floors.hpp"
 #include "flux.hpp"
 #include "gr_coordinates.hpp"
+#include "kharma.hpp"
 #include "types.hpp"
 
 #include "seed_B_ct.hpp"
@@ -210,7 +212,7 @@ void KHARMA::SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh)
     Flag("Added B Field");
 }
 
-void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart)
+void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart, bool is_resize)
 {
     Flag("Post-initialization started");
     if (!is_restart)
@@ -230,46 +232,24 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart)
 
     // TODO when (restart/non) do we need this for setting ctop?
     if (is_restart) {
-        // Recover conserved variables 
-        if (pin->GetOrAddBoolean("driver", "type", false)) {
-            for (auto &pmb : pmesh->block_list) {
-                auto rc = pmb->meshblock_data.Get();
-                // This inserts only in vicinity of some global r,th,phi
-                InsertBlob(rc.get(), pin);
-            }
-        }
 
-        auto& md = pmesh->mesh_data.GetOrAdd("base", 0);
-        auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
-        const ReconstructionType& recon = pmb0->packages.Get("GRMHD")->Param<ReconstructionType>("recon");
-        switch (recon) {
-        case ReconstructionType::donor_cell:
-            Flux::GetFlux<ReconstructionType::donor_cell, X1DIR>(md.get());
-            Flux::GetFlux<ReconstructionType::donor_cell, X2DIR>(md.get());
-            Flux::GetFlux<ReconstructionType::donor_cell, X3DIR>(md.get());
-            break;
-        case ReconstructionType::linear_mc:
-            Flux::GetFlux<ReconstructionType::linear_mc, X1DIR>(md.get());
-            Flux::GetFlux<ReconstructionType::linear_mc, X2DIR>(md.get());
-            Flux::GetFlux<ReconstructionType::linear_mc, X3DIR>(md.get());
-            break;
-        case ReconstructionType::linear_vl:
-            Flux::GetFlux<ReconstructionType::linear_vl, X1DIR>(md.get());
-            Flux::GetFlux<ReconstructionType::linear_vl, X2DIR>(md.get());
-            Flux::GetFlux<ReconstructionType::linear_vl, X3DIR>(md.get());
-            break;
-        case ReconstructionType::weno5:
-            Flux::GetFlux<ReconstructionType::weno5, X1DIR>(md.get());
-            Flux::GetFlux<ReconstructionType::weno5, X2DIR>(md.get());
-            Flux::GetFlux<ReconstructionType::weno5, X3DIR>(md.get());
-            break;
-        case ReconstructionType::ppm:
-        case ReconstructionType::mp5:
-        case ReconstructionType::weno5_lower_poles:
-            cerr << "Reconstruction type not supported!  Supported reconstructions:" << endl;
-            cerr << "donor_cell, linear_mc, linear_vl, weno5" << endl;
-            exit(-5);
+        // Parthenon restored our global data for us, but we don't always want that
+        KHARMA::ResetGlobals(pin, pmesh);
+
+        // If we resized the array, cleanup any field divergence we created
+        if (is_resize) {
+            // Cleanup operates on full single MeshData as there are MPI syncs
+            auto &mbase = pmesh->mesh_data.GetOrAdd("base", 0);
+            // Clean field divergence across the whole grid
+            B_Cleanup::CleanupDivergence(mbase);
+            // Sync to make sure periodic boundaries are set
+            Flag("Boundary sync");
+            SyncAllBounds(pmesh);
         }
+
+        // TODO anything special for imex driver here?
+        // TODO there was a reconstruction here for filling ctop, but
+        // it should definitely not be necessary as first dt is set with dt_first
     }
 
     Flag("Post-initialization finished");
diff --git a/kharma/prob/post_initialize.hpp b/kharma/prob/post_initialize.hpp
index 2fc90741..28bd1008 100644
--- a/kharma/prob/post_initialize.hpp
+++ b/kharma/prob/post_initialize.hpp
@@ -52,8 +52,10 @@ void SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh);
 /**
  * Functions run over the entire mesh after per-block initialization:
  * 1. Initialize magnetic field, which must be normalized globally to respect beta_min parameter
- * 2. Initial boundary sync, including primitive values
+ * 2. Any ad-hoc additions to fluid state, e.g. add hotspots etc.
+ * 3. Initial boundary sync to populate ghost zones
+ * 4. On restarts, reset any per-run parameters & clean up B field divergence if resizing the grid
  */
-void PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart);
+void PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart, bool is_resize);
 
 }
diff --git a/kharma/prob/problem.cpp b/kharma/prob/problem.cpp
index 61e90a11..1d5c5041 100644
--- a/kharma/prob/problem.cpp
+++ b/kharma/prob/problem.cpp
@@ -47,7 +47,7 @@
 #include "bondi.hpp"
 #include "explosion.hpp"
 #include "fm_torus.hpp"
-#include "iharm_restart.hpp"
+#include "resize_restart.hpp"
 #include "kelvin_helmholtz.hpp"
 #include "bz_monopole.hpp"
 #include "mhdmodes.hpp"
@@ -92,7 +92,7 @@ void KHARMA::ProblemGenerator(MeshBlock *pmb, ParameterInput *pin)
         status = InitializeFMTorus(rc.get(), pin);
     } else if (prob == "bz_monopole") {
         status = InitializeBZMonopole(rc.get(), pin);
-    } else if (prob == "iharm_restart") {
+    } else if (prob == "resize_restart") {
         status = ReadIharmRestart(rc.get(), pin);
     } else if (prob == "noh"){
         status = InitializeNoh(rc.get(), pin);
@@ -105,7 +105,7 @@ void KHARMA::ProblemGenerator(MeshBlock *pmb, ParameterInput *pin)
     // option in perturbation->u_jitter
     // Note this defaults to zero, generally it's controlled via runtime options
     // But we *definitely* don't want it when restarting
-    if (prob != "iharm_restart" && pin->GetOrAddReal("perturbation", "u_jitter", 0.0) > 0.0) {
+    if (prob != "resize_restart" && pin->GetOrAddReal("perturbation", "u_jitter", 0.0) > 0.0) {
         PerturbU(rc.get(), pin);
     }
 
diff --git a/kharma/prob/resize.hpp b/kharma/prob/resize.hpp
new file mode 100644
index 00000000..2b0a7e1c
--- /dev/null
+++ b/kharma/prob/resize.hpp
@@ -0,0 +1,150 @@
+/* 
+ *  File: resize.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include "decs.hpp"
+
+// For using the ipole routines verbatim.
+// Automatically wraps in k so we can avoid ghost zones
+#define ind_sph(i, j, k) ( (((k)+n3) % n3) * n2 * n1 + (j) * n1 + (i))
+#define ind_periodic(i, j, k) ( (((k)+n3) % n3) * n2 * n1 + (((j)+n2) % n2) * n1 + (((i)+n1) % n1) )
+
+/**
+ * Routines for interpolating and initializing a KHARMA meshblock from the
+ * correct area of a global iharm3d restart file, used in resize_restart.cpp.
+ * Doesn't include "Elliptic maid" solver step for eliminating magnetic field
+ * divergence, see b_flux_ct for that (as it is divergence-rep dependent)
+ */
+
+/*
+ *  translates geodesic coordinates to a grid zone and returns offset
+ *  for interpolation purposes. integer index corresponds to the zone
+ *  center "below" the desired point and del[i] \in [0,1) returns the
+ *  offset from that zone center.
+ *
+ *  0    0.5    1
+ *  [     |     ]
+ *  A  B  C DE  F
+ *
+ *  startx = 0.
+ *  dx = 0.5
+ *
+ *  A -> (-1, 0.5)
+ *  B -> ( 0, 0.0)
+ *  C -> ( 0, 0.5)
+ *  D -> ( 0, 0.9)
+ *  E -> ( 1, 0.0)
+ *  F -> ( 1, 0.5)
+ */
+KOKKOS_INLINE_FUNCTION void Xtoijk(const GReal XG[GR_DIM],
+                                   const GReal startx[GR_DIM], const GReal stopx[GR_DIM],
+                                   const GReal dx[GR_DIM],
+                                   int& i, int& j, int& k, GReal del[GR_DIM])
+{
+    // If we ever include ghosts in iharm3d-format restarts, we need to clip phi here
+    // GReal phi = fmod(XG[3], stopx[3]);
+    // if (phi < 0.0) // TODO adapt for startx3 != 0?
+    //     phi += stopx[3];
+    GReal phi = XG[3];
+
+    // get provisional zone index. see note above function for details. note we
+    // shift to zone centers because that's where variables are most exact.
+    i = (int) ((XG[1] - startx[1]) / dx[1] - 0.5 + 1000) - 1000;
+    j = (int) ((XG[2] - startx[2]) / dx[2] - 0.5 + 1000) - 1000;
+    k = (int) ((phi   - startx[3]) / dx[3] - 0.5 + 1000) - 1000;
+
+    // now construct del
+    del[1] = (XG[1] - ((i + 0.5) * dx[1] + startx[1])) / dx[1];
+    del[2] = (XG[2] - ((j + 0.5) * dx[2] + startx[2])) / dx[2];
+    del[3] = (phi   - ((k + 0.5) * dx[3] + startx[3])) / dx[3];
+}
+
+/**
+ * This interpolates a single-array variable 'var' representing a grid of size 'startx' to 'stopx' in
+ * native coordinates, returning its value at location X
+ */
+KOKKOS_INLINE_FUNCTION Real interp_scalar(const GReal X[GR_DIM],
+                                          const GReal startx[GR_DIM], const GReal stopx[GR_DIM],
+                                          const GReal dx[GR_DIM], const bool& is_spherical,
+                                          const int& n3, const int& n2, const int& n1,
+                                          const Real *var)
+{
+    // zone and offset from X
+    GReal del[GR_DIM];
+    int i, j, k;
+    Xtoijk(X, startx, stopx, dx, i, j, k, del);
+
+    Real interp;
+    if (is_spherical) {
+        // For ghost zones, we treat each boundary differently:
+        // In X1, repeat first & last zones. TODO should be scaled by sqrt(-g). 
+        if (i < 0) i = 0; if (i >= n1-1) i = n1 - 2;
+        // In X2, bounce over the pole. Not probably perfect for rightward interp
+        if (j < 0) j = -j; if (j > n2-2) j = (n2-2) - (j - (n2-2));
+        // k auto-wraps. So do all indices for periodic boxes.
+
+        // interpolate in x1 and x2
+            interp = var[ind_sph(i    , j    , k)]*(1. - del[1])*(1. - del[2]) +
+                    var[ind_sph(i    , j + 1, k)]*(1. - del[1])*del[2] +
+                    var[ind_sph(i + 1, j    , k)]*del[1]*(1. - del[2]) +
+                    var[ind_sph(i + 1, j + 1, k)]*del[1]*del[2];
+
+        // then interpolate in x3 if we need
+        if (n3 > 1) {
+            interp = (1. - del[3])*interp +
+                    del[3]*(var[ind_sph(i    , j    , k + 1)]*(1. - del[1])*(1. - del[2]) +
+                            var[ind_sph(i    , j + 1, k + 1)]*(1. - del[1])*del[2] +
+                            var[ind_sph(i + 1, j    , k + 1)]*del[1]*(1. - del[2]) +
+                            var[ind_sph(i + 1, j + 1, k + 1)]*del[1]*del[2]);
+        }
+    } else {
+        // interpolate in x1 and x2
+            interp = var[ind_periodic(i    , j    , k)]*(1. - del[1])*(1. - del[2]) +
+                    var[ind_periodic(i    , j + 1, k)]*(1. - del[1])*del[2] +
+                    var[ind_periodic(i + 1, j    , k)]*del[1]*(1. - del[2]) +
+                    var[ind_periodic(i + 1, j + 1, k)]*del[1]*del[2];
+
+        // then interpolate in x3 if we need
+        if (n3 > 1) {
+            interp = (1. - del[3])*interp +
+                    del[3]*(var[ind_periodic(i    , j    , k + 1)]*(1. - del[1])*(1. - del[2]) +
+                            var[ind_periodic(i    , j + 1, k + 1)]*(1. - del[1])*del[2] +
+                            var[ind_periodic(i + 1, j    , k + 1)]*del[1]*(1. - del[2]) +
+                            var[ind_periodic(i + 1, j + 1, k + 1)]*del[1]*del[2]);
+        }
+    }
+
+    return interp;
+}
+
diff --git a/kharma/prob/iharm_restart.cpp b/kharma/prob/resize_restart.cpp
similarity index 63%
rename from kharma/prob/iharm_restart.cpp
rename to kharma/prob/resize_restart.cpp
index ce0bf702..5823c3fd 100644
--- a/kharma/prob/iharm_restart.cpp
+++ b/kharma/prob/resize_restart.cpp
@@ -1,5 +1,5 @@
 /* 
- *  File: iharm_restart.cpp
+ *  File: resize_restart.cpp
  *  
  *  BSD 3-Clause License
  *  
@@ -32,10 +32,13 @@
  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "iharm_restart.hpp"
+#include "resize_restart.hpp"
 
+#include "b_flux_ct.hpp"
+#include "debug.hpp"
 #include "hdf5_utils.h"
 #include "mpi.hpp"
+#include "resize.hpp"
 #include "types.hpp"
 
 #include <sys/stat.h>
@@ -49,11 +52,16 @@ void periodic_x3(const GRCoordinates& G, GridVars P, int nghost, int n1, int n2,
 using namespace Kokkos;
 
 // TODO
-// Definitely check coordinate system params such that x1 in old mesh == x1 in new mesh
-// Implement Xtoijk and tri-linear (/etc) interp
-// Optimize by stashing file contents in a static pointer somewhere?
-// -> use above to re-map any restart to the given Parthenon mesh on import
-// Default to re-mapping but reintroduce option to set Parthenon mesh size to restart size
+// Record & read:
+// 1. startx/stopx/dx
+// 2. coordinate name FMKS/MKS/etc
+// 3. all coordinate params in play
+// 4. Electron MODEL bitflag param
+// 5. nprim for sanity check?
+// 6. Indication of EMHD vs MHD
+
+// TODO this code is very specific to spherical systems/boundares or entirely periodic boxes.
+// No other boundaries/geometries are really supported.
 
 void ReadIharmRestartHeader(std::string fname, std::unique_ptr<ParameterInput>& pin)
 {
@@ -63,14 +71,24 @@ void ReadIharmRestartHeader(std::string fname, std::unique_ptr<ParameterInput>&
     // Read everything from root
     hdf5_set_directory("/");
 
-    // Get size
+    // Get the grid size
     int n1file, n2file, n3file;
     hdf5_read_single_val(&n1file, "n1", H5T_STD_I32LE);
     hdf5_read_single_val(&n2file, "n2", H5T_STD_I32LE);
     hdf5_read_single_val(&n3file, "n3", H5T_STD_I32LE);
-    pin->SetInteger("parthenon/mesh", "nx1", n1file);
-    pin->SetInteger("parthenon/mesh", "nx2", n2file);
-    pin->SetInteger("parthenon/mesh", "nx3", n3file);
+    if (pin->GetOrAddBoolean("resize_restart", "use_restart_size", false)) {
+        // This locks the mesh size to be zone-for-zone the same as the iharm3d dump file
+        pin->SetInteger("parthenon/mesh", "nx1", n1file);
+        pin->SetInteger("parthenon/mesh", "nx2", n2file);
+        pin->SetInteger("parthenon/mesh", "nx3", n3file);
+        pin->SetInteger("parthenon/meshblock", "nx1", n1file);
+        pin->SetInteger("parthenon/meshblock", "nx2", n2file);
+        pin->SetInteger("parthenon/meshblock", "nx3", n3file);
+    }
+    // Record the old values in any case
+    pin->SetInteger("parthenon/mesh", "restart_nx1", n1file);
+    pin->SetInteger("parthenon/mesh", "restart_nx2", n2file);
+    pin->SetInteger("parthenon/mesh", "restart_nx3", n3file);
 
     double gam, cour, t, dt;
     hdf5_read_single_val(&gam, "gam", H5T_IEEE_F64LE);
@@ -79,9 +97,12 @@ void ReadIharmRestartHeader(std::string fname, std::unique_ptr<ParameterInput>&
     hdf5_read_single_val(&dt, "dt", H5T_IEEE_F64LE);
 
     pin->SetReal("GRMHD", "gamma", gam);
-    //pin->SetReal("GRMHD", "cfl", cour);
+    //pin->SetReal("GRMHD", "cfl", cour);  // TODO use_cour option?
+    // Setting dt here is actually for KHARMA,
+    // which returns this from EstimateTimestep in step 0
     pin->SetReal("parthenon/time", "dt", dt);
     pin->SetReal("parthenon/time", "start_time", t);
+    // TODO NSTEP, next tdump/tlog, etc? Do KHARMA globals need anything?
 
     if (hdf5_exists("a")) {
         double a, hslope, Rout;
@@ -158,19 +179,17 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
 
     auto& G = pmb->coords;
 
-    auto fname = pin->GetString("iharm_restart", "fname"); // Require this, don't guess
-    bool use_tf = pin->GetOrAddBoolean("iharm_restart", "use_tf", false);
+    auto fname = pin->GetString("resize_restart", "fname"); // Require this, don't guess
+    bool use_tf = pin->GetOrAddBoolean("resize_restart", "use_tf", false);
+    const bool is_spherical = pin->GetBoolean("coordinates", "spherical");
 
-    IndexDomain domain = IndexDomain::interior;
-    // Full mesh size
-    hsize_t n1tot = pmb->pmy_mesh->mesh_size.nx1;
-    hsize_t n2tot = pmb->pmy_mesh->mesh_size.nx2;
-    hsize_t n3tot = pmb->pmy_mesh->mesh_size.nx3;
-    // Our block size, start, and bounds for the GridVars
-    hsize_t n1 = pmb->cellbounds.ncellsi(domain);
-    hsize_t n2 = pmb->cellbounds.ncellsj(domain);
-    hsize_t n3 = pmb->cellbounds.ncellsk(domain);
+    // Size of the file mesh
+    hsize_t n1tot = pin->GetInteger("parthenon/mesh", "restart_nx1");
+    hsize_t n2tot = pin->GetInteger("parthenon/mesh", "restart_nx2");
+    hsize_t n3tot = pin->GetInteger("parthenon/mesh", "restart_nx3");
 
+    // Size/domain of the MeshBlock we're reading to
+    IndexDomain domain = IndexDomain::entire;
     int is = pmb->cellbounds.is(domain), ie = pmb->cellbounds.ie(domain);
     int js = pmb->cellbounds.js(domain), je = pmb->cellbounds.je(domain);
     int ks = pmb->cellbounds.ks(domain), ke = pmb->cellbounds.ke(domain);
@@ -179,7 +198,7 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
 
     // Read everything from root
     hdf5_set_directory("/");
-
+    // Print version
     hid_t string_type = hdf5_make_str_type(20);
     char version[20];
     hdf5_read_single_val(version, "version", string_type);
@@ -187,11 +206,12 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
         cout << "Restarting from " << fname << ", file version " << version << endl << endl;
     }
 
-    // Get tf here and not when reading the header, since using this
-    // value *itself* depends on a parameter, "use_tf"
-    Real tf;
+    // Get tf here and not when reading the header, since whether we use this
+    // value depends on another parameter, "use_tf," which needs to be initialized
+    double tf;
     hdf5_read_single_val(&tf, "tf", H5T_IEEE_F64LE);
 
+    // TODO do this better by recording/counting flags in MODEL
     hsize_t nfprim;
     if(hdf5_exists("game")) {
         nfprim = 10;
@@ -199,20 +219,16 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
         nfprim = 8;
     }
 
-    // Declare known sizes for outputting primitives
+    // Declare known sizes for inputting/outputting primitives
+    // We'll only ever read the full block, so this is the size we want
     static hsize_t fdims[] = {nfprim, n3tot, n2tot, n1tot};
-    static hsize_t fcount[] = {nfprim, n3, n2, n1};
-
-    // TODO figure out single restart -> multi mesh
-    //hsize_t fstart[] = {0, global_start[2], global_start[1], global_start[0]};
     hsize_t fstart[] = {0, 0, 0, 0};
 
-    // These are dimensions for memory,
-    static hsize_t mdims[] = {nfprim, n3, n2, n1};
-    static hsize_t mstart[] = {0, 0, 0, 0};
-
-    Real *ptmp = new Real[nfprim*n3*n2*n1];
-    hdf5_read_array(ptmp, "p", 4, fdims, fstart, fcount, mdims, mstart, H5T_IEEE_F64LE);
+    // TODO don't repeat this read for every block!
+    // Likely requires read once in e.g. InitUserMeshData
+    // -> pass in (pointer) -> delete[] in PostInit or something
+    Real *ptmp = new double[nfprim*n3tot*n2tot*n1tot]; // These will include B & thus be double or upconverted to it
+    hdf5_read_array(ptmp, "p", 4, fdims, fstart, fdims, fdims, fstart, H5T_IEEE_F64LE);
 
     // End HDF5 reads
     hdf5_close();
@@ -222,15 +238,37 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
     auto uvec_host = uvec.GetHostMirror();
     auto B_host = B_P.GetHostMirror();
 
-    // Host-side copy into the mirror.
-    // TODO traditional OpenMP still works...
+    // These are set to probably mirror the restart file,
+    // but ideally should be read straight from it.
+    const GReal startx[GR_DIM] = {0,
+        pin->GetReal("parthenon/mesh", "x1min"),
+        pin->GetReal("parthenon/mesh", "x2min"),
+        pin->GetReal("parthenon/mesh", "x3min")};
+    const GReal stopx[GR_DIM] = {0,
+        pin->GetReal("parthenon/mesh", "x1max"),
+        pin->GetReal("parthenon/mesh", "x2max"),
+        pin->GetReal("parthenon/mesh", "x3max")};
+    // Same here
+    const GReal dx[GR_DIM] = {0., (stopx[1] - startx[1])/n1tot,
+                                (stopx[2] - startx[2])/n2tot,
+                                (stopx[3] - startx[3])/n3tot};
+
+    const int block_sz = n3tot*n2tot*n1tot;
+
+    // Host-side interpolate & copy into the mirror array
+    // TODO Interpolate in native coordinates of restart
+    // NOTE: KOKKOS USES < not <=!! Therefore the RangePolicy below will seem like it is too big
     Kokkos::parallel_for("copy_restart_state",
         Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<3>>({ks, js, is}, {ke+1, je+1, ie+1}),
         KOKKOS_LAMBDA_3D {
-            rho_host(k, j, i) = ptmp[0*n3*n2*n1 + (k-ks)*n2*n1 + (j-js)*n1 + (i-is)];
-            u_host(k, j, i) = ptmp[1*n3*n2*n1 + (k-ks)*n2*n1 + (j-js)*n1 + (i-is)];
-            VLOOP uvec_host(v, k, j, i) = ptmp[(2+v)*n3*n2*n1 + (k-ks)*n2*n1 + (j-js)*n1 + (i-is)];
-            VLOOP B_host(v, k, j, i) = ptmp[(5+v)*n3*n2*n1 + (k-ks)*n2*n1 + (j-js)*n1 + (i-is)];
+            // Get the zone center location
+            GReal X[GR_DIM];
+            G.coord(k, j, i, Loci::center, X);
+            // Interpolate the value at this location from the global grid
+            rho_host(k, j, i) = interp_scalar(X, startx, stopx, dx, is_spherical, n3tot, n2tot, n1tot, &(ptmp[0*block_sz]));
+            u_host(k, j, i) = interp_scalar(X, startx, stopx, dx, is_spherical, n3tot, n2tot, n1tot, &(ptmp[1*block_sz]));
+            VLOOP uvec_host(v, k, j, i) = interp_scalar(X, startx, stopx, dx, is_spherical, n3tot, n2tot, n1tot, &(ptmp[(2+v)*block_sz]));
+            VLOOP B_host(v, k, j, i) = interp_scalar(X, startx, stopx, dx, is_spherical, n3tot, n2tot, n1tot, &(ptmp[(5+v)*block_sz]));
         }
     );
     delete[] ptmp;
@@ -242,32 +280,11 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
     B_P.DeepCopy(B_host);
     Kokkos::fence();
 
-    // Initialize the guesses for fluid prims in boundary zones
-    // TODO Is this still necessary?
-    // periodic_x3(G, P, Globals::nghost, n1, n2, n3);
-
     // Set the original simulation's end time, if we wanted that
+    // Used pretty much only for MHDModes restart test
     if (use_tf) {
         pin->SetReal("parthenon/time", "tlim", tf);
     }
 
     return TaskStatus::complete;
 }
-
-// void periodic_x3(const GRCoordinates& G, GridVars P, int nghost, int n1, int n2, int n3)
-// {
-//     Kokkos::parallel_for("periodic_x3_l", MDRangePolicy<Rank<3>>({0, 0, 0}, {nghost, n2+2*nghost, n1+2*nghost}),
-//         KOKKOS_LAMBDA_3D {
-//             int kz = k + n3;
-
-//             PLOOP P(p, k, j, i) = P(p, kz, j, i);
-//         }
-//     );
-//     Kokkos::parallel_for("periodic_x3_r", MDRangePolicy<Rank<3>>({n3+nghost, 0, 0}, {n3+2*nghost, n2+2*nghost, n1+2*nghost}),
-//         KOKKOS_LAMBDA_3D {
-//             int kz = k - n3;
-
-//             PLOOP P(p, k, j, i) = P(p, kz, j, i);
-//         }
-//     );
-// }
diff --git a/kharma/prob/iharm_restart.hpp b/kharma/prob/resize_restart.hpp
similarity index 100%
rename from kharma/prob/iharm_restart.hpp
rename to kharma/prob/resize_restart.hpp
diff --git a/pars/iharm_restart.par b/pars/resize_restart.par
similarity index 59%
rename from pars/iharm_restart.par
rename to pars/resize_restart.par
index f5690b18..aebdcb59 100644
--- a/pars/iharm_restart.par
+++ b/pars/resize_restart.par
@@ -2,16 +2,19 @@
 # Very limited for the moment
 
 <parthenon/job>
-problem_id = iharm_restart
+problem_id = resize_restart
 
 <parthenon/mesh>
 refinement = none
 numlevel = 1
-nx1 = 192
+nx1 = 128
 nx2 = 128
 nx3 = 128
 
-# ONLY ONE MESH (for now)
+<parthenon/meshblock>
+nx1 = 128
+nx2 = 128
+nx3 = 64
 
 <coordinates>
 base = spherical_ks
@@ -26,26 +29,38 @@ integrator = rk2
 dt_min = 0.00001
 
 <GRMHD>
-cfl = 0.7
-gamma = 1.444444
+cfl = 0.9
+gamma = 1.666667
 
-<iharm_restart>
-fname = restarts/restart_192_gold.h5
+<resize_restart>
+fname = restart_00000001.h5
 use_tf = false
+# Ignore meshsize above and use the restart's size
+use_restart_size = false
+
+<b_cleanup>
+error_tolerance = 1e-7
+check_interval = 100
+sor_factor = 15
 
 <floors>
 rho_min_geom = 1e-6
 u_min_geom = 1e-8
 bsq_over_rho_max = 100
-bsq_over_u_max = 10000
+bsq_over_u_max = 50
 u_over_rho_max = 100
-ktot_max = 3
+
+<debug>
+verbose = 2
+flag_verbose = 1
+extra_checks = 1
 
 <parthenon/output0>
 file_type = hdf5
 dt = 1.0
 single_precision_output = true
 variables = prims.rho, prims.u, prims.uvec, prims.B
+ghost_zones = true
 
 <parthenon/output1>
 file_type = hst

From d060fc6c1957c9a506b2fcf28ed09a23f5168498 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 2 Mar 2022 15:57:42 -0600
Subject: [PATCH 09/26] Fix evolution without a B field, update tests. IMHD
 still converges at 1o

---
 .gitlab-ci.yml                   |   2 +
 kharma/grmhd/grmhd_functions.hpp |  71 ++++++++++------
 kharma/harm_driver.cpp           |   6 +-
 kharma/imex_driver.cpp           |   8 +-
 kharma/implicit/implicit.cpp     |  38 +++++----
 pars/bondi.par                   |   2 +
 pars/mhdmodes.par                |   2 +
 scripts/compare.py               |   8 +-
 scripts/quick_movie.sh           |   9 --
 scripts/quick_plot.py            | 141 -------------------------------
 tests/bondi/check.py             |   6 +-
 tests/bondi/check.sh             |   2 +-
 tests/bondi/run.sh               |   6 +-
 tests/bz_monopole/check.py       |   4 +-
 tests/bz_monopole/check.sh       |   2 +-
 tests/mhdmodes/check.py          |   4 +-
 tests/mhdmodes/check.sh          |  26 +++---
 tests/mhdmodes/run.sh            |  24 +++---
 tests/noh/check.sh               |   4 +-
 tests/restart/check.sh           |   4 +-
 tests/tilt_init/check.py         |   6 +-
 tests/tilt_init/check.sh         |   2 +-
 22 files changed, 131 insertions(+), 246 deletions(-)
 delete mode 100755 scripts/quick_movie.sh
 delete mode 100644 scripts/quick_plot.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 53273b62..58c9968d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -34,6 +34,7 @@ bondi:
     paths:
       - tests/bondi/*.png
       - tests/bondi/*.hst
+      - tests/bondi/*.txt
 
 mhdmodes:
   stage: tests
@@ -47,6 +48,7 @@ mhdmodes:
     paths:
       - tests/mhdmodes/*.png
       - tests/mhdmodes/*.hst
+      - tests/mhdmodes/*.txt
 
 noh:
   stage: tests
diff --git a/kharma/grmhd/grmhd_functions.hpp b/kharma/grmhd/grmhd_functions.hpp
index 8ef89c86..134ac557 100644
--- a/kharma/grmhd/grmhd_functions.hpp
+++ b/kharma/grmhd/grmhd_functions.hpp
@@ -39,6 +39,28 @@
 #include "types.hpp"
 #include "kharma_utils.hpp"
 
+/**
+ * This namespace is solely for calc_tensor.
+ * calc_4vecs above intelligently skips the bcon calculation if B field is not present
+ */
+namespace GRHD
+{
+/**
+ * Get a row of the hydrodynamic stress-energy tensor with first index up, second index down.
+ */
+KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Real& pgas,
+                                            const FourVectors& D, const int dir,
+                                            Real hd[GR_DIM])
+{
+    const Real eta = pgas + rho + u;
+    DLOOP1 {
+        hd[mu] = eta * D.ucon[dir] * D.ucov[mu] +
+                 pgas * (dir == mu);
+    }
+}
+
+}
+
 /**
  * Device-side GR(M)HD functions
  * Anything reasonably specific to doing GRHD/GRMHD, which will not change:
@@ -48,9 +70,8 @@
  * Many also have a form for split variables rho, uvec, etc, and one for a full array of primitive variables P.
  * Where all 4 combinations are used, we get 4 overloads.
  * 
- * Local full-primitives versions are templated, to accept Slices/Scratch/etc equivalently 
+ * Local full-primitives versions are templated, to accept Slices/Scratch/etc equivalently
  */
-
 namespace GRMHD
 {
 
@@ -153,6 +174,7 @@ KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Real uvec[N
 
     G.lower(D.ucon, D.ucov, k, j, i, loc);
 
+    // This fn is guaranteed to have B values
     D.bcon[0] = 0;
     VLOOP D.bcon[0] += B_P[v] * D.ucov[v+1];
     VLOOP D.bcon[v+1] = (B_P[v] + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
@@ -171,6 +193,7 @@ KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const GridVector
 
     G.lower(D.ucon, D.ucov, k, j, i, loc);
 
+    // This fn is guaranteed to have B values
     D.bcon[0] = 0;
     VLOOP D.bcon[0] += B_P(v, k, j, i) * D.ucov[v+1];
     VLOOP D.bcon[v+1] = (B_P(v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
@@ -196,6 +219,8 @@ KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Local& P, c
         VLOOP D.bcon[v+1] = (P(m.B1 + v) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
 
         G.lower(D.bcon, D.bcov, 0, j, i, loc);
+    } else {
+        DLOOP1 D.bcon[mu] = D.bcov[mu] = 0.;
     }
 }
 KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
@@ -215,6 +240,8 @@ KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const VariablePac
         VLOOP D.bcon[v+1] = (P(m.B1 + v, k, j, i) + D.bcon[0] * D.ucon[v+1]) / D.ucon[0];
 
         G.lower(D.bcon, D.bcov, k, j, i, loc);
+    } else {
+        DLOOP1 D.bcon[mu] = D.bcov[mu] = 0.;
     }
 }
 /**
@@ -242,7 +269,7 @@ KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const Real uvec[NV
 }
 
 /**
- * Global GRMHD-only "p_to_u" call: just MHD variables (no B!). TODO elminate?
+ * Global GRMHD-only "p_to_u" call: just MHD variables (uses B optionally, but no output). TODO elminate?
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const VarMap& m_p,
@@ -263,6 +290,14 @@ KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const
         U(m_u.U1) =  mhd[1] * gdet;
         U(m_u.U2) =  mhd[2] * gdet;
         U(m_u.U3) =  mhd[3] * gdet;
+    } else {
+        // HD stress-energy tensor w/ first index up, second index down
+        Real hd[GR_DIM];
+        GRHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), Dtmp, 0, hd);
+        U(m_u.UU) = hd[0] * gdet + U(m_u.RHO);
+        U(m_u.U1) = hd[1] * gdet;
+        U(m_u.U2) = hd[2] * gdet;
+        U(m_u.U3) = hd[3] * gdet;
     }
 }
 KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
@@ -283,6 +318,14 @@ KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Re
         U(m_u.U1, k, j, i) =  mhd[1] * gdet;
         U(m_u.U2, k, j, i) =  mhd[2] * gdet;
         U(m_u.U3, k, j, i) =  mhd[3] * gdet;
+    } else {
+        // HD stress-energy tensor w/ first index up, second index down
+        Real hd[GR_DIM];
+        GRHD::calc_tensor(P(m_p.RHO, k, j, i), P(m_p.UU, k, j, i), (gam - 1) * P(m_p.UU, k, j, i), Dtmp, 0, hd);
+        U(m_u.UU, k, j, i) = hd[0] * gdet + U(m_u.RHO, k, j, i);
+        U(m_u.U1, k, j, i) = hd[1] * gdet;
+        U(m_u.U2, k, j, i) = hd[2] * gdet;
+        U(m_u.U3, k, j, i) = hd[3] * gdet;
     }
 }
 
@@ -311,25 +354,3 @@ KOKKOS_INLINE_FUNCTION void p_to_u_mhd(const GRCoordinates& G, const Real& rho,
 }
 
 }
-
-/**
- * This namespace is solely for calc_tensor.
- * calc_4vecs above intelligently skips the bcon calculation if B field is not present
- */
-namespace GRHD
-{
-/**
- * Get a row of the hydrodynamic stress-energy tensor with first index up, second index down.
- */
-KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Real& pgas,
-                                            const FourVectors& D, const int dir,
-                                            Real hd[GR_DIM])
-{
-    const Real eta = pgas + rho + u;
-    DLOOP1 {
-        hd[mu] = eta * D.ucon[dir] * D.ucov[mu] +
-                 pgas * (dir == mu);
-    }
-}
-
-}
diff --git a/kharma/harm_driver.cpp b/kharma/harm_driver.cpp
index f7e218f1..0bff7667 100644
--- a/kharma/harm_driver.cpp
+++ b/kharma/harm_driver.cpp
@@ -173,11 +173,11 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         // ADD SOURCES TO CONSERVED VARIABLES
         // Source term for GRMHD, \Gamma * T
         // TODO take this out in Minkowski space
-        auto t_flux_apply = tl.AddTask(t_flux_div, GRMHD::AddSource, mc0.get(), mdudt.get());
+        auto t_grmhd_source = tl.AddTask(t_flux_div, GRMHD::AddSource, mc0.get(), mdudt.get());
         // Source term for constraint-damping.  Applied only to B
-        auto t_b_cd_source = t_flux_apply;
+        auto t_b_cd_source = t_grmhd_source;
         if (use_b_cd) {
-            t_b_cd_source = tl.AddTask(t_flux_apply, B_CD::AddSource, mc0.get(), mdudt.get());
+            t_b_cd_source = tl.AddTask(t_grmhd_source, B_CD::AddSource, mc0.get(), mdudt.get());
         }
         // Wind source.  Applied to conserved variables similar to GR source term
         auto t_wind_source = t_b_cd_source;
diff --git a/kharma/imex_driver.cpp b/kharma/imex_driver.cpp
index 12e1a152..e19c09b9 100644
--- a/kharma/imex_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -173,11 +173,11 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         // ADD EXPLICIT SOURCES TO CONSERVED VARIABLES
         // Source term for GRMHD, \Gamma * T
         // TODO take this out in Minkowski space
-        auto t_flux_apply = tl.AddTask(t_flux_div, GRMHD::AddSource, mc0.get(), mdudt.get());
+        auto t_grmhd_source = tl.AddTask(t_flux_div, GRMHD::AddSource, mc0.get(), mdudt.get());
         // Source term for constraint-damping.  Applied only to B
-        auto t_b_cd_source = t_flux_apply;
+        auto t_b_cd_source = t_grmhd_source;
         if (use_b_cd) {
-            t_b_cd_source = tl.AddTask(t_flux_apply, B_CD::AddSource, mc0.get(), mdudt.get());
+            t_b_cd_source = tl.AddTask(t_grmhd_source, B_CD::AddSource, mc0.get(), mdudt.get());
         }
         // Wind source.  Applied to conserved variables similar to GR source term
         auto t_wind_source = t_b_cd_source;
@@ -330,7 +330,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             t_heat_electrons = tl.AddTask(t_fix_derived, Electrons::ApplyElectronHeating, sc0.get(), sc1.get());
         }
 
-        // 
+        // Make sure conserved vars are synchronized at step end
         auto t_ptou = tl.AddTask(t_heat_electrons, Flux::PtoUTask, sc1.get());
 
         auto t_step_done = t_ptou;
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
index 5d336e5d..9b23d7a4 100644
--- a/kharma/implicit/implicit.cpp
+++ b/kharma/implicit/implicit.cpp
@@ -104,9 +104,8 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     auto& Us_all = md0->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved});
     // Flux divergence plus explicit source terms. This is what we'd be adding 
     auto& dUdt_all = dudt->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved});
-    // Desired final state.  Note this is prims only: we sync these, then run P->U on each node.
-    // TODO REMEMBER TO COPY IN MD0 CONTENTS AS GUESS
-    auto& P_solver_all = md1->PackVariables(std::vector<MetadataFlag>{isPrimitive});
+    // Desired final state.
+    auto& Pf_all = md1->PackVariables(std::vector<MetadataFlag>{isPrimitive});
 
     // Note this iterator, like all of KHARMA, requires nprim == ncons
     // TODO Maybe should enforce that at start?
@@ -114,11 +113,13 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     const int nvar = Ui_all.GetDim(4);
 
     // Workspaces for iteration, include ghosts to match indices.
-    // Probably should never need coarse/entire...
-    auto bounds = pmb0->cellbounds; //coarse ? pmb0->c_cellbounds : pmb0->cellbounds;
+    auto bounds = pmb0->cellbounds;
     const int n1 = bounds.ncellsi(IndexDomain::entire);
     const int n2 = bounds.ncellsj(IndexDomain::entire);
     const int n3 = bounds.ncellsk(IndexDomain::entire);
+    // A full space for solver iterations, as Pi/Pf may be aliased:
+    // thus we don't want to write anything until we're done.
+    ParArray5D<Real> P_solver_all("P_solver", nblock, nvar, n3, n2, n1);
 
     // The norm of the residual.  We store this to avoid the main kernel
     // also being a 2-stage reduction, which is complex and sucks.
@@ -135,15 +136,15 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     const IndexRange jb = bounds.GetBoundsJ(domain);
     const IndexRange kb = bounds.GetBoundsK(domain);
     const IndexRange block = IndexRange{0, nblock - 1};
-    //const IndexRange vb = IndexRange{0, nvar - 1};
+    const IndexRange vb = IndexRange{0, nvar - 1};
 
     // Allocate scratch space
     // It is impossible to declare runtime-sized arrays in CUDA
     // of e.g. length var[nvar] (recall nvar can change at runtime in KHARMA)
     // Instead we copy to scratch!
-    // This allows flexibility in structuring the kernel, as
-    // well as slicing, which in turn allows writing just *one* version of each operation!
-    // Older versions of KHARMA solved this with overloads, it was a mess.  This is less mess.
+    // This allows flexibility in structuring the kernel, and the results can be sliced
+    // to avoid a bunch of indices in all the device-side operations
+    // See grmhd_functions.hpp for the other approach with overloads
     const int scratch_level = 1; // 0 is actual scratch (tiny); 1 is HBM
     const size_t var_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, n1);
     const size_t tensor_size_in_bytes = parthenon::ScratchPad3D<Real>::shmem_size(nvar, nvar, n1);
@@ -192,7 +193,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         if (iter == 0) {
                             P_solver_s(ip, i) = Ps_all(b)(ip, k, j, i);
                         } else {
-                            P_solver_s(ip, i) = P_solver_all(b)(ip, k, j, i);
+                            P_solver_s(ip, i) = P_solver_all(b, ip, k, j, i);
                         }
                     }
                 );
@@ -224,10 +225,9 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
 
                         // Jacobian calculation
                         // Requires calculating the residual anyway, so we grab it here
-                        // (the array will eventually hold delta_prim, after the matrix solve)
                         calc_jacobian(G, P_solver, Ui, Us, dUdt, dUi, tmp1, tmp2, tmp3,
                                       m_p, m_u, nvar, j, i, delta, gam, dt, jacobian, residual);
-                        // Initial delta prim is negative residual
+                        // Solve against the negative residual
                         PLOOP delta_prim(ip) = -residual(ip);
 
                         // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 8) {
@@ -243,6 +243,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
 
                         // Linear solve
                         // This code lightly adapted from Kokkos batched examples
+                        // Replaces our inverse residual with the actual desired delta_prim
                         KokkosBatched::SerialLU<Algo::LU::Unblocked>::invoke(jacobian, tiny);
                         KokkosBatched::SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>
                         ::invoke(alpha, jacobian, delta_prim);
@@ -277,13 +278,13 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                 // This combo still works if P_solver is aliased to one of the other arrays!
                 PLOOP parthenon::par_for_inner(member, ib.s, ib.e,
                     [&](const int& i) {
-                        P_solver_all(b)(ip, k, j, i) = P_solver_s(ip, i);
+                        P_solver_all(b, ip, k, j, i) = P_solver_s(ip, i);
                     }
                 );
             }
         );
-
-        // L2 norm maximum.
+        
+        // Take the maximum L2 norm
         Real max_norm;
         Kokkos::Max<Real> norm_max(max_norm);
         pmb0->par_reduce("max_norm", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
@@ -295,6 +296,13 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
         if (MPIRank0()) fprintf(stdout, "Nonlinear iter %d. Max L2 norm: %g\n", iter, max_norm);
     }
 
+    // Write to Pf
+    pmb0->par_for("write_Pf", block.s, block.e, vb.s, vb.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_VARS {
+            Pf_all(b)(p, k, j, i) = P_solver_all(b, p, k, j, i);
+        }
+    );
+
     return TaskStatus::complete;
 
 }
diff --git a/pars/bondi.par b/pars/bondi.par
index d143a8cf..3341de54 100644
--- a/pars/bondi.par
+++ b/pars/bondi.par
@@ -58,6 +58,8 @@ verbose = 0
 <driver>
 type = harm
 step = explicit
+
+<implicit>
 max_nonlinear_iter = 3
 
 <parthenon/output0>
diff --git a/pars/mhdmodes.par b/pars/mhdmodes.par
index 219d8861..83e22dd4 100644
--- a/pars/mhdmodes.par
+++ b/pars/mhdmodes.par
@@ -59,6 +59,8 @@ verbose = 0
 <driver>
 type = harm
 step = explicit
+
+<implicit>
 max_nonlinear_iter = 3
 
 <parthenon/output0>
diff --git a/scripts/compare.py b/scripts/compare.py
index 6caad739..a24b2fb2 100644
--- a/scripts/compare.py
+++ b/scripts/compare.py
@@ -8,10 +8,10 @@
 
 from __future__ import print_function, division
 
-import pyHARM
-from pyHARM import parameters
-import pyHARM.ana.plot as pplt
-import pyHARM.util as util
+import pyharm
+from pyharm import parameters
+import pyharm.plots.plot_dumps as pplt
+import pyharm.util as util
 
 import os,sys
 import numpy as np
diff --git a/scripts/quick_movie.sh b/scripts/quick_movie.sh
deleted file mode 100755
index 04371a2f..00000000
--- a/scripts/quick_movie.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-NAME=$1
-VAR=$2
-mkdir -p frames_${NAME}_${VAR}
-cd frames_${NAME}_${VAR}
-
-SCRIPT_DIR="$(dirname $0)"
-parallel -P 8 python $SCRIPT_DIR/quick_plot.py {} $SCRIPT_DIR/../pars/${NAME}.par $VAR frame_{#} ::: ../${NAME}.*.phdf
diff --git a/scripts/quick_plot.py b/scripts/quick_plot.py
deleted file mode 100644
index aceb814c..00000000
--- a/scripts/quick_plot.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""
- File: quick_plot.py
- 
- BSD 3-Clause License
- 
- Copyright (c) 2020, AFD Group at UIUC
- All rights reserved.
- 
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 
- 1. Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
- 
- 2. Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
- 
- 3. Neither the name of the copyright holder nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
- 
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-
-################################################################################
-#                                                                              #
-#  PLOT ONE PRIMITIVE                                                          #
-#                                                                              #
-################################################################################
-
-import sys
-import numpy as np
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-
-import cProfile
-
-# TODO package interface...
-import pyHARM
-import pyHARM.ana.plot as pplt
-from pyHARM import pretty
-from pyHARM.ana.units import get_units_M87
-import pyHARM.parameters as parameters
-
-# TODO parse these instead of hard-coding
-USEARRSPACE = True
-
-if not USEARRSPACE:
-    SIZE = 50
-    #window = (0, SIZE, 0, SIZE)
-    window = (-SIZE, SIZE, -SIZE, SIZE)
-    # window=(-SIZE/4, SIZE/4, 0, SIZE)
-else:
-    window = (0, 1, 0, 1)
-    #window = (-0.1, 1.1, -0.1, 1.1)
-
-pdf_window = (-10, 0)
-FIGX = 10
-FIGY = 10
-
-dumpfile = sys.argv[1]
-parfile = sys.argv[2]
-var = sys.argv[3]
-# Optionally take extra name, otherwise just set it to var
-name = sys.argv[-1]
-
-if len(sys.argv) > 5:
-    munit = float(sys.argv[4])
-    cgs = get_units_M87(munit)
-    print("Uisng M_unit: ", munit)
-    unit = cgs[sys.argv[3]]
-    print("Will multiply by unit {} with value {}".format(sys.argv[3], unit))
-    name = var + "_units"
-else:
-    unit = 1
-
-#params = {'include_ghost': True}
-params = {}
-parameters.parse_parthenon_dat(params, parfile)
-parameters.fix(params)
-dump = pyHARM.load_dump(dumpfile, params=params)
-
-# Plot vectors in 4-pane layout
-# fig = plt.figure(figsize=(FIGX, FIGY))
-# plt.title(pretty(var))
-
-# if var in ['jcon', 'jcov', 'ucon', 'ucov', 'bcon', 'bcov']:
-#     axes = [plt.subplot(2, 2, i) for i in range(1, 5)]
-#     for n in range(4):
-#         pplt.plot_xy(axes[n], dump, np.log10(dump[var][n] * unit), arrayspace=USEARRSPACE, window=window)
-# elif "pdf_" in var:
-#     fig = plt.figure(figsize=(FIGX, FIGY))
-#     d_var, d_var_bins = dump[var]
-#     plt.plot(d_var_bins[:-1], d_var)
-#     if "_log_" in var:
-#         plt.xlabel("Log10 value")
-#     elif "_ln_" in var:
-#         plt.xlabel("Ln value")
-#     else:
-#         plt.xlabel("Value")
-#     plt.ylabel("Frequency")
-
-#     plt.savefig(name+".png", dpi=100)
-#     plt.close(fig)
-#     exit() # We already saved the figure, we don't need another
-# else:
-#     # TODO allow specifying vmin/max, average from command line or above
-#     ax = plt.subplot(1, 1, 1)
-#     pplt.plot_xy(ax, dump, dump[var] * unit, log=False, arrayspace=USEARRSPACE, window=window)
-
-# plt.tight_layout()
-# plt.savefig(name + "_xy.png", dpi=100)
-# plt.close(fig)
-
-# Plot XZ
-fig = plt.figure(figsize=(FIGX, FIGY))
-
-if var in ['jcon', 'jcov', 'ucon', 'ucov', 'bcon', 'bcov']:
-    axes = [plt.subplot(2, 2, i) for i in range(1, 5)]
-    for n in range(4):
-        pplt.plot_xz(axes[n], dump, np.log10(dump[var][n] * unit), arrayspace=USEARRSPACE, window=window)
-else:
-    ax = plt.subplot(1, 1, 1)
-    pplt.plot_xz(ax, dump, dump[var] * unit, log=False, arrayspace=USEARRSPACE, window=window)
-    #pplt.overlay_field(ax, dump, nlines=5, arrayspace=USEARRSPACE)
-
-plt.tight_layout()
-
-plt.savefig(name + "_xz.png", dpi=100)
-plt.close(fig)
diff --git a/tests/bondi/check.py b/tests/bondi/check.py
index 8ef6a1fb..1c0aecdf 100644
--- a/tests/bondi/check.py
+++ b/tests/bondi/check.py
@@ -6,7 +6,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-import pyHARM
+import pyharm
 
 RES = [int(x) for x in sys.argv[1].split(",")]
 LONG = sys.argv[2]
@@ -19,8 +19,8 @@
 
 # 2d
 for res in RES:
-    start = pyHARM.load_dump("bondi_2d_{}_start_{}.phdf".format(res, SHORT))
-    end = pyHARM.load_dump("bondi_2d_{}_end_{}.phdf".format(res, SHORT))
+    start = pyharm.load_dump("bondi_2d_{}_start_{}.phdf".format(res, SHORT))
+    end = pyharm.load_dump("bondi_2d_{}_end_{}.phdf".format(res, SHORT))
     params = start.params
 
     r = start['r'][:,start['n2']//2]
diff --git a/tests/bondi/check.sh b/tests/bondi/check.sh
index ad13c435..85304def 100755
--- a/tests/bondi/check.sh
+++ b/tests/bondi/check.sh
@@ -3,7 +3,7 @@
 # Run checks against analytic result for specified tests
 
 . ~/libs/anaconda3/etc/profile.d/conda.sh
-conda activate pyHARM
+conda activate pyharm
 
 res="32,48,64,96,128"
 python check.py $res "in 2D, FMKS coordinates" fmks || fail=1
diff --git a/tests/bondi/run.sh b/tests/bondi/run.sh
index b7c7e4c6..ac5f4435 100755
--- a/tests/bondi/run.sh
+++ b/tests/bondi/run.sh
@@ -20,10 +20,10 @@ conv_2d() {
 # Test coordinates (raw ks?)
 conv_2d fmks coordinates/transform=fmks
 conv_2d mks coordinates/transform=mks
-conv_2d eks coordinates/transform=eks # TODO fix eks in pyHARM
+conv_2d eks coordinates/transform=eks
 # Recon
 conv_2d linear_mc GRMHD/reconstruction=linear_mc
 conv_2d linear_vl GRMHD/reconstruction=linear_vl
 # And the GRIM/classic driver
-conv_2d imex driver/type=grim
-conv_2d imex_im "driver/type=grim driver/step=implicit"
+conv_2d imex driver/type=imex
+conv_2d imex_im "driver/type=imex driver/step=implicit"
diff --git a/tests/bz_monopole/check.py b/tests/bz_monopole/check.py
index c33274b7..d6c53e2f 100755
--- a/tests/bz_monopole/check.py
+++ b/tests/bz_monopole/check.py
@@ -6,8 +6,8 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-import pyHARM
-import pyHARM.ana.plot as hplt
+import pyharm
+import pyharm.plots.plot_dumps as hplt
 
 for dumpname in np.sort(glob.glob("bz_monopole.out0.*.phdf")):
     dump = pyHARM.load_dump(dumpname)
diff --git a/tests/bz_monopole/check.sh b/tests/bz_monopole/check.sh
index 3792d1d6..2bcb2b13 100755
--- a/tests/bz_monopole/check.sh
+++ b/tests/bz_monopole/check.sh
@@ -3,6 +3,6 @@
 # Run checks against analytic result for specified tests
 
 . ~/libs/anaconda3/etc/profile.d/conda.sh
-conda activate pyHARM
+conda activate pyharm
 
 python3 ./check.py
diff --git a/tests/mhdmodes/check.py b/tests/mhdmodes/check.py
index ccafb1dd..581765ef 100644
--- a/tests/mhdmodes/check.py
+++ b/tests/mhdmodes/check.py
@@ -6,7 +6,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-import pyHARM
+import pyharm
 
 RES = [int(x) for x in sys.argv[1].split(",")]
 LONG = sys.argv[2]
@@ -95,7 +95,7 @@
 # USE DUMPS IN FOLDERS OF GIVEN FORMAT
 for m, res in enumerate(RES):
     print(DIM, res, SHORT)
-    dump = pyHARM.load_dump("mhd_{}_{}_end_{}.phdf".format(DIM, res, SHORT))
+    dump = pyharm.load_dump("mhd_{}_{}_end_{}.phdf".format(DIM, res, SHORT))
     params = dump.params
 
     X1 = dump['x']
diff --git a/tests/mhdmodes/check.sh b/tests/mhdmodes/check.sh
index 498df83e..61c69b88 100755
--- a/tests/mhdmodes/check.sh
+++ b/tests/mhdmodes/check.sh
@@ -3,27 +3,27 @@
 # Run checks against analytic result for specified tests
 
 . ~/libs/anaconda3/etc/profile.d/conda.sh
-conda activate pyHARM
+conda activate pyharm
 
 RES3D="16,24,32,48"
 RES2D="32,64,128,256"
 
 fail=0
-python3 check.py $RES3D "entropy mode in 3D" entropy || fail=1
-python3 check.py $RES3D "slow mode in 3D" slow || fail=1
-python3 check.py $RES3D "Alfven mode in 3D" alfven || fail=1
-python3 check.py $RES3D "fast mode in 3D" fast || fail=1
+#python3 check.py $RES3D "entropy mode in 3D" entropy || fail=1
+#python3 check.py $RES3D "slow mode in 3D" slow || fail=1
+#python3 check.py $RES3D "Alfven mode in 3D" alfven || fail=1
+#python3 check.py $RES3D "fast mode in 3D" fast || fail=1
 
-python3 check.py $RES3D "entropy mode in 3D, linear/MC reconstruction" entropy_mc || fail=1
-python3 check.py $RES3D "entropy mode in 3D, linear/VL reconstruction" entropy_vl || fail=1
+#python3 check.py $RES3D "entropy mode in 3D, linear/MC reconstruction" entropy_mc || fail=1
+#python3 check.py $RES3D "entropy mode in 3D, linear/VL reconstruction" entropy_vl || fail=1
 
-python3 check.py $RES3D "slow mode in 3D, classic algo" slow_grim || fail=1
-python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_grim || fail=1
-python3 check.py $RES3D "fast mode in 3D, classic algo" fast_grim || fail=1
+#python3 check.py $RES3D "slow mode in 3D, classic algo" slow_imex || fail=1
+#python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_imex || fail=1
+#python3 check.py $RES3D "fast mode in 3D, classic algo" fast_imex || fail=1
 
-python3 check.py $RES3D "slow mode in 3D, classic algo" slow_grim_im || fail=1
-python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_grim_im || fail=1
-python3 check.py $RES3D "fast mode in 3D, classic algo" fast_grim_im || fail=1
+python3 check.py $RES3D "slow mode in 3D, classic algo" slow_imex_im || fail=1
+python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_imex_im || fail=1
+python3 check.py $RES3D "fast mode in 3D, classic algo" fast_imex_im || fail=1
 
 #python3 check.py $RES2D "fast mode in 2D, WENO5" fast2d 2d || fail=1
 #python3 check.py $RES2D "fast mode in 2D, linear/MC reconstruction" fast_mc 2d || fail=1
diff --git a/tests/mhdmodes/run.sh b/tests/mhdmodes/run.sh
index 02b129db..5aba2b09 100755
--- a/tests/mhdmodes/run.sh
+++ b/tests/mhdmodes/run.sh
@@ -47,21 +47,21 @@ conv_1d() {
 }
 
 # These 3 double as a demo of why WENO is great
-conv_3d entropy mhdmodes/nmode=0
-conv_3d entropy_mc "mhdmodes/nmode=0 GRMHD/reconstruction=linear_mc"
-conv_3d entropy_vl "mhdmodes/nmode=0 GRMHD/reconstruction=linear_vl"
+#conv_3d entropy mhdmodes/nmode=0
+#conv_3d entropy_mc "mhdmodes/nmode=0 GRMHD/reconstruction=linear_mc"
+#conv_3d entropy_vl "mhdmodes/nmode=0 GRMHD/reconstruction=linear_vl"
 # Other modes don't benefit, exercise WENO most since we use it
-conv_3d slow mhdmodes/nmode=1
-conv_3d alfven mhdmodes/nmode=2
-conv_3d fast mhdmodes/nmode=3
+#conv_3d slow mhdmodes/nmode=1
+#conv_3d alfven mhdmodes/nmode=2
+#conv_3d fast mhdmodes/nmode=3
 # And we've got to test classic/GRIM stepping
-conv_3d slow_imex   "mhdmodes/nmode=1 driver/type=imex"
-conv_3d alfven_imex "mhdmodes/nmode=2 driver/type=imex"
-conv_3d fast_imex   "mhdmodes/nmode=3 driver/type=imex"
+#conv_3d slow_imex   "mhdmodes/nmode=1 driver/type=imex"
+#conv_3d alfven_imex "mhdmodes/nmode=2 driver/type=imex"
+#conv_3d fast_imex   "mhdmodes/nmode=3 driver/type=imex"
 # And the implicit solver
-conv_3d slow_imex_im   "mhdmodes/nmode=1 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=3"
-conv_3d alfven_imex_im "mhdmodes/nmode=2 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=3"
-conv_3d fast_imex_im   "mhdmodes/nmode=3 driver/type=imex driver/step=implicit driver/max_nonlinear_iter=3"
+conv_3d slow_imex_im   "mhdmodes/nmode=1 driver/type=imex driver/step=implicit implicit/max_nonlinear_iter=3"
+conv_3d alfven_imex_im "mhdmodes/nmode=2 driver/type=imex driver/step=implicit implicit/max_nonlinear_iter=3"
+conv_3d fast_imex_im   "mhdmodes/nmode=3 driver/type=imex driver/step=implicit implicit/max_nonlinear_iter=3"
 
 # 2D modes use small blocks, could pick up some problems at MPI ranks >> 1
 # Currently very slow, plus modes are incorrect
diff --git a/tests/noh/check.sh b/tests/noh/check.sh
index 66c5bb68..84d26354 100755
--- a/tests/noh/check.sh
+++ b/tests/noh/check.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
 BASEDIR=.
-PYHARMDIR=$HOME/Code/pyHARM
+PYHARMDIR=$HOME/Code/pyharm
 
 . ~/libs/anaconda3/etc/profile.d/conda.sh
-conda activate pyHARM
+conda activate pyharm
 
 python3 $PYHARMDIR/scripts/kharma_convert.py *.phdf
 python3 $BASEDIR/check.py . . 64,128,256,512,1024,2048,4096 1.666667
diff --git a/tests/restart/check.sh b/tests/restart/check.sh
index 565f5ced..6be7b456 100755
--- a/tests/restart/check.sh
+++ b/tests/restart/check.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
 . ~/libs/anaconda3/etc/profile.d/conda.sh
-conda activate pyHARM
+conda activate pyharm
 
 # Set paths
 KHARMADIR=../..
 
-python3 $KHARMADIR/scripts/compare.py torus.out0.final.init.phdf torus.out0.final.restart.phdf init_vs_restart
\ No newline at end of file
+python3 $KHARMADIR/scripts/compare.py torus.out0.final.init.phdf torus.out0.final.restart.phdf init_vs_restart
diff --git a/tests/tilt_init/check.py b/tests/tilt_init/check.py
index 5f9142c0..e9771e08 100755
--- a/tests/tilt_init/check.py
+++ b/tests/tilt_init/check.py
@@ -6,11 +6,11 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-import pyHARM
-import pyHARM.ana.plot as hplt
+import pyharm
+import pyharm.plots.plot_dumps as hplt
 
 dumpname = "torus.out0.00000.phdf"
-dump = pyHARM.load_dump(dumpname, calc_derived=True)
+dump = pyharm.load_dump(dumpname, calc_derived=True)
 fig, ax = plt.subplots(1,1,figsize=(7,7))
 hplt.plot_xz(ax, dump, 'log_beta', window=[-200,200,-200,200])
 plt.savefig(dumpname+"_beta.png")
diff --git a/tests/tilt_init/check.sh b/tests/tilt_init/check.sh
index 827e7e8a..dd2a8ad8 100755
--- a/tests/tilt_init/check.sh
+++ b/tests/tilt_init/check.sh
@@ -3,6 +3,6 @@
 # Image the first dump to ensure tilted disk is created properly
 
 . ~/libs/anaconda3/etc/profile.d/conda.sh
-conda activate pyHARM
+conda activate pyharm
 
 python3 ./check.py

From 7b24261ed5f13ed6a70a263a74af280122ae895b Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Fri, 4 Mar 2022 11:01:08 -0600
Subject: [PATCH 10/26] Add EMHD Source terms

To avoid runtime "codegen" like in CoordinateEmbedding, which
was complicated & might be slow, the problem-specific closure
terms are implemented all together, and switched/controlled
by a new struct Closure.  This allows some code sharing as well.

Otherwise implementation was straightforward, glorified copy/paste
from working code in iharm3d courtesy of Vedant Dhruv.

Implicit stepper still converges at first order!
---
 kharma/CMakeLists.txt                  |   2 +-
 kharma/emhd/emhd.cpp                   | 144 ++++++++++++-
 kharma/emhd/emhd.hpp                   | 100 +++++++--
 kharma/emhd/emhd_sources.hpp           | 273 +++++++++----------------
 kharma/emhd/emhd_utils.hpp             | 161 +++++++++++++++
 kharma/flux.cpp                        |  10 +-
 kharma/flux.hpp                        |  17 +-
 kharma/flux_functions.hpp              |  87 ++++----
 kharma/grmhd/grmhd_functions.hpp       |  35 +++-
 kharma/imex_driver.cpp                 |   5 +
 kharma/implicit/implicit.cpp           |  29 ++-
 kharma/implicit/implicit.hpp           |  35 +++-
 kharma/kharma.cpp                      |   9 +-
 kharma/prob/anisotropic_conduction.hpp |  91 +++++++++
 kharma/prob/emhdmodes.hpp              | 127 ++++++++++++
 kharma/prob/kelvin_helmholtz.hpp       |   2 +-
 kharma/prob/noh.hpp                    |   9 +-
 kharma/prob/problem.cpp                |  22 +-
 pars/anisotropic_conduction.par        |  81 ++++++++
 pars/emhdmodes.par                     |  85 ++++++++
 pars/mhdmodes_emhd.par                 |  88 ++++++++
 pars/mhdmodes_implicit.par             |   4 +-
 22 files changed, 1139 insertions(+), 277 deletions(-)
 create mode 100644 kharma/emhd/emhd_utils.hpp
 create mode 100644 kharma/prob/anisotropic_conduction.hpp
 create mode 100644 kharma/prob/emhdmodes.hpp
 create mode 100644 pars/anisotropic_conduction.par
 create mode 100644 pars/emhdmodes.par
 create mode 100644 pars/mhdmodes_emhd.par

diff --git a/kharma/CMakeLists.txt b/kharma/CMakeLists.txt
index 49f05d67..9ec7e73b 100644
--- a/kharma/CMakeLists.txt
+++ b/kharma/CMakeLists.txt
@@ -52,7 +52,7 @@ target_link_libraries(${EXE_NAME} PUBLIC parthenon)
 # OPTIONS
 # These are almost universally performance trade-offs
 # TODO is there any way to make compile options less painful in CMake?
-option(FUSE_FLUX_KERNELS "Bundle the usual four flux calculation kernels (floors,R,L,apply) into one" OFF)
+option(FUSE_FLUX_KERNELS "Bundle the usual four flux calculation kernels (floors,R,L,apply) into one" ON)
 option(FUSE_EMF_KERNELS "Bundle the three emf direction kernels into one. Likely won't affect much" ON)
 option(FUSE_FLOOR_KERNELS "Bundle applying the floors and ceilings into one kernel" ON)
 option(FAST_CARTESIAN "Break operation in curved spacetimes to make Cartesian Minkowski space computations faster" OFF)
diff --git a/kharma/emhd/emhd.cpp b/kharma/emhd/emhd.cpp
index 1819041c..9fbdc25b 100644
--- a/kharma/emhd/emhd.cpp
+++ b/kharma/emhd/emhd.cpp
@@ -34,6 +34,8 @@
 #include "emhd.hpp"
 
 #include "decs.hpp"
+#include "emhd_sources.hpp"
+#include "emhd_utils.hpp"
 #include "grmhd.hpp"
 #include "kharma.hpp"
 
@@ -57,18 +59,48 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     int extra_checks = pin->GetOrAddInteger("debug", "extra_checks", 0);
     params.Add("extra_checks", extra_checks);
 
-    // Floors & fluid gamma
-    // Any parameters, like above
+    // EMHD Problem/Closure parameters
+    // GRIM uses a callback to a problem-specific implementation which sets these
+    // We share implementations in one function, controlled by these parameters
+    // These are always necessary for performing EGRMHD.
+    std::string closure_type = pin->GetString("emhd", "closure_type");
+    Real tau = pin->GetOrAddReal("emhd", "tau", 1.0);
+    Real conduction_alpha = pin->GetOrAddReal("emhd", "conduction_alpha", 1.0);
+    Real viscosity_alpha = pin->GetOrAddReal("emhd", "viscosity_alpha", 1.0);
+
+    Closure closure;
+    if (closure_type == "constant") { 
+        closure.type = ClosureType::constant;
+    } else if (closure_type == "sound_speed") {
+        closure.type = ClosureType::soundspeed;
+    } else {
+        closure.type = ClosureType::torus;
+    }
+    closure.tau = tau;
+    closure.conduction_alpha = conduction_alpha;
+    closure.viscosity_alpha = viscosity_alpha;
+    params.Add("closure", closure);
+
+
+    // Slope reconstruction on faces. Always linear: default to MC unless we're using VL everywhere
+    if (packages.Get("GRMHD")->Param<ReconstructionType>("recon") == ReconstructionType::linear_vl) {
+        params.Add("slope_recon", ReconstructionType::linear_mc);
+    } else {
+        params.Add("slope_recon", ReconstructionType::linear_mc);
+    }
+
+    // Floors specific to EMHD calculations? Currently only need to enforce bsq>0 in one denominator
 
     MetadataFlag isPrimitive = packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
     MetadataFlag isNonideal = Metadata::AllocateNewFlag("Nonideal");
     params.Add("NonidealFlag", isNonideal);
 
-    // General options for primitive and conserved scalar variables in KHARMA
-    Metadata m_con  = Metadata({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
-                 Metadata::Restart, Metadata::Conserved, Metadata::WithFluxes, isNonideal});
-    Metadata m_prim = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived,
-                  isPrimitive, isNonideal});
+    // General options for primitive and conserved scalar variables in ImEx driver
+    // EMHD is supported only with imex driver and 
+    Metadata m_con  = Metadata({Metadata::Real, Metadata::Cell, Metadata::Independent,
+                                Metadata::Conserved, Metadata::WithFluxes, isNonideal});
+    Metadata m_prim = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::FillGhost,
+                                Metadata::Restart, isPrimitive, isNonideal});
 
     // Heat conduction
     pkg->AddField("cons.q", m_con);
@@ -76,7 +108,6 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     // Pressure anisotropy
     pkg->AddField("cons.dP", m_con);
     pkg->AddField("prims.dP", m_prim);
-    // Eventually also need (most or all of) Theta, bsq, nu_emhd, chi_emhd, tau
 
     // If we want to register an EMHD-specific UtoP for some reason?
     // Likely we'll only use the post-step summary hook
@@ -85,4 +116,101 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     return pkg;
 }
 
+TaskStatus AddSource(MeshData<Real> *md, MeshData<Real> *mdudt)
+{
+
+    Flag(mdudt, "Adding EMHD Explicit Sources");
+    // Pointers
+    auto pmesh = mdudt->GetMeshPointer();
+    auto pmb0 = mdudt->GetBlockData(0)->GetBlockPointer();
+    // Options
+    const auto& gpars = pmb0->packages.Get("GRMHD")->AllParams();
+    const Real gam = gpars.Get<Real>("gamma");
+    const MetadataFlag isPrimitive = gpars.Get<MetadataFlag>("PrimitiveFlag");
+    const int ndim = pmesh->ndim;
+
+    const auto& pars = pmb0->packages.Get("EMHD")->AllParams();
+    const Closure& closure = pars.Get<Closure>("closure");
+
+    // Pack variables
+    PackIndexMap prims_map, cons_map;
+    auto P = md->PackVariables(std::vector<MetadataFlag>{isPrimitive}, prims_map);
+    auto U = md->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
+    auto dUdt = mdudt->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved});
+    const VarMap m_p(prims_map, false), m_u(cons_map, true);
+
+    // Get sizes, declare temporary ucov, Theta for gradients
+    const int n1 = pmb0->cellbounds.ncellsi(IndexDomain::entire);
+    const int n2 = pmb0->cellbounds.ncellsj(IndexDomain::entire);
+    const int n3 = pmb0->cellbounds.ncellsk(IndexDomain::entire);
+    const int nb = dUdt.GetDim(5);
+    GridVector ucov_s("ucov", nb, GR_DIM, n3, n2, n1);
+    GridScalar theta_s("Theta", nb, n3, n2, n1);
+
+    // Get ranges
+    const IndexRange ib = mdudt->GetBoundsI(IndexDomain::interior);
+    const IndexRange jb = mdudt->GetBoundsJ(IndexDomain::interior);
+    const IndexRange kb = mdudt->GetBoundsK(IndexDomain::interior);
+    const IndexRange block = IndexRange{0, nb - 1};
+    // 1-zone halo in nontrivial dimensions
+    const IndexRange il = IndexRange{ib.s-1, ib.e+1};
+    const IndexRange jl = (ndim > 1) ? IndexRange{jb.s-1, jb.e+1} : jb;
+    const IndexRange kl = (ndim > 2) ? IndexRange{kb.s-1, kb.e+1} : kb;
+
+    // Calculate & apply source terms
+    pmb0->par_for("emhd_sources_pre", block.s, block.e, kl.s, kl.e, jl.s, jl.e, il.s, il.e,
+        KOKKOS_LAMBDA_MESH_3D {
+            const auto& G = dUdt.GetCoords(b);
+            const GReal gdet = G.gdet(Loci::center, j, i);
+            // ucon
+            Real ucon[GR_DIM], ucov[GR_DIM];
+            GRMHD::calc_ucon(G, P(b), m_p, k, j, i, Loci::center, ucon);
+            G.lower(ucon, ucov, k, j, i, Loci::center);
+            DLOOP1 ucov_s(b, mu, k, j, i) = ucov[mu];
+            // theta
+            theta_s(b, k, j, i) = max((gam - 1) * P(b)(m_p.UU, k, j, i) / P(b)(m_p.RHO, k, j, i), SMALL);
+        }
+    );
+
+    // Calculate & apply source terms
+    pmb0->par_for("emhd_sources", block.s, block.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_3D {
+            const auto& G = dUdt.GetCoords(b);
+
+            // Get the EGRMHD parameters
+            Real tau, chi, nu_e;
+            EMHD::set_parameters(G, P(b), m_p, closure, gam, k, j, i, tau, chi, nu_e);
+
+            // and the 4-vectors
+            FourVectors D;
+            GRMHD::calc_4vecs(G, P(b), m_p, k, j, i, Loci::center, D);
+            double bsq = max(dot(D.bcon, D.bcov), SMALL);
+
+            // Compute gradient of ucov and Theta
+            Real grad_ucov[GR_DIM][GR_DIM], grad_Theta[GR_DIM];
+            EMHD::gradient_calc(G, P(b), ucov_s, theta_s, b, k, j, i, (ndim > 2), grad_ucov, grad_Theta);
+
+            // Compute div of ucon (all terms but the time-derivative ones are nonzero)
+            Real div_ucon = 0;
+            DLOOP2 div_ucon += G.gcon(Loci::center, mu, nu, j, i) * grad_ucov[mu][nu];
+
+            // Compute+add explicit source terms (conduction and viscosity)
+            const Real& rho = P(b)(m_p.RHO, k, j, i);
+            Real q0 = 0;
+            DLOOP1 q0 -= rho * chi * (D.bcon[mu] / sqrt(bsq)) * grad_Theta[mu];
+            DLOOP2 q0 -= rho * chi * (D.bcon[mu] / sqrt(bsq)) * theta_s(b, k, j, i) * D.ucon[nu] * grad_ucov[nu][mu];
+
+            Real deltaP0 = -rho * nu_e * div_ucon;
+            DLOOP2  deltaP0 += 3. * rho * nu_e * (D.bcon[mu] * D.bcon[nu] / bsq) * grad_ucov[mu][nu];
+
+            // TODO edit this when higher order terms are considered
+            dUdt(b, m_u.Q, k, j, i)  += G.gdet(Loci::center, j, i) * q0 / tau;
+            dUdt(b, m_u.DP, k, j, i) += G.gdet(Loci::center, j, i) * deltaP0 / tau;
+        }
+    );
+
+    Flag(mdudt, "Added");
+    return TaskStatus::complete;
+}
+
 } // namespace EMHD
diff --git a/kharma/emhd/emhd.hpp b/kharma/emhd/emhd.hpp
index 4c35a9f3..3a57c0ce 100644
--- a/kharma/emhd/emhd.hpp
+++ b/kharma/emhd/emhd.hpp
@@ -48,33 +48,105 @@ using namespace parthenon;
  * implemented in KHARMA as ImexDriver.
  */
 namespace EMHD {
+
+enum ClosureType{constant=0, soundspeed, torus};
+
+class Closure {
+    public:
+        ClosureType type;
+        Real tau;
+        Real conduction_alpha;
+        Real viscosity_alpha;
+
+};
+
 /**
- * Initialization: declare any fields this package will evolve, initialize any parameters
+ * Initialization: handle parameters, 
  */
 std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages);
 
 /**
- * TODO standard interface for implicit solver & what that needs, similar to UtoP/prim_to_flux definitions
+ * Add EGRMHD explicit source terms: anything which can be calculated once
+ * and added to the general dU/dt term along with e.g. GRMHD source, wind, etc
  */
+TaskStatus AddSource(MeshData<Real> *md, MeshData<Real> *mdudt);
 
 /**
- * Whatever form these take for viscous variables
+ * Set chi, nu, tau. Problem dependent
+ * 
+ * TODO Local & Global, when we're sure
  */
-KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m_p, const FourVectors D,
-                                         const int& k, const int& j, const int& i, const int dir,
-                                         ScratchPad2D<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Local& P, const VarMap& m_p,
+                                           const Closure& closure, const Real& gam,
+                                           Real& tau, Real& chi, Real& nu)
 {
-    // Calculate flux through a face from primitives
+    if (closure.type == ClosureType::constant) {
+        // Set tau, nu, chi to constants
+        tau = closure.tau;
+        chi = closure.conduction_alpha;
+        nu  = closure.viscosity_alpha;
+    } else if (closure.type == ClosureType::soundspeed) {
+        // Set tau=const, chi/nu prop. to sound speed squared
+        Real cs2 = (gam * (gam - 1.) * P(m_p.UU)) / (P(m_p.RHO) + (gam * P(m_p.UU)));
+
+        tau = closure.tau;
+        chi = closure.conduction_alpha * cs2 * tau;
+        nu  = closure.viscosity_alpha * cs2 * tau;
+    } else if (closure.type == ClosureType::torus) {
+        // Something complicated
+    } // else yell
 }
-KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
-                                         const int& k, const int& j, const int& i,
-                                         const VariablePack<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
+template<typename Global>
+KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Global& P, const VarMap& m_p,
+                                           const Closure& closure, const Real& gam,
+                                           const int& k, const int& j, const int& i,
+                                           Real& tau, Real& chi, Real& nu)
 {
-    // Calculate conserved variables from primitives
-}
+    if (closure.type == ClosureType::constant) {
+        // Set tau, nu, chi to constants
+        tau = closure.tau;
+        chi = closure.conduction_alpha;
+        nu  = closure.viscosity_alpha;
+    } else if (closure.type == ClosureType::soundspeed) {
+        // Set tau=const, chi/nu prop. to sound speed squared
+        const Real cs2 = (gam * (gam - 1.) * P(m_p.UU, k, j, i)) /
+                            (P(m_p.RHO, k, j, i) + (gam * P(m_p.UU, k, j, i)));
 
-KOKKOS_INLINE_FUNCTION void gradient_calc(const GRCoordinates& G, const GridVector var, int loc, int i, int j, int k, double grad[NDIM]);
+        tau = closure.tau;
+        chi = closure.conduction_alpha * cs2 * tau;
+        nu  = closure.viscosity_alpha * cs2 * tau;
+    } else if (closure.type == ClosureType::torus) {
+        // Something complicated
+    } // else yell
+}
 
-KOKKOS_INLINE_FUNCTION void gradient_calc_vec(const GRCoordinates& G, const GridVector var, int loc, int i, int j, int k, double grad_vec[NDIM][NDIM]);
+/**
+ * Get a row of the EMHD stress-energy tensor with first index up, second index down.
+ * A factor of sqrt(4 pi) is absorbed into the definition of b.
+ * Note this must be passed the full q, dP, not the primitive prims.q, usually denote qtilde
+ *
+ * Entirely local!
+ */
+KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Real& pgas,
+                                        const Real& q, const Real& dP,
+                                        const FourVectors& D, const int& dir,
+                                        Real emhd[GR_DIM])
+{
+    const Real bsq = max(dot(D.bcon, D.bcov), SMALL);
+    const Real eta = pgas + rho + u + bsq;
+    const Real ptot = pgas + 0.5 * bsq;
 
+    DLOOP1 {
+        emhd[mu] = eta * D.ucon[dir] * D.ucov[mu]
+                  + ptot * (dir == mu)
+                  - D.bcon[dir] * D.bcov[mu]
+                  + (q / sqrt(bsq)) * ((D.ucon[dir] * D.bcov[mu]) +
+                                       (D.bcon[dir] * D.ucov[mu]))
+                  + (-dP) * ((D.bcon[dir] * D.bcov[mu] / bsq)
+                                  - (1./3) * ((dir == mu) + D.ucon[dir] * D.ucov[mu]));
+    }
 }
+
+
+} // namespace EMHD
diff --git a/kharma/emhd/emhd_sources.hpp b/kharma/emhd/emhd_sources.hpp
index 6ec7ad98..a3d86a8e 100644
--- a/kharma/emhd/emhd_sources.hpp
+++ b/kharma/emhd/emhd_sources.hpp
@@ -1,189 +1,116 @@
-
+/* 
+ *  File: emhd_sources.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 #pragma once
 
 #include "decs.hpp"
 
+#include "emhd.hpp"
+#include "gr_coordinates.hpp"
+#include "grmhd_functions.hpp"
+
 /**
- * Implicit source terms for EMHD
+ * The various implicit/solved source terms for EGRMHD evolution.
+ * Explicit terms are added in emhd.cpp
  */
-KOKKOS_INLINE_FUNCTION void emhd_implicit_sources(const GRCoordinates& G, const Local& P, const VarMap& m_p,
-                                                  const Local& dU, const VarMap& m_u)
-{
-    Real gdet = G.gdet(loc, j, i);
-    Real tau = 0. //HFSDAJKHFASDHJLASFD
-    dU(m_u.Q)  = -gdet * (P(m_p.Q) / tau);
-    dU(m_u.DP) = -gdet * (P(m_p.DP) / tau);
-}
 
+namespace EMHD {
 
-KOKKOS_INLINE_FUNCTION void emhd_time_derivative_sources(const GRCoordinates& G, const Local& P, const VarMap& m_p,
-                                                         const Local& dU, const VarMap& m_u)
+/**
+ * Implicit source terms for EMHD, evaluated during implicit step calculation
+ */
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void implicit_sources(const GRCoordinates& G, const Local& P, const VarMap& m_p,
+                                             const Real& gam, const int& j, const int& i,
+                                             const Closure& closure,
+                                             Real& dUq, Real& dUdP)
 {
-
-    // Initializations
-    double rho      = P(m_p.RHO);
-    double Theta    = S->Theta[k][j][i];
-    double bsq      = S->bsq[k][j][i];
-    double chi_emhd = S->chi_emhd[k][j][i];
-    double nu_emhd  = S->nu_emhd[k][j][i];
-    double tau      = S->tau[k][j][i];
-
-    double gdet = G->gdet[loc][j][i];
-
-    // Compute partial derivative of ucov
-    double dt_ucov[GR_DIM];
-    DLOOP1 {
-        double ucov_new = S_new->ucov[mu][k][j][i];
-        double ucov_old = S_old->ucov[mu][k][j][i];
-
-        dt_ucov[mu] = (ucov_new - ucov_old) / dt;
-    }
-
-    // Compute div of ucon (only temporal part is nonzero)
-    double div_ucon = 0;
-    DLOOP1 {
-        double gcon_t_mu = G->gcon[loc][0][mu][j][i];
-
-        div_ucon += gcon_t_mu * dt_ucov[mu];
-    }
-
-    // Compute q0 and delta_P0 (temporal terms)
-    double Theta_new, Theta_old, dt_Theta;
-    Theta_new = S_new->Theta[k][j][i];
-    Theta_old = S_old->Theta[k][j][i];
-
-    dt_Theta = (Theta_new - Theta_old) / dt;
-
-    double q0, deltaP0;
-    double bcon_t  = S->bcon[0][k][j][i];
-
-    q0 = -rho * chi_emhd * (bcon_t / sqrt(bsq)) * dt_Theta;
-    DLOOP1 {
-        double ucon_t  = S->ucon[0][k][j][i];
-        double bcon_mu = S->bcon[mu][k][j][i];
-
-        q0 -= rho * chi_emhd * (bcon_mu / sqrt(bsq)) * Theta * ucon_t * dt_ucov[mu];
-    }
-
-    deltaP0 = -rho * nu_emhd * div_ucon;
-    DLOOP1 {
-        double bcon_mu = S->bcon[mu][k][j][i];
-
-        deltaP0 += 3. * rho * nu_emhd * (bcon_t * bcon_mu / bsq) * dt_ucov[mu];
-    }
-
-    // Add the time derivative source terms (conduction and viscosity)
-    // NOTE: Will have to edit this when higher order terms are considered
-    dU(Q)  += gdet * (q0 / tau);
-    dU(DP) += gdet * (deltaP0 / tau);
+    // These are intentionally the tilde versions!
+    Real tau, chi, nu;
+    EMHD::set_parameters(G, P, m_p, closure, gam, tau, chi, nu);
+    dUq  = -G.gdet(Loci::center, j, i) * (P(m_p.Q) / tau);
+    dUdP = -G.gdet(Loci::center, j, i) * (P(m_p.DP) / tau);
 }
 
-// Compute explicit source terms
-KOKKOS_INLINE_FUNCTION void emhd_explicit_sources(struct GridGeom *G, struct FluidState *S, int loc,
-                                                  int i, int j, int k, double dU_explicit)
+/**
+ * EMHD source terms requiring time derivatives, used to evaluate residual
+ * gamma, j, i,
+ */
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void time_derivative_sources(const GRCoordinates& G, const Local& P_new,
+                                                    const Local& P_old, const Local& P,
+                                                    const VarMap& m_p, const Closure& closure,
+                                                    const Real& gam, const Real& dt, const int& j, const int& i,
+                                                    Real& dUq, Real& dUdP)
 {
-    // Extended MHD components
+    // Parameters
+    Real tau, chi, nu;
+    EMHD::set_parameters(G, P, m_p, closure, gam, tau, chi, nu);
+
+    FourVectors Dtmp;
+    GRMHD::calc_4vecs(G, P, m_p, j, i, Loci::center, Dtmp);
+    double bsq = max(dot(Dtmp.bcon, Dtmp.bcov), SMALL);
+
+    // TIME DERIVATIVES
+    Real ucon[GR_DIM], ucov_new[GR_DIM], ucov_old[GR_DIM];
+    GRMHD::calc_ucon(G, P_old, m_p, j, i, Loci::center, ucon);
+    G.lower(ucon, ucov_old, 0, j, i, Loci::center);
+    GRMHD::calc_ucon(G, P_new, m_p, j, i, Loci::center, ucon);
+    G.lower(ucon, ucov_new, 0, j, i, Loci::center);
+    Real dt_ucov[GR_DIM];
+    DLOOP1 dt_ucov[mu] = (ucov_new[mu] - ucov_old[mu]) / dt;
+
+    // Compute div of ucon (only the temporal part is nonzero)
+    Real div_ucon = 0;
+    DLOOP1 div_ucon += G.gcon(Loci::center, 0, mu, j, i) * dt_ucov[mu];
+    // dTheta/dt
+    const Real Theta_new = max((gam-1) * P_new(m_p.UU) / P_new(m_p.RHO), SMALL);
+    const Real Theta_old = max((gam-1) * P_old(m_p.UU) / P_old(m_p.RHO), SMALL);
+    const Real dt_Theta = (Theta_new - Theta_old) / dt;
+
+    // TEMPORAL SOURCE TERMS
+    const Real& rho = P(m_p.RHO);
+    const Real& Theta = (gam-1) * P(m_p.UU) / P(m_p.RHO);
+    Real q0 = -rho * chi * (Dtmp.bcon[0] / sqrt(bsq)) * dt_Theta;
+    DLOOP1 q0 -= rho * chi * (Dtmp.bcon[mu] / sqrt(bsq)) * Theta * Dtmp.ucon[0] * dt_ucov[mu];
+
+
+    Real deltaP0 = -rho * nu * div_ucon;
+    DLOOP1 deltaP0 += 3. * rho * nu * (Dtmp.bcon[0] * Dtmp.bcon[mu] / bsq) * dt_ucov[mu];
 
-    // Initializations
-
-    double rho      = S->P[RHO][k][j][i];
-    double Theta    = S->Theta[k][j][i];
-    double bsq      = S->bsq[k][j][i];
-    double chi_emhd = S->chi_emhd[k][j][i];
-    double nu_emhd  = S->nu_emhd[k][j][i];
-    double tau      = S->tau[k][j][i];
-
-    double gdet = G->gdet[loc][j][i];
-
-    double grad_ucov[GR_DIM][GR_DIM], grad_Theta[GR_DIM];
-
-    // Compute gradient of ucov and Theta
-    gradient_calc(G, S, loc, i, j, k, grad_ucov, grad_Theta);
-
-    // Compute div of ucon (all terms but the time-derivative ones are nonzero)
-    double div_ucon = 0;
-    DLOOP2 {
-        double gcon_mu_nu = G->gcon[loc][mu][nu][j][i];
-
-        div_ucon += gcon_mu_nu * grad_ucov[mu][nu];
-    }
-
-    // Compute q0 and deltaP0 (everything but the time-derivative terms)
-    double q0, deltaP0;
-
-    DLOOP1 {
-        double bcon_mu = S->bcon[mu][k][j][i];
-
-        q0 = -rho * chi_emhd * (bcon_mu / sqrt(bsq)) * grad_Theta[mu];
-    }
-
-    DLOOP2 {
-        double bcon_mu = S->bcon[mu][k][j][i];
-        double ucon_nu = S->ucon[nu][k][j][i];
-
-        q0 -= rho * chi_emhd * (bcon_mu / sqrt(bsq)) * Theta * ucon_nu * grad_ucov[nu][mu];
-    }
-
-    deltaP0 = -rho * nu_emhd * div_ucon;
-    DLOOP2  {
-        double bcon_mu = S->bcon[mu][k][j][i];
-        double bcon_nu = S->bcon[nu][k][j][i];
-
-        deltaP0 += 3. * rho * nu_emhd * (bcon_mu * bcon_nu / bsq) * grad_ucov[mu][nu];
-    }
-
-    // Add explicit source terms (conduction and viscosity)
     // NOTE: Will have to edit this when higher order terms are considered
-    dU(Q)  += gdet * (q0 / tau);
-    dU(DP) += gdet * (deltaP0) / tau;
-}
-
-// Compute gradient of four velocities and temperature
-// Called by emhd_explicit_sources
-KOKKOS_INLINE_FUNCTION void gradient_calc(struct GridGeom *G, struct FluidState *S, int loc, int i, int j, int k,
-                                          double grad_ucov[GR_DIM][GR_DIM], double grad_Theta[GR_DIM])
-{
-    // Compute gradient of ucov
-    DLOOP1 {
-        grad_ucov[0][mu] = 0;
-
-        slope_calc_4vec(S->ucov, mu, 1, i, j, k, grad_ucov[1][mu]);
-        slope_calc_4vec(S->ucov, mu, 2, i, j, k, grad_ucov[2][mu]);
-        slope_calc_4vec(S->ucov, mu, 3, i, j, k, grad_ucov[3][mu]);
-    }
-
-    DLOOP2 {
-        for (int gam = 0; gam < GR_DIM; gam++)
-            grad_ucov[mu][nu] -= G->conn[gam][mu][nu][j][i] * S->ucov[gam][k][j][i];
-    }
-
-    // Compute temperature gradient
-    // Time derivative component computed in emhd_time_derivative_sources
-    grad_Theta[0] = 0;
-    slope_calc_scalar(S->Theta, 1, i, j, k, grad_Theta[1]);
-    slope_calc_scalar(S->Theta, 2, i, j, k, grad_Theta[2]);
-    slope_calc_scalar(S->Theta, 3, i, j, k, grad_Theta[3]);
-}
-
-// Compute slope for 4 vectors
-// TODO going to need to either keep or calculate these based on recon choices
-KOKKOS_INLINE_FUNCTION void slope_calc_4vec(GridVector u, int component, int dir, int i, int j, int k, double slope)
-{
-    if (dir == 1)
-        slope = SLOPE_ALGO(u[component][k][j][i-2], u[component][k][j][i-1], u[component][k][j][i],
-                            u[component][k][j][i+1], u[component][k][j][i+2], dx[dir]);
-    if (dir == 2)
-        slope = SLOPE_ALGO(u[component][k][j-2][i], u[component][k][j-1][i], u[component][k][j][i],
-                            u[component][k][j+1][i], u[component][k][j+2][i], dx[dir]);
-    if (dir == 3)
-        slope = SLOPE_ALGO(u[component][k-2][j][i], u[component][k-1][j][i], u[component][k][j][i],
-                            u[component][k+1][j][i], u[component][k+2][j][i], dx[dir]);
+    dUq  = G.gdet(Loci::center, j, i) * (q0 / tau);
+    dUdP = G.gdet(Loci::center, j, i) * (deltaP0 / tau);
 }
 
-// Compute slope for scalars
-KOKKOS_INLINE_FUNCTION void slope_calc_scalar(GridDouble T, int dir, int i, int j, int k, double slope)
-{
-  if (dir == 1) slope = SLOPE_ALGO(T[k][j][i-2], T[k][j][i-1], T[k][j][i], T[k][j][i+1], T[k][j][i+2], dx[dir]);
-  if (dir == 2) slope = SLOPE_ALGO(T[k][j-2][i], T[k][j-1][i], T[k][j][i], T[k][j+1][i], T[k][j+2][i], dx[dir]);
-  if (dir == 3) slope = SLOPE_ALGO(T[k-2][j][i], T[k-1][j][i], T[k][j][i], T[k+1][j][i], T[k+2][j][i], dx[dir]);
-}
\ No newline at end of file
+} // namespace EMHD
diff --git a/kharma/emhd/emhd_utils.hpp b/kharma/emhd/emhd_utils.hpp
new file mode 100644
index 00000000..20c521db
--- /dev/null
+++ b/kharma/emhd/emhd_utils.hpp
@@ -0,0 +1,161 @@
+/* 
+ *  File: emhd.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include "decs.hpp"
+
+/**
+ * Utilities for the EMHD source terms, things we might conceivably use somewhere else,
+ * or use *from* somewhere else instead of here.
+ * 
+ * 1. Slopes at faces using various linear reconstructions.  Since this is unrelated to
+ *    reconstructing all prims, and only called zone-wise, the "same" recon algos are reimplemented here
+ * 2. Calculate gradient of each component of ucov & 
+ */
+
+namespace EMHD {
+
+// Linear MC slope limiter
+KOKKOS_INLINE_FUNCTION Real linear_monotonized_cd(Real x1, Real x2, Real x3, Real dx)
+{
+    const Real Dqm = 2 * (x2 - x1) / dx;
+    const Real Dqp = 2 * (x3 - x2) / dx;
+    const Real Dqc = 0.5 * (x3 - x1) / dx;
+
+    if (Dqm * Dqp <= 0) {
+        return 0;
+    } else {
+        if ((fabs(Dqm) < fabs(Dqp)) && (fabs (Dqm) < fabs(Dqc))) {
+            return Dqm;
+        } else if (fabs(Dqp) < fabs(Dqc)) {
+            return Dqp;
+        } else {
+            return Dqc;
+        }
+    }
+}
+
+// Linear Van Leer slope limiter
+KOKKOS_INLINE_FUNCTION Real linear_van_leer(Real x1, Real x2, Real x3, Real dx)
+{
+    const Real Dqm = (x2 - x1) / dx;
+    const Real Dqp = (x3 - x2) / dx;
+
+    const Real extrema = Dqm * Dqp;
+
+    if (extrema <= 0) {
+        return 0;
+    } else {
+        return (2 * extrema / (Dqm + Dqp)); 
+    }
+}
+
+/**
+ * Compute slope of scalars at faces
+ */
+template<typename Global>
+KOKKOS_INLINE_FUNCTION Real slope_calc_scalar(const GRCoordinates& G, const Global& A, const int& dir,
+                                              const int& b, const int& k, const int& j, const int& i, 
+                                              ReconstructionType recon=ReconstructionType::linear_mc)
+{
+    // TODO could generic-ize this, but with two options, screw it
+    if (recon != ReconstructionType::linear_vl) {
+        if (dir == 1) return linear_monotonized_cd(A(b, k, j, i-1), A(b, k, j, i), A(b, k, j, i+1), G.dx1v(i));
+        if (dir == 2) return linear_monotonized_cd(A(b, k, j-1, i), A(b, k, j, i), A(b, k, j+1, i), G.dx2v(j));
+        if (dir == 3) return linear_monotonized_cd(A(b, k-1, j, i), A(b, k, j, i), A(b, k+1, j, i), G.dx3v(k));
+    } else {
+        if (dir == 1) return linear_van_leer(A(b, k, j, i-1), A(b, k, j, i), A(b, k, j, i+1), G.dx1v(i));
+        if (dir == 2) return linear_van_leer(A(b, k, j-1, i), A(b, k, j, i), A(b, k, j+1, i), G.dx2v(j));
+        if (dir == 3) return linear_van_leer(A(b, k-1, j, i), A(b, k, j, i), A(b, k+1, j, i), G.dx3v(k));
+    }
+    return 0.;
+}
+
+/**
+ * Compute slope of all  vectors at faces
+ */
+template<typename Global>
+KOKKOS_INLINE_FUNCTION Real slope_calc_vector(const GRCoordinates& G, const Global& A, const int& mu,
+                                              const int& dir, const int& b, const int& k, const int& j, const int& i, 
+                                              ReconstructionType recon=ReconstructionType::linear_mc)
+{
+    // TODO could generic-ize this, but with two options, screw it
+    if (recon != ReconstructionType::linear_vl) {
+        if (dir == 1) return linear_monotonized_cd(A(b, mu, k, j, i-1), A(b, mu, k, j, i), A(b, mu, k, j, i+1), G.dx1v(i));
+        if (dir == 2) return linear_monotonized_cd(A(b, mu, k, j-1, i), A(b, mu, k, j, i), A(b, mu, k, j+1, i), G.dx2v(j));
+        if (dir == 3) return linear_monotonized_cd(A(b, mu, k-1, j, i), A(b, mu, k, j, i), A(b, mu, k+1, j, i), G.dx3v(k));
+    } else {
+        if (dir == 1) return linear_van_leer(A(b, mu, k, j, i-1), A(b, mu, k, j, i), A(b, mu, k, j, i+1), G.dx1v(i));
+        if (dir == 2) return linear_van_leer(A(b, mu, k, j-1, i), A(b, mu, k, j, i), A(b, mu, k, j+1, i), G.dx2v(j));
+        if (dir == 3) return linear_van_leer(A(b, mu, k-1, j, i), A(b, mu, k, j, i), A(b, mu, k+1, j, i), G.dx3v(k));
+    }
+    return 0.;
+}
+
+// Compute gradient of four velocities and temperature
+// Called by emhd_explicit_sources
+template<typename Global>
+KOKKOS_INLINE_FUNCTION void gradient_calc(const GRCoordinates& G, const Global& P,
+                                          const GridVector& ucov_s, const GridScalar& theta_s,
+                                          const int& b, const int& k, const int& j, const int& i, const bool& do_3d,
+                                          Real grad_ucov[GR_DIM][GR_DIM], Real grad_Theta[GR_DIM])
+{
+    // Compute gradient of ucov
+    DLOOP1 {
+        grad_ucov[0][mu] = 0;
+
+        // slope in direction nu of component mu
+        grad_ucov[1][mu] = slope_calc_vector(G, ucov_s, mu, 1, b, k, j, i);
+        grad_ucov[2][mu] = slope_calc_vector(G, ucov_s, mu, 2, b, k, j, i);
+        if (do_3d) {
+            grad_ucov[3][mu] = slope_calc_vector(G, ucov_s, mu, 3, b, k, j, i);
+        } else {
+            grad_ucov[3][mu] = 0.;
+        }
+    }
+    DLOOP3 grad_ucov[mu][nu] -= G.conn(lam, mu, nu, j, i) * ucov_s(lam, k, j, i);
+
+    // Compute temperature gradient
+    // Time derivative component is computed in time_derivative_sources
+    grad_Theta[0] = 0;
+    grad_Theta[1] = slope_calc_scalar(G, theta_s, 1, b, k, j, i);
+    grad_Theta[2] = slope_calc_scalar(G, theta_s, 2, b, k, j, i);
+    if (do_3d) {
+        grad_Theta[3] = slope_calc_scalar(G, theta_s, 3, b, k, j, i);
+    } else {
+        grad_Theta[3] = 0.;
+    }
+}
+
+} // namespace EMHD
diff --git a/kharma/flux.cpp b/kharma/flux.cpp
index 1420e464..818f2c9d 100644
--- a/kharma/flux.cpp
+++ b/kharma/flux.cpp
@@ -53,8 +53,16 @@ TaskStatus Flux::PtoU(MeshBlockData<Real> *rc, IndexDomain domain)
     const bool flux_ct = pkgs.count("B_FluxCT");
     const bool b_cd = pkgs.count("B_CD");
     const bool use_electrons = pkgs.count("Electrons");
+    const bool use_emhd = pkgs.count("EMHD");
     MetadataFlag isPrimitive = pars.Get<MetadataFlag>("PrimitiveFlag");
 
+    EMHD::Closure closure_tmp;
+    if (use_emhd) {
+        const auto& emhd_pars = pmb->packages.Get("EMHD")->AllParams();
+        closure_tmp = emhd_pars.Get<EMHD::Closure>("closure");
+    }
+    const EMHD::Closure& closure = closure_tmp;
+
     // Pack variables
     PackIndexMap prims_map, cons_map;
     const auto& P_all = rc->PackVariables({isPrimitive}, prims_map);
@@ -93,7 +101,7 @@ TaskStatus Flux::PtoU(MeshBlockData<Real> *rc, IndexDomain domain)
                 [&](const int& i) {
                     auto P = Kokkos::subview(P_s, Kokkos::ALL(), i);
                     auto U = Kokkos::subview(U_s, Kokkos::ALL(), i);
-                    Flux::p_to_u(G, P, m_p, gam, j, i, U, m_u);
+                    Flux::p_to_u(G, P, m_p, closure, gam, j, i, U, m_u);
                 }
             );
 
diff --git a/kharma/flux.hpp b/kharma/flux.hpp
index 3ce68e95..9c146408 100644
--- a/kharma/flux.hpp
+++ b/kharma/flux.hpp
@@ -45,6 +45,7 @@
 #include "types.hpp"
 
 // Package functions
+#include "emhd.hpp"
 #include "grmhd_functions.hpp"
 #include "b_flux_ct.hpp"
 #include "b_cd.hpp"
@@ -129,12 +130,20 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
     const bool use_b_flux_ct = pkgs.count("B_FluxCT");
     const bool use_b_cd = pkgs.count("B_CD");
     const bool use_electrons = pkgs.count("Electrons");
+    const bool use_emhd = pkgs.count("EMHD");
     // Pull flag indicating primitive variables
     const MetadataFlag isPrimitive = pars.Get<MetadataFlag>("PrimitiveFlag");
 
     const Real gam = pars.Get<Real>("gamma");
     const double ctop_max = (use_b_cd) ? globals.Get<Real>("ctop_max_last") : 0.0;
 
+    EMHD::Closure closure_tmp;
+    if (use_emhd) {
+        const auto& emhd_pars = pmb0->packages.Get("EMHD")->AllParams();
+        closure_tmp = emhd_pars.Get<EMHD::Closure>("closure");
+    }
+    const EMHD::Closure& closure = closure_tmp;
+
     const Loci loc = loc_of(dir);
 
     // Pack variables.  Keep ctop separate
@@ -223,8 +232,8 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
 
                     // Left
                     GRMHD::calc_4vecs(G, Pl, m_p, j, i, loc, Dtmp);
-                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, gam, j, i, 0, Ul, m_u, loc);
-                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, gam, j, i, dir, Fl, m_u, loc);
+                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, closure, gam, j, i, 0, Ul, m_u, loc);
+                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, closure, gam, j, i, dir, Fl, m_u, loc);
 
                     // Magnetosonic speeds
                     Real cmaxL, cminL;
@@ -251,8 +260,8 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
                     // Right
                     // TODO GRMHD/GRHD versions of this
                     GRMHD::calc_4vecs(G, Pr, m_p, j, i, loc, Dtmp);
-                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, gam, j, i, 0, Ur, m_u, loc);
-                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, gam, j, i, dir, Fr, m_u, loc);
+                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, closure, gam, j, i, 0, Ur, m_u, loc);
+                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, closure, gam, j, i, dir, Fr, m_u, loc);
 
                     // Magnetosonic speeds
                     Real cmaxR, cminR;
diff --git a/kharma/flux_functions.hpp b/kharma/flux_functions.hpp
index c6765b1e..0136f1b9 100644
--- a/kharma/flux_functions.hpp
+++ b/kharma/flux_functions.hpp
@@ -35,6 +35,7 @@
 
 #include "decs.hpp"
 
+#include "emhd.hpp"
 #include "gr_coordinates.hpp"
 #include "grmhd_functions.hpp"
 #include "kharma_utils.hpp"
@@ -55,22 +56,40 @@ namespace Flux
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P, const VarMap& m_p, const FourVectors D,
-                                         const Real& gam, const int& j, const int& i, const int dir,
+                                         const EMHD::Closure& closure, const Real& gam, const int& j, const int& i, const int dir,
                                          const Local& flux, const VarMap& m_u, const Loci loc=Loci::center)
 {
     Real gdet = G.gdet(loc, j, i);
     // Particle number flux
     flux(m_u.RHO) = P(m_p.RHO) * D.ucon[dir] * gdet;
 
-    if (m_p.B1 >= 0) {
-        // MHD stress-energy tensor w/ first index up, second index down
-        Real mhd[GR_DIM];
-        GRMHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), D, dir, mhd);
-        flux(m_u.UU) = mhd[0] * gdet + flux(m_u.RHO);
-        flux(m_u.U1) = mhd[1] * gdet;
-        flux(m_u.U2) = mhd[2] * gdet;
-        flux(m_u.U3) = mhd[3] * gdet;
+    Real T[GR_DIM];
+    if (m_p.Q >= 0) {
+        // EGRMHD stress-energy tensor w/ first index up, second index down
+        // Convert prim Qtilde/dPtilde to real q/dP
+        // Real tau, chi, nu;
+        // EMHD::set_parameters(G, P, m_p, closure, gam, tau, chi, nu);
+        //const Real Theta = (gam - 1) * P(m_p.UU) / P(m_p.RHO);
+        //const Real q = (closure.higher_order) ? P(m_p.RHO) * sqrt(chi * P(m_p.RHO) * pow(Theta, 2) / tau);
+        //const Real dP = sqrt(nu * P(m_p.RHO) * Theta / tau);
+        const Real q = P(m_p.Q);
+        const Real dP = P(m_p.DP);
+        // Then calculate the tensor
+        EMHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), q, dP, D, dir, T);
+    } else if (m_p.B1 >= 0) {
+        // GRMHD stress-energy tensor w/ first index up, second index down
+        GRMHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), D, dir, T);
+    } else {
+        // GRHD stress-energy tensor w/ first index up, second index down
+        GRHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), D, dir, T);
+    }
+    flux(m_u.UU) = T[0] * gdet + flux(m_u.RHO);
+    flux(m_u.U1) = T[1] * gdet;
+    flux(m_u.U2) = T[2] * gdet;
+    flux(m_u.U3) = T[3] * gdet;
 
+    // Magnetic field
+    if (m_p.B1 >= 0) {
         // Magnetic field
         if (dir == 0) {
             VLOOP flux(m_u.B1 + v) = P(m_p.B1 + v) * gdet;
@@ -79,29 +98,27 @@ KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P,
             // but for us this is in the source term
             VLOOP flux(m_u.B1 + v) = (D.bcon[v+1] * D.ucon[dir] - D.bcon[dir] * D.ucon[v+1]) * gdet;
         }
-    } else {
-        // HD stress-energy tensor w/ first index up, second index down
-        Real hd[GR_DIM];
-        GRHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), D, dir, hd);
-        flux(m_u.UU) = hd[0] * gdet + flux(m_u.RHO);
-        flux(m_u.U1) = hd[1] * gdet;
-        flux(m_u.U2) = hd[2] * gdet;
-        flux(m_u.U3) = hd[3] * gdet;
-    }
-    if (m_p.PSI >= 0) {
         // Extra scalar psi for constraint damping, see B_CD
-        if (dir == 0) {
-            flux(m_u.PSI) = P(m_p.PSI) * gdet;
-        } else {
-            // Psi field update as in Mosta et al (IllinoisGRMHD), alternate explanation Jesse et al (2020)
-            //Real alpha = 1. / sqrt(-G.gcon(Loci::center, j, i, 0, 0));
-            //Real beta_dir = G.gcon(Loci::center, j, i, 0, dir) * alpha * alpha;
-            flux(m_u.PSI) = (D.bcon[dir] - G.gcon(Loci::center, j, i, 0, dir) * P(m_p.PSI)) * gdet;
+        if (m_p.PSI >= 0) {
+            if (dir == 0) {
+                flux(m_u.PSI) = P(m_p.PSI) * gdet;
+            } else {
+                // Psi field update as in Mosta et al (IllinoisGRMHD), alternate explanation Jesse et al (2020)
+                //Real alpha = 1. / sqrt(-G.gcon(Loci::center, j, i, 0, 0));
+                //Real beta_dir = G.gcon(Loci::center, j, i, 0, dir) * alpha * alpha;
+                flux(m_u.PSI) = (D.bcon[dir] - G.gcon(Loci::center, j, i, 0, dir) * P(m_p.PSI)) * gdet;
+            }
         }
     }
 
+    // EMHD Variables: advect like rho
+    if (m_p.Q >= 0) {
+        flux(m_u.Q) = P(m_p.Q) * D.ucon[dir] * gdet;
+        flux(m_u.DP) = P(m_p.DP) * D.ucon[dir] * gdet;
+    }
+
+    // Electrons: normalized by density
     if (m_p.KTOT >= 0) {
-        // Take the factor from the primitives, in case we need to reorder this to happen before GRMHD::prim_to_flux later
         flux(m_u.KTOT) = flux(m_u.RHO) * P(m_p.KTOT);
         if (m_p.K_CONSTANT >= 0)
             flux(m_u.K_CONSTANT) = flux(m_u.RHO) * P(m_p.K_CONSTANT);
@@ -124,19 +141,13 @@ KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P,
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const VarMap& m_p,
-                                   const Real& gam, const int& j, const int& i,
+                                   const EMHD::Closure& closure, const Real& gam, const int& j, const int& i,
                                    const Local& U, const VarMap& m_u, const Loci& loc=Loci::center)
 {
     FourVectors Dtmp;
-    GRMHD::calc_4vecs(G, P, m_p, j, i, loc, Dtmp); // TODO switch GRHD/GRMHD
-    prim_to_flux(G, P, m_p, Dtmp, gam, j, i, 0, U, m_u, loc);
+    GRMHD::calc_4vecs(G, P, m_p, j, i, loc, Dtmp); // TODO switch GRHD/GRMHD?
+    prim_to_flux(G, P, m_p, Dtmp, closure, gam, j, i, 0, U, m_u, loc);
 }
-// KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
-//                                    const Real& gam, const int& k, const int& j, const int& i,
-//                                    const VariablePack<Real>& U, const VarMap& m_u, const Loci& loc=Loci::center)
-// {
-
-// }
 
 /**
  * Calculate components of magnetosonic velocity from primitive variables
@@ -153,14 +164,14 @@ KOKKOS_INLINE_FUNCTION void vchar(const GRCoordinates& G, const Local& P, const
     Real cms2;
     if (m.B1 >= 0) {
         // Find fast magnetosonic speed
-        const Real bsq = dot(D.bcon, D.bcov);
+        const Real bsq = max(dot(D.bcon, D.bcov), SMALL);
         const Real ee = bsq + ef;
         const Real va2 = bsq / ee;
         cms2 = cs2 + va2 - cs2 * va2;
     } else {
         cms2 = cs2;
     }
-    clip(cms2, 1.e-20, 1.);
+    clip(cms2, SMALL, 1.);
 
     // Require that speed of wave measured by observer q.ucon is cms2
     Real A, B, C;
diff --git a/kharma/grmhd/grmhd_functions.hpp b/kharma/grmhd/grmhd_functions.hpp
index 134ac557..a32280a2 100644
--- a/kharma/grmhd/grmhd_functions.hpp
+++ b/kharma/grmhd/grmhd_functions.hpp
@@ -223,7 +223,8 @@ KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Local& P, c
         DLOOP1 D.bcon[mu] = D.bcov[mu] = 0.;
     }
 }
-KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m,
+template<typename Global>
+KOKKOS_INLINE_FUNCTION void calc_4vecs(const GRCoordinates& G, const Global& P, const VarMap& m,
                                       const int& k, const int& j, const int& i, const Loci loc, FourVectors& D)
 {
     const Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
@@ -267,6 +268,28 @@ KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates &G, const Real uvec[NV
     ucon[0] = gamma / alpha;
     VLOOP ucon[v+1] = uvec[v] - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
 }
+template<typename Local>
+KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates& G, const Local& P, const VarMap& m,
+                                      const int& j, const int& i, const Loci loc,
+                                      Real ucon[GR_DIM])
+{
+    const Real gamma = lorentz_calc(G, P, m, j, i, loc);
+    const Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    ucon[0] = gamma / alpha;
+    VLOOP ucon[v+1] = P(m.U1 + v) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+}
+template<typename Global>
+KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates& G, const Global& P, const VarMap& m,
+                                      const int& k, const int& j, const int& i, const Loci loc,
+                                      Real ucon[GR_DIM])
+{
+    const Real gamma = lorentz_calc(G, P, m, k, j, i, loc);
+    const Real alpha = 1. / sqrt(-G.gcon(loc, j, i, 0, 0));
+
+    ucon[0] = gamma / alpha;
+    VLOOP ucon[v+1] = P(m.U1 + v, k, j, i) - gamma * alpha * G.gcon(loc, j, i, 0, v+1);
+}
 
 /**
  * Global GRMHD-only "p_to_u" call: just MHD variables (uses B optionally, but no output). TODO elminate?
@@ -278,7 +301,7 @@ KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const
 {
     Real gdet = G.gdet(loc, j, i);
     FourVectors Dtmp;
-    GRMHD::calc_4vecs(G, P, m_p, j, i, loc, Dtmp); // TODO switch GRHD/GRMHD
+    GRMHD::calc_4vecs(G, P, m_p, j, i, loc, Dtmp); // TODO switch GRHD/GRMHD?
     // Particle number flux
     U(m_u.RHO) = P(m_p.RHO) * Dtmp.ucon[0] * gdet;
 
@@ -300,9 +323,10 @@ KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const
         U(m_u.U3) = hd[3] * gdet;
     }
 }
-KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
+template<typename Global>
+KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Global& P, const VarMap& m_p,
                                    const Real& gam, const int& k, const int& j, const int& i,
-                                   const VariablePack<Real>& U, const VarMap& m_u, const Loci& loc=Loci::center)
+                                   const Global& U, const VarMap& m_u, const Loci& loc=Loci::center)
 {
     Real gdet = G.gdet(loc, j, i);
     FourVectors Dtmp;
@@ -330,8 +354,7 @@ KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Re
 }
 
 /**
- * Special local "p_to_u" call for just MHD variables, used in fluid frame floors & wind source.
- * See Flux::p_to_u in flux_functions.hpp for documentation.
+ * Special all-local "p_to_u" call for just MHD variables, used in fluid frame floors & wind source.
  */
 KOKKOS_INLINE_FUNCTION void p_to_u_mhd(const GRCoordinates& G, const Real& rho, const Real& u, const Real uvec[NVEC],
                                    const Real B_P[NVEC], const Real& gam, const int& k, const int& j, const int& i,
diff --git a/kharma/imex_driver.cpp b/kharma/imex_driver.cpp
index e19c09b9..7f760087 100644
--- a/kharma/imex_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -77,6 +77,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
     bool use_b_flux_ct = pkgs.count("B_FluxCT");
     bool use_electrons = pkgs.count("Electrons");
     bool use_wind = pkgs.count("Wind");
+    bool use_emhd = pkgs.count("EMHD");
 
     // Allocate the fluid states ("containers") we need for each block
     for (int i = 0; i < blocks.size(); i++) {
@@ -184,6 +185,10 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         if (use_wind) {
             t_wind_source = tl.AddTask(t_b_cd_source, Wind::AddSource, mdudt.get());
         }
+        auto t_emhd_source = t_wind_source;
+        if (use_emhd) {
+            t_emhd_source = tl.AddTask(t_wind_source, EMHD::AddSource, mc0.get(), mdudt.get());
+        }
         // Done with source terms
         auto t_sources = t_wind_source;
     }
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
index 9b23d7a4..db28a26e 100644
--- a/kharma/implicit/implicit.cpp
+++ b/kharma/implicit/implicit.cpp
@@ -91,6 +91,11 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     const Real delta = implicit_par.Get<Real>("jacobian_delta");
     const Real gam = pmb0->packages.Get("GRMHD")->Param<Real>("gamma");
 
+    Closure closure;
+    if (pmb0->packages.AllPackages().count("EMHD")) {
+        const auto& pars = pmb0->packages.Get("EMHD")->AllParams();
+        closure = pars.Get<Closure>("closure");
+    }
 
     //MetadataFlag isNonideal = pmb0->packages.Get("EMHD")->Param<MetadataFlag>("NonidealFlag");
     MetadataFlag isPrimitive = pmb0->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
@@ -189,6 +194,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         Ps_s(ip, i) = Ps_all(b)(ip, k, j, i);
                         Us_s(ip, i) = Us_all(b)(ip, k, j, i);
                         dUdt_s(ip, i) = dUdt_all(b)(ip, k, j, i);
+                        dUi_s(ip, i) = 0.; // Only a few vars are populated
                         // Finally, P_solver should actually be initialized to Ps
                         if (iter == 0) {
                             P_solver_s(ip, i) = Ps_all(b)(ip, k, j, i);
@@ -197,6 +203,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         }
                     }
                 );
+                member.team_barrier();
 
                 parthenon::par_for_inner(member, ib.s, ib.e,
                     [&](const int& i) {
@@ -218,19 +225,22 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         // Implicit sources at starting state
                         auto dUi = Kokkos::subview(dUi_s, Kokkos::ALL(), i);
                         if (m_p.Q >= 0) {
-                            //emhd_implicit_sources(G, Si, dUi);
-                        } else {
-                            PLOOP dUi(ip) = 0;
+                            Real dUq, dUdP;
+                            EMHD::implicit_sources(G, Pi, m_p, gam, j, i, closure, dUq, dUdP);
+                            dUi(m_u.Q) = dUq;
+                            dUi(m_u.DP) = dUdP;
                         }
 
                         // Jacobian calculation
                         // Requires calculating the residual anyway, so we grab it here
-                        calc_jacobian(G, P_solver, Ui, Us, dUdt, dUi, tmp1, tmp2, tmp3,
-                                      m_p, m_u, nvar, j, i, delta, gam, dt, jacobian, residual);
+                        calc_jacobian(G, P_solver, Pi, Ui, Ps, dUdt, dUi, tmp1, tmp2, tmp3,
+                                      m_p, m_u, closure, nvar, j, i, delta, gam, dt, jacobian, residual);
                         // Solve against the negative residual
                         PLOOP delta_prim(ip) = -residual(ip);
 
                         // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 8) {
+                        //     printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d",
+                        //             m_p.RHO, m_p.UU, m_p.U1, m_p.B1, m_p.Q, m_p.DP);
                         //     printf("\nSample Jacobian and residual:");
                         //     for (int u=0; u < nvar; u++) {
                         //         printf("\n");
@@ -244,8 +254,8 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         // Linear solve
                         // This code lightly adapted from Kokkos batched examples
                         // Replaces our inverse residual with the actual desired delta_prim
-                        KokkosBatched::SerialLU<Algo::LU::Unblocked>::invoke(jacobian, tiny);
-                        KokkosBatched::SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>
+                        KokkosBatched::SerialLU<Algo::LU::Blocked>::invoke(jacobian, tiny);
+                        KokkosBatched::SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Blocked>
                         ::invoke(alpha, jacobian, delta_prim);
 
                         // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 8) {
@@ -262,8 +272,8 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         // Update the guess.  For now lambda == 1, choose on the fly?
                         PLOOP P_solver(ip) += lambda * delta_prim(ip);
 
-                        calc_residual(G, P_solver, Ui, Us, dUdt, dUi, tmp3,
-                                      m_p, m_u, nvar, j, i, gam, dt, residual);
+                        calc_residual(G, P_solver, Pi, Ui, Ps, dUdt, dUi, tmp3,
+                                      m_p, m_u, closure, nvar, j, i, gam, dt, residual);
 
                         // Store for maximum/output
                         // I would be tempted to store the whole residual, but it's of variable size
@@ -273,6 +283,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
 
                     }
                 );
+                member.team_barrier();
 
                 // Copy out P_solver to the existing array
                 // This combo still works if P_solver is aliased to one of the other arrays!
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
index 08c349fb..860ce3c4 100644
--- a/kharma/implicit/implicit.hpp
+++ b/kharma/implicit/implicit.hpp
@@ -34,12 +34,20 @@
 #pragma once
 
 #include "decs.hpp"
+
+#include "emhd_sources.hpp"
+#include "emhd.hpp"
 #include "flux_functions.hpp"
 #include "types.hpp"
 #include "grmhd_functions.hpp"
 
 #include <parthenon/parthenon.hpp>
 
+// This class calls EMHD stuff a bunch,
+// since that's the only package with specific
+// implicit solver stuff
+using namespace EMHD;
+
 namespace Implicit
 {
 
@@ -67,9 +75,10 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
  * "Local" here is anything sliced (usually Scratch) addressable var(ip)
  */
 template<typename Local>
-KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P_test, const Local& Ui, const Local& Us,
+KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P_test,
+                                          const Local& Pi, const Local& Ui, const Local& Ps,
                                           const Local& dudt_explicit, const Local& dUi, const Local& tmp, 
-                                          const VarMap& m_p, const VarMap& m_u,
+                                          const VarMap& m_p, const VarMap& m_u, const Closure& closure,
                                           const int& nvar, const int& j, const int& i,
                                           const Real& gam, const double& dt,
                                           Local& residual)
@@ -77,18 +86,21 @@ KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P
     // These lines calculate res = (U_test - Ui)/dt - dudt_explicit - 0.5*(dU_new(ip) + dUi(ip)) - dU_time(ip) )
     // Start with conserved vars corresponding to test P, U_test
     // Note this uses the Flux:: call, it needs *all* conserved vars!
-    Flux::p_to_u(G, P_test, m_p, gam, j, i, tmp, m_u); // U_test
+    Flux::p_to_u(G, P_test, m_p, closure, gam, j, i, tmp, m_u); // U_test
     // (U_test - Ui)/dt - dudt_explicit ...
     PLOOP residual(ip) = (tmp(ip) - Ui(ip)) / dt - dudt_explicit(ip);
 
     if (m_p.Q >= 0) {
         // Compute new implicit source terms and time derivative source terms
-        //emhd_implicit_sources(G, P_test, j, i, tmp); // dU_new
+        Real dUq, dUdP; // Don't need full array for these
+        EMHD::implicit_sources(G, P_test, m_p, gam, j, i, closure, dUq, dUdP); // dU_new
         // ... - 0.5*(dU_new(ip) + dUi(ip)) ...
-        //PLOOP residual(ip) -= 0.5*(tmp(ip) + dUi(ip));
-        //emhd_time_derivative_sources(G, P_test, Ui, Us, dt, j, i, tmp); // dU_time
+        residual(m_u.Q) -= 0.5*(dUq + dUi(m_u.Q));
+        residual(m_u.DP) -= 0.5*(dUdP + dUi(m_u.DP));
+        EMHD::time_derivative_sources(G, P_test, Pi, Ps, m_p, closure, gam, dt, j, i, dUq, dUdP); // dU_time
         // ... - dU_time(ip)
-        //PLOOP residual(ip) -= tmp(ip);
+        residual(m_u.Q) -= dUq;
+        residual(m_u.DP) -= dUdP;
     }
 }
 
@@ -99,16 +111,17 @@ KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P
  * Usually these are Kokkos subviews
  */
 template<typename Local, typename Local2>
-KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P, const Local& Ui, const Local& Us,
+KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P,
+                                          const Local& Pi, const Local& Ui, const Local& Ps,
                                           const Local& dudt_explicit, const Local& dUi,
                                           Local& tmp1, Local& tmp2, Local& tmp3,
-                                          const VarMap& m_p, const VarMap& m_u,
+                                          const VarMap& m_p, const VarMap& m_u, const Closure& closure,
                                           const int& nvar, const int& j, const int& i,
                                           const Real& jac_delta, const Real& gam, const double& dt,
                                           Local2& jacobian, Local& residual)
 {
     // Calculate residual for Sf->P
-    calc_residual(G, P, Ui, Us, dudt_explicit, dUi, tmp3, m_p, m_u, nvar, j, i, gam, dt, residual);
+    calc_residual(G, P, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, closure, nvar, j, i, gam, dt, residual);
 
     // Use one scratchpad as the incremented prims P_delta,
     // one as the new residual residual_delta
@@ -127,7 +140,7 @@ KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P
         }
 
         // Compute the residual for P_delta, residual_delta
-        calc_residual(G, P_delta, Ui, Us, dudt_explicit, dUi, tmp3, m_p, m_u, nvar, j, i, gam, dt, residual_delta);
+        calc_residual(G, P_delta, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, closure, nvar, j, i, gam, dt, residual_delta);
 
         // Compute forward derivatives of each residual vs the primitive col
         for (int row = 0; row < nvar; row++) {
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index e67d7e3d..24968911 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -214,7 +214,14 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
 
     // Set the default driver way up here so packages know how to flag
     // prims vs cons (imex stepper syncs prims, but packages have to mark them that way)
-    auto driver_type = pin->GetOrAddString("driver", "type", "harm");
+    std::string driver_type;
+    if (do_emhd) {
+        // Default to implicit step for EMHD
+        driver_type = pin->GetOrAddString("driver", "type", "imex");
+        pin->GetOrAddString("driver", "step", "implicit");
+    } else {
+        driver_type = pin->GetOrAddString("driver", "type", "harm");
+    }
 
     // Global variables "package."  Mutable global state Parthenon doesn't keep for us.
     // Always enable.
diff --git a/kharma/prob/anisotropic_conduction.hpp b/kharma/prob/anisotropic_conduction.hpp
new file mode 100644
index 00000000..28cf0653
--- /dev/null
+++ b/kharma/prob/anisotropic_conduction.hpp
@@ -0,0 +1,91 @@
+/* 
+ *  File: anisotropic_conduction.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include <complex>
+
+#include "decs.hpp"
+
+using namespace std;
+using namespace parthenon;
+
+/**
+ * Anisotropic heat conduction problem, see Chandra+ 2017
+ */
+TaskStatus InitializeAnisotropicConduction(MeshBlockData<Real> *rc, ParameterInput *pin)
+{
+    Flag(rc, "Initializing EMHD Modes problem");
+    auto pmb = rc->GetBlockPointer();
+    GridScalar rho = rc->Get("prims.rho").data;
+    GridScalar u = rc->Get("prims.u").data;
+    GridVector uvec = rc->Get("prims.uvec").data;
+    // It is well and good this problem should cry if B/EMHD are disabled.
+    GridVector B_P = rc->Get("prims.B").data;
+    GridVector q = rc->Get("prims.q").data;
+    GridVector dP = rc->Get("prims.dP").data;
+
+    const auto& G = pmb->coords;
+
+    const Real A = pin->GetOrAddReal("anisotropic_conduction", "A", 0.2);
+    const Real Rsq = pin->GetOrAddReal("anisotropic_conduction", "Rsq", 0.005);
+    const Real B0 = pin->GetOrAddReal("anisotropic_conduction", "B0", 1e-4);
+    const Real k0 = pin->GetOrAddReal("anisotropic_conduction", "k", 4.);
+
+    const Real R = sqrt(Rsq);
+
+    IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
+    IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
+    IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
+    pmb->par_for("emhdmodes_init", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_3D {
+            Real X[GR_DIM];
+            G.coord_embed(k, j, i, Loci::center, X);
+            GReal r = sqrt(pow((X[1] - 0.5), 2) + pow((X[2] - 0.5), 2));
+
+            // Initialize primitives
+            rho(k, j, i) = 1 - (A * exp(-pow(r, 2) / pow(R, 2)));
+            u(k, j, i) = 1.;
+            uvec(0, k, j, i) = 0.;
+            uvec(1, k, j, i) = 0.;
+            uvec(2, k, j, i) = 0.;
+            B_P(0, k, j, i) = B0;
+            B_P(1, k, j, i) = B0 * sin(2*M_PI*k0*X[1]);
+            B_P(2, k, j, i) = 0;
+            q(k, j, i) = 0.;
+            dP(k, j, i) = 0.;
+        }
+    );
+
+    return TaskStatus::complete;
+}
diff --git a/kharma/prob/emhdmodes.hpp b/kharma/prob/emhdmodes.hpp
new file mode 100644
index 00000000..3f0cebf0
--- /dev/null
+++ b/kharma/prob/emhdmodes.hpp
@@ -0,0 +1,127 @@
+/* 
+ *  File: emhdmodes.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include <complex>
+
+#include "decs.hpp"
+
+
+using namespace std::literals::complex_literals;
+using namespace std;
+using namespace parthenon;
+
+/**
+ * Initialization of analytic wave modes in magnetized plasma w/viscosity and heat conduction
+ * 
+ * Note the end time is not set -- even after exactly 1 period, EMHD modes will
+ * have lost amplitude due to having viscosity, which is kind of the point
+ */
+TaskStatus InitializeEMHDModes(MeshBlockData<Real> *rc, ParameterInput *pin)
+{
+    Flag(rc, "Initializing EMHD Modes problem");
+    auto pmb = rc->GetBlockPointer();
+    GridScalar rho = rc->Get("prims.rho").data;
+    GridScalar u = rc->Get("prims.u").data;
+    GridVector uvec = rc->Get("prims.uvec").data;
+    // It is well and good this problem should cry if B/EMHD are disabled.
+    GridVector B_P = rc->Get("prims.B").data;
+    GridVector q = rc->Get("prims.q").data;
+    GridVector dP = rc->Get("prims.dP").data;
+
+    const auto& G = pmb->coords;
+
+    const Real amp = pin->GetOrAddReal("emhdmodes", "amp", 1e-4);
+
+    // TODO actually calculate the mode?  Figure something out
+    const Real omega_real = pin->GetOrAddReal("emhdmodes", "omega_real", -0.5533585207638141);
+    const Real omega_imag = pin->GetOrAddReal("emhdmodes", "omega_imag", -3.6262571286888425);
+
+    // START POSSIBLE ARGS: take all these as parameters in pin?
+    // Also note this is 2D only for now
+    // Mean state
+    Real rho0 = 1.;
+    Real u0 = 2.;
+    Real u10 = 0.;
+    Real u20 = 0.;
+    Real u30 = 0.;
+    Real B10 = 0.1;
+    Real B20 = 0.3;
+    Real B30 = 0.;
+    Real q0   = 0.;
+    Real delta_p0 = 0.;
+
+    // Wavevector
+    Real k1 = 2. * M_PI;
+    Real k2 = 4. * M_PI;
+    // END POSSIBLE ARGS
+
+    IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
+    IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
+    IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
+    pmb->par_for("emhdmodes_init", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_3D {
+            Real X[GR_DIM];
+            G.coord_embed(k, j, i, Loci::center, X);
+            const Real cos_phi = cos(k1*X[1] + k2*X[2]);
+            const Real sin_phi = sin(k1*X[1] + k2*X[2]);
+
+            // Perturbations: no higher-order terms
+            Real drho     = amp * (((-0.518522524082246)*cos_phi) + ((0.1792647678001878)*sin_phi));
+            Real du       = amp * ((0.5516170736393813)*cos_phi);
+            Real du1      = amp * (((0.008463122479547856)*cos_phi) + ((-0.011862022608466367)*sin_phi));
+            Real du2      = amp * (((-0.16175466371870734)*cos_phi) + ((0.034828080823603294)*sin_phi));
+            Real du3      = 0.;
+            Real dB1      = amp * (((-0.05973794979640743)*cos_phi) + ((0.03351707506150924)*sin_phi));
+            Real dB2      = amp * (((0.02986897489820372)*cos_phi) - ((0.016758537530754618)*sin_phi));
+            Real dB3      = 0.;
+            Real dq       = amp * (((0.5233486841539436)*cos_phi) - ((0.04767672501939603)*sin_phi));
+            Real ddelta_p = amp * (((0.2909106062057657)*cos_phi) - ((0.02159452055336572)*sin_phi));
+
+            // Initialize primitives
+            rho(k, j, i) = rho0 + drho;
+            u(k, j, i) = u0 + du;
+            uvec(0, k, j, i) = u10 + du1;
+            uvec(1, k, j, i) = u20 + du2;
+            uvec(2, k, j, i) = u30 + du3;
+            B_P(0, k, j, i) = B10 + dB1;
+            B_P(1, k, j, i) = B20 + dB2;
+            B_P(2, k, j, i) = B30 + dB3;
+            q(k, j, i) = q0 + dq;
+            dP(k, j, i) = delta_p0 + ddelta_p;
+        }
+    );
+
+    return TaskStatus::complete;
+}
diff --git a/kharma/prob/kelvin_helmholtz.hpp b/kharma/prob/kelvin_helmholtz.hpp
index 06a2cafc..53a838dc 100644
--- a/kharma/prob/kelvin_helmholtz.hpp
+++ b/kharma/prob/kelvin_helmholtz.hpp
@@ -73,7 +73,7 @@ TaskStatus InitializeKelvinHelmholtz(MeshBlockData<Real> *rc, ParameterInput *pi
     pmb->par_for("kh_init", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA_3D {
             GReal X[GR_DIM];
-            G.coord_embed(i, j, k, Loci::center, X);
+            G.coord_embed(k, j, i, Loci::center, X);
 
             // Lecoanet's x <-> x1; z <-> x2
             GReal x = X[1];
diff --git a/kharma/prob/noh.hpp b/kharma/prob/noh.hpp
index 2e0fac3d..9a8ceb99 100644
--- a/kharma/prob/noh.hpp
+++ b/kharma/prob/noh.hpp
@@ -1,5 +1,5 @@
 /* 
- *  File: mhdmodes.hpp
+ *  File: noh.hpp
  *  
  *  BSD 3-Clause License
  *  
@@ -38,6 +38,9 @@
 using namespace std;
 using namespace parthenon;
 
+/**
+ * Noh shock tube test.
+ */
 TaskStatus InitializeNoh(MeshBlockData<Real> *rc, ParameterInput *pin)
 {
     Flag(rc, "Initializing 1D (Noh) Shock test");
@@ -71,11 +74,11 @@ TaskStatus InitializeNoh(MeshBlockData<Real> *rc, ParameterInput *pin)
     const Real x1max = pin->GetReal("parthenon/mesh", "x1max");
     const Real center = (x1min + x1max) / 2.;
 
+    // TODO relativistic sound speed
     Real cs2 = (gam * (gam - 1) * PL) / rhoL;
     Real v1 = mach * sqrt(cs2);
 
-    if (set_tlim)
-    {
+    if (set_tlim) {
         pin->SetReal("parthenon/time", "tlim", 0.6*(x1max - x1min)/v1);
     }
 
diff --git a/kharma/prob/problem.cpp b/kharma/prob/problem.cpp
index 1d5c5041..0081831c 100644
--- a/kharma/prob/problem.cpp
+++ b/kharma/prob/problem.cpp
@@ -44,7 +44,9 @@
 #include "types.hpp"
 
 // Problem initialization headers
+#include "anisotropic_conduction.hpp"
 #include "bondi.hpp"
+#include "emhdmodes.hpp"
 #include "explosion.hpp"
 #include "fm_torus.hpp"
 #include "resize_restart.hpp"
@@ -76,6 +78,7 @@ void KHARMA::ProblemGenerator(MeshBlock *pmb, ParameterInput *pin)
     auto prob = pin->GetString("parthenon/job", "problem_id"); // Required parameter
     if (MPIRank0()) cout << "Initializing problem: " << prob << endl;
     TaskStatus status = TaskStatus::fail;
+    // GRMHD
     if (prob == "mhdmodes") {
         status = InitializeMHDModes(rc.get(), pin);
     } else if (prob == "orszag_tang") {
@@ -88,15 +91,24 @@ void KHARMA::ProblemGenerator(MeshBlock *pmb, ParameterInput *pin)
         status = InitializeShockTube(rc.get(), pin);
     } else if (prob == "bondi") {
         status = InitializeBondi(rc.get(), pin);
-    } else if (prob == "torus") {
-        status = InitializeFMTorus(rc.get(), pin);
     } else if (prob == "bz_monopole") {
         status = InitializeBZMonopole(rc.get(), pin);
+    // Electrons
+    } else if (prob == "noh") {
+        status = InitializeNoh(rc.get(), pin);
+    // Extended GRMHD
+    } else if (prob == "emhdmodes") {
+        status = InitializeEMHDModes(rc.get(), pin);
+    } else if (prob == "anisotropic_conduction") {
+        status = InitializeAnisotropicConduction(rc.get(), pin);
+    // Everything
+    } else if (prob == "torus") {
+        status = InitializeFMTorus(rc.get(), pin);
     } else if (prob == "resize_restart") {
         status = ReadIharmRestart(rc.get(), pin);
-    } else if (prob == "noh"){
-        status = InitializeNoh(rc.get(), pin);
     }
+
+    // If we didn't initialize a problem, yell
     if (status != TaskStatus::complete) {
         throw std::invalid_argument("Invalid or incomplete problem: "+prob);
     }
@@ -109,7 +121,7 @@ void KHARMA::ProblemGenerator(MeshBlock *pmb, ParameterInput *pin)
         PerturbU(rc.get(), pin);
     }
 
-    // Initialize electron entropies if enabled
+    // Initialize electron entropies to defaults if enabled
     if (pmb->packages.AllPackages().count("Electrons")) {
         Electrons::InitElectrons(rc.get(), pin);
     }
diff --git a/pars/anisotropic_conduction.par b/pars/anisotropic_conduction.par
new file mode 100644
index 00000000..31fa910d
--- /dev/null
+++ b/pars/anisotropic_conduction.par
@@ -0,0 +1,81 @@
+# Anisotropic Conduction problem
+# Heat conduction along field lines in the EMHD theory
+# of Chandra+2015
+
+<parthenon/job>
+problem_id = anisotropic_conduction
+
+<parthenon/mesh>
+refinement = none
+numlevel = 1
+
+nx1 = 256
+x1min = 0.0
+x1max = 1.0
+ix1_bc = periodic
+ox1_bc = periodic
+
+nx2 = 256
+x2min = 0.0
+x2max = 1.0
+ix2_bc = periodic
+ox2_bc = periodic
+
+nx3 = 1
+x3min = 0.0
+x3max = 1.0
+ix3_bc = periodic
+ox3_bc = periodic
+
+<parthenon/meshblock>
+nx1 = 128
+nx2 = 128
+nx3 = 1
+
+<coordinates>
+base = cartesian_minkowski
+transform = null
+
+<parthenon/time>
+tlim = 10.0
+# "RK2" is the only option for implicit solver
+integrator = rk2
+dt_min = 1e-6
+
+<GRMHD>
+cfl = 0.7
+gamma = 1.333333
+reconstruction = weno5
+
+<floors>
+bsq_over_rho_max = 100
+u_over_rho_max = 100
+
+<debug>
+verbose = 1
+flag_verbose = 0
+extra_checks = 1
+
+# This block must be present and values filled
+# in all EGRMHD simulations
+<emhd>
+on = true
+closure_type = constant
+tau = 0.1
+conduction_alpha = 0.01
+viscosity_alpha = 0.0
+
+# Implicit driver is auto-selected for emhd
+<implicit>
+max_nonlinear_iter = 3
+
+<parthenon/output0>
+file_type = hdf5
+dt = 0.1
+single_precision_output = true
+variables = prims.rho, prims.u, prims.uvec, prims.B, prims.q, prims.dP
+
+<parthenon/output1>
+file_type = hst
+dt = 0.1
+
diff --git a/pars/emhdmodes.par b/pars/emhdmodes.par
new file mode 100644
index 00000000..409b5fc0
--- /dev/null
+++ b/pars/emhdmodes.par
@@ -0,0 +1,85 @@
+# EGRMHD Modes problem
+# Try to propagate an analytically-amenable
+# linear mode of the equations of Extended MHD (Chandra+2015)
+
+<parthenon/job>
+problem_id = emhdmodes
+
+<parthenon/mesh>
+refinement = none
+numlevel = 1
+
+nx1 = 128
+x1min = 0.0
+x1max = 1.0
+ix1_bc = periodic
+ox1_bc = periodic
+
+nx2 = 128
+x2min = 0.0
+x2max = 1.0
+ix2_bc = periodic
+ox2_bc = periodic
+
+nx3 = 1
+x3min = 0.0
+x3max = 1.0
+ix3_bc = periodic
+ox3_bc = periodic
+
+<parthenon/meshblock>
+nx1 = 64
+nx2 = 64
+nx3 = 1
+
+<coordinates>
+base = cartesian_minkowski
+transform = null
+
+<parthenon/time>
+tlim = 2.0
+# "RK2" is the only option for implicit solver
+integrator = rk2
+dt_min = 0.0001
+
+<GRMHD>
+cfl = 0.9
+gamma = 1.333333
+reconstruction = weno5
+
+<emhdmodes>
+amp = 1e-4
+
+<floors>
+disable_floors = true
+
+<debug>
+verbose = 0
+
+# This block must be present and values filled
+# in all EGRMHD simulations
+<emhd>
+on = true
+closure_type = soundspeed
+tau = 1.0
+conduction_alpha = 1.0
+viscosity_alpha = 1.0
+
+<driver>
+type = imex
+step = implicit
+
+<implicit>
+max_nonlinear_iter = 3
+
+<parthenon/output0>
+file_type = hdf5
+# This is so as to output only the final state
+dt = 100.0
+single_precision_output = true
+variables = prims.rho, prims.u, prims.uvec, prims.B, prims.q, prims.dP
+
+<parthenon/output1>
+file_type = hst
+dt = 0.1
+
diff --git a/pars/mhdmodes_emhd.par b/pars/mhdmodes_emhd.par
new file mode 100644
index 00000000..fcd2a285
--- /dev/null
+++ b/pars/mhdmodes_emhd.par
@@ -0,0 +1,88 @@
+# GRMHD Modes problem
+# Try to propagate several analytically-amenable linear modes of the MHD equations
+
+<parthenon/job>
+problem_id = mhdmodes
+
+<parthenon/mesh>
+refinement = none
+numlevel = 1
+
+nx1 = 64
+x1min = 0.0
+x1max = 1.0
+ix1_bc = periodic
+ox1_bc = periodic
+
+nx2 = 64
+x2min = 0.0
+x2max = 1.0
+ix2_bc = periodic
+ox2_bc = periodic
+
+nx3 = 64
+x3min = 0.0
+x3max = 1.0
+ix3_bc = periodic
+ox3_bc = periodic
+
+<parthenon/meshblock>
+nx1 = 32
+nx2 = 32
+nx3 = 32
+
+<coordinates>
+base = cartesian_minkowski
+transform = null
+
+<parthenon/time>
+# tlim will be overridden depending on the problem
+tlim = 5.0
+integrator = rk2
+dt_min = 0.0001
+
+<GRMHD>
+cfl = 0.9
+gamma = 1.333333
+reconstruction = weno5
+
+<mhdmodes>
+nmode = 1
+dir = 0
+
+<floors>
+disable_floors = true
+
+<debug>
+verbose = 1
+flag_verbose = 1
+extra_checks = 1
+
+<driver>
+type = imex
+step = implicit
+
+# This block must be present and values filled
+# in all EGRMHD simulations
+<emhd>
+on = true
+closure_type = soundspeed
+tau = 1.0
+conduction_alpha = 1.0
+viscosity_alpha = 1.0
+
+<perf>
+pack_comms = false
+
+<parthenon/output0>
+file_type = hdf5
+# This is so as to output only the final state
+dt = 1.0
+single_precision_output = true
+variables = prims.rho, prims.u, prims.uvec, prims.B
+ghost_zones = true
+
+<parthenon/output1>
+file_type = hst
+dt = 0.1
+
diff --git a/pars/mhdmodes_implicit.par b/pars/mhdmodes_implicit.par
index f4ce1d0b..d5b4fd19 100644
--- a/pars/mhdmodes_implicit.par
+++ b/pars/mhdmodes_implicit.par
@@ -59,8 +59,8 @@ flag_verbose = 1
 extra_checks = 1
 
 <driver>
-type = grim
-step = explicit
+type = imex
+step = implicit
 
 <perf>
 pack_comms = false

From 17801464f3387aa794f60b573e22c43d9492a716 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Fri, 4 Mar 2022 16:15:11 -0600
Subject: [PATCH 11/26] Fix a dumb bug, restore second-order convergence of
 semi-implicit scheme

---
 kharma/imex_driver.cpp       |  3 ++-
 kharma/implicit/implicit.cpp | 19 +++++++++----------
 tests/mhdmodes/check.sh      | 18 +++++++++---------
 tests/mhdmodes/run.sh        | 18 +++++++++---------
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/kharma/imex_driver.cpp b/kharma/imex_driver.cpp
index 7f760087..9aa558ef 100644
--- a/kharma/imex_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -259,7 +259,8 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
 
             // time-step by root-finding the residual
             // This applies the functions of both t_update and t_fill_derived
-            auto t_implicit_solve = tl.AddTask(t_none, Implicit::Step, mbase.get(), mc0.get(), mdudt.get(), mc1.get(), dt);
+            // This takes dt for the *substep*, not the whole thing -- should be 0.5*dt
+            auto t_implicit_solve = tl.AddTask(t_none, Implicit::Step, mbase.get(), mc0.get(), mdudt.get(), mc1.get(), dt / beta);
         }
     }
 
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
index db28a26e..166ea210 100644
--- a/kharma/implicit/implicit.cpp
+++ b/kharma/implicit/implicit.cpp
@@ -97,6 +97,8 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
         closure = pars.Get<Closure>("closure");
     }
 
+    printf("Implicit advance dt: %g\n", dt);
+
     //MetadataFlag isNonideal = pmb0->packages.Get("EMHD")->Param<MetadataFlag>("NonidealFlag");
     MetadataFlag isPrimitive = pmb0->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
     // Initial state.  Also mapping template
@@ -239,16 +241,14 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         PLOOP delta_prim(ip) = -residual(ip);
 
                         // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 8) {
-                        //     printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d",
+                        //     printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d\n",
                         //             m_p.RHO, m_p.UU, m_p.U1, m_p.B1, m_p.Q, m_p.DP);
-                        //     printf("\nSample Jacobian and residual:");
-                        //     for (int u=0; u < nvar; u++) {
-                        //         printf("\n");
-                        //         for (int v=0; v < nvar; v++) printf("%f ", jacobian(u, v));
-                        //     }
-                        //     printf("\nres:\n");
-                        //     for (int u=0; u < nvar; u++) printf("%f ", delta_prim(u));
-                        //     printf("\n");
+                        //     printf("P_solver: "); PLOOP printf("%g ", P_solver(ip)); printf("\n");
+                        //     printf("Pi: "); PLOOP printf("%g ", Pi(ip)); printf("\n");
+                        //     printf("Ui: "); PLOOP printf("%g ", Ui(ip)); printf("\n");
+                        //     printf("Ps: "); PLOOP printf("%g ", Ps(ip)); printf("\n");
+                        //     printf("Us: "); PLOOP printf("%g ", Us(ip)); printf("\n");
+                        //     printf("dUdt: "); PLOOP printf("%g ", dUdt(ip)); printf("\n");
                         // }
 
                         // Linear solve
@@ -280,7 +280,6 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         norm_all(b, k , j, i) = 0;
                         PLOOP norm_all(b, k, j, i) += pow(residual(ip), 2);
                         norm_all(b, k, j, i) = sqrt(norm_all(b, k, j, i)); // TODO faster to scratch cache & copy?
-
                     }
                 );
                 member.team_barrier();
diff --git a/tests/mhdmodes/check.sh b/tests/mhdmodes/check.sh
index 61c69b88..67cdfd61 100755
--- a/tests/mhdmodes/check.sh
+++ b/tests/mhdmodes/check.sh
@@ -9,17 +9,17 @@ RES3D="16,24,32,48"
 RES2D="32,64,128,256"
 
 fail=0
-#python3 check.py $RES3D "entropy mode in 3D" entropy || fail=1
-#python3 check.py $RES3D "slow mode in 3D" slow || fail=1
-#python3 check.py $RES3D "Alfven mode in 3D" alfven || fail=1
-#python3 check.py $RES3D "fast mode in 3D" fast || fail=1
+python3 check.py $RES3D "entropy mode in 3D" entropy || fail=1
+python3 check.py $RES3D "slow mode in 3D" slow || fail=1
+python3 check.py $RES3D "Alfven mode in 3D" alfven || fail=1
+python3 check.py $RES3D "fast mode in 3D" fast || fail=1
 
-#python3 check.py $RES3D "entropy mode in 3D, linear/MC reconstruction" entropy_mc || fail=1
-#python3 check.py $RES3D "entropy mode in 3D, linear/VL reconstruction" entropy_vl || fail=1
+python3 check.py $RES3D "entropy mode in 3D, linear/MC reconstruction" entropy_mc || fail=1
+python3 check.py $RES3D "entropy mode in 3D, linear/VL reconstruction" entropy_vl || fail=1
 
-#python3 check.py $RES3D "slow mode in 3D, classic algo" slow_imex || fail=1
-#python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_imex || fail=1
-#python3 check.py $RES3D "fast mode in 3D, classic algo" fast_imex || fail=1
+python3 check.py $RES3D "slow mode in 3D, classic algo" slow_imex || fail=1
+python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_imex || fail=1
+python3 check.py $RES3D "fast mode in 3D, classic algo" fast_imex || fail=1
 
 python3 check.py $RES3D "slow mode in 3D, classic algo" slow_imex_im || fail=1
 python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_imex_im || fail=1
diff --git a/tests/mhdmodes/run.sh b/tests/mhdmodes/run.sh
index 5aba2b09..20aee459 100755
--- a/tests/mhdmodes/run.sh
+++ b/tests/mhdmodes/run.sh
@@ -47,17 +47,17 @@ conv_1d() {
 }
 
 # These 3 double as a demo of why WENO is great
-#conv_3d entropy mhdmodes/nmode=0
-#conv_3d entropy_mc "mhdmodes/nmode=0 GRMHD/reconstruction=linear_mc"
-#conv_3d entropy_vl "mhdmodes/nmode=0 GRMHD/reconstruction=linear_vl"
+conv_3d entropy mhdmodes/nmode=0
+conv_3d entropy_mc "mhdmodes/nmode=0 GRMHD/reconstruction=linear_mc"
+conv_3d entropy_vl "mhdmodes/nmode=0 GRMHD/reconstruction=linear_vl"
 # Other modes don't benefit, exercise WENO most since we use it
-#conv_3d slow mhdmodes/nmode=1
-#conv_3d alfven mhdmodes/nmode=2
-#conv_3d fast mhdmodes/nmode=3
+conv_3d slow mhdmodes/nmode=1
+conv_3d alfven mhdmodes/nmode=2
+conv_3d fast mhdmodes/nmode=3
 # And we've got to test classic/GRIM stepping
-#conv_3d slow_imex   "mhdmodes/nmode=1 driver/type=imex"
-#conv_3d alfven_imex "mhdmodes/nmode=2 driver/type=imex"
-#conv_3d fast_imex   "mhdmodes/nmode=3 driver/type=imex"
+conv_3d slow_imex   "mhdmodes/nmode=1 driver/type=imex"
+conv_3d alfven_imex "mhdmodes/nmode=2 driver/type=imex"
+conv_3d fast_imex   "mhdmodes/nmode=3 driver/type=imex"
 # And the implicit solver
 conv_3d slow_imex_im   "mhdmodes/nmode=1 driver/type=imex driver/step=implicit implicit/max_nonlinear_iter=3"
 conv_3d alfven_imex_im "mhdmodes/nmode=2 driver/type=imex driver/step=implicit implicit/max_nonlinear_iter=3"

From 552cac0cd2a19985fc801a17be0993538517a6e7 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Fri, 4 Mar 2022 16:56:56 -0600
Subject: [PATCH 12/26] Fix some CI for new pyharm, add EMHD modes test (NOT
 run automatically)

---
 pars/emhdmodes.par         |  11 +--
 scripts/compare.py         |   4 +-
 tests/bz_monopole/check.py |   2 +-
 tests/emhdmodes/check.py   | 137 +++++++++++++++++++++++++++++++++++++
 tests/emhdmodes/check.sh   |  18 +++++
 tests/emhdmodes/run.sh     |  25 +++++++
 tests/noh/check.sh         |   2 +-
 7 files changed, 191 insertions(+), 8 deletions(-)
 create mode 100644 tests/emhdmodes/check.py
 create mode 100755 tests/emhdmodes/check.sh
 create mode 100755 tests/emhdmodes/run.sh

diff --git a/pars/emhdmodes.par b/pars/emhdmodes.par
index 409b5fc0..df73f757 100644
--- a/pars/emhdmodes.par
+++ b/pars/emhdmodes.par
@@ -43,18 +43,21 @@ integrator = rk2
 dt_min = 0.0001
 
 <GRMHD>
-cfl = 0.9
+cfl = 0.5
 gamma = 1.333333
-reconstruction = weno5
+reconstruction = linear_mc
 
 <emhdmodes>
 amp = 1e-4
 
 <floors>
-disable_floors = true
+#disable_floors = true
+rho_min_geom=1e-6
 
 <debug>
-verbose = 0
+verbose = 1
+flag_verbose = 2
+extra_checks = 1
 
 # This block must be present and values filled
 # in all EGRMHD simulations
diff --git a/scripts/compare.py b/scripts/compare.py
index a24b2fb2..9e1c84e7 100644
--- a/scripts/compare.py
+++ b/scripts/compare.py
@@ -36,9 +36,9 @@
 dump2file = sys.argv[2]
 imname = sys.argv[3]
 
-dump1 = pyHARM.load_dump(dump1file, add_ghosts=GHOSTS)
+dump1 = pyharm.load_dump(dump1file, add_ghosts=GHOSTS)
 #Hopefully this fails for dumps that shouldn't be compared
-dump2 = pyHARM.load_dump(dump2file, add_ghosts=GHOSTS)
+dump2 = pyharm.load_dump(dump2file, add_ghosts=GHOSTS)
 
 N1 = dump1['n1']; N2 = dump1['n2']; N3 = dump1['n3']
 
diff --git a/tests/bz_monopole/check.py b/tests/bz_monopole/check.py
index d6c53e2f..56fa6a76 100755
--- a/tests/bz_monopole/check.py
+++ b/tests/bz_monopole/check.py
@@ -10,7 +10,7 @@
 import pyharm.plots.plot_dumps as hplt
 
 for dumpname in np.sort(glob.glob("bz_monopole.out0.*.phdf")):
-    dump = pyHARM.load_dump(dumpname)
+    dump = pyharm.load_dump(dumpname)
     fig, ax = plt.subplots(1,1,figsize=(7,7))
     hplt.plot_xz(ax, dump, 'log_U1', arrayspace=True, window=[0,1,0,1])
     plt.savefig(dumpname+"_U1.png")
diff --git a/tests/emhdmodes/check.py b/tests/emhdmodes/check.py
new file mode 100644
index 00000000..c076481f
--- /dev/null
+++ b/tests/emhdmodes/check.py
@@ -0,0 +1,137 @@
+import numpy as np
+import os, sys, h5py, glob
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+
+if __name__=='__main__':
+    outputdir = './'
+
+    NVAR = 10
+    VARS = ['rho', 'u', 'u1', 'u2', 'u3', 'B1', 'B2', 'B3', 'q', 'deltaP']
+    RES = [16,24,32,48]
+
+    # problem params
+    var0 = np.zeros(NVAR)
+    var0[0] = 1.
+    var0[1] = 2.
+    var0[5] = 0.1
+    var0[6] = 0.3
+
+    # L1 initialization
+    L1 = np.zeros([len(RES), NVAR])
+    fit = np.zeros([len(RES), NVAR])
+
+    # perturbation (for 2D EMHD wave)
+    dvar_cos = np.zeros(NVAR)
+    dvar_cos[0] = -0.518522524082246
+    dvar_cos[1] = 0.5516170736393813
+    dvar_cos[2] = 0.008463122479547856
+    dvar_cos[3] = -0.16175466371870734
+    dvar_cos[5] = -0.05973794979640743
+    dvar_cos[6] = 0.02986897489820372
+    dvar_cos[8] = 0.5233486841539436
+    dvar_cos[9] = 0.2909106062057657
+    dvar_sin = np.zeros(NVAR)
+    dvar_sin[0] = 0.1792647678001878
+    dvar_sin[2] = -0.011862022608466367
+    dvar_sin[3] = 0.034828080823603294
+    dvar_sin[5] = 0.03351707506150924
+    dvar_sin[6] = -0.016758537530754618
+    dvar_sin[8] = -0.04767672501939603
+    dvar_sin[9] = -0.02159452055336572
+
+    # loop over RES
+    for r in range(len(RES)):
+        # load data
+        dfile = h5py.File(sorted(glob.glob(os.path.join(str(RES[r]), 'dumps', 'dump_000000*.h5')))[-1], 'r')
+        gfile = h5py.File(os.path.join(str(RES[r]), 'dumps', 'grid.h5'), 'r')
+
+        dump = {}
+
+        amp = dfile['header/problem/amp'][()]
+        k1  = 2*np.pi
+        k2  = 4*np.pi
+        real_omega  = dfile['header/problem/real_omega'][()]
+        imag_omega  = dfile['header/problem/imag_omega'][()]
+        t = dfile['t'][()]
+
+        dump['RHO'] = dfile['prims'][Ellipsis,0][()]
+        dump['U'] = dfile['prims'][Ellipsis,1][()]
+        dump['U1'] = dfile['prims'][Ellipsis,2][()]
+        dump['U2'] = dfile['prims'][Ellipsis,3][()]
+        dump['U3'] = dfile['prims'][Ellipsis,4][()]
+        dump['B1'] = dfile['prims'][Ellipsis,5][()]
+        dump['B2'] = dfile['prims'][Ellipsis,6][()]
+        dump['B3'] = dfile['prims'][Ellipsis,7][()]
+        dump['q'] = dfile['prims'][Ellipsis,8][()]
+        dump['deltaP'] = dfile['prims'][Ellipsis,9][()]
+
+        grid = {}
+        grid['x'] = gfile['X'][()]
+        grid['y'] = gfile['Y'][()]
+        cos_phi = np.cos(k1*grid['x'] + k2*grid['y'] + imag_omega*t)
+        sin_phi = np.sin(k1*grid['x'] + k2*grid['y'] + imag_omega*t)
+
+        grid['n1'] = dfile['header/n1'][()]
+        grid['n2'] = dfile['header/n2'][()]
+        grid['n3'] = dfile['header/n3'][()]
+
+        gfile.close()
+        dfile.close()
+
+        # compute analytic result
+        var_analytic  = []
+        for i in range(NVAR):    
+            var_analytic.append(var0[i] + ((amp*cos_phi*dvar_cos[i]) + (amp*sin_phi*dvar_sin[i])) * np.exp(real_omega*t))
+        var_analytic = np.asarray(var_analytic)
+
+        # numerical result
+        var_numerical = np.zeros((NVAR, grid['n1'], grid['n2'], grid['n3']), dtype=float)
+        var_numerical[0,Ellipsis] = dump['RHO'] 
+        var_numerical[1,Ellipsis] = dump['U'] 
+        var_numerical[2,Ellipsis] = dump['U1'] 
+        var_numerical[3,Ellipsis] = dump['U2'] 
+        var_numerical[4,Ellipsis] = dump['U3'] 
+        var_numerical[5,Ellipsis] = dump['B1'] 
+        var_numerical[6,Ellipsis] = dump['B2'] 
+        var_numerical[7,Ellipsis] = dump['B3'] 
+        var_numerical[8,Ellipsis] = dump['q'] 
+        var_numerical[9,Ellipsis] = dump['deltaP']
+
+        print("\n{:d}".format(RES[r]))
+        print(np.mean(np.fabs(var_numerical - var_analytic)[6,Ellipsis]))
+
+        for n in range(NVAR):
+            L1[r,n] = np.mean(np.fabs(var_numerical[n,Ellipsis] - var_analytic[n,Ellipsis]))
+
+    # plot parameters
+    mpl.rcParams['figure.dpi'] = 300
+    mpl.rcParams['savefig.dpi'] = 300
+    mpl.rcParams['figure.autolayout'] = True
+    mpl.rcParams['axes.titlesize'] = 16
+    mpl.rcParams['axes.labelsize'] = 14
+    mpl.rcParams['xtick.labelsize'] = 12
+    mpl.rcParams['ytick.labelsize'] = 12
+    mpl.rcParams['axes.xmargin'] = 0.02
+    mpl.rcParams['axes.ymargin'] = 0.02
+    mpl.rcParams['legend.fontsize'] = 'medium'
+    colors = ['indigo', 'goldenrod', 'darkgreen', 'crimson', 'xkcd:blue', 'xkcd:magenta', 'green', 'xkcd:yellowgreen', 'xkcd:teal', 'xkcd:olive']
+
+    # plot
+    fig = plt.figure(figsize=(6,6))
+    ax = fig.add_subplot(1,1,1)
+
+    # loop over prims
+    tracker = 0
+    for n in range(NVAR):
+        if abs((dvar_cos[n] != 0) or abs(dvar_sin[n] != 0)):
+            color = colors[tracker]
+            ax.loglog(RES, L1[:,n], color=color, marker='o', label=VARS[n])
+            tracker+=1
+
+    ax.loglog([RES[0], RES[-1]], 100*amp*np.asarray([float(RES[0]), float(RES[-1])])**(-2), color='k', linestyle='dashed', label='$N^{-2}$')
+    plt.xscale('log', base=2)
+    ax.legend()
+    plt.savefig(os.path.join(outputdir, 'emhd_linear_mode_convergence.png'))
diff --git a/tests/emhdmodes/check.sh b/tests/emhdmodes/check.sh
new file mode 100755
index 00000000..c2ba3a77
--- /dev/null
+++ b/tests/emhdmodes/check.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Run checks against analytic result for specified tests
+
+. ~/libs/anaconda3/etc/profile.d/conda.sh
+conda activate pyharm
+
+pyharm-convert *.phdf
+
+RES3D="16,24,32,48"
+RES2D="16,24,32,48"
+
+fail=0
+python3 check.py $RES2D "EMHD mode in 2D, WENO5" emhd2d_weno 2d || fail=1
+python3 check.py $RES2D "EMHD mode in 2D, linear/MC reconstruction" emhd2d_mc 2d || fail=1
+python3 check.py $RES2D "EMHD mode in 2D, linear/VL reconstruction" emhd2d_vl 2d || fail=1
+
+exit $fail
diff --git a/tests/emhdmodes/run.sh b/tests/emhdmodes/run.sh
new file mode 100755
index 00000000..39f10922
--- /dev/null
+++ b/tests/emhdmodes/run.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -euo pipefail
+
+BASE=../..
+
+# Extended MHD modes convergence in 2D to exercise basic EMHD source terms
+
+conv_2d() {
+    for res in 32 64 128 256
+    do
+      # Four blocks
+      half=$(( $res / 2 ))
+      $BASE/run.sh -i $BASE/pars/emhdmodes.par debug/verbose=1 \
+                      parthenon/mesh/nx1=$res parthenon/mesh/nx2=$res parthenon/mesh/nx3=1 \
+                      parthenon/meshblock/nx1=$half parthenon/meshblock/nx2=$half parthenon/meshblock/nx3=1 $2
+        mv mhdmodes.out0.00000.phdf mhd_2d_${res}_start_${1}.phdf
+        mv mhdmodes.out0.final.phdf mhd_2d_${res}_end_${1}.phdf
+    done
+}
+
+# 2D modes use small blocks, could pick up some problems at MPI ranks >> 1
+# Just one default mode
+conv_2d emhd2d_vl "GRMHD/reconstruction=linear_vl"
+conv_2d emhd2d_mc "GRMHD/reconstruction=linear_mc"
+conv_2d emhd2d_weno "GRMHD/reconstruction=weno5"
diff --git a/tests/noh/check.sh b/tests/noh/check.sh
index 84d26354..6cdfe80a 100755
--- a/tests/noh/check.sh
+++ b/tests/noh/check.sh
@@ -6,5 +6,5 @@ PYHARMDIR=$HOME/Code/pyharm
 . ~/libs/anaconda3/etc/profile.d/conda.sh
 conda activate pyharm
 
-python3 $PYHARMDIR/scripts/kharma_convert.py *.phdf
+pyharm-convert *.phdf
 python3 $BASEDIR/check.py . . 64,128,256,512,1024,2048,4096 1.666667

From 26fa56e52ce24dfc50b59db9c397e4c921ae942c Mon Sep 17 00:00:00 2001
From: Vedant Dhruv <tg867798@c205-015.frontera.tacc.utexas.edu>
Date: Mon, 7 Mar 2022 09:06:36 -0600
Subject: [PATCH 13/26] higher order terms included. they have to be texted but
 at least it compiles and we now have some placeholder text for them

---
 kharma/emhd/emhd.cpp         | 49 ++++++++++++---------
 kharma/emhd/emhd.hpp         | 77 ++++++++++++++++++++++++---------
 kharma/emhd/emhd_sources.hpp | 32 ++++++++------
 kharma/flux.cpp              |  8 ++--
 kharma/flux.hpp              | 14 +++---
 kharma/flux_functions.hpp    | 26 ++++++-----
 kharma/implicit/implicit.cpp | 10 ++---
 kharma/implicit/implicit.hpp | 14 +++---
 pars/emhdshock.par           | 83 ++++++++++++++++++++++++++++++++++++
 9 files changed, 227 insertions(+), 86 deletions(-)
 create mode 100644 pars/emhdshock.par

diff --git a/kharma/emhd/emhd.cpp b/kharma/emhd/emhd.cpp
index 9fbdc25b..3c818768 100644
--- a/kharma/emhd/emhd.cpp
+++ b/kharma/emhd/emhd.cpp
@@ -63,24 +63,27 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     // GRIM uses a callback to a problem-specific implementation which sets these
     // We share implementations in one function, controlled by these parameters
     // These are always necessary for performing EGRMHD.
+
+    bool higher_order_terms = pin->GetOrAddBoolean("emhd", "higher_order_terms", false);
     std::string closure_type = pin->GetString("emhd", "closure_type");
+
     Real tau = pin->GetOrAddReal("emhd", "tau", 1.0);
     Real conduction_alpha = pin->GetOrAddReal("emhd", "conduction_alpha", 1.0);
     Real viscosity_alpha = pin->GetOrAddReal("emhd", "viscosity_alpha", 1.0);
 
-    Closure closure;
+    EMHD_parameters emhd_params;
+    emhd_params.higher_order_terms = higher_order_terms;
     if (closure_type == "constant") { 
-        closure.type = ClosureType::constant;
+        emhd_params.type = ClosureType::constant;
     } else if (closure_type == "sound_speed") {
-        closure.type = ClosureType::soundspeed;
+        emhd_params.type = ClosureType::soundspeed;
     } else {
-        closure.type = ClosureType::torus;
+        emhd_params.type = ClosureType::torus;
     }
-    closure.tau = tau;
-    closure.conduction_alpha = conduction_alpha;
-    closure.viscosity_alpha = viscosity_alpha;
-    params.Add("closure", closure);
-
+    emhd_params.tau = tau;
+    emhd_params.conduction_alpha = conduction_alpha;
+    emhd_params.viscosity_alpha = viscosity_alpha;
+    params.Add("emhd_params", emhd_params);
 
     // Slope reconstruction on faces. Always linear: default to MC unless we're using VL everywhere
     if (packages.Get("GRMHD")->Param<ReconstructionType>("recon") == ReconstructionType::linear_vl) {
@@ -130,7 +133,7 @@ TaskStatus AddSource(MeshData<Real> *md, MeshData<Real> *mdudt)
     const int ndim = pmesh->ndim;
 
     const auto& pars = pmb0->packages.Get("EMHD")->AllParams();
-    const Closure& closure = pars.Get<Closure>("closure");
+    const EMHD_parameters& emhd_params = pars.Get<EMHD_parameters>("emhd_params");
 
     // Pack variables
     PackIndexMap prims_map, cons_map;
@@ -178,8 +181,8 @@ TaskStatus AddSource(MeshData<Real> *md, MeshData<Real> *mdudt)
             const auto& G = dUdt.GetCoords(b);
 
             // Get the EGRMHD parameters
-            Real tau, chi, nu_e;
-            EMHD::set_parameters(G, P(b), m_p, closure, gam, k, j, i, tau, chi, nu_e);
+            Real tau, chi_e, nu_e;
+            EMHD::set_parameters(G, P(b), m_p, emhd_params, gam, k, j, i, tau, chi_e, nu_e);
 
             // and the 4-vectors
             FourVectors D;
@@ -197,15 +200,23 @@ TaskStatus AddSource(MeshData<Real> *md, MeshData<Real> *mdudt)
             // Compute+add explicit source terms (conduction and viscosity)
             const Real& rho = P(b)(m_p.RHO, k, j, i);
             Real q0 = 0;
-            DLOOP1 q0 -= rho * chi * (D.bcon[mu] / sqrt(bsq)) * grad_Theta[mu];
-            DLOOP2 q0 -= rho * chi * (D.bcon[mu] / sqrt(bsq)) * theta_s(b, k, j, i) * D.ucon[nu] * grad_ucov[nu][mu];
+            DLOOP1 q0 -= rho * chi_e * (D.bcon[mu] / sqrt(bsq)) * grad_Theta[mu];
+            DLOOP2 q0 -= rho * chi_e * (D.bcon[mu] / sqrt(bsq)) * theta_s(b, k, j, i) * D.ucon[nu] * grad_ucov[nu][mu];
+
+            Real dP0 = -rho * nu_e * div_ucon;
+            DLOOP2  dP0 += 3. * rho * nu_e * (D.bcon[mu] * D.bcon[nu] / bsq) * grad_ucov[mu][nu];
+
+            Real q0_tilde = 0., dP0_tilde = 0;
+            EMHD::convert_q_dP_to_prims(q0, dP0, rho, theta_s(b, k, j, i), tau, chi_e, nu_e, 
+                                        emhd_params, q0_tilde, dP0_tilde);
 
-            Real deltaP0 = -rho * nu_e * div_ucon;
-            DLOOP2  deltaP0 += 3. * rho * nu_e * (D.bcon[mu] * D.bcon[nu] / bsq) * grad_ucov[mu][nu];
+            dUdt(b, m_u.Q, k, j, i)  += G.gdet(Loci::center, j, i) * q0_tilde / tau;
+            dUdt(b, m_u.DP, k, j, i) += G.gdet(Loci::center, j, i) * dP0_tilde / tau;
 
-            // TODO edit this when higher order terms are considered
-            dUdt(b, m_u.Q, k, j, i)  += G.gdet(Loci::center, j, i) * q0 / tau;
-            dUdt(b, m_u.DP, k, j, i) += G.gdet(Loci::center, j, i) * deltaP0 / tau;
+            if (emhd_params.higher_order_terms) {
+                dUdt(b, m_u.Q, k, j, i) += G.gdet(Loci::center, j, i) * (q0_tilde / 2.) * div_ucon;
+                dUdt(b, m_u.DP, k, j, i) += G.gdet(Loci::center, j, i) * (q0_tilde / 2.) * div_ucon;
+            }
         }
     );
 
diff --git a/kharma/emhd/emhd.hpp b/kharma/emhd/emhd.hpp
index 3a57c0ce..a3c6febf 100644
--- a/kharma/emhd/emhd.hpp
+++ b/kharma/emhd/emhd.hpp
@@ -51,8 +51,10 @@ namespace EMHD {
 
 enum ClosureType{constant=0, soundspeed, torus};
 
-class Closure {
+class EMHD_parameters {
     public:
+
+        bool higher_order_terms;
         ClosureType type;
         Real tau;
         Real conduction_alpha;
@@ -78,45 +80,45 @@ TaskStatus AddSource(MeshData<Real> *md, MeshData<Real> *mdudt);
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Local& P, const VarMap& m_p,
-                                           const Closure& closure, const Real& gam,
+                                           const EMHD_parameters& emhd_params, const Real& gam,
                                            Real& tau, Real& chi, Real& nu)
 {
-    if (closure.type == ClosureType::constant) {
+    if (emhd_params.type == ClosureType::constant) {
         // Set tau, nu, chi to constants
-        tau = closure.tau;
-        chi = closure.conduction_alpha;
-        nu  = closure.viscosity_alpha;
-    } else if (closure.type == ClosureType::soundspeed) {
+        tau = emhd_params.tau;
+        chi = emhd_params.conduction_alpha;
+        nu  = emhd_params.viscosity_alpha;
+    } else if (emhd_params.type == ClosureType::soundspeed) {
         // Set tau=const, chi/nu prop. to sound speed squared
         Real cs2 = (gam * (gam - 1.) * P(m_p.UU)) / (P(m_p.RHO) + (gam * P(m_p.UU)));
 
-        tau = closure.tau;
-        chi = closure.conduction_alpha * cs2 * tau;
-        nu  = closure.viscosity_alpha * cs2 * tau;
-    } else if (closure.type == ClosureType::torus) {
+        tau = emhd_params.tau;
+        chi = emhd_params.conduction_alpha * cs2 * tau;
+        nu  = emhd_params.viscosity_alpha * cs2 * tau;
+    } else if (emhd_params.type == ClosureType::torus) {
         // Something complicated
     } // else yell
 }
 template<typename Global>
 KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Global& P, const VarMap& m_p,
-                                           const Closure& closure, const Real& gam,
+                                           const EMHD_parameters& emhd_params, const Real& gam,
                                            const int& k, const int& j, const int& i,
                                            Real& tau, Real& chi, Real& nu)
 {
-    if (closure.type == ClosureType::constant) {
+    if (emhd_params.type == ClosureType::constant) {
         // Set tau, nu, chi to constants
-        tau = closure.tau;
-        chi = closure.conduction_alpha;
-        nu  = closure.viscosity_alpha;
-    } else if (closure.type == ClosureType::soundspeed) {
+        tau = emhd_params.tau;
+        chi = emhd_params.conduction_alpha;
+        nu  = emhd_params.viscosity_alpha;
+    } else if (emhd_params.type == ClosureType::soundspeed) {
         // Set tau=const, chi/nu prop. to sound speed squared
         const Real cs2 = (gam * (gam - 1.) * P(m_p.UU, k, j, i)) /
                             (P(m_p.RHO, k, j, i) + (gam * P(m_p.UU, k, j, i)));
 
-        tau = closure.tau;
-        chi = closure.conduction_alpha * cs2 * tau;
-        nu  = closure.viscosity_alpha * cs2 * tau;
-    } else if (closure.type == ClosureType::torus) {
+        tau = emhd_params.tau;
+        chi = emhd_params.conduction_alpha * cs2 * tau;
+        nu  = emhd_params.viscosity_alpha * cs2 * tau;
+    } else if (emhd_params.type == ClosureType::torus) {
         // Something complicated
     } // else yell
 }
@@ -148,5 +150,38 @@ KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Re
     }
 }
 
+// Convert q_tilde and dP_tilde (which are primitives) to q and dP
+// This is required because the stress-energy tensor depends on q and dP
+KOKKOS_INLINE_FUNCTION void convert_prims_to_q_dP(const Real& q_tilde, const Real& dP_tilde,
+                                        const Real& rho, const Real& Theta, 
+                                        const Real& tau, const Real& chi_e, const Real& nu_e,
+                                        const EMHD_parameters& emhd_params, Real& q, Real& dP)
+{
+    q  = q_tilde;
+    dP = dP_tilde;
+
+    if (emhd_params.higher_order_terms) {
+        q  *= sqrt(chi_e * rho * pow(Theta, 2) /tau);
+        dP *= sqrt(chi_e * rho * Theta /tau);
+    }
+}
+
+// Convert q and dP to q_tilde and dP_tilde (which are primitives)
+// This is required because,
+//          1. The source terms contain q0_tilde and dP0_tilde
+//          2. Initializations MAY require converting q and dP to q_tilde and dP_tilde
+KOKKOS_INLINE_FUNCTION void convert_q_dP_to_prims(const Real& q, const Real& dP,
+                                        const Real& rho, const Real& Theta, 
+                                        const Real& tau, const Real& chi_e, const Real& nu_e,
+                                        const EMHD_parameters& emhd_params, Real& q_tilde, Real& dP_tilde)
+{
+    q_tilde  = q;
+    dP_tilde = dP;
+
+    if (emhd_params.higher_order_terms) {
+        q_tilde  *= sqrt(tau / (chi_e * rho * pow(Theta, 2)) );
+        dP_tilde *= sqrt(tau / (nu_e * rho * Theta /tau) );
+    }
+}
 
 } // namespace EMHD
diff --git a/kharma/emhd/emhd_sources.hpp b/kharma/emhd/emhd_sources.hpp
index a3d86a8e..9cdf588a 100644
--- a/kharma/emhd/emhd_sources.hpp
+++ b/kharma/emhd/emhd_sources.hpp
@@ -52,12 +52,12 @@ namespace EMHD {
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void implicit_sources(const GRCoordinates& G, const Local& P, const VarMap& m_p,
                                              const Real& gam, const int& j, const int& i,
-                                             const Closure& closure,
+                                             const EMHD_parameters& emhd_params,
                                              Real& dUq, Real& dUdP)
 {
     // These are intentionally the tilde versions!
-    Real tau, chi, nu;
-    EMHD::set_parameters(G, P, m_p, closure, gam, tau, chi, nu);
+    Real tau, chi_e, nu_e;
+    EMHD::set_parameters(G, P, m_p, emhd_params, gam, tau, chi_e, nu_e);
     dUq  = -G.gdet(Loci::center, j, i) * (P(m_p.Q) / tau);
     dUdP = -G.gdet(Loci::center, j, i) * (P(m_p.DP) / tau);
 }
@@ -69,13 +69,13 @@ KOKKOS_INLINE_FUNCTION void implicit_sources(const GRCoordinates& G, const Local
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void time_derivative_sources(const GRCoordinates& G, const Local& P_new,
                                                     const Local& P_old, const Local& P,
-                                                    const VarMap& m_p, const Closure& closure,
+                                                    const VarMap& m_p, const EMHD_parameters& emhd_params,
                                                     const Real& gam, const Real& dt, const int& j, const int& i,
                                                     Real& dUq, Real& dUdP)
 {
     // Parameters
-    Real tau, chi, nu;
-    EMHD::set_parameters(G, P, m_p, closure, gam, tau, chi, nu);
+    Real tau, chi_e, nu_e;
+    EMHD::set_parameters(G, P, m_p, emhd_params, gam, tau, chi_e, nu_e);
 
     FourVectors Dtmp;
     GRMHD::calc_4vecs(G, P, m_p, j, i, Loci::center, Dtmp);
@@ -101,16 +101,24 @@ KOKKOS_INLINE_FUNCTION void time_derivative_sources(const GRCoordinates& G, cons
     // TEMPORAL SOURCE TERMS
     const Real& rho = P(m_p.RHO);
     const Real& Theta = (gam-1) * P(m_p.UU) / P(m_p.RHO);
-    Real q0 = -rho * chi * (Dtmp.bcon[0] / sqrt(bsq)) * dt_Theta;
-    DLOOP1 q0 -= rho * chi * (Dtmp.bcon[mu] / sqrt(bsq)) * Theta * Dtmp.ucon[0] * dt_ucov[mu];
+    Real q0 = -rho * chi_e * (Dtmp.bcon[0] / sqrt(bsq)) * dt_Theta;
+    DLOOP1 q0 -= rho * chi_e * (Dtmp.bcon[mu] / sqrt(bsq)) * Theta * Dtmp.ucon[0] * dt_ucov[mu];
 
+    Real dP0 = -rho * nu_e * div_ucon;
+    DLOOP1 dP0 += 3. * rho * nu_e * (Dtmp.bcon[0] * Dtmp.bcon[mu] / bsq) * dt_ucov[mu];
 
-    Real deltaP0 = -rho * nu * div_ucon;
-    DLOOP1 deltaP0 += 3. * rho * nu * (Dtmp.bcon[0] * Dtmp.bcon[mu] / bsq) * dt_ucov[mu];
+    Real q0_tilde = 0., dP0_tilde = 0;
+    EMHD::convert_q_dP_to_prims(q0, dP0, rho, Theta, tau, chi_e, nu_e, 
+                                emhd_params, q0_tilde, dP0_tilde);
 
     // NOTE: Will have to edit this when higher order terms are considered
-    dUq  = G.gdet(Loci::center, j, i) * (q0 / tau);
-    dUdP = G.gdet(Loci::center, j, i) * (deltaP0 / tau);
+    dUq  = G.gdet(Loci::center, j, i) * (q0_tilde / tau);
+    dUdP = G.gdet(Loci::center, j, i) * (dP0_tilde / tau);
+
+    if (emhd_params.higher_order_terms) {
+        dUq  += G.gdet(Loci::center, j, i) * (q0_tilde / 2.) * div_ucon;
+        dUdP += G.gdet(Loci::center, j, i) * (dP0_tilde / 2.) * div_ucon;
+    }
 }
 
 } // namespace EMHD
diff --git a/kharma/flux.cpp b/kharma/flux.cpp
index 818f2c9d..b1fbc857 100644
--- a/kharma/flux.cpp
+++ b/kharma/flux.cpp
@@ -56,12 +56,12 @@ TaskStatus Flux::PtoU(MeshBlockData<Real> *rc, IndexDomain domain)
     const bool use_emhd = pkgs.count("EMHD");
     MetadataFlag isPrimitive = pars.Get<MetadataFlag>("PrimitiveFlag");
 
-    EMHD::Closure closure_tmp;
+    EMHD::EMHD_parameters emhd_params_tmp;
     if (use_emhd) {
         const auto& emhd_pars = pmb->packages.Get("EMHD")->AllParams();
-        closure_tmp = emhd_pars.Get<EMHD::Closure>("closure");
+        emhd_params_tmp = emhd_pars.Get<EMHD::EMHD_parameters>("emhd_params");
     }
-    const EMHD::Closure& closure = closure_tmp;
+    const EMHD::EMHD_parameters& emhd_params = emhd_params_tmp;
 
     // Pack variables
     PackIndexMap prims_map, cons_map;
@@ -101,7 +101,7 @@ TaskStatus Flux::PtoU(MeshBlockData<Real> *rc, IndexDomain domain)
                 [&](const int& i) {
                     auto P = Kokkos::subview(P_s, Kokkos::ALL(), i);
                     auto U = Kokkos::subview(U_s, Kokkos::ALL(), i);
-                    Flux::p_to_u(G, P, m_p, closure, gam, j, i, U, m_u);
+                    Flux::p_to_u(G, P, m_p, emhd_params, gam, j, i, U, m_u);
                 }
             );
 
diff --git a/kharma/flux.hpp b/kharma/flux.hpp
index 9c146408..c78b2ea9 100644
--- a/kharma/flux.hpp
+++ b/kharma/flux.hpp
@@ -137,12 +137,12 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
     const Real gam = pars.Get<Real>("gamma");
     const double ctop_max = (use_b_cd) ? globals.Get<Real>("ctop_max_last") : 0.0;
 
-    EMHD::Closure closure_tmp;
+    EMHD::EMHD_parameters emhd_params_tmp;
     if (use_emhd) {
         const auto& emhd_pars = pmb0->packages.Get("EMHD")->AllParams();
-        closure_tmp = emhd_pars.Get<EMHD::Closure>("closure");
+        emhd_params_tmp = emhd_pars.Get<EMHD::EMHD_parameters>("emhd_params");
     }
-    const EMHD::Closure& closure = closure_tmp;
+    const EMHD::EMHD_parameters& emhd_params = emhd_params_tmp;
 
     const Loci loc = loc_of(dir);
 
@@ -232,8 +232,8 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
 
                     // Left
                     GRMHD::calc_4vecs(G, Pl, m_p, j, i, loc, Dtmp);
-                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, closure, gam, j, i, 0, Ul, m_u, loc);
-                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, closure, gam, j, i, dir, Fl, m_u, loc);
+                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, emhd_params, gam, j, i, 0, Ul, m_u, loc);
+                    Flux::prim_to_flux(G, Pl, m_p, Dtmp, emhd_params, gam, j, i, dir, Fl, m_u, loc);
 
                     // Magnetosonic speeds
                     Real cmaxL, cminL;
@@ -260,8 +260,8 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
                     // Right
                     // TODO GRMHD/GRHD versions of this
                     GRMHD::calc_4vecs(G, Pr, m_p, j, i, loc, Dtmp);
-                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, closure, gam, j, i, 0, Ur, m_u, loc);
-                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, closure, gam, j, i, dir, Fr, m_u, loc);
+                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, emhd_params, gam, j, i, 0, Ur, m_u, loc);
+                    Flux::prim_to_flux(G, Pr, m_p, Dtmp, emhd_params, gam, j, i, dir, Fr, m_u, loc);
 
                     // Magnetosonic speeds
                     Real cmaxR, cminR;
diff --git a/kharma/flux_functions.hpp b/kharma/flux_functions.hpp
index 0136f1b9..d5b44a02 100644
--- a/kharma/flux_functions.hpp
+++ b/kharma/flux_functions.hpp
@@ -56,7 +56,7 @@ namespace Flux
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P, const VarMap& m_p, const FourVectors D,
-                                         const EMHD::Closure& closure, const Real& gam, const int& j, const int& i, const int dir,
+                                         const EMHD::EMHD_parameters& emhd_params, const Real& gam, const int& j, const int& i, const int dir,
                                          const Local& flux, const VarMap& m_u, const Loci loc=Loci::center)
 {
     Real gdet = G.gdet(loc, j, i);
@@ -66,14 +66,18 @@ KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P,
     Real T[GR_DIM];
     if (m_p.Q >= 0) {
         // EGRMHD stress-energy tensor w/ first index up, second index down
-        // Convert prim Qtilde/dPtilde to real q/dP
-        // Real tau, chi, nu;
-        // EMHD::set_parameters(G, P, m_p, closure, gam, tau, chi, nu);
-        //const Real Theta = (gam - 1) * P(m_p.UU) / P(m_p.RHO);
-        //const Real q = (closure.higher_order) ? P(m_p.RHO) * sqrt(chi * P(m_p.RHO) * pow(Theta, 2) / tau);
-        //const Real dP = sqrt(nu * P(m_p.RHO) * Theta / tau);
-        const Real q = P(m_p.Q);
-        const Real dP = P(m_p.DP);
+        
+        const Real& rho     = P(m_p.RHO);
+        const Real q_tilde  = P(m_p.Q);
+        const Real dP_tilde = P(m_p.DP);
+        const Real& Theta   = (gam - 1) * P(m_p.UU) / P(m_p.RHO);
+
+        Real tau, chi_e, nu_e;
+        EMHD::set_parameters(G, P, m_p, emhd_params, gam, tau, chi_e, nu_e);
+
+        Real q = 0., dP = 0.;
+        EMHD::convert_prims_to_q_dP(q_tilde, dP_tilde, rho, Theta, tau, chi_e, nu_e, emhd_params, q, dP);
+
         // Then calculate the tensor
         EMHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), q, dP, D, dir, T);
     } else if (m_p.B1 >= 0) {
@@ -141,12 +145,12 @@ KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P,
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const VarMap& m_p,
-                                   const EMHD::Closure& closure, const Real& gam, const int& j, const int& i,
+                                   const EMHD::EMHD_parameters& emhd_params, const Real& gam, const int& j, const int& i,
                                    const Local& U, const VarMap& m_u, const Loci& loc=Loci::center)
 {
     FourVectors Dtmp;
     GRMHD::calc_4vecs(G, P, m_p, j, i, loc, Dtmp); // TODO switch GRHD/GRMHD?
-    prim_to_flux(G, P, m_p, Dtmp, closure, gam, j, i, 0, U, m_u, loc);
+    prim_to_flux(G, P, m_p, Dtmp, emhd_params, gam, j, i, 0, U, m_u, loc);
 }
 
 /**
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
index 166ea210..953244df 100644
--- a/kharma/implicit/implicit.cpp
+++ b/kharma/implicit/implicit.cpp
@@ -91,10 +91,10 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     const Real delta = implicit_par.Get<Real>("jacobian_delta");
     const Real gam = pmb0->packages.Get("GRMHD")->Param<Real>("gamma");
 
-    Closure closure;
+    EMHD_parameters emhd_params;
     if (pmb0->packages.AllPackages().count("EMHD")) {
         const auto& pars = pmb0->packages.Get("EMHD")->AllParams();
-        closure = pars.Get<Closure>("closure");
+        emhd_params = pars.Get<EMHD_parameters>("emhd_params");
     }
 
     printf("Implicit advance dt: %g\n", dt);
@@ -228,7 +228,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         auto dUi = Kokkos::subview(dUi_s, Kokkos::ALL(), i);
                         if (m_p.Q >= 0) {
                             Real dUq, dUdP;
-                            EMHD::implicit_sources(G, Pi, m_p, gam, j, i, closure, dUq, dUdP);
+                            EMHD::implicit_sources(G, Pi, m_p, gam, j, i, emhd_params, dUq, dUdP);
                             dUi(m_u.Q) = dUq;
                             dUi(m_u.DP) = dUdP;
                         }
@@ -236,7 +236,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         // Jacobian calculation
                         // Requires calculating the residual anyway, so we grab it here
                         calc_jacobian(G, P_solver, Pi, Ui, Ps, dUdt, dUi, tmp1, tmp2, tmp3,
-                                      m_p, m_u, closure, nvar, j, i, delta, gam, dt, jacobian, residual);
+                                      m_p, m_u, emhd_params, nvar, j, i, delta, gam, dt, jacobian, residual);
                         // Solve against the negative residual
                         PLOOP delta_prim(ip) = -residual(ip);
 
@@ -273,7 +273,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         PLOOP P_solver(ip) += lambda * delta_prim(ip);
 
                         calc_residual(G, P_solver, Pi, Ui, Ps, dUdt, dUi, tmp3,
-                                      m_p, m_u, closure, nvar, j, i, gam, dt, residual);
+                                      m_p, m_u, emhd_params, nvar, j, i, gam, dt, residual);
 
                         // Store for maximum/output
                         // I would be tempted to store the whole residual, but it's of variable size
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
index 860ce3c4..550aab56 100644
--- a/kharma/implicit/implicit.hpp
+++ b/kharma/implicit/implicit.hpp
@@ -78,7 +78,7 @@ template<typename Local>
 KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P_test,
                                           const Local& Pi, const Local& Ui, const Local& Ps,
                                           const Local& dudt_explicit, const Local& dUi, const Local& tmp, 
-                                          const VarMap& m_p, const VarMap& m_u, const Closure& closure,
+                                          const VarMap& m_p, const VarMap& m_u, const EMHD_parameters& emhd_params,
                                           const int& nvar, const int& j, const int& i,
                                           const Real& gam, const double& dt,
                                           Local& residual)
@@ -86,18 +86,18 @@ KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P
     // These lines calculate res = (U_test - Ui)/dt - dudt_explicit - 0.5*(dU_new(ip) + dUi(ip)) - dU_time(ip) )
     // Start with conserved vars corresponding to test P, U_test
     // Note this uses the Flux:: call, it needs *all* conserved vars!
-    Flux::p_to_u(G, P_test, m_p, closure, gam, j, i, tmp, m_u); // U_test
+    Flux::p_to_u(G, P_test, m_p, emhd_params, gam, j, i, tmp, m_u); // U_test
     // (U_test - Ui)/dt - dudt_explicit ...
     PLOOP residual(ip) = (tmp(ip) - Ui(ip)) / dt - dudt_explicit(ip);
 
     if (m_p.Q >= 0) {
         // Compute new implicit source terms and time derivative source terms
         Real dUq, dUdP; // Don't need full array for these
-        EMHD::implicit_sources(G, P_test, m_p, gam, j, i, closure, dUq, dUdP); // dU_new
+        EMHD::implicit_sources(G, P_test, m_p, gam, j, i, emhd_params, dUq, dUdP); // dU_new
         // ... - 0.5*(dU_new(ip) + dUi(ip)) ...
         residual(m_u.Q) -= 0.5*(dUq + dUi(m_u.Q));
         residual(m_u.DP) -= 0.5*(dUdP + dUi(m_u.DP));
-        EMHD::time_derivative_sources(G, P_test, Pi, Ps, m_p, closure, gam, dt, j, i, dUq, dUdP); // dU_time
+        EMHD::time_derivative_sources(G, P_test, Pi, Ps, m_p, emhd_params, gam, dt, j, i, dUq, dUdP); // dU_time
         // ... - dU_time(ip)
         residual(m_u.Q) -= dUq;
         residual(m_u.DP) -= dUdP;
@@ -115,13 +115,13 @@ KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P
                                           const Local& Pi, const Local& Ui, const Local& Ps,
                                           const Local& dudt_explicit, const Local& dUi,
                                           Local& tmp1, Local& tmp2, Local& tmp3,
-                                          const VarMap& m_p, const VarMap& m_u, const Closure& closure,
+                                          const VarMap& m_p, const VarMap& m_u, const EMHD_parameters& emhd_params,
                                           const int& nvar, const int& j, const int& i,
                                           const Real& jac_delta, const Real& gam, const double& dt,
                                           Local2& jacobian, Local& residual)
 {
     // Calculate residual for Sf->P
-    calc_residual(G, P, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, closure, nvar, j, i, gam, dt, residual);
+    calc_residual(G, P, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, emhd_params, nvar, j, i, gam, dt, residual);
 
     // Use one scratchpad as the incremented prims P_delta,
     // one as the new residual residual_delta
@@ -140,7 +140,7 @@ KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P
         }
 
         // Compute the residual for P_delta, residual_delta
-        calc_residual(G, P_delta, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, closure, nvar, j, i, gam, dt, residual_delta);
+        calc_residual(G, P_delta, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, emhd_params, nvar, j, i, gam, dt, residual_delta);
 
         // Compute forward derivatives of each residual vs the primitive col
         for (int row = 0; row < nvar; row++) {
diff --git a/pars/emhdshock.par b/pars/emhdshock.par
new file mode 100644
index 00000000..0299b0d7
--- /dev/null
+++ b/pars/emhdshock.par
@@ -0,0 +1,83 @@
+# EMHD Shock problem
+# Try to maintain the BVP solution to a discontuinity
+# Checks the higher order terms implementation in flat space
+# IMPORTANT: This test is different from the other tests in its initialization
+             It reads in ".txt" files that correspond to the BVP solution
+
+<parthenon/job>
+problem_id = emhdmodes
+
+<parthenon/mesh>
+refinement = none
+numlevel = 1
+
+nx1 = 1024
+x1min = 0.0
+x1max = 1.0
+ix1_bc = outflow
+ox1_bc = outflow
+
+nx2 = 1
+x2min = 0.0
+x2max = 1.0
+ix2_bc = periodic
+ox2_bc = periodic
+
+nx3 = 1
+x3min = 0.0
+x3max = 1.0
+ix3_bc = periodic
+ox3_bc = periodic
+
+<parthenon/meshblock>
+nx1 = 512
+nx2 = 1
+nx3 = 1
+
+<coordinates>
+base = cartesian_minkowski
+transform = null
+
+<parthenon/time>
+tlim = 2.0
+# "RK2" is the only option for implicit solver
+integrator = rk2
+dt_min = 0.0001
+
+<GRMHD>
+cfl = 0.25
+gamma = 1.333333
+reconstruction = weno5
+
+<floors>
+disable_floors = true
+
+<debug>
+verbose = 0
+
+# IMPORTANT: This block must be present and values filled in all EGRMHD simulations
+<emhd>
+on = true
+higher_order_terms = true
+closure_type = soundspeed
+tau = 0.1
+conduction_alpha = 5.0
+viscosity_alpha = 3.0
+
+<driver>
+type = imex
+step = implicit
+
+<implicit>
+max_nonlinear_iter = 3
+
+<parthenon/output0>
+file_type = hdf5
+# This is so as to output only the final state
+dt = 100.0
+single_precision_output = true
+variables = prims.rho, prims.u, prims.uvec, prims.B, prims.q, prims.dP
+
+<parthenon/output1>
+file_type = hst
+dt = 0.1
\ No newline at end of file

From 7f88a718c7bb6192738e4b7b3217a156e55d28ff Mon Sep 17 00:00:00 2001
From: Vedant Dhruv <tg867798@login1.frontera.tacc.utexas.edu>
Date: Mon, 7 Mar 2022 13:19:38 -0600
Subject: [PATCH 14/26] EMHD shock test initialization. NOTE: This problem (as
 of now) will not run and converge at second order successfully!! The issue
 will be fixed in iharm3d and then carried forward to KHARMA

---
 kharma/debug.cpp             |   4 +-
 kharma/emhd/emhd.hpp         |  28 ++---
 kharma/emhd/emhd_sources.hpp |   1 -
 kharma/prob/emhdshock.hpp    | 206 +++++++++++++++++++++++++++++++++++
 pars/emhdmodes.par           |   4 +-
 pars/emhdshock.par           |   9 +-
 6 files changed, 232 insertions(+), 20 deletions(-)
 create mode 100644 kharma/prob/emhdshock.hpp

diff --git a/kharma/debug.cpp b/kharma/debug.cpp
index b0c8037f..6510835e 100644
--- a/kharma/debug.cpp
+++ b/kharma/debug.cpp
@@ -171,7 +171,7 @@ int CountPFlags(MeshData<Real> *md, IndexDomain domain, int verbose)
         auto& rc = pmb->meshblock_data.Get();
         auto pflag = rc->Get("pflag").data.GetHostMirrorAndCopy();
 
-#pragma omp parallel for simd collapse(3) reduction(+:n_cells,n_tot,n_neg_in,n_max_iter,n_utsq,n_gamma,n_neg_u,n_neg_rho,n_neg_both)
+//#pragma omp parallel for simd collapse(3) reduction(+:n_cells,n_tot,n_neg_in,n_max_iter,n_utsq,n_gamma,n_neg_u,n_neg_rho,n_neg_both)
         for(int k=ks; k <= ke; ++k)
             for(int j=js; j <= je; ++j)
                 for(int i=is; i <= ie; ++i)
@@ -236,7 +236,7 @@ int CountFFlags(MeshData<Real> *md, IndexDomain domain, int verbose)
         auto& rc = pmb->meshblock_data.Get();
         auto fflag = rc->Get("fflag").data.GetHostMirrorAndCopy();
 
-#pragma omp parallel for simd collapse(3) reduction(+:n_cells,n_tot,n_geom_rho,n_geom_u,n_b_rho,n_b_u,n_temp,n_gamma,n_ktot)
+//#pragma omp parallel for simd collapse(3) reduction(+:n_cells,n_tot,n_geom_rho,n_geom_u,n_b_rho,n_b_u,n_temp,n_gamma,n_ktot)
         for(int k=ks; k <= ke; ++k)
             for(int j=js; j <= je; ++j)
                 for(int i=is; i <= ie; ++i)
diff --git a/kharma/emhd/emhd.hpp b/kharma/emhd/emhd.hpp
index a3c6febf..59ea605f 100644
--- a/kharma/emhd/emhd.hpp
+++ b/kharma/emhd/emhd.hpp
@@ -81,20 +81,20 @@ TaskStatus AddSource(MeshData<Real> *md, MeshData<Real> *mdudt);
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Local& P, const VarMap& m_p,
                                            const EMHD_parameters& emhd_params, const Real& gam,
-                                           Real& tau, Real& chi, Real& nu)
+                                           Real& tau, Real& chi_e, Real& nu_e)
 {
     if (emhd_params.type == ClosureType::constant) {
         // Set tau, nu, chi to constants
-        tau = emhd_params.tau;
-        chi = emhd_params.conduction_alpha;
-        nu  = emhd_params.viscosity_alpha;
+        tau   = emhd_params.tau;
+        chi_e = emhd_params.conduction_alpha;
+        nu_e  = emhd_params.viscosity_alpha;
     } else if (emhd_params.type == ClosureType::soundspeed) {
         // Set tau=const, chi/nu prop. to sound speed squared
         Real cs2 = (gam * (gam - 1.) * P(m_p.UU)) / (P(m_p.RHO) + (gam * P(m_p.UU)));
 
-        tau = emhd_params.tau;
-        chi = emhd_params.conduction_alpha * cs2 * tau;
-        nu  = emhd_params.viscosity_alpha * cs2 * tau;
+        tau   = emhd_params.tau;
+        chi_e = emhd_params.conduction_alpha * cs2 * tau;
+        nu_e  = emhd_params.viscosity_alpha * cs2 * tau;
     } else if (emhd_params.type == ClosureType::torus) {
         // Something complicated
     } // else yell
@@ -103,21 +103,21 @@ template<typename Global>
 KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Global& P, const VarMap& m_p,
                                            const EMHD_parameters& emhd_params, const Real& gam,
                                            const int& k, const int& j, const int& i,
-                                           Real& tau, Real& chi, Real& nu)
+                                           Real& tau, Real& chi_e, Real& nu_e)
 {
     if (emhd_params.type == ClosureType::constant) {
         // Set tau, nu, chi to constants
-        tau = emhd_params.tau;
-        chi = emhd_params.conduction_alpha;
-        nu  = emhd_params.viscosity_alpha;
+        tau   = emhd_params.tau;
+        chi_e = emhd_params.conduction_alpha;
+        nu_e  = emhd_params.viscosity_alpha;
     } else if (emhd_params.type == ClosureType::soundspeed) {
         // Set tau=const, chi/nu prop. to sound speed squared
         const Real cs2 = (gam * (gam - 1.) * P(m_p.UU, k, j, i)) /
                             (P(m_p.RHO, k, j, i) + (gam * P(m_p.UU, k, j, i)));
 
-        tau = emhd_params.tau;
-        chi = emhd_params.conduction_alpha * cs2 * tau;
-        nu  = emhd_params.viscosity_alpha * cs2 * tau;
+        tau   = emhd_params.tau;
+        chi_e = emhd_params.conduction_alpha * cs2 * tau;
+        nu_e  = emhd_params.viscosity_alpha * cs2 * tau;
     } else if (emhd_params.type == ClosureType::torus) {
         // Something complicated
     } // else yell
diff --git a/kharma/emhd/emhd_sources.hpp b/kharma/emhd/emhd_sources.hpp
index 9cdf588a..19673bd0 100644
--- a/kharma/emhd/emhd_sources.hpp
+++ b/kharma/emhd/emhd_sources.hpp
@@ -111,7 +111,6 @@ KOKKOS_INLINE_FUNCTION void time_derivative_sources(const GRCoordinates& G, cons
     EMHD::convert_q_dP_to_prims(q0, dP0, rho, Theta, tau, chi_e, nu_e, 
                                 emhd_params, q0_tilde, dP0_tilde);
 
-    // NOTE: Will have to edit this when higher order terms are considered
     dUq  = G.gdet(Loci::center, j, i) * (q0_tilde / tau);
     dUdP = G.gdet(Loci::center, j, i) * (dP0_tilde / tau);
 
diff --git a/kharma/prob/emhdshock.hpp b/kharma/prob/emhdshock.hpp
new file mode 100644
index 00000000..f33faebf
--- /dev/null
+++ b/kharma/prob/emhdshock.hpp
@@ -0,0 +1,206 @@
+/* 
+ *  File: emhdshock.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include "decs.hpp"
+#include "emhd.hpp"
+
+using namespace std::literals::complex_literals;
+using namespace std;
+using namespace parthenon;
+
+#define STRLEN 2048
+
+/**
+ * Initialization of the EMHD shock test in magnetized plasma w/viscosity and heat conduction
+ * 
+ * The BVP solution (EMHD_shock_test.ipynb) is the input to the code.
+ * Since the BVP solution is a steady-state, time-independent solution of the EMHD equations,
+ * the code should maintain the solution.
+ * 
+ * An alternate option is to initialize with the ideal MHD Rankine-Hugoniot jump condition.
+ * If higher order terms have been implemented correctly, the primitives should relax to the
+ * steady state solution. However, they may differ by a translation to the BVP solution.
+ * 
+ * Therefore, to quantitatively check the EMHD implementation, we prefer the BVP solution as the input
+ **/
+
+TaskStatus InitializeEMHDShock(MeshBlockData<Real> *rc, ParameterInput *pin)
+{
+    Flag(rc, "Initializing EMHD shock problem");
+    auto pmb = rc->GetBlockPointer();
+    GridScalar rho = rc->Get("prims.rho").data;
+    GridScalar u = rc->Get("prims.u").data;
+    GridVector uvec = rc->Get("prims.uvec").data;
+    // It is well and good this problem should cry if B/EMHD are disabled.
+    GridVector B_P = rc->Get("prims.B").data;
+    GridVector q = rc->Get("prims.q").data;
+    GridVector dP = rc->Get("prims.dP").data;
+
+    // Need P for EMHD::set_parameters
+    PackIndexMap prims_map;
+    const MetadataFlag isPrimitive = pmb->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+    auto P = rc->PackVariables(std::vector<MetadataFlag>{isPrimitive}, prims_map);
+    const VarMap m_p(prims_map, false);
+
+    // Need fluid adiabatic index to compute pgas and sound speed (for higher order terms)
+    const Real gam = pmb->packages.Get("GRMHD")->Param<Real>("gamma");
+
+    const auto& G = pmb->coords;
+
+    const std::string input = pin->GetOrAddString("emhd", "input" "BVP");
+
+    IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
+    IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
+    IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
+
+    // Need the EMHD package if higher order terms are considered
+    const bool use_emhd = pkgs.count("EMHD");
+    EMHD::EMHD_parameters emhd_params_tmp;
+    if (use_emhd) {
+        const auto& emhd_pars = pmb0->packages.Get("EMHD")->AllParams();
+        emhd_params_tmp       = emhd_pars.Get<EMHD::EMHD_parameters>("emhd_params");
+    }
+    const EMHD::EMHD_parameters& emhd_params = emhd_params_tmp;
+
+    if (input == "BVP"){
+
+        // Load file names into strings
+        char fbvp_rho[STRLEN], fbvp_u[STRLEN], fbvp_u1[STRLEN], fbvp_q[STRLEN], fbvp_dP[STRLEN];
+        sprintf(fbvp_rho, "shock_soln_rho.txt");
+        sprintf(fbvp_u,   "shock_soln_u.txt");
+        sprintf(fbvp_u1,  "shock_soln_u1.txt");
+        sprintf(fbvp_q,   "shock_soln_q.txt");
+        sprintf(fbvp_dP,  "shock_soln_dP.txt");
+
+        // Assign file pointers
+        FILE *fp_rho, *fp_u, *fp_u1, *fp_q, *fp_dP;
+        fp_rho = fopen(fname_rho, "r");
+        fp_u   = fopen(fname_u,   "r");
+        fp_u1  = fopen(fname_u1,  "r");
+        fp_q   = fopen(fname_q,   "r");
+        fp_dP  = fopen(fname_dP,  "r");
+
+        pmb->par_for("emhdshock_init", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA_3D {
+                Real X[GR_DIM];
+                G.coord_embed(k, j, i, Loci::center, X);
+
+                // First initialize primitives that are read from .txt files
+                fscanf(fp_rho, "%lf", &(rho(k, j, i)));
+                fscanf(fp_u,   "%lf", &(u(k, j, i)));
+                fscanf(fp_u1,  "%lf", &(uvec(0, k, j, i)));
+                fscanf(fp_q,   "%lf", &(q(k, j, i)));
+                fscanf(fp_dP,  "%lf", &(dP(k, j, i)));
+
+                // Now the remaining primitives
+                uvec(1, k, j, i) = 0.;
+                uvec(2, k, j, i) = 0.;
+                B_P(0, k, j, i)  = 0.;
+                B_P(1, k, j, i)  = 0.;
+                B_P(2, k, j, i)  = 0.;
+
+                if (emhd_params.higher_order_terms) {
+
+                    // Initialize local variables (for improved readability)
+                    const Real& rho   = rho(k, j, i);
+                    const Real& u     = u(k, j, i);
+                    const Real& Theta = (gam - 1.) * u / rho;
+
+                    // Set EMHD parameters
+                    Real tau, chi_e, nu_e;
+                    EMHD::set_parameters(G, P, m_p, emhd_params, gam, k, j, i, tau, chi_e, nu_e);
+
+                    // Update q and dP (which now are q_tilde and dP_tilde)
+                    q(k, j, i)  *= sqrt(tau / (chi_e * rho * pow(Theta, 2)));
+                    dP(k, j, i) *= sqrt(tau / (nu_e * rho * Theta));
+                }
+
+            }
+        );
+
+        // disassociate file pointer
+        fclose(fp_rho);
+        fclose(fp_u);
+        fclose(fp_u1);
+        fclose(fp_q);
+        fclose(fp_dP);
+    }
+
+    else {
+
+        // Need the limits of the problem size to determine center
+        const Real x1min = pin->GetReal("parthenon/mesh", "x1min");
+        const Real x1max = pin->GetReal("parthenon/mesh", "x1max");
+
+        // Left and right states
+        double rhoL = 1.,     rhoR = 3.08312999;
+        double uL   = 1.,     uR   = 4.94577705;
+        double u1L  = 1.,     u1R  = 0.32434571;
+        double u2L  = 0.,     u2R  = 0.;
+        double u3L  = 0.,     u3R  = 0.;
+        double B1L  = 1.e-5,  B1R  = 1.e-5;
+        double B2L  = 0,      B2R  = 0.;
+        double B3L  = 0.,     B3R  = 0.;
+
+        pmb->par_for("emhdshock_init", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA_3D {
+
+                Real X[GR_DIM];
+                G.coord_embed(k, j, i, Loci::center, X);
+                const Real x1_center = (x1min + x1max) / 2.;
+
+                bool lhs = X[1] < x1_center;
+
+                // Initialize primitives
+                rho(k, j, i)     = (lhs) ? rhoL : rhoR;
+                u(k, j, i)       = (lhs) ? uL : uR;
+                uvec(0, k, j, i) = (lhs) ? u1L : u1R;
+                uvec(1, k, j, i) = (lhs) ? u2L : u2R;
+                uvec(2, k, j, i) = (lhs) ? u3L : u3R;
+                B_P(0, k, j, i)  = (lhs) ? B1L : B1R;
+                B_P(1, k, j, i)  = (lhs) ? B2L : B2R;
+                B_P(3, k, j, i)  = (lhs) ? B3L : B3R;
+                q(k ,j, i)       = 0.;   
+                dP(k ,j, i)      = 0.;   
+
+            }
+
+        );
+
+    }
+
+    return TaskStatus::complete;
+
+}
\ No newline at end of file
diff --git a/pars/emhdmodes.par b/pars/emhdmodes.par
index df73f757..6355d330 100644
--- a/pars/emhdmodes.par
+++ b/pars/emhdmodes.par
@@ -63,6 +63,7 @@ extra_checks = 1
 # in all EGRMHD simulations
 <emhd>
 on = true
+higher_order_terms = false
 closure_type = soundspeed
 tau = 1.0
 conduction_alpha = 1.0
@@ -84,5 +85,4 @@ variables = prims.rho, prims.u, prims.uvec, prims.B, prims.q, prims.dP
 
 <parthenon/output1>
 file_type = hst
-dt = 0.1
-
+dt = 0.1
\ No newline at end of file
diff --git a/pars/emhdshock.par b/pars/emhdshock.par
index 0299b0d7..f68f084d 100644
--- a/pars/emhdshock.par
+++ b/pars/emhdshock.par
@@ -2,7 +2,10 @@
 # Try to maintain the BVP solution to a discontuinity
 # Checks the higher order terms implementation in flat space
 # IMPORTANT: This test is different from the other tests in its initialization
-             It reads in ".txt" files that correspond to the BVP solution
+             It reads in ".txt" files that correspond to the BVP solution (set input to "BVP" in <emhdshock>)
+             One, in principle, can run this problem with the usual ideal MHD jump conditions but this
+             may not allow a quantitative check
+             Run it with a single MPI task
 
 <parthenon/job>
 problem_id = emhdmodes
@@ -49,6 +52,10 @@ cfl = 0.25
 gamma = 1.333333
 reconstruction = weno5
 
+<emhdshock>
+# The input can be the BVP solution or the ideal MHD Rankine-Hugoniot jump conditions
+input = BVP
+
 <floors>
 disable_floors = true
 

From 2181fc597f167004e589cf7c8a24aa5c4c65b67c Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 7 Mar 2022 13:20:53 -0600
Subject: [PATCH 15/26] Fix an initialization issue in EMHD modes test,
 compiling on BH. some tracing code.

---
 kharma/emhd/emhd.cpp         |  8 ++--
 kharma/implicit/implicit.cpp | 28 ++++++------
 kharma/implicit/implicit.hpp | 13 ++++++
 kharma/types.hpp             | 89 +++++++++++++++---------------------
 make.sh                      | 10 ++++
 pars/emhdmodes.par           |  2 +-
 run.sh                       |  6 ++-
 7 files changed, 85 insertions(+), 71 deletions(-)

diff --git a/kharma/emhd/emhd.cpp b/kharma/emhd/emhd.cpp
index 3c818768..da9e6baa 100644
--- a/kharma/emhd/emhd.cpp
+++ b/kharma/emhd/emhd.cpp
@@ -65,7 +65,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     // These are always necessary for performing EGRMHD.
 
     bool higher_order_terms = pin->GetOrAddBoolean("emhd", "higher_order_terms", false);
-    std::string closure_type = pin->GetString("emhd", "closure_type");
+    std::string closure_type = pin->GetOrAddString("emhd", "closure_type", "torus");
 
     Real tau = pin->GetOrAddReal("emhd", "tau", 1.0);
     Real conduction_alpha = pin->GetOrAddReal("emhd", "conduction_alpha", 1.0);
@@ -75,10 +75,12 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     emhd_params.higher_order_terms = higher_order_terms;
     if (closure_type == "constant") { 
         emhd_params.type = ClosureType::constant;
-    } else if (closure_type == "sound_speed") {
+    } else if (closure_type == "sound_speed" || closure_type == "soundspeed") {
         emhd_params.type = ClosureType::soundspeed;
-    } else {
+    } else if (closure_type == "torus") {
         emhd_params.type = ClosureType::torus;
+    } else {
+        throw std::invalid_argument("Invalid Closure type: "+closure_type+". Use constant, sound_speed, or torus");
     }
     emhd_params.tau = tau;
     emhd_params.conduction_alpha = conduction_alpha;
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
index 953244df..8fcd03a3 100644
--- a/kharma/implicit/implicit.cpp
+++ b/kharma/implicit/implicit.cpp
@@ -91,7 +91,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     const Real delta = implicit_par.Get<Real>("jacobian_delta");
     const Real gam = pmb0->packages.Get("GRMHD")->Param<Real>("gamma");
 
-    EMHD_parameters emhd_params;
+    EMHD_parameters emhd_params = {0};
     if (pmb0->packages.AllPackages().count("EMHD")) {
         const auto& pars = pmb0->packages.Get("EMHD")->AllParams();
         emhd_params = pars.Get<EMHD_parameters>("emhd_params");
@@ -167,7 +167,6 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     for (int iter=0; iter < iter_max; iter++) {
         // Flags per iter, since debugging here will be rampant
         Flag(md0, "Implicit Iteration: md0");
-        Flag(md1, "Implicit Iteration: md1");
 
         parthenon::par_for_outer(DEFAULT_OUTER_LOOP_PATTERN, "implicit_solve", pmb0->exec_space,
             total_scratch_bytes, scratch_level, block.s, block.e, kb.s, kb.e, jb.s, jb.e,
@@ -240,7 +239,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         // Solve against the negative residual
                         PLOOP delta_prim(ip) = -residual(ip);
 
-                        // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 8) {
+                        // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 0) {
                         //     printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d\n",
                         //             m_p.RHO, m_p.UU, m_p.U1, m_p.B1, m_p.Q, m_p.DP);
                         //     printf("P_solver: "); PLOOP printf("%g ", P_solver(ip)); printf("\n");
@@ -249,6 +248,8 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         //     printf("Ps: "); PLOOP printf("%g ", Ps(ip)); printf("\n");
                         //     printf("Us: "); PLOOP printf("%g ", Us(ip)); printf("\n");
                         //     printf("dUdt: "); PLOOP printf("%g ", dUdt(ip)); printf("\n");
+                        //     printf("Initial residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+                        //     printf("Initial delta_prim: "); PLOOP printf("%g ", delta_prim(ip)); printf("\n");
                         // }
 
                         // Linear solve
@@ -258,23 +259,21 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         KokkosBatched::SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Blocked>
                         ::invoke(alpha, jacobian, delta_prim);
 
-                        // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 8) {
-                        //     printf("\nTri Jacobian and dP:");
-                        //     for (int u=0; u < nvar; u++) {
-                        //         printf("\n");
-                        //         for (int v=0; v < nvar; v++) printf("%f ", jacobian(u, v));
-                        //     }
-                        //     printf("\ndP:\n");
-                        //     for (int u=0; u < nvar; u++) printf("%f ", delta_prim(u));
-                        //     printf("\n");
-                        // }
-
                         // Update the guess.  For now lambda == 1, choose on the fly?
                         PLOOP P_solver(ip) += lambda * delta_prim(ip);
 
                         calc_residual(G, P_solver, Pi, Ui, Ps, dUdt, dUi, tmp3,
                                       m_p, m_u, emhd_params, nvar, j, i, gam, dt, residual);
 
+                        // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 0) {
+                        //     printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d\n",
+                        //             m_p.RHO, m_p.UU, m_p.U1, m_p.B1, m_p.Q, m_p.DP);
+                        //     // JACOBIAN
+                        //     printf("Final residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+                        //     printf("Final delta_prim: "); PLOOP printf("%g ", delta_prim(ip)); printf("\n");
+                        //     printf("Final P_solver: "); PLOOP printf("%g ", P_solver(ip)); printf("\n");
+                        // }
+
                         // Store for maximum/output
                         // I would be tempted to store the whole residual, but it's of variable size
                         norm_all(b, k , j, i) = 0;
@@ -312,6 +311,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
             Pf_all(b)(p, k, j, i) = P_solver_all(b, p, k, j, i);
         }
     );
+    Flag(md1, "Implicit Iteration: final");
 
     return TaskStatus::complete;
 
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
index 550aab56..c4a90ff2 100644
--- a/kharma/implicit/implicit.hpp
+++ b/kharma/implicit/implicit.hpp
@@ -89,6 +89,9 @@ KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P
     Flux::p_to_u(G, P_test, m_p, emhd_params, gam, j, i, tmp, m_u); // U_test
     // (U_test - Ui)/dt - dudt_explicit ...
     PLOOP residual(ip) = (tmp(ip) - Ui(ip)) / dt - dudt_explicit(ip);
+    // if (i == 8 && j == 8) {
+    //     printf("Explicit residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+    // }
 
     if (m_p.Q >= 0) {
         // Compute new implicit source terms and time derivative source terms
@@ -97,10 +100,20 @@ KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P
         // ... - 0.5*(dU_new(ip) + dUi(ip)) ...
         residual(m_u.Q) -= 0.5*(dUq + dUi(m_u.Q));
         residual(m_u.DP) -= 0.5*(dUdP + dUi(m_u.DP));
+        // if (i == 8 && j == 8) {
+        //     Real tau = 0, chi_e = 0, nu_e = 0;
+        //     EMHD::set_parameters(G, P_test, m_p, emhd_params, gam, tau, chi_e, nu_e);
+        //     printf("EMHD Params: "); printf("%g %g %g", tau, chi_e, nu_e); printf("\n");
+        //     printf("Implicit sources new: "); printf("%g %g %g %g", P_test(m_p.Q), P_test(m_p.DP), dUq, dUdP); printf("\n");
+        //     printf("Implicit sources residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+        // }
         EMHD::time_derivative_sources(G, P_test, Pi, Ps, m_p, emhd_params, gam, dt, j, i, dUq, dUdP); // dU_time
         // ... - dU_time(ip)
         residual(m_u.Q) -= dUq;
         residual(m_u.DP) -= dUdP;
+        // if (i == 8 && j == 8) {
+        //     printf("Sources residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+        // }
     }
 }
 
diff --git a/kharma/types.hpp b/kharma/types.hpp
index 31271a76..35403bc5 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -156,78 +156,63 @@ KOKKOS_INLINE_FUNCTION bool outside(const int& k, const int& j, const int& i,
 }
 
 #if TRACE
+#define PRINTCORNERS 0
+inline void PrintCorner(MeshBlockData<Real> *rc)
+{
+    auto rhop = rc->Get("prims.rho").data.GetHostMirrorAndCopy();
+    auto up = rc->Get("prims.u").data.GetHostMirrorAndCopy();
+    auto uvecp = rc->Get("prims.uvec").data.GetHostMirrorAndCopy();
+    auto Bp = rc->Get("prims.B").data.GetHostMirrorAndCopy();
+    auto rhoc = rc->Get("cons.rho").data.GetHostMirrorAndCopy();
+    auto uc = rc->Get("cons.u").data.GetHostMirrorAndCopy();
+    auto uvecc = rc->Get("cons.uvec").data.GetHostMirrorAndCopy();
+    auto Bu = rc->Get("cons.B").data.GetHostMirrorAndCopy();
+    auto q = rc->Get("prims.q").data.GetHostMirrorAndCopy();
+    auto dP = rc->Get("prims.dP").data.GetHostMirrorAndCopy();
+    cerr << "q:";
+    for (int j=0; j<8; j++) {
+        cout << endl;
+        for (int i=0; i<8; i++) {
+            fprintf(stderr, "%.5g\t", q(0, j, i));
+        }
+    }
+    cerr << endl << "dP:";
+    for (int j=0; j<8; j++) {
+        cerr << endl;
+        for (int i=0; i<8; i++) {
+            fprintf(stderr, "%.5g\t", dP(0, j, i));
+        }
+    }
+    cerr << endl << endl;
+}
+
 inline void Flag(std::string label)
 {
 #pragma omp critical
     if(MPIRank0()) std::cerr << label << std::endl;
 }
+
 inline void Flag(MeshBlockData<Real> *rc, std::string label)
 {
 #pragma omp critical
 {
     if(MPIRank0()) std::cerr << label << std::endl;
-    if(0) {
-        auto rhop = rc->Get("prims.rho").data.GetHostMirrorAndCopy();
-        auto up = rc->Get("prims.u").data.GetHostMirrorAndCopy();
-        auto uvecp = rc->Get("prims.uvec").data.GetHostMirrorAndCopy();
-        auto Bp = rc->Get("prims.B").data.GetHostMirrorAndCopy();
-        auto rhoc = rc->Get("cons.rho").data.GetHostMirrorAndCopy();
-        auto uc = rc->Get("cons.u").data.GetHostMirrorAndCopy();
-        auto uvecc = rc->Get("cons.uvec").data.GetHostMirrorAndCopy();
-        auto Bu = rc->Get("cons.B").data.GetHostMirrorAndCopy();
-        cerr << "P:";
-        for (int j=0; j<8; j++) {
-            cout << endl;
-            for (int i=0; i<8; i++) {
-                fprintf(stderr, "%.5g\t", uvecp(2, 0, j, i));
-            }
-        }
-        cerr << endl << "U:";
-        for (int j=0; j<8; j++) {
-            cerr << endl;
-            for (int i=0; i<8; i++) {
-                fprintf(stderr, "%.5g\t", uvecc(2, 0, j, i));
-            }
-        }
-        cerr << endl << endl;
-    }
+    if(PRINTCORNERS) PrintCorner(rc);
 }
 }
+
 inline void Flag(MeshData<Real> *md, std::string label)
 {
 #pragma omp critical
 {
     if(MPIRank0()) std::cerr << label << std::endl;
-    if(0) {
-        cerr << label << ":" << std::endl;
-        auto rc = md->GetBlockData(0);
-        auto rhop = rc->Get("prims.rho").data.GetHostMirrorAndCopy();
-        auto up = rc->Get("prims.u").data.GetHostMirrorAndCopy();
-        auto uvecp = rc->Get("prims.uvec").data.GetHostMirrorAndCopy();
-        auto Bp = rc->Get("prims.B").data.GetHostMirrorAndCopy();
-        auto rhoc = rc->Get("cons.rho").data.GetHostMirrorAndCopy();
-        auto uc = rc->Get("cons.u").data.GetHostMirrorAndCopy();
-        auto uvecc = rc->Get("cons.uvec").data.GetHostMirrorAndCopy();
-        auto Bu = rc->Get("cons.B").data.GetHostMirrorAndCopy();
-        cerr << "P:";
-        for (int j=0; j<8; j++) {
-            cout << endl;
-            for (int i=0; i<8; i++) {
-                fprintf(stderr, "%.5g\t", uvecp(2, 0, j, i));
-            }
-        }
-        cerr << endl;
-        cerr << "U:";
-        for (int j=0; j<8; j++) {
-            cerr << endl;
-            for (int i=0; i<8; i++) {
-                fprintf(stderr, "%.5g\t", uvecc(2, 0, j, i));
-            }
-        }
-        cerr << endl << endl;
+    if(PRINTCORNERS) {
+        auto rc = md->GetBlockData(0).get();
+        PrintCorner(rc);
     }
 }
 }
+
 #else
 inline void Flag(std::string label) {}
 inline void Flag(MeshBlockData<Real> *rc, std::string label) {}
diff --git a/make.sh b/make.sh
index db8ec66e..900a0120 100755
--- a/make.sh
+++ b/make.sh
@@ -174,6 +174,7 @@ elif [[ "$ARGS" == *"cuda"* ]]; then
     export CXXFLAGS="-dryrun $CXXFLAGS"
     echo "Dry-running with $CXXFLAGS"
   fi
+  export NVCC_WRAPPER_DEFAULT_COMPILER="$CXX_NATIVE"
   # I've occasionally needed this. CUDA version thing?
   #export CXXFLAGS="--expt-relaxed-constexpr $CXXFLAGS"
   OUTER_LAYOUT="MANUAL1D_LOOP"
@@ -202,6 +203,13 @@ else
   ENABLE_HIP="OFF"
 fi
 
+# 
+if [[ -v LINKER ]]; then
+  LINKER="$LINKER"
+else
+  LINKER="$CXX"
+fi
+
 # Make build dir. Recall "clean" means "clean and build"
 if [[ "$ARGS" == *"clean"* ]]; then
   rm -rf build
@@ -214,6 +222,8 @@ if [[ "$ARGS" == *"clean"* ]]; then
   cmake ..\
     -DCMAKE_C_COMPILER="$CC" \
     -DCMAKE_CXX_COMPILER="$CXX" \
+    -DCMAKE_LINKER="$LINKER" \
+    -DCMAKE_CXX_LINK_EXECUTABLE='<CMAKE_LINKER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>' \
     -DCMAKE_PREFIX_PATH="$PREFIX_PATH:$CMAKE_PREFIX_PATH" \
     -DCMAKE_BUILD_TYPE=$TYPE \
     -DPAR_LOOP_LAYOUT=$OUTER_LAYOUT \
diff --git a/pars/emhdmodes.par b/pars/emhdmodes.par
index df73f757..c210bdc3 100644
--- a/pars/emhdmodes.par
+++ b/pars/emhdmodes.par
@@ -63,7 +63,7 @@ extra_checks = 1
 # in all EGRMHD simulations
 <emhd>
 on = true
-closure_type = soundspeed
+closure_type = sound_speed
 tau = 1.0
 conduction_alpha = 1.0
 viscosity_alpha = 1.0
diff --git a/run.sh b/run.sh
index 9c36408c..525fca56 100755
--- a/run.sh
+++ b/run.sh
@@ -30,7 +30,7 @@ export CUDA_LAUNCH_BLOCKING=0
 #export KOKKOS_DEVICE_ID=0
 
 # Choose the kharma from compiled options in order of preference
-KHARMA_DIR="$(dirname $0)"
+KHARMA_DIR="$(dirname "${BASH_SOURCE[0]}")"
 if [ -f $KHARMA_DIR/kharma.cuda ]; then
   EXE_NAME=kharma.cuda
 elif [ -f $KHARMA_DIR/kharma.sycl ]; then
@@ -42,6 +42,10 @@ else
   exit
 fi
 
+# Optionally use the Kokkos tools to profile kernels
+#export KOKKOS_PROFILE_LIBRARY=$KHARMA_DIR/../kokkos-tools/kp_kernel_timer.so
+#export KOKKOS_PROFILE_LIBRARY=$KHARMA_DIR/../kokkos-tools/kp_nvprof_cnnector.so
+
 # Load environment from the same files as the compile process
 HOST=$(hostname -f)
 ARGS=$(cat $KHARMA_DIR/make_args)

From 0e49438da94d7d099e40264a37a765480916b8fd Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Tue, 15 Mar 2022 14:55:14 -0500
Subject: [PATCH 16/26] ImEx Updates

* Support evolving some fields explicitly and some implicitly via flags
* Reorder prims in implicit solve & use upper triangular solve for accuracy
* Option to cleanup field with SOR at start, regardless of initialization
* EMHD modes test added to regression tests
* Other test fixes & new tests for imex behavior
* Fix bug in Mdot reduction

Note implicit solve is now triggered by one of:
1. EMHD enabled at all
2. driver/type=imex
In the latter case, solves for GRMHD/B can be disabled with implicit=false
---
 .gitlab-ci.yml                         |  12 ++
 kharma/b_cleanup/b_cleanup.cpp         |   3 +-
 kharma/b_flux_ct/b_flux_ct.cpp         |  46 +++--
 kharma/b_flux_ct/b_flux_ct.hpp         |   2 +
 kharma/boundaries.cpp                  |   3 +
 kharma/electrons/electrons.cpp         |   6 +-
 kharma/electrons/electrons.hpp         |  51 +-----
 kharma/emhd/emhd.cpp                   |  11 +-
 kharma/emhd/emhd.hpp                   |  27 ++-
 kharma/emhd/emhd_sources.hpp           |   2 +-
 kharma/emhd/emhd_utils.hpp             |   2 +-
 kharma/floors/floors.hpp               |   4 +
 kharma/flux_functions.hpp              |  16 +-
 kharma/grmhd/fixup.cpp                 |   2 +
 kharma/grmhd/grmhd.cpp                 | 206 +++++++++++++++++-----
 kharma/grmhd/grmhd.hpp                 |   6 +-
 kharma/grmhd/grmhd_functions.hpp       |   2 +-
 kharma/imex_driver.cpp                 | 198 ++++++++++++---------
 kharma/implicit/implicit.cpp           | 228 +++++++++++++++----------
 kharma/implicit/implicit.hpp           |  43 ++---
 kharma/kharma.cpp                      |  20 +--
 kharma/prob/anisotropic_conduction.hpp |   2 +-
 kharma/prob/bondi.cpp                  |   1 +
 kharma/prob/bondi.hpp                  |   1 +
 kharma/prob/emhdmodes.hpp              |  73 ++++----
 kharma/prob/post_initialize.cpp        | 144 +++++++++-------
 kharma/prob/resize_restart.cpp         |   5 -
 kharma/reductions/reductions.hpp       |  23 +--
 kharma/types.hpp                       |  21 ++-
 pars/bondi.par                         |   2 +-
 pars/emhdmodes.par                     |  25 +--
 pars/mhdmodes.par                      |   8 +-
 pars/mhdmodes_emhd.par                 |  88 ----------
 pars/mhdmodes_implicit.par             |  20 ++-
 scripts/compare.py                     |   4 +-
 tests/bondi/check.sh                   |   4 +-
 tests/bondi/run.sh                     |   5 +-
 tests/emhdmodes/check.py               |  36 ++--
 tests/emhdmodes/check.sh               |   9 +-
 tests/emhdmodes/run.sh                 |   7 +-
 tests/mhdmodes/check.sh                |  17 +-
 tests/mhdmodes/run.sh                  |  12 +-
 tests/tilt_init/check.py               |   2 +-
 43 files changed, 804 insertions(+), 595 deletions(-)
 delete mode 100644 pars/mhdmodes_emhd.par

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 58c9968d..a979b721 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -50,6 +50,18 @@ mhdmodes:
       - tests/mhdmodes/*.hst
       - tests/mhdmodes/*.txt
 
+emhdmodes:
+  stage: tests
+  before_script:
+    - cd tests/emhdmodes/
+  script:
+    - bash run.sh
+    - bash check.sh
+  artifacts:
+    when: always
+    paths:
+      - tests/emhdmodes/*.png
+
 noh:
   stage: tests
   before_script:
diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
index 34779f36..2e1f277f 100644
--- a/kharma/b_cleanup/b_cleanup.cpp
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -63,7 +63,8 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     params.Add("extra_checks", extra_checks);
 
     // Solver options
-    Real error_tolerance = pin->GetOrAddReal("b_cleanup", "error_tolerance", 1e-8);
+    // This tolerance corresponds to divB_max ~ 1e-12. TODO use that as the indicator?
+    Real error_tolerance = pin->GetOrAddReal("b_cleanup", "error_tolerance", 1e-10);
     params.Add("error_tolerance", error_tolerance);
     Real sor_factor = pin->GetOrAddReal("b_cleanup", "sor_factor", 2./3);
     params.Add("sor_factor", sor_factor);
diff --git a/kharma/b_flux_ct/b_flux_ct.cpp b/kharma/b_flux_ct/b_flux_ct.cpp
index eb0e6e5e..a0859234 100644
--- a/kharma/b_flux_ct/b_flux_ct.cpp
+++ b/kharma/b_flux_ct/b_flux_ct.cpp
@@ -62,6 +62,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     int extra_checks = pin->GetOrAddInteger("debug", "extra_checks", 0);
     params.Add("extra_checks", extra_checks);
 
+    // Diagnostic & inadvisable flags
     bool fix_flux = pin->GetOrAddBoolean("b_field", "fix_polar_flux", true);
     params.Add("fix_polar_flux", fix_flux);
     // WARNING this disables constrained transport, so the field will quickly pick up a divergence.
@@ -69,6 +70,13 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     bool disable_flux_ct = pin->GetOrAddBoolean("b_field", "disable_flux_ct", false);
     params.Add("disable_flux_ct", disable_flux_ct);
 
+    // Driver type & implicit marker
+    // By default, solve B implicitly if GRMHD is
+    auto driver_type = pin->GetString("driver", "type");
+    bool grmhd_implicit = packages.Get("GRMHD")->Param<bool>("implicit");
+    bool implicit_b = (driver_type == "imex" && pin->GetOrAddBoolean("b_field", "implicit", grmhd_implicit));
+    params.Add("implicit", implicit_b);
+
     // FIELDS
 
     std::vector<int> s_vector({NVEC});
@@ -77,20 +85,22 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     MetadataFlag isMHD = packages.Get("GRMHD")->Param<MetadataFlag>("MHDFlag");
 
     // B fields.  "Primitive" form is field, "conserved" is flux
-    // Note: when changing metadata, keep these in lockstep with grmhd.cpp!!
     // See notes there about changes for the Imex driver
     std::vector<MetadataFlag> flags_prim, flags_cons;
-    auto imex_driver = pin->GetString("driver", "type") == "imex";
-    if (!imex_driver) {
+    if (driver_type == "harm") {
         flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
                                                 isPrimitive, isMHD, Metadata::Vector});
         flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
-                 Metadata::Restart, Metadata::Conserved, isMHD, Metadata::WithFluxes, Metadata::Vector});
-    } else {
-        flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::FillGhost, Metadata::Restart,
-                                                isPrimitive, isMHD, Metadata::Vector});
-        flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
-                                                Metadata::Conserved, isMHD, Metadata::WithFluxes, Metadata::Vector});
+                                    Metadata::Restart, Metadata::Conserved, isMHD, Metadata::WithFluxes, Metadata::Vector});
+    } else if (driver_type == "imex") {
+        // See grmhd.cpp for full notes on flag changes for ImEx driver
+        // Note that default for B is *explicit* evolution
+        MetadataFlag areWeImplicit = (implicit_b) ? packages.Get("Implicit")->Param<MetadataFlag>("ImplicitFlag")
+                                                  : packages.Get("Implicit")->Param<MetadataFlag>("ExplicitFlag");
+        flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::FillGhost,
+                                                Metadata::Restart, isPrimitive, isMHD, areWeImplicit, Metadata::Vector});
+        flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent, Metadata::Conserved,
+                                                Metadata::WithFluxes, isMHD, areWeImplicit, Metadata::Vector});
     }
 
     auto m = Metadata(flags_prim, s_vector);
@@ -101,13 +111,19 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
     pkg->AddField("divB", m);
 
-    pkg->FillDerivedMesh = B_FluxCT::FillDerivedMesh;
-    pkg->FillDerivedBlock = B_FluxCT::FillDerivedBlock;
+    // Ensure that prims get filled
+    if (!implicit_b) {
+        //pkg->FillDerivedMesh = B_FluxCT::FillDerivedMesh;
+        pkg->FillDerivedBlock = B_FluxCT::FillDerivedBlock;
+    }
+
+    // Register the other callbacks
     pkg->PostStepDiagnosticsMesh = B_FluxCT::PostStepDiagnostics;
 
-    // List (vector) of HistoryOutputVar that will all be enrolled as output variables
+    // List (vector) of HistoryOutputVars that will all be enrolled as output variables
     parthenon::HstVar_list hst_vars = {};
-    // The definition of MaxDivB we care about actually changes per-transport. Use our function.
+    // The definition of MaxDivB we care about actually changes per-transport. Use our function,
+    // which calculates divB at cell corners
     hst_vars.emplace_back(parthenon::HistoryOutputVar(UserHistoryOperation::max, B_FluxCT::MaxDivB, "MaxDivB"));
     // add callbacks for HST output to the Params struct, identified by the `hist_param_key`
     pkg->AddParam<>(parthenon::hist_param_key, hst_vars);
@@ -117,7 +133,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
 
 void UtoP(MeshData<Real> *md, IndexDomain domain, bool coarse)
 {
-    Flag(md, "B UtoP Mesh"); // 
+    Flag(md, "B UtoP Mesh");
     auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
 
     const auto& B_U = md->PackVariables(std::vector<std::string>{"cons.B"});
@@ -163,7 +179,7 @@ void UtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
 
 void PtoU(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
 {
-    Flag(rc, "B UtoP Block");
+    Flag(rc, "B PtoU Block");
     auto pmb = rc->GetBlockPointer();
 
     auto B_U = rc->PackVariables(std::vector<std::string>{"cons.B"});
diff --git a/kharma/b_flux_ct/b_flux_ct.hpp b/kharma/b_flux_ct/b_flux_ct.hpp
index 5f466546..bdb516a4 100644
--- a/kharma/b_flux_ct/b_flux_ct.hpp
+++ b/kharma/b_flux_ct/b_flux_ct.hpp
@@ -65,8 +65,10 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
  */
 void UtoP(MeshData<Real> *md, IndexDomain domain=IndexDomain::entire, bool coarse=false);
 inline void FillDerivedMesh(MeshData<Real> *md) { UtoP(md); }
+inline TaskStatus FillDerivedMeshTask(MeshData<Real> *md) { UtoP(md); return TaskStatus::complete; }
 void UtoP(MeshBlockData<Real> *md, IndexDomain domain=IndexDomain::entire, bool coarse=false);
 inline void FillDerivedBlock(MeshBlockData<Real> *rc) { UtoP(rc); }
+inline TaskStatus FillDerivedBlockTask(MeshBlockData<Real> *rc) { UtoP(rc); return TaskStatus::complete; }
 
 /**
  * Inverse of above. Generally only for initialization.
diff --git a/kharma/boundaries.cpp b/kharma/boundaries.cpp
index ca8a887c..7089961f 100644
--- a/kharma/boundaries.cpp
+++ b/kharma/boundaries.cpp
@@ -37,6 +37,7 @@
 #include "boundaries.hpp"
 
 #include "kharma.hpp"
+#include "flux_functions.hpp"
 #include "grmhd_functions.hpp"
 #include "pack.hpp"
 #include "types.hpp"
@@ -140,6 +141,7 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
                 if (m_p.B1 >= 0)
                     VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
                 // Recover conserved vars
+                // TODO all flux
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
             }
         );
@@ -221,6 +223,7 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
             KOKKOS_LAMBDA_3D {
                 if (m_p.B1 >= 0)
                     VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
+                // TODO all flux
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
             }
         );
diff --git a/kharma/electrons/electrons.cpp b/kharma/electrons/electrons.cpp
index 0be44b37..5133873b 100644
--- a/kharma/electrons/electrons.cpp
+++ b/kharma/electrons/electrons.cpp
@@ -151,8 +151,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     // TODO if nKs == 1 then rename Kel_Whatever -> Kel?
     // TODO record nKs and find a nice way to loop/vector the device-side layout?
 
-    pkg->FillDerivedBlock = Electrons::FillDerived;
-    pkg->PostFillDerivedBlock = Electrons::PostFillDerived;
+    pkg->FillDerivedBlock = Electrons::FillDerivedBlock;
     return pkg;
 }
 
@@ -221,8 +220,6 @@ void UtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
 
 }
 
-void PostUtoP(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse) {}
-
 TaskStatus ApplyElectronHeating(MeshBlockData<Real> *rc_old, MeshBlockData<Real> *rc)
 {
     Flag(rc, "Applying electron heating");
@@ -372,6 +369,7 @@ TaskStatus ApplyElectronHeating(MeshBlockData<Real> *rc_old, MeshBlockData<Real>
             KOKKOS_LAMBDA_3D {
                 const Real Q = -(ug0 * v0 * (gam - 2) / pow(1 + v0 * t, 3));
                 P_new(m_p.UU, k, j, i) += Q * dt;
+                // TODO all flux
                 GRMHD::p_to_u(G, P_new, m_p, gam, k, j, i, U_new, m_u);
             }
         );
diff --git a/kharma/electrons/electrons.hpp b/kharma/electrons/electrons.hpp
index 5b3d818d..fb9e11db 100644
--- a/kharma/electrons/electrons.hpp
+++ b/kharma/electrons/electrons.hpp
@@ -85,17 +85,7 @@ TaskStatus InitElectrons(MeshBlockData<Real> *rc, ParameterInput *pin);
  * Function in this package: Get the specific entropy primitive value, by dividing the total entropy K/(rho*u^0)
  */
 void UtoP(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire, bool coarse=false);
-inline void FillDerived(MeshBlockData<Real> *rc) { UtoP(rc); }
-
-/**
- * Anything which should be applied after every package has performed "UtoP"
- * Generally floors, fixes, or very basic source terms for primitive variables.
- * 
- * Currently a no-op in this package, as floors *before* ApplyElectronHeating are applied in GRMHD::PostUtoP,
- * and floors *after* electron heating are applied immediately in ApplyElectronHeating
- */
-void PostUtoP(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire, bool coarse=false);
-inline void PostFillDerived(MeshBlockData<Real> *rc) { PostUtoP(rc); }
+inline void FillDerivedBlock(MeshBlockData<Real> *rc) { UtoP(rc); }
 
 /**
  * This heating step is custom for this package:
@@ -132,41 +122,16 @@ TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *rc);
 void FillOutput(MeshBlock *pmb, ParameterInput *pin);
 
 /**
- * KHARMA requires two forms of the function for obtaining conserved variables from primitives.
- * However, these are very different from UtoP/FillDerived in that they are called exclusively on the
- * device side, operating on a single zone rather than the whole fluid state.
+ * KHARMA requires some method for getting conserved variables from primitives, as well.
  * 
- * Each should have roughly the signature used here, accepting scratchpads of size NVARxN1, and index
- * maps (see types.hpp) indicating which index corresponds to which variable in the packed array, as well
- * as indications of the desired zone location and flux direction (dir==0 for just the conserved variable forms).
- * As used extensively here, any variables not present in a pack will have index -1 in the map.
- *  
- * The two functions differ in two ways:
- * 1. The caller precalculate the four-vectors (u^mu, b^mu) and pass them in the struct D to prim_to_flux (see flux.hpp for call)
- * 2. p_to_u will only ever be called to obtain the conserved variables U, not fluxes (i.e. dir == 0 in calls)
+ * However, unlike UtoP, p_to_u is implemented device-side. That means that any
+ * package defining new primitive/conserved vars must add them to Flux::prim_to_flux
+ * in addition to providing a UtoP function.
  * 
- * Function in this package: Divide or multiply by local density to get entropy/particle -- opposite of UtoP above
+ * Some packages may wish to have their own local p_to_u functions as well, to avoid
+ * calling Flux::PtoU where not all conserved variables need to be calculated. This is
+ * an example.
  */
-KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const ScratchPad2D<Real>& P, const VarMap& m_p, const FourVectors D,
-                                         const int& k, const int& j, const int& i, const int dir,
-                                         ScratchPad2D<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
-{
-    // Take the factor from the primitives, in case we need to reorder this to happen before GRMHD::prim_to_flux later
-    const Real rho_ut = P(m_p.RHO, i) * D.ucon[dir] * G.gdet(loc, j, i);
-    flux(m_u.KTOT, i) = rho_ut * P(m_p.KTOT, i);
-    if (m_p.K_CONSTANT >= 0)
-        flux(m_u.K_CONSTANT, i) = rho_ut * P(m_p.K_CONSTANT, i);
-    if (m_p.K_HOWES >= 0)
-        flux(m_u.K_HOWES, i) = rho_ut * P(m_p.K_HOWES, i);
-    if (m_p.K_KAWAZURA >= 0)
-        flux(m_u.K_KAWAZURA, i) = rho_ut * P(m_p.K_KAWAZURA, i);
-    if (m_p.K_WERNER >= 0)
-        flux(m_u.K_WERNER, i) = rho_ut * P(m_p.K_WERNER, i);
-    if (m_p.K_ROWAN >= 0)
-        flux(m_u.K_ROWAN, i) = rho_ut * P(m_p.K_ROWAN, i);
-    if (m_p.K_SHARMA >= 0)
-        flux(m_u.K_SHARMA, i) = rho_ut * P(m_p.K_SHARMA, i);
-}
 KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const VariablePack<Real>& P, const VarMap& m_p,
                                          const int& k, const int& j, const int& i,
                                          const VariablePack<Real>& flux, const VarMap m_u, const Loci loc=Loci::center)
diff --git a/kharma/emhd/emhd.cpp b/kharma/emhd/emhd.cpp
index da9e6baa..33e08be1 100644
--- a/kharma/emhd/emhd.cpp
+++ b/kharma/emhd/emhd.cpp
@@ -101,11 +101,12 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     params.Add("NonidealFlag", isNonideal);
 
     // General options for primitive and conserved scalar variables in ImEx driver
-    // EMHD is supported only with imex driver and 
-    Metadata m_con  = Metadata({Metadata::Real, Metadata::Cell, Metadata::Independent,
+    // EMHD is supported only with imex driver and implicit evolution
+    MetadataFlag isImplicit = packages.Get("Implicit")->Param<MetadataFlag>("ImplicitFlag");
+    Metadata m_con  = Metadata({Metadata::Real, Metadata::Cell, Metadata::Independent, isImplicit,
                                 Metadata::Conserved, Metadata::WithFluxes, isNonideal});
-    Metadata m_prim = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::FillGhost,
-                                Metadata::Restart, isPrimitive, isNonideal});
+    Metadata m_prim = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, isImplicit,
+                                Metadata::FillGhost, Metadata::Restart, isPrimitive, isNonideal});
 
     // Heat conduction
     pkg->AddField("cons.q", m_con);
@@ -197,7 +198,7 @@ TaskStatus AddSource(MeshData<Real> *md, MeshData<Real> *mdudt)
 
             // Compute div of ucon (all terms but the time-derivative ones are nonzero)
             Real div_ucon = 0;
-            DLOOP2 div_ucon += G.gcon(Loci::center, mu, nu, j, i) * grad_ucov[mu][nu];
+            DLOOP2 div_ucon += G.gcon(Loci::center, j, i, mu, nu) * grad_ucov[mu][nu];
 
             // Compute+add explicit source terms (conduction and viscosity)
             const Real& rho = P(b)(m_p.RHO, k, j, i);
diff --git a/kharma/emhd/emhd.hpp b/kharma/emhd/emhd.hpp
index 59ea605f..0711d979 100644
--- a/kharma/emhd/emhd.hpp
+++ b/kharma/emhd/emhd.hpp
@@ -97,7 +97,7 @@ KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Local&
         nu_e  = emhd_params.viscosity_alpha * cs2 * tau;
     } else if (emhd_params.type == ClosureType::torus) {
         // Something complicated
-    } // else yell
+    } // else yell?
 }
 template<typename Global>
 KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Global& P, const VarMap& m_p,
@@ -120,7 +120,26 @@ KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Global&
         nu_e  = emhd_params.viscosity_alpha * cs2 * tau;
     } else if (emhd_params.type == ClosureType::torus) {
         // Something complicated
-    } // else yell
+    } // else yell?
+}
+// Local version for use in initialization, as q/dP need to be converted to prim tilde forms
+KOKKOS_INLINE_FUNCTION void set_parameters(const GRCoordinates& G, const Real& rho, const Real& u,
+                                           const EMHD_parameters& emhd_params, const Real& gam,
+                                           const int& k, const int& j, const int& i,
+                                           Real& tau, Real& chi_e, Real& nu_e)
+{
+    if (emhd_params.type == ClosureType::constant) {
+        // Set tau, nu, chi to constants
+        tau   = emhd_params.tau;
+        chi_e = emhd_params.conduction_alpha;
+        nu_e  = emhd_params.viscosity_alpha;
+    } else if (emhd_params.type == ClosureType::soundspeed) {
+        // Set tau=const, chi/nu prop. to sound speed squared
+        const Real cs2 = (gam * (gam - 1.) * u) / (rho + (gam * u));
+        tau   = emhd_params.tau;
+        chi_e = emhd_params.conduction_alpha * cs2 * tau;
+        nu_e  = emhd_params.viscosity_alpha * cs2 * tau;
+    } // else yell?
 }
 
 /**
@@ -145,8 +164,8 @@ KOKKOS_INLINE_FUNCTION void calc_tensor(const Real& rho, const Real& u, const Re
                   - D.bcon[dir] * D.bcov[mu]
                   + (q / sqrt(bsq)) * ((D.ucon[dir] * D.bcov[mu]) +
                                        (D.bcon[dir] * D.ucov[mu]))
-                  + (-dP) * ((D.bcon[dir] * D.bcov[mu] / bsq)
-                                  - (1./3) * ((dir == mu) + D.ucon[dir] * D.ucov[mu]));
+                  - dP * ((D.bcon[dir] * D.bcov[mu] / bsq)
+                          - (1./3) * ((dir == mu) + D.ucon[dir] * D.ucov[mu]));
     }
 }
 
diff --git a/kharma/emhd/emhd_sources.hpp b/kharma/emhd/emhd_sources.hpp
index 19673bd0..d3187003 100644
--- a/kharma/emhd/emhd_sources.hpp
+++ b/kharma/emhd/emhd_sources.hpp
@@ -92,7 +92,7 @@ KOKKOS_INLINE_FUNCTION void time_derivative_sources(const GRCoordinates& G, cons
 
     // Compute div of ucon (only the temporal part is nonzero)
     Real div_ucon = 0;
-    DLOOP1 div_ucon += G.gcon(Loci::center, 0, mu, j, i) * dt_ucov[mu];
+    DLOOP1 div_ucon += G.gcon(Loci::center, j, i, 0, mu) * dt_ucov[mu];
     // dTheta/dt
     const Real Theta_new = max((gam-1) * P_new(m_p.UU) / P_new(m_p.RHO), SMALL);
     const Real Theta_old = max((gam-1) * P_old(m_p.UU) / P_old(m_p.RHO), SMALL);
diff --git a/kharma/emhd/emhd_utils.hpp b/kharma/emhd/emhd_utils.hpp
index 20c521db..b368a154 100644
--- a/kharma/emhd/emhd_utils.hpp
+++ b/kharma/emhd/emhd_utils.hpp
@@ -144,7 +144,7 @@ KOKKOS_INLINE_FUNCTION void gradient_calc(const GRCoordinates& G, const Global&
             grad_ucov[3][mu] = 0.;
         }
     }
-    DLOOP3 grad_ucov[mu][nu] -= G.conn(lam, mu, nu, j, i) * ucov_s(lam, k, j, i);
+    DLOOP3 grad_ucov[mu][nu] -= G.conn(j, i, lam, mu, nu) * ucov_s(lam, k, j, i);
 
     // Compute temperature gradient
     // Time derivative component is computed in time_derivative_sources
diff --git a/kharma/floors/floors.hpp b/kharma/floors/floors.hpp
index 308eaeac..821f7ff2 100644
--- a/kharma/floors/floors.hpp
+++ b/kharma/floors/floors.hpp
@@ -37,6 +37,7 @@
 
 
 #include "b_flux_ct.hpp"
+#include "flux_functions.hpp"
 #include "grmhd_functions.hpp"
 #include "U_to_P.hpp"
 
@@ -177,6 +178,7 @@ KOKKOS_INLINE_FUNCTION int apply_ceilings(const GRCoordinates& G, const Variable
 
     if (fflag) {
         // Keep lockstep!
+        // TODO all flux
         GRMHD::p_to_u(G, P, m_p, gam, k, j, i, U, m_u, loc);
     }
 
@@ -270,6 +272,7 @@ KOKKOS_INLINE_FUNCTION int apply_floors(const GRCoordinates& G, const VariablePa
         if (use_ff) {
             P(m_p.RHO, k, j, i) += max(0., rhoflr_max - rho);
             P(m_p.UU, k, j, i) += max(0., uflr_max - u);
+            // TODO should be all Flux
             GRMHD::p_to_u(G, P, m_p, gam, k, j, i, U, m_u, loc);
         } else {
             // Add the material in the normal observer frame, by:
@@ -298,6 +301,7 @@ KOKKOS_INLINE_FUNCTION int apply_floors(const GRCoordinates& G, const VariablePa
             // If that fails, we've effectively already applied the floors in fluid-frame to the prims,
             // so we just formalize that
             if (pflag) {
+                // TODO should be all Flux
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, U, m_u, loc);
             }
         }
diff --git a/kharma/flux_functions.hpp b/kharma/flux_functions.hpp
index d5b44a02..c8591fcd 100644
--- a/kharma/flux_functions.hpp
+++ b/kharma/flux_functions.hpp
@@ -66,17 +66,14 @@ KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P,
     Real T[GR_DIM];
     if (m_p.Q >= 0) {
         // EGRMHD stress-energy tensor w/ first index up, second index down
-        
-        const Real& rho     = P(m_p.RHO);
-        const Real q_tilde  = P(m_p.Q);
-        const Real dP_tilde = P(m_p.DP);
-        const Real& Theta   = (gam - 1) * P(m_p.UU) / P(m_p.RHO);
-
+        // Get problem closure parameters
         Real tau, chi_e, nu_e;
         EMHD::set_parameters(G, P, m_p, emhd_params, gam, tau, chi_e, nu_e);
 
-        Real q = 0., dP = 0.;
-        EMHD::convert_prims_to_q_dP(q_tilde, dP_tilde, rho, Theta, tau, chi_e, nu_e, emhd_params, q, dP);
+        // Apply higher-order terms conversion if necessary
+        Real q, dP;
+        const Real Theta   = (gam - 1) * P(m_p.UU) / P(m_p.RHO);
+        EMHD::convert_prims_to_q_dP(P(m_p.Q), P(m_p.DP), P(m_p.RHO), Theta, tau, chi_e, nu_e, emhd_params, q, dP);
 
         // Then calculate the tensor
         EMHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), q, dP, D, dir, T);
@@ -87,6 +84,7 @@ KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P,
         // GRHD stress-energy tensor w/ first index up, second index down
         GRHD::calc_tensor(P(m_p.RHO), P(m_p.UU), (gam - 1) * P(m_p.UU), D, dir, T);
     }
+    //if (i == 11 && j == 11) printf("mhd: %g %g %g %g\n", T[0], T[1], T[2], T[3]);
     flux(m_u.UU) = T[0] * gdet + flux(m_u.RHO);
     flux(m_u.U1) = T[1] * gdet;
     flux(m_u.U2) = T[2] * gdet;
@@ -141,7 +139,7 @@ KOKKOS_INLINE_FUNCTION void prim_to_flux(const GRCoordinates& G, const Local& P,
 }
 
 /**
- * Get the conserved GRHD variables corresponding to primitives in a zone. Equivalent to prim_to_flux with dir==0
+ * Get the conserved (E)GRMHD variables corresponding to primitives in a zone. Equivalent to prim_to_flux with dir==0
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const VarMap& m_p,
diff --git a/kharma/grmhd/fixup.cpp b/kharma/grmhd/fixup.cpp
index 38f6e1ee..4b194434 100644
--- a/kharma/grmhd/fixup.cpp
+++ b/kharma/grmhd/fixup.cpp
@@ -35,6 +35,7 @@
 #include "fixup.hpp"
 
 #include "floors.hpp"
+#include "flux_functions.hpp"
 #include "pack.hpp"
 
 // Version of PLOOP guaranteeing specifically the 5 GRMHD fixup-amenable primitive vars
@@ -131,6 +132,7 @@ TaskStatus GRMHD::FixUtoP(MeshBlockData<Real> *rc)
         KOKKOS_LAMBDA_3D {
             if (((int) pflag(k, j, i)) > InversionStatus::success) {
                 // Make sure to keep lockstep
+                // This will only be run for GRMHD, so we can call its p_to_u
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, U, m_u);
 
                 // And make sure the fixed values still abide by floors (floors keep lockstep)
diff --git a/kharma/grmhd/grmhd.cpp b/kharma/grmhd/grmhd.cpp
index 5ef248ef..d77bff48 100644
--- a/kharma/grmhd/grmhd.cpp
+++ b/kharma/grmhd/grmhd.cpp
@@ -66,18 +66,20 @@ using namespace Kokkos;
 namespace GRMHD
 {
 
-std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
+std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages)
 {
     // This function builds and returns a "StateDescriptor" or "Package" object.
     // The most important part of this object is a member of type "Params",
     // which acts more or less like a Python dictionary:
     // it puts values into a map of names->objects, where "objects" are usually
     // floats, strings, and ints, but can be arbitrary classes.
-    // This "dictionary" is *not* immutable, but should be treated as such
-    // in every package except "Globals".
+    // This "dictionary" is *not* totally immutable, but should be treated
+    // as such in every package except "Globals".
     auto pkg = std::make_shared<StateDescriptor>("GRMHD");
     Params &params = pkg->AllParams();
 
+    // =================================== PARAMETERS ===================================
+
     // Add the problem name, so we can be C++ noobs and special-case on string contents
     std::string problem_name = pin->GetString("parthenon/job", "problem_id");
     params.Add("problem", problem_name);
@@ -113,6 +115,15 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     double max_dt_increase = pin->GetOrAddReal("parthenon/time", "max_dt_increase", 2.0);
     params.Add("max_dt_increase", max_dt_increase);
 
+    // Alternatively, you can start with (or just always use) the light (phase) speed crossing time
+    // of the smallest zone.  Useful when you're not sure of/modeling the characteristic velocities
+    bool start_dt_light = pin->GetOrAddBoolean("parthenon/time", "start_dt_light", false);
+    params.Add("start_dt_light", start_dt_light);
+    bool use_dt_light = pin->GetOrAddBoolean("parthenon/time", "use_dt_light", false);
+    params.Add("use_dt_light", use_dt_light);
+    bool use_dt_light_phase_speed = pin->GetOrAddBoolean("parthenon/time", "use_dt_light_phase_speed", false);
+    params.Add("use_dt_light_phase_speed", use_dt_light_phase_speed);
+
     // Reconstruction scheme: plm, weno5, ppm...
     std::string recon = pin->GetOrAddString("GRMHD", "reconstruction", "weno5");
     if (recon == "donor_cell") {
@@ -139,19 +150,27 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     int extra_checks = pin->GetOrAddInteger("debug", "extra_checks", 0);
     params.Add("extra_checks", extra_checks);
 
-    // Option to disable checking the fluxes at boundaries
+    // Option to disable checking the fluxes at boundaries:
+    // Prevent inflow at outer boundaries
     bool check_inflow_inner = pin->GetOrAddBoolean("bounds", "check_inflow_inner", true);
     params.Add("check_inflow_inner", check_inflow_inner);
     bool check_inflow_outer = pin->GetOrAddBoolean("bounds", "check_inflow_outer", true);
     params.Add("check_inflow_outer", check_inflow_outer);
+    // Ensure fluxes through the zero-size face at the pole are zero
     bool fix_flux_pole = pin->GetOrAddBoolean("bounds", "fix_flux_pole", true);
     params.Add("fix_flux_pole", fix_flux_pole);
 
     // Driver options
+    // The two current drivers are "harm" or "imex", with the former being the usual KHARMA
+    // driver, and the latter supporting implicit stepping of some or all variables
     auto driver_type = pin->GetString("driver", "type"); // This is set in kharma.cpp
     params.Add("driver_type", driver_type);
-    auto driver_step = pin->GetOrAddString("driver", "step", "explicit");
-    params.Add("driver_step", driver_step);
+    // The ImEx driver is necessary to evolve implicitly, but doesn't require it.  Using explicit
+    // updates for GRMHD vars is useful for testing, or if adding just a couple of implicit variables
+    // Doing EGRMHD requires implicit evolution of GRMHD variables, of course
+    auto implicit_grmhd = (driver_type == "imex") &&
+                          (pin->GetBoolean("emhd", "on") || pin->GetOrAddBoolean("GRMHD", "implicit", false));
+    params.Add("implicit", implicit_grmhd);
 
     // Performance options
     // Packed communications kernels, exchanging all boundary buffers of an MPI process
@@ -166,6 +185,13 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     Real derefine_tol = pin->GetOrAddReal("GRMHD", "derefine_tol", 0.05);
     params.Add("derefine_tol", derefine_tol);
 
+    // =================================== FIELDS ===================================
+
+    // In addition to "params", the StateDescriptor/Package object carries "Fields"
+    // These represent any variables we want to keep track of across the grid, and
+    // generally inherit the size of the MeshBlock (for "Cell" fields) or some
+    // closely-related size (for "Face" and "Edge" fields)
+
     // Add flags to distinguish groups of fields.
     // This is stretching what the "Params" object should really be carrying,
     // but the flag values are necessary in many places, and this was the
@@ -182,16 +208,8 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     MetadataFlag isMHD = Metadata::AllocateNewFlag("MHD");
     params.Add("MHDFlag", isMHD);
 
-    // In addition to "params", the StateDescriptor/Package object carries "Fields"
-    // These represent any variables we want to keep track of across the grid, and
-    // generally inherit the size of the MeshBlock (for "Cell" fields) or some
-    // closely-related size (for "Face" and "Edge" fields)
-
-    std::vector<int> s_vector({NVEC});
     std::vector<MetadataFlag> flags_prim, flags_cons;
-    auto imex_driver = pin->GetString("driver", "type") == "imex";
-    auto explicit_step = (pin->GetOrAddString("driver", "step", "explicit") == "explicit");
-    if (!imex_driver) { // Normal operation
+    if (driver_type == "harm") { // Normal operation
         // As mentioned elsewhere, KHARMA treats the conserved variables as the independent ones,
         // and the primitives as "Derived"
         // Primitives are still used for reconstruction, physical boundaries, and output, and are
@@ -203,15 +221,18 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
         flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
                                                 Metadata::WithFluxes, Metadata::FillGhost, Metadata::Restart,
                                                 Metadata::Conserved, isHD, isMHD});
-    } else {
-        // For ImexDriver, however, the primitive variables are independent, and boundary syncs are performed
-        // with them.  This is to accommodate the implicit step, which takes and returns primitive values and
-        // thus is much easier to handle by just using primitives everywhere.
-        flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
+    } else if (driver_type == "imex") { // ImEx driver
+        // When evolving (E)GRMHD implicitly, we instead mark the primitive variables to be synchronized.
+        // This won't work for AMR, but it fits much better with the implicit solver, which expects
+        // primitive variable inputs and produces primitive variable results.
+
+        // Mark whether to evolve our variables via the explicit or implicit step inside the driver
+        MetadataFlag areWeImplicit = (implicit_grmhd) ? packages.Get("Implicit")->Param<MetadataFlag>("ImplicitFlag")
+                                                      : packages.Get("Implicit")->Param<MetadataFlag>("ExplicitFlag");
+
+        flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived, areWeImplicit,
                                                 Metadata::FillGhost, Metadata::Restart, isPrimitive, isHD, isMHD});
-        // Conserved variables are actualy rho*u^0 & T^0_mu, but are named after the prims for consistency
-        // We will rarely need the conserved variables by name, we will mostly be treating them as a group
-        flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
+        flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent, areWeImplicit,
                                                 Metadata::WithFluxes, Metadata::Conserved, isHD, isMHD});
     }
 
@@ -219,8 +240,10 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     auto m = Metadata(flags_prim);
     pkg->AddField("prims.rho", m);
     pkg->AddField("prims.u", m);
+    // We add the "Vector" flag and a size to vectors
     auto flags_prim_vec(flags_prim);
     flags_prim_vec.push_back(Metadata::Vector);
+    std::vector<int> s_vector({NVEC});
     m = Metadata(flags_prim_vec, s_vector);
     pkg->AddField("prims.uvec", m);
 
@@ -240,22 +263,26 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, s_vector);
     pkg->AddField("ctop", m);
 
-    if (explicit_step) {
-        // Flag denoting UtoP inversion failures.
-        // Not used for implicit stepper, that has its own flag
-        if (imex_driver) {
-            m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy, Metadata::FillGhost});
-        } else {
-            m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
-        }
-        pkg->AddField("pflag", m);
+    // Flag denoting UtoP inversion failures
+    // Only needed if we're actually calling UtoP, but always allocated as it's retrieved often
+    // Needs boundary sync if treating primitive variables as fundamental
+    if (driver_type == "imex" && !implicit_grmhd) {
+        m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy, Metadata::FillGhost});
+    } else {
+        m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
+    }
+    pkg->AddField("pflag", m);
+
+    if (!implicit_grmhd) {
+        // If we're using a step that requires calling UtoP, register it
+        // Calling this messes up implicit stepping, so we only register it here
+        pkg->FillDerivedBlock = GRMHD::FillDerivedBlock;
     }
 
     // Finally, the StateDescriptor/Package object determines the Callbacks Parthenon makes to
     // a particular package -- that is, some portion of the things that the package needs done
     // at each step, which must be done at specific times.
-    // See the documentation on each of these functions for their purpose and call context.
-    pkg->FillDerivedBlock = GRMHD::FillDerivedBlock;
+    // See the header files defining each of these functions for their purpose and call context.
     pkg->CheckRefinementBlock = GRMHD::CheckRefinement;
     pkg->EstimateTimestepBlock = GRMHD::EstimateTimestep;
     pkg->PostStepDiagnosticsMesh = GRMHD::PostStepDiagnostics;
@@ -328,11 +355,34 @@ Real EstimateTimestep(MeshBlockData<Real> *rc)
     // TODO: move timestep limiter into an override of SetGlobalTimestep
     // TODO: move diagnostic printing to PostStepDiagnostics, now it's broken here
 
-    if (!pmb->packages.Get("Globals")->Param<bool>("in_loop")) {
-        double dt = pmb->packages.Get("GRMHD")->Param<double>("dt_start");
-        // Record this, since we'll use it to determine the max step next
-        pmb->packages.Get("Globals")->UpdateParam<double>("dt_last", dt);
-        return dt;
+    auto& globals = pmb->packages.Get("Globals")->AllParams();
+    const auto& grmhd_pars = pmb->packages.Get("GRMHD")->AllParams();
+
+    if (!globals.Get<bool>("in_loop")) {
+        if (grmhd_pars.Get<bool>("start_dt_light") ||
+            grmhd_pars.Get<bool>("use_dt_light")) {
+            // Estimate based on light crossing time
+            double dt = EstimateRadiativeTimestep(rc);
+            // This records a per-rank minimum,
+            // but Parthenon calls MPIMin per-step anyway
+            if (globals.hasKey("dt_light")) {
+                if (dt < globals.Get<double>("dt_light"))
+                    globals.Update<double>("dt_light", dt);
+            } else {
+                globals.Add<double>("dt_light", dt);
+            }
+            return dt;
+        } else {
+            // Or Just take from parameters
+            double dt = grmhd_pars.Get<double>("dt_start");
+            // Record this, since we'll use it to determine the max step increase
+            globals.Update<double>("dt_last", dt);
+            return dt;
+        }
+    }
+    // If we're still using the light crossing time, skip the rest
+    if (grmhd_pars.Get<bool>("use_dt_light")) {
+        return globals.Get<double>("dt_light");
     }
 
     typename Kokkos::MinMax<Real>::value_type minmax;
@@ -356,21 +406,87 @@ Real EstimateTimestep(MeshBlockData<Real> *rc)
     const double nctop = minmax.max_val;
 
     // Apply limits
-    const double cfl = pmb->packages.Get("GRMHD")->Param<double>("cfl");
-    const double dt_min = pmb->packages.Get("GRMHD")->Param<double>("dt_min");
-    const double dt_last = pmb->packages.Get("Globals")->Param<double>("dt_last");
-    const double dt_max = pmb->packages.Get("GRMHD")->Param<double>("max_dt_increase") * dt_last;
+    const double cfl = grmhd_pars.Get<double>("cfl");
+    const double dt_min = grmhd_pars.Get<double>("dt_min");
+    const double dt_last = globals.Get<double>("dt_last");
+    const double dt_max = grmhd_pars.Get<double>("max_dt_increase") * dt_last;
     const double ndt = clip(min_ndt * cfl, dt_min, dt_max);
 
     // Record max ctop, for constraint damping
-    if (nctop > pmb->packages.Get("Globals")->Param<Real>("ctop_max")) {
-        pmb->packages.Get("Globals")->UpdateParam<Real>("ctop_max", nctop);
+    if (nctop > globals.Get<Real>("ctop_max")) {
+        globals.Update<Real>("ctop_max", nctop);
     }
 
     Flag(rc, "Estimated");
     return ndt;
 }
 
+Real EstimateRadiativeTimestep(MeshBlockData<Real> *rc)
+{
+    Flag(rc, "Estimating shortest light crossing time");
+    auto pmb = rc->GetBlockPointer();
+    IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
+    IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
+    const auto& G = pmb->coords;
+
+    const auto& grmhd_pars = pmb->packages.Get("GRMHD")->AllParams();
+    const bool phase_speed = grmhd_pars.Get<bool>("use_dt_light_phase_speed");
+
+    const Real dx[GR_DIM] = {0., G.dx1v(0), G.dx2v(0), G.dx3v(0)};
+
+    // Leaving minmax in case the max phase speed is useful
+    typename Kokkos::MinMax<Real>::value_type minmax;
+    pmb->par_reduce("ndt_min", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int k, const int j, const int i,
+                      typename Kokkos::MinMax<Real>::value_type &lminmax) {
+
+            double light_phase_speed = SMALL;
+            double dt_light_local = 0.;
+
+            if (phase_speed) {
+                for (int mu = 1; mu < GR_DIM; mu++) {
+                    if(pow(G.gcon(Loci::center, j, i, 0, mu), 2) -
+                        G.gcon(Loci::center, j, i, mu, mu)*G.gcon(Loci::center, j, i, 0, 0) >= 0.) {
+
+                        double cplus = fabs((-G.gcon(Loci::center, j, i, 0, mu) +
+                                            sqrt(pow(G.gcon(Loci::center, j, i, 0, mu), 2) -
+                                                G.gcon(Loci::center, j, i, mu, mu)*G.gcon(Loci::center, j, i, 0, 0)))/
+                                            G.gcon(Loci::center, j, i, 0, 0));
+
+                        double cminus = fabs((-G.gcon(Loci::center, j, i, 0, mu) -
+                                            sqrt(pow(G.gcon(Loci::center, j, i, 0, mu), 2) -
+                                                G.gcon(Loci::center, j, i, mu, mu)*G.gcon(Loci::center, j, i, 0, 0)))/
+                                            G.gcon(Loci::center, j, i, 0, 0));
+
+                        light_phase_speed = max(cplus,cminus);
+                    } else {
+                        light_phase_speed = SMALL;
+                    }
+
+                    dt_light_local += 1./(dx[mu]/light_phase_speed);
+                }
+            } else {
+                for (int mu = 1; mu < GR_DIM; mu++)
+                    dt_light_local += 1./dx[mu];
+            }
+            dt_light_local = 1/dt_light_local;
+
+            if (!isnan(dt_light_local) && (dt_light_local < lminmax.min_val))
+                lminmax.min_val = dt_light_local;
+            if (!isnan(light_phase_speed) && (light_phase_speed > lminmax.max_val))
+                lminmax.max_val = light_phase_speed;
+        }
+    , Kokkos::MinMax<Real>(minmax));
+
+    // Just spit out dt
+    const double cfl = grmhd_pars.Get<double>("cfl");
+    const double ndt = minmax.min_val * cfl;
+
+    Flag(rc, "Estimated");
+    return ndt;
+}
+
 AmrTag CheckRefinement(MeshBlockData<Real> *rc)
 {
     auto pmb = rc->GetBlockPointer();
diff --git a/kharma/grmhd/grmhd.hpp b/kharma/grmhd/grmhd.hpp
index ba6c08e6..90ce8aae 100644
--- a/kharma/grmhd/grmhd.hpp
+++ b/kharma/grmhd/grmhd.hpp
@@ -58,7 +58,7 @@ using namespace parthenon;
  */
 namespace GRMHD {
 // For declaring meshes, as well as the full intermediates we need (right & left fluxes etc)
-std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin);
+std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t packages);
 
 /**
  * Get the primitive variables
@@ -72,6 +72,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin);
 // inline void FillDerivedMesh(MeshData<Real> *md) { UtoP(md); }
 void UtoP(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::entire, bool coarse=false);
 inline void FillDerivedBlock(MeshBlockData<Real> *rc) { UtoP(rc); }
+inline TaskStatus FillDerivedBlockTask(MeshBlockData<Real> *rc) { UtoP(rc); return TaskStatus::complete; }
 
 /**
  * Fix the primitive variables
@@ -91,6 +92,9 @@ void PostUtoP(MeshBlockData<Real> *rc);
  */
 Real EstimateTimestep(MeshBlockData<Real> *rc);
 
+// Internal version for the light phase speed crossing time of smallest zone
+Real EstimateRadiativeTimestep(MeshBlockData<Real> *rc);
+
 /**
  * Return a tag per-block indicating whether to refine it
  * 
diff --git a/kharma/grmhd/grmhd_functions.hpp b/kharma/grmhd/grmhd_functions.hpp
index a32280a2..80d7c903 100644
--- a/kharma/grmhd/grmhd_functions.hpp
+++ b/kharma/grmhd/grmhd_functions.hpp
@@ -292,7 +292,7 @@ KOKKOS_INLINE_FUNCTION void calc_ucon(const GRCoordinates& G, const Global& P, c
 }
 
 /**
- * Global GRMHD-only "p_to_u" call: just MHD variables (uses B optionally, but no output). TODO elminate?
+ * Global GRMHD-only "p_to_u" call: for areas where nonideal terms are *always* 0!
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION void p_to_u(const GRCoordinates& G, const Local& P, const VarMap& m_p,
diff --git a/kharma/imex_driver.cpp b/kharma/imex_driver.cpp
index 9aa558ef..013dba62 100644
--- a/kharma/imex_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -64,6 +64,9 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
 
     // This is *not* likely the task list you are looking for, and is not well commented yet.
     // See harm_driver.cpp for KHARMA's main driver.
+    // This driver *requires* the "Implicit" package to be loaded, in order to read some flags
+    // it defines for 
+
     TaskCollection tc;
     TaskID t_none(0);
 
@@ -91,6 +94,9 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             // At the end of the step, updating "sc1" updates the base
             // So we have to keep a copy at the beginning to calculate jcon
             pmb->meshblock_data.Add("preserve", base);
+            // When solving, we need a temporary copy with any explicit updates,
+            // but not overwriting the beginning- or mid-step values
+            pmb->meshblock_data.Add("solver", base);
         }
     }
 
@@ -104,6 +110,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         auto &mc0 = pmesh->mesh_data.GetOrAdd(stage_name[stage - 1], i);
         auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
         auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i);
+        auto &mc_solver = pmesh->mesh_data.GetOrAdd("solver", i);
 
         auto t_start_recv = tl.AddTask(t_none, &MeshData<Real>::StartReceiving, mc1.get(),
                                     BoundaryCommSubset::all);
@@ -111,7 +118,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         // Calculate the HLL fluxes in each direction
         // This reconstructs the primitives (P) at faces and uses them to calculate fluxes
         // of the conserved variables (U)
-        const ReconstructionType& recon = blocks[0]->packages.Get("GRMHD")->Param<ReconstructionType>("recon");
+        const ReconstructionType& recon = pkgs.at("GRMHD")->Param<ReconstructionType>("recon");
         TaskID t_calculate_flux1, t_calculate_flux2, t_calculate_flux3;
         switch (recon) {
         case ReconstructionType::donor_cell:
@@ -145,6 +152,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
 
         auto t_recv_flux = t_calculate_flux;
         // TODO this appears to be implemented *only* block-wise, split it into its own region if so
+        // TODO should probably keep track of/wait on all tasks!! Might be a race condition!!
         if (pmesh->multilevel) {
             // Get flux corrections from AMR neighbors
             for (auto &pmb : pmesh->block_list) {
@@ -190,127 +198,161 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             t_emhd_source = tl.AddTask(t_wind_source, EMHD::AddSource, mc0.get(), mdudt.get());
         }
         // Done with source terms
-        auto t_sources = t_wind_source;
+        auto t_sources = t_emhd_source;
+
+        // UPDATE VARIABLES
+        // This block is designed to intelligently update a set of variables partially marked "Implicit"
+        // and partially "Explicit," by first doing any explicit updates, then using them as elements
+        // of the "guess" for the implicit solve
+
+        // Indicators for Explicit/Implicit variables to evolve
+        MetadataFlag isExplicit = pkgs.at("Implicit")->Param<MetadataFlag>("ExplicitFlag");
+        MetadataFlag isImplicit = pkgs.at("Implicit")->Param<MetadataFlag>("ImplicitFlag");
+        MetadataFlag isPrimitive = pkgs.at("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+        // Substep timestep
+        const double beta_this = integrator->beta[stage % integrator->nstages];
+        const double dt_this = dt * beta_this;
+
+        // Update any variables for which we should take an explicit step.
+        // These calls are the equivalent of what's in HARMDriver
+        // auto t_average = tl.AddTask(t_sources, Update::WeightedSumData<MetadataFlag, MeshData<Real>>,
+        //                             std::vector<MetadataFlag>({isExplicit, Metadata::Independent}),
+        //                             mc0.get(), mbase.get(), beta, (1.0 - beta), mc_solver.get());
+        // auto t_explicit_U = tl.AddTask(t_average, Update::WeightedSumData<MetadataFlag, MeshData<Real>>,
+        //                             std::vector<MetadataFlag>({isExplicit, Metadata::Independent}),
+        //                             mc_solver.get(), mdudt.get(), 1.0, beta * dt, mc_solver.get());
+        // Version with half/whole step to match implicit solver
+        auto t_explicit_U = tl.AddTask(t_sources, Update::WeightedSumData<MetadataFlag, MeshData<Real>>,
+                                    std::vector<MetadataFlag>({isExplicit, Metadata::Independent}),
+                                    mbase.get(), mdudt.get(), 1.0, dt_this, mc_solver.get());
+
+        // Make sure the primitive values of any explicit fields are filled
+        auto t_explicit_UtoP_B = t_explicit_U;
+        if (!pkgs.at("B_FluxCT")->Param<bool>("implicit"))
+            t_explicit_UtoP_B = tl.AddTask(t_explicit_U, B_FluxCT::FillDerivedMeshTask, mc_solver.get());
+        // If GRMHD is not implicit, but we're still going to be taking an implicit step, call its FillDerived function
+        // TODO Would be faster/more flexible if this supported MeshData. Also maybe race condition
+        auto t_explicit_UtoP_G = t_explicit_UtoP_B;
+        if (!pkgs.at("GRMHD")->Param<bool>("implicit") && use_b_cd) {
+            // Get flux corrections from AMR neighbors
+            for (auto &pmb : pmesh->block_list) {
+                auto& rc = pmb->meshblock_data.Get();
+                auto t_explicit_UtoP_G = tl.AddTask(t_explicit_UtoP_B, GRMHD::FillDerivedBlockTask, rc.get());
+            }
+        }
+        auto t_explicit = t_explicit_UtoP_G;
+
+        // Copy the current implicit vars in as a guess.  This needs at least the primitive vars
+        auto t_copy_guess = tl.AddTask(t_sources, Update::WeightedSumData<MetadataFlag, MeshData<Real>>,
+                                    std::vector<MetadataFlag>({isImplicit}),
+                                    mc0.get(), mc0.get(), 1.0, 0.0, mc_solver.get());
+
+        // Time-step implicit variables by root-finding the residual
+        // This applies the functions of both the update above and FillDerived call below for "isImplicit" variables
+        // This takes dt for the *substep*, not the whole thing, so we multiply total dt by *this step's* beta
+        auto t_guess_ready = t_explicit | t_copy_guess;
+        auto t_implicit = tl.AddTask(t_guess_ready, Implicit::Step, mbase.get(), mc0.get(), mdudt.get(), mc_solver.get(), dt_this);
+
+        // Copy the solver state into the final state mc1
+        auto t_copy_result = tl.AddTask(t_implicit, Update::WeightedSumData<MetadataFlag, MeshData<Real>>, std::vector<MetadataFlag>({}),
+                                        mc_solver.get(), mc_solver.get(), 1.0, 0.0, mc1.get());
+
     }
 
-    // This region is where GRIM and classic HARM split.
-    // Classic HARM applies the fluxes to calculate a new state of conserved variables,
-    // then solves for the primitive variables with UtoP (here "FillDerived")
-    const auto &driver_step =
-        blocks[0]->packages.Get("GRMHD")->Param<std::string>("driver_step");
-    if (driver_step == "explicit") { // Explicit step
-        // Update conserved state with dUdt
-        const int num_partitions = pmesh->DefaultNumPartitions();
-        TaskRegion &single_tasklist_per_pack_region = tc.AddRegion(num_partitions);
-        for (int i = 0; i < num_partitions; i++) {
-            auto &tl = single_tasklist_per_pack_region[i];
-            auto &mbase = pmesh->mesh_data.GetOrAdd("base", i);
-            auto &mc0 = pmesh->mesh_data.GetOrAdd(stage_name[stage - 1], i);
-            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
-            auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i);
-
-            // UPDATE BASE CONTAINER
-            auto t_avg_data = tl.AddTask(t_none, Update::AverageIndependentData<MeshData<Real>>,
-                                    mc0.get(), mbase.get(), beta);
-            // apply du/dt to all independent fields in the container
-            auto t_update = tl.AddTask(t_avg_data, Update::UpdateIndependentData<MeshData<Real>>, mc0.get(),
-                                    mdudt.get(), beta * dt, mc1.get());
-        }
+    // Even though we filled some primitive vars 
+    TaskRegion &async_region1 = tc.AddRegion(blocks.size());
+    for (int i = 0; i < blocks.size(); i++) {
+        auto &pmb = blocks[i];
+        auto &tl = async_region1[i];
+        auto &sc0 = pmb->meshblock_data.Get(stage_name[stage-1]);
+        auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
 
-        // Then solve for new primitives in the fluid interior, with the primitives at step start as a guess,
-        // using UtoP.  Note that since no ghost zones are updated here, and thus FixUtoP cannot use
-        // ghost zones. Thus KHARMA behavior in this mode will dependent on the breakdown of meshblocks,
-        // & possibly erratic when there are many fixups.
-        // Full algo should boundary sync -> FixUtoP -> boundary sync
-        TaskRegion &async_region = tc.AddRegion(blocks.size());
-        for (int i = 0; i < blocks.size(); i++) {
-            auto &pmb = blocks[i];
-            auto &tl = async_region[i];
-            auto &sc0 = pmb->meshblock_data.Get(stage_name[stage-1]);
-            auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
-
-            // COPY PRIMITIVES
-            // These form the guess for UtoP
-            auto t_copy_prims = tl.AddTask(t_none,
-                [](MeshBlockData<Real> *rc0, MeshBlockData<Real> *rc1)
-                {
-                    Flag(rc1, "Copying prims");
-                    rc1->Get("prims.rho").data.DeepCopy(rc0->Get("prims.rho").data);
-                    rc1->Get("prims.u").data.DeepCopy(rc0->Get("prims.u").data);
-                    rc1->Get("prims.uvec").data.DeepCopy(rc0->Get("prims.uvec").data);
-                    Flag(rc1, "Copied");
-                    return TaskStatus::complete;
-                }, sc0.get(), sc1.get()
-            );
-
-            auto t_fill_derived = tl.AddTask(t_copy_prims, Update::FillDerived<MeshBlockData<Real>>, sc1.get());
-            // This is *not* immediately corrected with FixUtoP, but synchronized (including pflags!) first.
-            // With an extra ghost zone, this *should* still allow binary-similar evolution between numbers of mesh blocks
+        // Copy primitives to form the guess for GRMHD::UtoP
+        // Only needed if GRMHD vars are being updated explicitly
+        auto t_copy_prims = t_none;
+        if (!pkgs.at("GRMHD")->Param<bool>("implicit")) {
+            MetadataFlag isPrimitive = pkgs.at("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+            MetadataFlag isHD = pkgs.at("GRMHD")->Param<MetadataFlag>("HDFlag");
+            auto t_copy_prims = tl.AddTask(t_none, Update::WeightedSumData<MetadataFlag, MeshBlockData<Real>>,
+                                        std::vector<MetadataFlag>({isHD, isPrimitive}),
+                                        sc0.get(), sc0.get(), 1.0, 0.0, sc1.get());
         }
-    } else { // Implicit step
-        const int num_partitions = pmesh->DefaultNumPartitions();
-        TaskRegion &single_tasklist_per_pack_region = tc.AddRegion(num_partitions);
-        for (int i = 0; i < num_partitions; i++) {
-            auto &tl = single_tasklist_per_pack_region[i];
-            auto &mbase = pmesh->mesh_data.GetOrAdd("base", i);
-            auto &mc0 = pmesh->mesh_data.GetOrAdd(stage_name[stage - 1], i);
-            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
-            auto &mdudt = pmesh->mesh_data.GetOrAdd("dUdt", i);
 
-            // time-step by root-finding the residual
-            // This applies the functions of both t_update and t_fill_derived
-            // This takes dt for the *substep*, not the whole thing -- should be 0.5*dt
-            auto t_implicit_solve = tl.AddTask(t_none, Implicit::Step, mbase.get(), mc0.get(), mdudt.get(), mc1.get(), dt / beta);
-        }
+        // Note that floors are applied (to all variables!) immediately after this FillDerived call.
+        // However, it is *not* immediately corrected with FixUtoP, but synchronized (including pflags!) first.
+        // With an extra ghost zone, this *should* still allow binary-similar evolution between numbers of mesh blocks,
+        // but hasn't been tested.
+        auto t_fill_derived = tl.AddTask(t_copy_prims, Update::FillDerived<MeshBlockData<Real>>, sc1.get());
     }
 
     // MPI/MeshBlock boundary exchange.
     // Optionally "packed" to send all data in one call (num_partitions defaults to 1)
     // Note that in this driver, this block syncs *primitive* variables, not conserved
-    const auto &pack_comms =
-        blocks[0]->packages.Get("GRMHD")->Param<bool>("pack_comms");
+    const auto &pack_comms = pkgs.at("GRMHD")->Param<bool>("pack_comms");
     if (pack_comms) {
         TaskRegion &tr1 = tc.AddRegion(num_partitions);
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr1[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Boundary 1"); return TaskStatus::complete; }
+            , mc1.get());
             tr1[i].AddTask(t_none, cell_centered_bvars::SendBoundaryBuffers, mc1);
         }
         TaskRegion &tr2 = tc.AddRegion(num_partitions);
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr2[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Boundary 2"); return TaskStatus::complete; }
+            , mc1.get());
             tr2[i].AddTask(t_none, cell_centered_bvars::ReceiveBoundaryBuffers, mc1);
         }
         TaskRegion &tr3 = tc.AddRegion(num_partitions);
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr3[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Boundary 3"); return TaskStatus::complete; }
+            , mc1.get());
             tr3[i].AddTask(t_none, cell_centered_bvars::SetBoundaries, mc1);
         }
     } else {
         TaskRegion &tr1 = tc.AddRegion(blocks.size());
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr1[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Boundary 1"); return TaskStatus::complete; }
+            , sc1.get());
             tr1[i].AddTask(t_none, &MeshBlockData<Real>::SendBoundaryBuffers, sc1.get());
         }
         TaskRegion &tr2 = tc.AddRegion(blocks.size());
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr2[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Boundary 2"); return TaskStatus::complete; }
+            , sc1.get());
             tr2[i].AddTask(t_none, &MeshBlockData<Real>::ReceiveBoundaryBuffers, sc1.get());
         }
         TaskRegion &tr3 = tc.AddRegion(blocks.size());
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr3[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Boundary 3"); return TaskStatus::complete; }
+            , sc1.get());
             tr3[i].AddTask(t_none, &MeshBlockData<Real>::SetBoundaries, sc1.get());
         }
     }
 
-    // Async Region: Any post-sync tasks.  Timestep & AMR things.
-    TaskRegion &async_region = tc.AddRegion(blocks.size());
+    // Async Region: Any post-sync tasks.  Fixups, timestep & AMR things.
+    TaskRegion &async_region2 = tc.AddRegion(blocks.size());
     for (int i = 0; i < blocks.size(); i++) {
         auto &pmb = blocks[i];
-        auto &tl = async_region[i];
+        auto &tl = async_region2[i];
         auto &sc0 = pmb->meshblock_data.Get(stage_name[stage-1]);
         auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
 
+        auto t_flag = tl.AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Copying prims"); return TaskStatus::complete; }
+            , sc1.get());
+
         auto t_clear_comm_flags = tl.AddTask(t_none, &MeshBlockData<Real>::ClearBoundary,
                                         sc1.get(), BoundaryCommSubset::all);
 
@@ -321,12 +363,12 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
 
         auto t_set_bc = tl.AddTask(t_prolongBound, parthenon::ApplyBoundaryConditions, sc1);
 
-        // Syncing bounds before fixUtoP, and thus running it over the whole domain, will make
-        // behavior for different mesh breakdowns much more similar (identical?), as bad zones on boundaries
-        // will get to use all the same neighbors.
-        // As long as we sync pflags by setting FillGhosts when using this driver!
+        // If we're evolving even the GRMHD variables explicitly, we need to fix UtoP variable inversion failures
+        // Syncing bounds before calling this, and then running it over the whole domain, will make
+        // behavior for different mesh breakdowns much more similar (identical?), since bad zones in
+        // relevant ghost zone ranks will get to use all the same neighbors as if they were in the bulk
         auto t_fix_derived = t_set_bc;
-        if (driver_step == "explicit") {
+        if (!pkgs.at("GRMHD")->Param<bool>("implicit")) {
             t_fix_derived = tl.AddTask(t_set_bc, GRMHD::FixUtoP, sc1.get());
         }
 
diff --git a/kharma/implicit/implicit.cpp b/kharma/implicit/implicit.cpp
index 8fcd03a3..7d361da5 100644
--- a/kharma/implicit/implicit.cpp
+++ b/kharma/implicit/implicit.cpp
@@ -49,8 +49,31 @@ using namespace KokkosBatched;
 namespace Implicit
 {
 
+std::vector<std::string> get_ordered_names(MeshBlockData<Real> *rc, const MetadataFlag& flag, bool only_implicit=false) {
+    auto pmb0 = rc->GetBlockPointer();
+    MetadataFlag isImplicit = pmb0->packages.Get("Implicit")->Param<MetadataFlag>("ImplicitFlag");
+    MetadataFlag isExplicit = pmb0->packages.Get("Implicit")->Param<MetadataFlag>("ExplicitFlag");
+    std::vector<std::string> out;
+    auto vars = rc->GetVariablesByFlag(std::vector<MetadataFlag>({isImplicit, flag}), true).labels();
+    for (int i=0; i < vars.size(); ++i) {
+        if (rc->Contains(vars[i])) {
+            out.push_back(vars[i]);
+        }
+    }
+    if (!only_implicit) {
+        vars = rc->GetVariablesByFlag(std::vector<MetadataFlag>({isExplicit, flag}), true).labels();
+        for (int i=0; i < vars.size(); ++i) {
+            if (rc->Contains(vars[i])) {
+                out.push_back(vars[i]);
+            }
+        }
+    }
+    return out;
+}
+
 std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
 {
+    Flag("Initializing Implicit Package");
     auto pkg = std::make_shared<StateDescriptor>("Implicit");
     Params &params = pkg->AllParams();
 
@@ -64,26 +87,34 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
     int max_nonlinear_iter = pin->GetOrAddInteger("implicit", "max_nonlinear_iter", 3);
     params.Add("max_nonlinear_iter", max_nonlinear_iter);
 
-    // No field specific to implicit solving, but we keep around the residual since
-    // we need to write the whole thing out anyway
-    Metadata m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
-    pkg->AddField("pflag", m);
+    // Denote failures/non-converged zones with the same flag as UtoP
+    // This does NOT share the same mapping of values
+    // TODO currently unused
+    // Metadata m = Metadata({Metadata::Real, Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
+    // pkg->AddField("pflag", m);
+
+    // When using this package we'll need to distinguish Implicitly and Explicitly-updated variables
+    MetadataFlag isImplicit = Metadata::AllocateNewFlag("Implicit");
+    params.Add("ImplicitFlag", isImplicit);
+    MetadataFlag isExplicit = Metadata::AllocateNewFlag("Explicit");
+    params.Add("ExplicitFlag", isExplicit);
 
     // Anything we need to run from this package on callbacks
     // None of this will be crucial for the step
     // pkg->PostFillDerivedBlock = Implicit::PostFillDerivedBlock;
     // pkg->PostStepDiagnosticsMesh = Implicit::PostStepDiagnostics;
 
+    Flag("Initialized");
     return pkg;
 }
 
-TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
-                MeshData<Real> *md1, const Real& dt)
+TaskStatus Step(MeshData<Real> *mci, MeshData<Real> *mc0, MeshData<Real> *dudt,
+                MeshData<Real> *mc_solver, const Real& dt)
 {
-    Flag(mdi, "Implicit Iteration start, i");
-    Flag(md0, "Implicit Iteration start, 0");
+    Flag(mci, "Implicit Iteration start, i");
+    Flag(mc0, "Implicit Iteration start, 0");
     Flag(dudt, "Implicit Iteration start, dudt");
-    auto pmb0 = mdi->GetBlockData(0)->GetBlockPointer();
+    auto pmb0 = mci->GetBlockData(0)->GetBlockPointer();
 
     const auto& implicit_par = pmb0->packages.Get("Implicit")->AllParams();
     const int iter_max = implicit_par.Get<int>("max_nonlinear_iter");
@@ -91,42 +122,49 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     const Real delta = implicit_par.Get<Real>("jacobian_delta");
     const Real gam = pmb0->packages.Get("GRMHD")->Param<Real>("gamma");
 
-    EMHD_parameters emhd_params = {0};
+    EMHD_parameters emhd_params;
     if (pmb0->packages.AllPackages().count("EMHD")) {
         const auto& pars = pmb0->packages.Get("EMHD")->AllParams();
         emhd_params = pars.Get<EMHD_parameters>("emhd_params");
     }
 
-    printf("Implicit advance dt: %g\n", dt);
-
-    //MetadataFlag isNonideal = pmb0->packages.Get("EMHD")->Param<MetadataFlag>("NonidealFlag");
+    // I don't normally do this, but we *really* care about variable ordering here.
+    // The implicit variables need to be first, so we know how to iterate over just them to fill
+    // just the residual & Jacobian we care about, which makes the solve much faster.
+    // This strategy is ugly but potentially gives us complete control,
+    // in case Kokkos's un-pivoted LU proves problematic
     MetadataFlag isPrimitive = pmb0->packages.Get("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+     auto& rci = mci->GetBlockData(0); // MeshBlockData object, more member functions
+    auto ordered_prims = get_ordered_names(rci.get(), isPrimitive);
+    auto ordered_cons = get_ordered_names(rci.get(), Metadata::Conserved);
+    //cerr << "Ordered prims:"; for(auto prim: ordered_prims) cerr << " " << prim; cerr << endl;
+    //cerr << "Ordered cons:"; for(auto con: ordered_cons) cerr << " " << con; cerr << endl;
+
     // Initial state.  Also mapping template
     PackIndexMap prims_map, cons_map;
-    auto& Pi_all = mdi->PackVariables(std::vector<MetadataFlag>{isPrimitive}, prims_map);
-    auto& Ui_all = mdi->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved}, cons_map);
+    auto& Pi_all = mci->PackVariables(ordered_prims, prims_map);
+    auto& Ui_all = mci->PackVariables(ordered_cons, cons_map);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
     // Current sub-step starting state.
-    auto& Ps_all = md0->PackVariables(std::vector<MetadataFlag>{isPrimitive});
-    auto& Us_all = md0->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved});
-    // Flux divergence plus explicit source terms. This is what we'd be adding 
-    auto& dUdt_all = dudt->PackVariables(std::vector<MetadataFlag>{Metadata::Conserved});
-    // Desired final state.
-    auto& Pf_all = md1->PackVariables(std::vector<MetadataFlag>{isPrimitive});
-
-    // Note this iterator, like all of KHARMA, requires nprim == ncons
-    // TODO Maybe should enforce that at start?
+    auto& Ps_all = mc0->PackVariables(ordered_prims);
+    auto& Us_all = mc0->PackVariables(ordered_cons);
+    // Flux divergence plus explicit source terms. This is what we'd be adding.
+    auto& dUdt_all = dudt->PackVariables(ordered_cons);
+    // Guess at initial state. We update only the implicit primitive vars
+    auto& P_solver_all = mc_solver->PackVariables(get_ordered_names(rci.get(), isPrimitive, true));
+
+    // Sizes and scratchpads
     const int nblock = Ui_all.GetDim(5);
     const int nvar = Ui_all.GetDim(4);
-
-    // Workspaces for iteration, include ghosts to match indices.
+    const int nfvar = P_solver_all.GetDim(4);
     auto bounds = pmb0->cellbounds;
     const int n1 = bounds.ncellsi(IndexDomain::entire);
     const int n2 = bounds.ncellsj(IndexDomain::entire);
     const int n3 = bounds.ncellsk(IndexDomain::entire);
-    // A full space for solver iterations, as Pi/Pf may be aliased:
-    // thus we don't want to write anything until we're done.
-    ParArray5D<Real> P_solver_all("P_solver", nblock, nvar, n3, n2, n1);
+
+    // RETURN if there aren't any implicit variables to evolve
+    //cerr << "Solve size " << nfvar << " on prim size " << nvar << endl;
+    if (nfvar == 0) return TaskStatus::complete;
 
     // The norm of the residual.  We store this to avoid the main kernel
     // also being a 2-stage reduction, which is complex and sucks.
@@ -143,7 +181,6 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     const IndexRange jb = bounds.GetBoundsJ(domain);
     const IndexRange kb = bounds.GetBoundsK(domain);
     const IndexRange block = IndexRange{0, nblock - 1};
-    const IndexRange vb = IndexRange{0, nvar - 1};
 
     // Allocate scratch space
     // It is impossible to declare runtime-sized arrays in CUDA
@@ -154,32 +191,34 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
     // See grmhd_functions.hpp for the other approach with overloads
     const int scratch_level = 1; // 0 is actual scratch (tiny); 1 is HBM
     const size_t var_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, n1);
-    const size_t tensor_size_in_bytes = parthenon::ScratchPad3D<Real>::shmem_size(nvar, nvar, n1);
+    const size_t fvar_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nfvar, n1);
+    const size_t tensor_size_in_bytes = parthenon::ScratchPad3D<Real>::shmem_size(nfvar, nvar, n1);
     // Allocate enough to cache:
     // jacobian (2D)
-    // residual, deltaP, dUi, two temps
-    // Pi/Ui, Ps/Us, dUdt, P_solver
-    const size_t total_scratch_bytes = tensor_size_in_bytes + (12) * var_size_in_bytes;
+    // residual, deltaP (implicit only)
+    // Pi/Ui, Ps/Us, dUdt, P_solver, dUi, two temps (all vars)
+    const size_t total_scratch_bytes = tensor_size_in_bytes + (2) * fvar_size_in_bytes + (10) * var_size_in_bytes;
 
     // Iterate.  This loop is outside the kokkos kernel in order to print max_norm
     // There are generally a low and similar number of iterations between
     // different zones, so probably acceptable speed loss.
     for (int iter=0; iter < iter_max; iter++) {
         // Flags per iter, since debugging here will be rampant
-        Flag(md0, "Implicit Iteration: md0");
+        Flag(mc_solver, "Implicit Iteration:");
 
         parthenon::par_for_outer(DEFAULT_OUTER_LOOP_PATTERN, "implicit_solve", pmb0->exec_space,
             total_scratch_bytes, scratch_level, block.s, block.e, kb.s, kb.e, jb.s, jb.e,
             KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int& b, const int& k, const int& j) {
                 const auto& G = Ui_all.GetCoords(b);
-                ScratchPad3D<Real> jacobian_s(member.team_scratch(scratch_level), nvar, nvar, n1);
-                ScratchPad2D<Real> residual_s(member.team_scratch(scratch_level), nvar, n1);
-                ScratchPad2D<Real> delta_prim_s(member.team_scratch(scratch_level), nvar, n1);
+                // Scratchpads for implicit vars
+                ScratchPad3D<Real> jacobian_s(member.team_scratch(scratch_level), nfvar, nfvar, n1);
+                ScratchPad2D<Real> residual_s(member.team_scratch(scratch_level), nfvar, n1);
+                ScratchPad2D<Real> delta_prim_s(member.team_scratch(scratch_level), nfvar, n1);
+                // Scratchpads for all vars
                 ScratchPad2D<Real> dUi_s(member.team_scratch(scratch_level), nvar, n1);
                 ScratchPad2D<Real> tmp1_s(member.team_scratch(scratch_level), nvar, n1);
                 ScratchPad2D<Real> tmp2_s(member.team_scratch(scratch_level), nvar, n1);
                 ScratchPad2D<Real> tmp3_s(member.team_scratch(scratch_level), nvar, n1);
-                // Local versions of the variables
                 ScratchPad2D<Real> Pi_s(member.team_scratch(scratch_level), nvar, n1);
                 ScratchPad2D<Real> Ui_s(member.team_scratch(scratch_level), nvar, n1);
                 ScratchPad2D<Real> Ps_s(member.team_scratch(scratch_level), nvar, n1);
@@ -188,27 +227,36 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                 ScratchPad2D<Real> P_solver_s(member.team_scratch(scratch_level), nvar, n1);
 
                 // Copy some file contents to scratchpads, so we can slice them
-                PLOOP parthenon::par_for_inner(member, ib.s, ib.e,
-                    [&](const int& i) {
-                        Pi_s(ip, i) = Pi_all(b)(ip, k, j, i);
-                        Ui_s(ip, i) = Ui_all(b)(ip, k, j, i);
-                        Ps_s(ip, i) = Ps_all(b)(ip, k, j, i);
-                        Us_s(ip, i) = Us_all(b)(ip, k, j, i);
-                        dUdt_s(ip, i) = dUdt_all(b)(ip, k, j, i);
-                        dUi_s(ip, i) = 0.; // Only a few vars are populated
-                        // Finally, P_solver should actually be initialized to Ps
-                        if (iter == 0) {
+                PLOOP {
+                    parthenon::par_for_inner(member, ib.s, ib.e,
+                        [&](const int& i) {
+                            Pi_s(ip, i) = Pi_all(b)(ip, k, j, i);
+                            Ui_s(ip, i) = Ui_all(b)(ip, k, j, i);
+                            Ps_s(ip, i) = Ps_all(b)(ip, k, j, i);
+                            Us_s(ip, i) = Us_all(b)(ip, k, j, i);
+                            dUdt_s(ip, i) = dUdt_all(b)(ip, k, j, i);
                             P_solver_s(ip, i) = Ps_all(b)(ip, k, j, i);
-                        } else {
-                            P_solver_s(ip, i) = P_solver_all(b, ip, k, j, i);
+                            dUi_s(ip, i) = 0.;
                         }
-                    }
-                );
+                    );
+                }
+                member.team_barrier();
+
+                // Copy in the guess or current solution
+                // Note this replaces the implicit portion of P_solver_s --
+                // any explicit portion was initialized above
+                FLOOP { // Loop over just the implicit "fluid" portion of primitive vars
+                    parthenon::par_for_inner(member, ib.s, ib.e,
+                        [&](const int& i) {
+                            P_solver_s(ip, i) = P_solver_all(b)(ip, k, j, i);
+                        }
+                    );
+                }
                 member.team_barrier();
 
                 parthenon::par_for_inner(member, ib.s, ib.e,
                     [&](const int& i) {
-                        // Lots of slicing.  This is still way faster & cleaner than alternatives, trust me
+                        // Lots of slicing.  This still ends up faster & cleaner than alternatives I tried
                         auto Pi = Kokkos::subview(Pi_s, Kokkos::ALL(), i);
                         auto Ui = Kokkos::subview(Ui_s, Kokkos::ALL(), i);
                         auto Ps = Kokkos::subview(Ps_s, Kokkos::ALL(), i);
@@ -226,70 +274,70 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
                         // Implicit sources at starting state
                         auto dUi = Kokkos::subview(dUi_s, Kokkos::ALL(), i);
                         if (m_p.Q >= 0) {
-                            Real dUq, dUdP;
-                            EMHD::implicit_sources(G, Pi, m_p, gam, j, i, emhd_params, dUq, dUdP);
-                            dUi(m_u.Q) = dUq;
-                            dUi(m_u.DP) = dUdP;
+                            EMHD::implicit_sources(G, Pi, m_p, gam, j, i, emhd_params, dUi(m_u.Q), dUi(m_u.DP));
                         }
 
                         // Jacobian calculation
                         // Requires calculating the residual anyway, so we grab it here
                         calc_jacobian(G, P_solver, Pi, Ui, Ps, dUdt, dUi, tmp1, tmp2, tmp3,
-                                      m_p, m_u, emhd_params, nvar, j, i, delta, gam, dt, jacobian, residual);
+                                      m_p, m_u, emhd_params, nvar, nfvar, j, i, delta, gam, dt, jacobian, residual);
                         // Solve against the negative residual
-                        PLOOP delta_prim(ip) = -residual(ip);
+                        FLOOP delta_prim(ip) = -residual(ip);
 
-                        // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 0) {
+                        // if (am_rank0 && b == 0 && i == 4 && j == 4 && k == 4) {
                         //     printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d\n",
                         //             m_p.RHO, m_p.UU, m_p.U1, m_p.B1, m_p.Q, m_p.DP);
-                        //     printf("P_solver: "); PLOOP printf("%g ", P_solver(ip)); printf("\n");
-                        //     printf("Pi: "); PLOOP printf("%g ", Pi(ip)); printf("\n");
-                        //     printf("Ui: "); PLOOP printf("%g ", Ui(ip)); printf("\n");
-                        //     printf("Ps: "); PLOOP printf("%g ", Ps(ip)); printf("\n");
-                        //     printf("Us: "); PLOOP printf("%g ", Us(ip)); printf("\n");
-                        //     printf("dUdt: "); PLOOP printf("%g ", dUdt(ip)); printf("\n");
-                        //     printf("Initial residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
-                        //     printf("Initial delta_prim: "); PLOOP printf("%g ", delta_prim(ip)); printf("\n");
+                        //     printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d\n",
+                        //             m_u.RHO, m_u.UU, m_u.U1, m_u.B1, m_u.Q, m_u.DP);
+                        //     // printf("P_solver: "); PLOOP printf("%g ", P_solver(ip)); printf("\n");
+                        //     // printf("Pi: "); PLOOP printf("%g ", Pi(ip)); printf("\n");
+                        //     // printf("Ui: "); PLOOP printf("%g ", Ui(ip)); printf("\n");
+                        //     // printf("Ps: "); PLOOP printf("%g ", Ps(ip)); printf("\n");
+                        //     // printf("Us: "); PLOOP printf("%g ", Us(ip)); printf("\n");
+                        //     // printf("dUdt: "); PLOOP printf("%g ", dUdt(ip)); printf("\n");
+                        //     printf("Initial Jacobian:\n"); for (int jp=0; jp<nvar; ++jp) {PLOOP printf("%g\t", jacobian(jp,ip)); printf("\n");}
+                        //     // printf("Initial residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+                        //     // printf("Initial delta_prim: "); PLOOP printf("%g ", delta_prim(ip)); printf("\n");
                         // }
 
                         // Linear solve
                         // This code lightly adapted from Kokkos batched examples
                         // Replaces our inverse residual with the actual desired delta_prim
                         KokkosBatched::SerialLU<Algo::LU::Blocked>::invoke(jacobian, tiny);
-                        KokkosBatched::SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Blocked>
+                        KokkosBatched::SerialTrsv<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Blocked>
                         ::invoke(alpha, jacobian, delta_prim);
 
                         // Update the guess.  For now lambda == 1, choose on the fly?
-                        PLOOP P_solver(ip) += lambda * delta_prim(ip);
+                        FLOOP P_solver(ip) += lambda * delta_prim(ip);
 
                         calc_residual(G, P_solver, Pi, Ui, Ps, dUdt, dUi, tmp3,
-                                      m_p, m_u, emhd_params, nvar, j, i, gam, dt, residual);
+                                      m_p, m_u, emhd_params, nfvar, j, i, gam, dt, residual);
 
-                        // if (am_rank0 && b == 0 && i == 8 && j == 8 && k == 0) {
-                        //     printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d\n",
-                        //             m_p.RHO, m_p.UU, m_p.U1, m_p.B1, m_p.Q, m_p.DP);
-                        //     // JACOBIAN
+                        // if (am_rank0 && b == 0 && i == 11 && j == 11 && k == 0) {
+                        //     // printf("Variable ordering: rho %d uu %d u1 %d B1 %d q %d dP %d\n",
+                        //     //         m_p.RHO, m_p.UU, m_p.U1, m_p.B1, m_p.Q, m_p.DP);
                         //     printf("Final residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
-                        //     printf("Final delta_prim: "); PLOOP printf("%g ", delta_prim(ip)); printf("\n");
-                        //     printf("Final P_solver: "); PLOOP printf("%g ", P_solver(ip)); printf("\n");
+                        //     // printf("Final delta_prim: "); PLOOP printf("%g ", delta_prim(ip)); printf("\n");
+                        //     // printf("Final P_solver: "); PLOOP printf("%g ", P_solver(ip)); printf("\n");
                         // }
 
                         // Store for maximum/output
                         // I would be tempted to store the whole residual, but it's of variable size
                         norm_all(b, k , j, i) = 0;
-                        PLOOP norm_all(b, k, j, i) += pow(residual(ip), 2);
+                        FLOOP norm_all(b, k, j, i) += pow(residual(ip), 2);
                         norm_all(b, k, j, i) = sqrt(norm_all(b, k, j, i)); // TODO faster to scratch cache & copy?
                     }
                 );
                 member.team_barrier();
 
-                // Copy out P_solver to the existing array
-                // This combo still works if P_solver is aliased to one of the other arrays!
-                PLOOP parthenon::par_for_inner(member, ib.s, ib.e,
-                    [&](const int& i) {
-                        P_solver_all(b, ip, k, j, i) = P_solver_s(ip, i);
-                    }
-                );
+                // Copy out (the good bits of) P_solver to the existing array
+                FLOOP {
+                    parthenon::par_for_inner(member, ib.s, ib.e,
+                        [&](const int& i) {
+                            P_solver_all(b)(ip, k, j, i) = P_solver_s(ip, i);
+                        }
+                    );
+                }
             }
         );
         
@@ -305,13 +353,7 @@ TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
         if (MPIRank0()) fprintf(stdout, "Nonlinear iter %d. Max L2 norm: %g\n", iter, max_norm);
     }
 
-    // Write to Pf
-    pmb0->par_for("write_Pf", block.s, block.e, vb.s, vb.e, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA_MESH_VARS {
-            Pf_all(b)(p, k, j, i) = P_solver_all(b, p, k, j, i);
-        }
-    );
-    Flag(md1, "Implicit Iteration: final");
+    Flag(mc_solver, "Implicit Iteration: final");
 
     return TaskStatus::complete;
 
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
index c4a90ff2..00c28eaf 100644
--- a/kharma/implicit/implicit.hpp
+++ b/kharma/implicit/implicit.hpp
@@ -48,6 +48,9 @@
 // implicit solver stuff
 using namespace EMHD;
 
+// Version of PLOOP for just implicit ("fluid") variables
+#define FLOOP for(int ip=0; ip < nfvar; ++ip)
+
 namespace Implicit
 {
 
@@ -62,11 +65,11 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin);
  * @param mdi the fluid state at the beginning of the step
  * @param md0 the initial fluid state for this substep
  * @param dudt the negative flux divergence plus explicit source terms
- * @param md1 the final fluid state
+ * @param md_solver should contain initial guess on call, contains result on return
  * @param dt the timestep (current substep)
  */
 TaskStatus Step(MeshData<Real> *mdi, MeshData<Real> *md0, MeshData<Real> *dudt,
-                MeshData<Real> *md1, const Real& dt);
+                MeshData<Real> *mc_solver, const Real& dt);
 
 /**
  * Calculate the residual generated by the trial primitives P_test
@@ -79,7 +82,7 @@ KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P
                                           const Local& Pi, const Local& Ui, const Local& Ps,
                                           const Local& dudt_explicit, const Local& dUi, const Local& tmp, 
                                           const VarMap& m_p, const VarMap& m_u, const EMHD_parameters& emhd_params,
-                                          const int& nvar, const int& j, const int& i,
+                                          const int& nfvar, const int& j, const int& i,
                                           const Real& gam, const double& dt,
                                           Local& residual)
 {
@@ -88,9 +91,14 @@ KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P
     // Note this uses the Flux:: call, it needs *all* conserved vars!
     Flux::p_to_u(G, P_test, m_p, emhd_params, gam, j, i, tmp, m_u); // U_test
     // (U_test - Ui)/dt - dudt_explicit ...
-    PLOOP residual(ip) = (tmp(ip) - Ui(ip)) / dt - dudt_explicit(ip);
-    // if (i == 8 && j == 8) {
-    //     printf("Explicit residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+    FLOOP residual(ip) = (tmp(ip) - Ui(ip)) / dt - dudt_explicit(ip);
+    // if (i == 11 && j == 11) {
+    //     GReal X[GR_DIM];
+    //     G.coord(0, j, i, Loci::center, X);
+    //     printf("X: "); DLOOP1 printf("%g ", X[mu]); printf("\n");
+    //     printf("U_test: "); PLOOP printf("%g ", tmp(ip)); printf("\n");
+    //     printf("Ui:\t"); PLOOP printf("%g ", Ui(ip)); printf("\n");
+    //     printf("Explicit sources: "); PLOOP printf("%g ", dudt_explicit(ip)); printf("\n");
     // }
 
     if (m_p.Q >= 0) {
@@ -100,19 +108,15 @@ KOKKOS_INLINE_FUNCTION void calc_residual(const GRCoordinates& G, const Local& P
         // ... - 0.5*(dU_new(ip) + dUi(ip)) ...
         residual(m_u.Q) -= 0.5*(dUq + dUi(m_u.Q));
         residual(m_u.DP) -= 0.5*(dUdP + dUi(m_u.DP));
-        // if (i == 8 && j == 8) {
-        //     Real tau = 0, chi_e = 0, nu_e = 0;
-        //     EMHD::set_parameters(G, P_test, m_p, emhd_params, gam, tau, chi_e, nu_e);
-        //     printf("EMHD Params: "); printf("%g %g %g", tau, chi_e, nu_e); printf("\n");
-        //     printf("Implicit sources new: "); printf("%g %g %g %g", P_test(m_p.Q), P_test(m_p.DP), dUq, dUdP); printf("\n");
-        //     printf("Implicit sources residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+        // if (i == 11 && j == 11) {
+        //     printf("Implicit sources: "); printf("%g %g", dUq - dUi(m_u.Q), dUdP - dUi(m_u.DP)); printf("\n");
         // }
         EMHD::time_derivative_sources(G, P_test, Pi, Ps, m_p, emhd_params, gam, dt, j, i, dUq, dUdP); // dU_time
         // ... - dU_time(ip)
         residual(m_u.Q) -= dUq;
         residual(m_u.DP) -= dUdP;
-        // if (i == 8 && j == 8) {
-        //     printf("Sources residual: "); PLOOP printf("%g ", residual(ip)); printf("\n");
+        // if (i == 11 && j == 11) {
+        //     printf("Time derivative sources: "); printf("%g %g", dUq, dUdP); printf("\n");
         // }
     }
 }
@@ -129,12 +133,12 @@ KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P
                                           const Local& dudt_explicit, const Local& dUi,
                                           Local& tmp1, Local& tmp2, Local& tmp3,
                                           const VarMap& m_p, const VarMap& m_u, const EMHD_parameters& emhd_params,
-                                          const int& nvar, const int& j, const int& i,
+                                          const int& nvar, const int& nfvar, const int& j, const int& i,
                                           const Real& jac_delta, const Real& gam, const double& dt,
                                           Local2& jacobian, Local& residual)
 {
     // Calculate residual for Sf->P
-    calc_residual(G, P, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, emhd_params, nvar, j, i, gam, dt, residual);
+    calc_residual(G, P, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, emhd_params, nfvar, j, i, gam, dt, residual);
 
     // Use one scratchpad as the incremented prims P_delta,
     // one as the new residual residual_delta
@@ -144,7 +148,7 @@ KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P
     PLOOP P_delta(ip) = P(ip);
 
     // Numerically evaluate the Jacobian
-    for (int col = 0; col < nvar; col++) {
+    for (int col = 0; col < nfvar; col++) {
         // Compute P_delta, differently depending on whether the prims are small compared to eps
         if (abs(P(col)) < (0.5 * jac_delta)) {
             P_delta(col) = P(col) + jac_delta;
@@ -153,11 +157,10 @@ KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P
         }
 
         // Compute the residual for P_delta, residual_delta
-        calc_residual(G, P_delta, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, emhd_params, nvar, j, i, gam, dt, residual_delta);
+        calc_residual(G, P_delta, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, emhd_params, nfvar, j, i, gam, dt, residual_delta);
 
         // Compute forward derivatives of each residual vs the primitive col
-        for (int row = 0; row < nvar; row++) {
-            //if (row == m_p.RHO && col == m_p.RHO) 
+        for (int row = 0; row < nfvar; row++) {
             jacobian(row, col) = (residual_delta(row) - residual(row)) / (P_delta(col) - P(col) + SMALL);
         }
 
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index 24968911..9a611e8a 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -204,7 +204,8 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     std::string b_field_solver = pin->GetOrAddString("b_field", "solver", "flux_ct");
     // Enable b_cleanup package if we want periodic cleanups OR are resizing a restart file
     bool b_cleanup = pin->GetOrAddBoolean("b_cleanup", "on", false) ||
-                     pin->GetString("parthenon/job", "problem_id") == "resize_restart";
+                     pin->GetString("parthenon/job", "problem_id") == "resize_restart" ||
+                     pin->GetOrAddBoolean("b_field", "initial_clean", false);
     // TODO enable this iff jcon is in the list of outputs
     bool add_jcon = pin->GetOrAddBoolean("GRMHD", "add_jcon", true);
     bool do_electrons = pin->GetOrAddBoolean("electrons", "on", false);
@@ -218,10 +219,14 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     if (do_emhd) {
         // Default to implicit step for EMHD
         driver_type = pin->GetOrAddString("driver", "type", "imex");
-        pin->GetOrAddString("driver", "step", "implicit");
     } else {
         driver_type = pin->GetOrAddString("driver", "type", "harm");
     }
+    // Initialize the implicit timestepping package early so we can mark fields to be
+    // updated implicitly vs explicitly
+    if (driver_type == "imex") {
+        packages.Add(Implicit::Initialize(pin.get()));
+    }
 
     // Global variables "package."  Mutable global state Parthenon doesn't keep for us.
     // Always enable.
@@ -229,7 +234,7 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
 
     // Lots of common functions and variables are still in the GRMHD package,
     // always initialize it first among physics stuff
-    packages.Add(GRMHD::Initialize(pin.get()));
+    packages.Add(GRMHD::Initialize(pin.get(), packages));
 
     // We'll also always want the floors package, even if floors are disabled
     packages.Add(Floors::Initialize(pin.get()));
@@ -237,8 +242,6 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     // B field solvers, to ensure divB == 0.
     if (b_field_solver == "none") {
         // Don't add a B field
-        // Currently this means fields are still allocated, and occasionally read by GRMHD,
-        // but no other operations are performed.
     } else if (b_field_solver == "constraint_damping" || b_field_solver == "b_cd") {
         // Constraint damping, probably only useful for non-GR MHD systems
         packages.Add(B_CD::Initialize(pin.get(), packages));
@@ -255,17 +258,12 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     // there is some form of B field present/declared.
     bool b_field_exists = !(b_field_solver == "none" && !b_cleanup);
 
-    // Implicit timestepping has a few of its own functions
-    bool implicit_step = pin->GetOrAddString("driver", "step", "explicit") == "implicit";
-    if (driver_type != "harm" && implicit_step) {
-        packages.Add(Implicit::Initialize(pin.get()));
-    }
-
     // Add jcon, so long as there's a field to calculate it from
     if (add_jcon && b_field_exists) {
         packages.Add(Current::Initialize(pin.get()));
     }
 
+    // Electrons are boring but not impossible without a B field
     if (do_electrons) {
         packages.Add(Electrons::Initialize(pin.get(), packages));
     }
diff --git a/kharma/prob/anisotropic_conduction.hpp b/kharma/prob/anisotropic_conduction.hpp
index 28cf0653..e4f0d211 100644
--- a/kharma/prob/anisotropic_conduction.hpp
+++ b/kharma/prob/anisotropic_conduction.hpp
@@ -67,7 +67,7 @@ TaskStatus InitializeAnisotropicConduction(MeshBlockData<Real> *rc, ParameterInp
     IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
     IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
     IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
-    pmb->par_for("emhdmodes_init", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+    pmb->par_for("anisotropic_init", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA_3D {
             Real X[GR_DIM];
             G.coord_embed(k, j, i, Loci::center, X);
diff --git a/kharma/prob/bondi.cpp b/kharma/prob/bondi.cpp
index 619a8a5f..9f46ef64 100644
--- a/kharma/prob/bondi.cpp
+++ b/kharma/prob/bondi.cpp
@@ -98,6 +98,7 @@ TaskStatus SetBondi(MeshBlockData<Real> *rc, IndexDomain domain, bool coarse)
     pmb->par_for("bondi_boundary", kb_e.s, kb_e.e, jb_e.s, jb_e.e, ibs, ibe,
         KOKKOS_LAMBDA_3D {
             get_prim_bondi(G, cs, P, m_p, gam, bl, ks, mdot, rs, k, j, i);
+            // TODO all flux
             GRMHD::p_to_u(G, P, m_p, gam, k, j, i, U, m_u);
         }
     );
diff --git a/kharma/prob/bondi.hpp b/kharma/prob/bondi.hpp
index 926f30c0..c4361752 100644
--- a/kharma/prob/bondi.hpp
+++ b/kharma/prob/bondi.hpp
@@ -36,6 +36,7 @@
 #include "decs.hpp"
 
 #include "gr_coordinates.hpp"
+#include "flux_functions.hpp"
 #include "grmhd_functions.hpp"
 #include "pack.hpp"
 #include "prob_common.hpp"
diff --git a/kharma/prob/emhdmodes.hpp b/kharma/prob/emhdmodes.hpp
index 3f0cebf0..ad3b1130 100644
--- a/kharma/prob/emhdmodes.hpp
+++ b/kharma/prob/emhdmodes.hpp
@@ -62,7 +62,12 @@ TaskStatus InitializeEMHDModes(MeshBlockData<Real> *rc, ParameterInput *pin)
 
     const auto& G = pmb->coords;
 
-    const Real amp = pin->GetOrAddReal("emhdmodes", "amp", 1e-4);
+    const Real amp = pin->GetOrAddReal("emhdmodes", "amp", 1e-8);
+
+    const auto& emhd_pars = pmb->packages.Get("EMHD")->AllParams();
+    const EMHD::EMHD_parameters& emhd_params = emhd_pars.Get<EMHD::EMHD_parameters>("emhd_params");
+    const auto& grmhd_pars = pmb->packages.Get("GRMHD")->AllParams();
+    const Real& gam = grmhd_pars.Get<Real>("gamma");
 
     // TODO actually calculate the mode?  Figure something out
     const Real omega_real = pin->GetOrAddReal("emhdmodes", "omega_real", -0.5533585207638141);
@@ -71,20 +76,20 @@ TaskStatus InitializeEMHDModes(MeshBlockData<Real> *rc, ParameterInput *pin)
     // START POSSIBLE ARGS: take all these as parameters in pin?
     // Also note this is 2D only for now
     // Mean state
-    Real rho0 = 1.;
-    Real u0 = 2.;
-    Real u10 = 0.;
-    Real u20 = 0.;
-    Real u30 = 0.;
-    Real B10 = 0.1;
-    Real B20 = 0.3;
-    Real B30 = 0.;
-    Real q0   = 0.;
-    Real delta_p0 = 0.;
+    const Real rho0 = 1.;
+    const Real u0 = 2.;
+    const Real u10 = 0.;
+    const Real u20 = 0.;
+    const Real u30 = 0.;
+    const Real B10 = 0.1;
+    const Real B20 = 0.3;
+    const Real B30 = 0.;
+    const Real q0   = 0.;
+    const Real delta_p0 = 0.;
 
     // Wavevector
-    Real k1 = 2. * M_PI;
-    Real k2 = 4. * M_PI;
+    const Real k1 = 2. * M_PI;
+    const Real k2 = 4. * M_PI;
     // END POSSIBLE ARGS
 
     IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
@@ -98,28 +103,38 @@ TaskStatus InitializeEMHDModes(MeshBlockData<Real> *rc, ParameterInput *pin)
             const Real sin_phi = sin(k1*X[1] + k2*X[2]);
 
             // Perturbations: no higher-order terms
-            Real drho     = amp * (((-0.518522524082246)*cos_phi) + ((0.1792647678001878)*sin_phi));
-            Real du       = amp * ((0.5516170736393813)*cos_phi);
-            Real du1      = amp * (((0.008463122479547856)*cos_phi) + ((-0.011862022608466367)*sin_phi));
-            Real du2      = amp * (((-0.16175466371870734)*cos_phi) + ((0.034828080823603294)*sin_phi));
-            Real du3      = 0.;
-            Real dB1      = amp * (((-0.05973794979640743)*cos_phi) + ((0.03351707506150924)*sin_phi));
-            Real dB2      = amp * (((0.02986897489820372)*cos_phi) - ((0.016758537530754618)*sin_phi));
-            Real dB3      = 0.;
-            Real dq       = amp * (((0.5233486841539436)*cos_phi) - ((0.04767672501939603)*sin_phi));
-            Real ddelta_p = amp * (((0.2909106062057657)*cos_phi) - ((0.02159452055336572)*sin_phi));
+            const Real drho     = amp * (((-0.518522524082246)*cos_phi) + ((0.1792647678001878)*sin_phi));
+            const Real du       = amp * ((0.5516170736393813)*cos_phi);
+            const Real du1      = amp * (((0.008463122479547856)*cos_phi) + ((-0.011862022608466367)*sin_phi));
+            const Real du2      = amp * (((-0.16175466371870734)*cos_phi) + ((0.034828080823603294)*sin_phi));
+            const Real du3      = 0.;
+            const Real dB1      = amp * (((-0.05973794979640743)*cos_phi) + ((0.03351707506150924)*sin_phi));
+            const Real dB2      = amp * (((0.02986897489820372)*cos_phi) - ((0.016758537530754618)*sin_phi));
+            const Real dB3      = 0.;
+            const Real dq       = amp * (((0.5233486841539436)*cos_phi) - ((0.04767672501939603)*sin_phi));
+            const Real ddelta_p = amp * (((0.2909106062057657)*cos_phi) - ((0.02159452055336572)*sin_phi));
 
             // Initialize primitives
             rho(k, j, i) = rho0 + drho;
             u(k, j, i) = u0 + du;
-            uvec(0, k, j, i) = u10 + du1;
-            uvec(1, k, j, i) = u20 + du2;
-            uvec(2, k, j, i) = u30 + du3;
-            B_P(0, k, j, i) = B10 + dB1;
-            B_P(1, k, j, i) = B20 + dB2;
-            B_P(2, k, j, i) = B30 + dB3;
+            uvec(V1, k, j, i) = u10 + du1;
+            uvec(V2, k, j, i) = u20 + du2;
+            uvec(V3, k, j, i) = u30 + du3;
+            B_P(V1, k, j, i) = B10 + dB1;
+            B_P(V2, k, j, i) = B20 + dB2;
+            B_P(V3, k, j, i) = B30 + dB3;
             q(k, j, i) = q0 + dq;
             dP(k, j, i) = delta_p0 + ddelta_p;
+
+            if (emhd_params.higher_order_terms) {
+                Real tau, chi_e, nu_e;
+                EMHD::set_parameters(G, rho(k, j, i), u(k, j, i), emhd_params, gam, k, j, i, tau, chi_e, nu_e);
+                Real Theta = (gam - 1) * u(k, j, i) / rho(k, j, i);
+                Real q_tilde, dP_tilde;
+                EMHD::convert_q_dP_to_prims(q(k, j, i), dP(k, j, i), rho(k, j, i), Theta, tau, chi_e, nu_e, emhd_params, q_tilde, dP_tilde);
+                q(k, j, i) = q_tilde;
+                dP(k, j, i) = dP_tilde;
+            }
         }
     );
 
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index 7aac23d0..d4a92ef3 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -49,59 +49,83 @@
 #include "seed_B_ct.hpp"
 #include "seed_B_cd.hpp"
 
-void SyncAllBounds(Mesh *pmesh)
+void SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
 {
-    // Honestly, the easiest way through this sync is:
-    // 1. PtoU everywhere
-    // 2. Sync like a normal step, incl. physical bounds
-    // 3. UtoP everywhere
-    // Luckily we're amortized over the whole sim, so we can
-    // take our time.
-
-    for (auto &pmb : pmesh->block_list) {
-        auto& rc = pmb->meshblock_data.Get();
-        Flux::PtoU(rc.get(), IndexDomain::entire);
-    }
 
-    for (auto &pmb : pmesh->block_list) {
-        auto& rc = pmb->meshblock_data.Get();
-        rc->ClearBoundary(BoundaryCommSubset::mesh_init);
-        rc->StartReceiving(BoundaryCommSubset::mesh_init);
-        rc->SendBoundaryBuffers();
-    }
+    if (pin->GetString("driver", "type") == "imex") {
+        // If we're syncing the primitive vars, we just sync
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            rc->ClearBoundary(BoundaryCommSubset::mesh_init);
+            rc->StartReceiving(BoundaryCommSubset::mesh_init);
+            rc->SendBoundaryBuffers();
+        }
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            rc->ReceiveAndSetBoundariesWithWait();
+            rc->ClearBoundary(BoundaryCommSubset::mesh_init);
+            // TODO if amr...
+            //pmb->pbval->ProlongateBoundaries();
+
+            // Physical boundary conditions
+            parthenon::ApplyBoundaryConditions(rc);
+        }
+    } else {
+        // If we're syncing the conserved vars...
+        // Honestly, the easiest way through this sync is:
+        // 1. PtoU everywhere
+        // 2. Sync like a normal step, incl. physical bounds
+        // 3. UtoP everywhere
+        // Luckily we're amortized over the whole sim, so we can
+        // take our time.
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            Flux::PtoU(rc.get(), IndexDomain::entire);
+        }
 
-    for (auto &pmb : pmesh->block_list) {
-        auto& rc = pmb->meshblock_data.Get();
-        rc->ReceiveAndSetBoundariesWithWait();
-        rc->ClearBoundary(BoundaryCommSubset::mesh_init);
-        //pmb->pbval->ProlongateBoundaries();
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            rc->ClearBoundary(BoundaryCommSubset::mesh_init);
+            rc->StartReceiving(BoundaryCommSubset::mesh_init);
+            rc->SendBoundaryBuffers();
+        }
+
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            rc->ReceiveAndSetBoundariesWithWait();
+            rc->ClearBoundary(BoundaryCommSubset::mesh_init);
+            // TODO if amr...
+            //pmb->pbval->ProlongateBoundaries();
 
-        // Fill P again, including ghost zones
-        parthenon::Update::FillDerived(rc.get());
+            // Fill P again, including ghost zones
+            parthenon::Update::FillDerived(rc.get());
 
-        // Physical boundary conditions
-        parthenon::ApplyBoundaryConditions(rc);
+            // Physical boundary conditions
+            parthenon::ApplyBoundaryConditions(rc);
+        }
     }
 }
 
 void KHARMA::SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh)
 {
+
+
+    // Check which solver we'll be using
+    const bool use_b_flux_ct = pmesh->packages.AllPackages().count("B_FluxCT");
+    const bool use_b_cd = pmesh->packages.AllPackages().count("B_CD");
+
     // Add the field for torus problems as a second pass
     // Preserves P==U and ends with all physical zones fully defined
     if (pin->GetOrAddString("b_field", "type", "none") != "none") {
         // Calculating B has a stencil outside physical zones
         Flag("Extra boundary sync for B");
-        SyncAllBounds(pmesh);
+        SyncAllBounds(pin, pmesh);
 
         // "Legacy" is the much more common normalization:
         // It's the ratio of max values over the domain i.e. max(P) / max(P_B),
         // not necessarily a local min(beta)
         Real beta_calc_legacy = pin->GetOrAddBoolean("b_field", "legacy", true);
 
-        // Use the correct seed function based on field constraint solver
-        const bool use_b_flux_ct = pmesh->packages.AllPackages().count("B_FluxCT");
-        const bool use_b_cd = pmesh->packages.AllPackages().count("B_CD");
-
         Flag("Seeding magnetic field");
         // Seed the magnetic field and find the minimum beta
         Real beta_min = 1.e100, p_max = 0., bsq_max = 0.;
@@ -193,22 +217,28 @@ void KHARMA::SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh)
             } else {
                 beta_min = MPIMin(beta_min);
             }
-            // divB is implemented over a MeshBlockPack because it is fancy
-            auto md = pmesh->mesh_data.GetOrAdd("base", 0).get();
-            Real divb_max = 0.;
-            if (use_b_flux_ct) {
-                divb_max = B_FluxCT::MaxDivB(md);
-            } else if (use_b_cd) {
-                divb_max = B_CD::MaxDivB(md);
-            }
-            divb_max = MPIMax(divb_max);
             if (MPIRank0()) {
                 cerr << "Beta min post-norm: " << beta_min << endl;
-                cerr << "Max divB post-norm: " << divb_max << endl;
             }
         }
+    }
 
+    if (pin->GetString("b_field", "solver") != "none" && pin->GetInteger("debug", "verbose") > 0) {
+        // Still print divB, even if we're not initializing/normalizing field here
+        auto md = pmesh->mesh_data.GetOrAdd("base", 0).get();
+        Real divb_max = 0.;
+        if (use_b_flux_ct) {
+            divb_max = B_FluxCT::MaxDivB(md);
+        } else if (use_b_cd) {
+            divb_max = B_CD::MaxDivB(md);
+        }
+        divb_max = MPIMax(divb_max);
+        if (MPIRank0()) {
+            cerr << "Starting max divB: " << divb_max << endl;
+        }
     }
+
+
     Flag("Added B Field");
 }
 
@@ -228,28 +258,24 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart, b
 
     // Sync to fill the ghost zones
     Flag("Boundary sync");
-    SyncAllBounds(pmesh);
+    SyncAllBounds(pin, pmesh);
 
-    // TODO when (restart/non) do we need this for setting ctop?
+    // Extra cleanup & init to do if restarting
     if (is_restart) {
-
         // Parthenon restored our global data for us, but we don't always want that
         KHARMA::ResetGlobals(pin, pmesh);
+    }
 
-        // If we resized the array, cleanup any field divergence we created
-        if (is_resize) {
-            // Cleanup operates on full single MeshData as there are MPI syncs
-            auto &mbase = pmesh->mesh_data.GetOrAdd("base", 0);
-            // Clean field divergence across the whole grid
-            B_Cleanup::CleanupDivergence(mbase);
-            // Sync to make sure periodic boundaries are set
-            Flag("Boundary sync");
-            SyncAllBounds(pmesh);
-        }
-
-        // TODO anything special for imex driver here?
-        // TODO there was a reconstruction here for filling ctop, but
-        // it should definitely not be necessary as first dt is set with dt_first
+    // If we resized the array, cleanup any field divergence we created
+    // Let the user specify to do this, too
+    if ((is_restart && is_resize) || pin->GetBoolean("b_field", "initial_clean")) {
+        // Cleanup operates on full single MeshData as there are MPI syncs
+        auto &mbase = pmesh->mesh_data.GetOrAdd("base", 0);
+        // Clean field divergence across the whole grid
+        B_Cleanup::CleanupDivergence(mbase);
+        // Sync to make sure periodic boundaries are set
+        Flag("Boundary sync");
+        SyncAllBounds(pin, pmesh);
     }
 
     Flag("Post-initialization finished");
diff --git a/kharma/prob/resize_restart.cpp b/kharma/prob/resize_restart.cpp
index 5823c3fd..0f7d979e 100644
--- a/kharma/prob/resize_restart.cpp
+++ b/kharma/prob/resize_restart.cpp
@@ -44,11 +44,6 @@
 #include <sys/stat.h>
 #include <ctype.h>
 
-// First boundary sync
-void outflow_x1(const GRCoordinates& G, GridVars P, int nghost, int n1, int n2, int n3);
-void polar_x2(const GRCoordinates& G, GridVars P, int nghost, int n1, int n2, int n3);
-void periodic_x3(const GRCoordinates& G, GridVars P, int nghost, int n1, int n2, int n3);
-
 using namespace Kokkos;
 
 // TODO
diff --git a/kharma/reductions/reductions.hpp b/kharma/reductions/reductions.hpp
index 42a02fd6..85aa79f0 100644
--- a/kharma/reductions/reductions.hpp
+++ b/kharma/reductions/reductions.hpp
@@ -120,13 +120,13 @@ Real DomainSum(MeshData<Real> *md);
 // Each of the MAKE_ETC "calls" expands into an implementation of
 // AccretionRate<Type> using the macro we just defined above.
 enum class Mdot : int;
-MAKE_SUM2D_FN(Mdot, KOKKOS_LAMBDA_3D_REDUCE { local_result += -rho_P(k, j, i) * uvec_P(0, k, j, i) * G.dx3v(k) * G.dx2v(j) * G.dx1v(i) * G.gdet(Loci::center, j, i); })
+MAKE_SUM2D_FN(Mdot, KOKKOS_LAMBDA_3D_REDUCE { local_result += -rho_P(k, j, i) * uvec_P(V1, k, j, i) * G.dx3v(k) * G.dx2v(j) * G.gdet(Loci::center, j, i); })
 enum class Edot : int;
-MAKE_SUM2D_FN(Edot, KOKKOS_LAMBDA_3D_REDUCE { local_result += -uvec_U(0, k, j, i) * G.dx3v(k) * G.dx2v(j) * G.dx1v(i); })
+MAKE_SUM2D_FN(Edot, KOKKOS_LAMBDA_3D_REDUCE { local_result += -uvec_U(V1, k, j, i) * G.dx3v(k) * G.dx2v(j); })
 enum class Ldot : int;
-MAKE_SUM2D_FN(Ldot, KOKKOS_LAMBDA_3D_REDUCE { local_result += uvec_U(2, k, j, i) * G.dx3v(k) * G.dx2v(j) * G.dx1v(i); })
+MAKE_SUM2D_FN(Ldot, KOKKOS_LAMBDA_3D_REDUCE { local_result += uvec_U(V3, k, j, i) * G.dx3v(k) * G.dx2v(j); })
 enum class Phi : int;
-MAKE_SUM2D_FN(Phi, KOKKOS_LAMBDA_3D_REDUCE { local_result += 0.5 * fabs(B_U(0, k, j, i)) * G.dx3v(k) * G.dx2v(j); })
+MAKE_SUM2D_FN(Phi, KOKKOS_LAMBDA_3D_REDUCE { local_result += 0.5 * fabs(B_U(V1, k, j, i)) * G.dx3v(k) * G.dx2v(j); })
 
 // Then we can define the same with fluxes.
 // The MAKE_SUM2D_FN macro pulls out pretty much any variable we could need here
@@ -135,7 +135,7 @@ MAKE_SUM2D_FN(Mdot_Flux, KOKKOS_LAMBDA_3D_REDUCE { local_result += -rho_F(k, j,
 enum class Edot_Flux : int;
 MAKE_SUM2D_FN(Edot_Flux, KOKKOS_LAMBDA_3D_REDUCE { local_result += (u_F(k, j, i) - rho_F(k, j, i)) * G.dx3v(k) * G.dx2v(j); })
 enum class Ldot_Flux : int;
-MAKE_SUM2D_FN(Ldot_Flux, KOKKOS_LAMBDA_3D_REDUCE { local_result += uvec_F(2, k, j, i) * G.dx3v(k) * G.dx2v(j); })
+MAKE_SUM2D_FN(Ldot_Flux, KOKKOS_LAMBDA_3D_REDUCE { local_result += uvec_F(V3, k, j, i) * G.dx3v(k) * G.dx2v(j); })
 
 // Finally, we define the reductions in the form Parthenon needs, picking particular
 // variables and zones so that the resulting functions take only MeshData as an argument
@@ -220,17 +220,12 @@ MAKE_SUM3D_FN(EHTLum, (KOKKOS_LAMBDA_3D_REDUCE {
 // only for areas with sig > 1.
 enum class JetLum : int;
 MAKE_SUM3D_FN(JetLum, (KOKKOS_LAMBDA_3D_REDUCE {
-    Real rho = rho_P(k, j, i);
-    Real Pg = (gam - 1.) * u_P(k, j, i);
     FourVectors Dtmp;
     GRMHD::calc_4vecs(G, uvec_P, B_P, k, j, i, Loci::center, Dtmp);
-    Real bsq = dot(Dtmp.bcon, Dtmp.bcov);
-    double sig = bsq / rho_P(k, j, i);
-    if (sig > 1.) {
-        Real uvec_loc[NVEC] = {uvec_P(0, k, j, i), uvec_P(1, k, j, i), uvec_P(2, k, j, i)};
-        Real B_loc[NVEC] = {B_P(0, k, j, i), B_P(1, k, j, i), B_P(2, k, j, i)};
-        Real rho_ut, T[GR_DIM];
-        GRMHD::p_to_u_mhd(G, 0., 0., uvec_loc, B_loc, gam, k, j, i, rho_ut, T); // TODO should this be just GRMHD::calc_tensor?
+    // If sigma > 1...
+    if ((dot(Dtmp.bcon, Dtmp.bcov) / rho_P(k, j, i)) > 1.) {
+        Real T[GR_DIM];
+        GRMHD::calc_tensor(rho_P(k, j, i), u_P(k, j, i), (gam - 1.) * u_P(k, j, i), Dtmp, 0, T);
         local_result += -T[1] * G.dx3v(k) * G.dx2v(j);
     }
 }))
diff --git a/kharma/types.hpp b/kharma/types.hpp
index 35403bc5..a7e81f0a 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -157,6 +157,7 @@ KOKKOS_INLINE_FUNCTION bool outside(const int& k, const int& j, const int& i,
 
 #if TRACE
 #define PRINTCORNERS 0
+#define PRINTZONE 0
 inline void PrintCorner(MeshBlockData<Real> *rc)
 {
     auto rhop = rc->Get("prims.rho").data.GetHostMirrorAndCopy();
@@ -186,6 +187,20 @@ inline void PrintCorner(MeshBlockData<Real> *rc)
     cerr << endl << endl;
 }
 
+inline void PrintZone(MeshBlockData<Real> *rc)
+{
+    auto rhop = rc->Get("prims.rho").data.GetHostMirrorAndCopy();
+    auto up = rc->Get("prims.u").data.GetHostMirrorAndCopy();
+    auto uvecp = rc->Get("prims.uvec").data.GetHostMirrorAndCopy();
+    auto Bp = rc->Get("prims.B").data.GetHostMirrorAndCopy();
+    auto q = rc->Get("prims.q").data.GetHostMirrorAndCopy();
+    auto dP = rc->Get("prims.dP").data.GetHostMirrorAndCopy();
+    cerr << rhop(0,11,11) << up(0,11,11)
+         << uvecp(0, 0,11,11) << uvecp(1, 0,11,11) << uvecp(2, 0,11,11)
+         << Bp(0, 0,11,11) << Bp(1, 0,11,11) << Bp(2, 0,11,11)
+         << q(0,11,11) << dP(0,11,11) << endl;
+}
+
 inline void Flag(std::string label)
 {
 #pragma omp critical
@@ -198,6 +213,7 @@ inline void Flag(MeshBlockData<Real> *rc, std::string label)
 {
     if(MPIRank0()) std::cerr << label << std::endl;
     if(PRINTCORNERS) PrintCorner(rc);
+    if(PRINTZONE) PrintZone(rc);
 }
 }
 
@@ -206,9 +222,10 @@ inline void Flag(MeshData<Real> *md, std::string label)
 #pragma omp critical
 {
     if(MPIRank0()) std::cerr << label << std::endl;
-    if(PRINTCORNERS) {
+    if(PRINTCORNERS || PRINTZONE) {
         auto rc = md->GetBlockData(0).get();
-        PrintCorner(rc);
+        if(PRINTCORNERS) PrintCorner(rc);
+        if(PRINTZONE) PrintZone(rc);
     }
 }
 }
diff --git a/pars/bondi.par b/pars/bondi.par
index 3341de54..3b915149 100644
--- a/pars/bondi.par
+++ b/pars/bondi.par
@@ -35,6 +35,7 @@ tlim = 50.0
 cfl = 0.9
 gamma = 1.666667
 reconstruction = weno5
+implicit = false
 
 <bondi>
 mdot = 1.0
@@ -57,7 +58,6 @@ verbose = 0
 
 <driver>
 type = harm
-step = explicit
 
 <implicit>
 max_nonlinear_iter = 3
diff --git a/pars/emhdmodes.par b/pars/emhdmodes.par
index f20ed348..3a2d16e8 100644
--- a/pars/emhdmodes.par
+++ b/pars/emhdmodes.par
@@ -28,7 +28,7 @@ ix3_bc = periodic
 ox3_bc = periodic
 
 <parthenon/meshblock>
-nx1 = 64
+nx1 = 128
 nx2 = 64
 nx3 = 1
 
@@ -38,21 +38,27 @@ transform = null
 
 <parthenon/time>
 tlim = 2.0
+nlim = -1
 # "RK2" is the only option for implicit solver
 integrator = rk2
-dt_min = 0.0001
+use_dt_light = true
 
 <GRMHD>
-cfl = 0.5
+cfl = 0.9
 gamma = 1.333333
 reconstruction = linear_mc
 
+# Default is implicit B,
+# use this to specify explicit if desired
+<b_field>
+implicit = true
+initial_clean = false
+
 <emhdmodes>
 amp = 1e-4
 
 <floors>
-#disable_floors = true
-rho_min_geom=1e-6
+disable_floors = true
 
 <debug>
 verbose = 1
@@ -69,18 +75,15 @@ tau = 1.0
 conduction_alpha = 1.0
 viscosity_alpha = 1.0
 
-<driver>
-type = imex
-step = implicit
-
 <implicit>
 max_nonlinear_iter = 3
 
 <parthenon/output0>
 file_type = hdf5
-# This is so as to output only the final state
+# Output only final state
 dt = 100.0
-single_precision_output = true
+# Output in double due to low amplitude
+single_precision_output = false
 variables = prims.rho, prims.u, prims.uvec, prims.B, prims.q, prims.dP
 
 <parthenon/output1>
diff --git a/pars/mhdmodes.par b/pars/mhdmodes.par
index 83e22dd4..1674579f 100644
--- a/pars/mhdmodes.par
+++ b/pars/mhdmodes.par
@@ -45,6 +45,10 @@ dt_min = 0.0001
 cfl = 0.9
 gamma = 1.333333
 reconstruction = weno5
+implicit = false
+
+<b_field>
+implicit = false
 
 <mhdmodes>
 nmode = 1
@@ -58,10 +62,6 @@ verbose = 0
 
 <driver>
 type = harm
-step = explicit
-
-<implicit>
-max_nonlinear_iter = 3
 
 <parthenon/output0>
 file_type = hdf5
diff --git a/pars/mhdmodes_emhd.par b/pars/mhdmodes_emhd.par
deleted file mode 100644
index fcd2a285..00000000
--- a/pars/mhdmodes_emhd.par
+++ /dev/null
@@ -1,88 +0,0 @@
-# GRMHD Modes problem
-# Try to propagate several analytically-amenable linear modes of the MHD equations
-
-<parthenon/job>
-problem_id = mhdmodes
-
-<parthenon/mesh>
-refinement = none
-numlevel = 1
-
-nx1 = 64
-x1min = 0.0
-x1max = 1.0
-ix1_bc = periodic
-ox1_bc = periodic
-
-nx2 = 64
-x2min = 0.0
-x2max = 1.0
-ix2_bc = periodic
-ox2_bc = periodic
-
-nx3 = 64
-x3min = 0.0
-x3max = 1.0
-ix3_bc = periodic
-ox3_bc = periodic
-
-<parthenon/meshblock>
-nx1 = 32
-nx2 = 32
-nx3 = 32
-
-<coordinates>
-base = cartesian_minkowski
-transform = null
-
-<parthenon/time>
-# tlim will be overridden depending on the problem
-tlim = 5.0
-integrator = rk2
-dt_min = 0.0001
-
-<GRMHD>
-cfl = 0.9
-gamma = 1.333333
-reconstruction = weno5
-
-<mhdmodes>
-nmode = 1
-dir = 0
-
-<floors>
-disable_floors = true
-
-<debug>
-verbose = 1
-flag_verbose = 1
-extra_checks = 1
-
-<driver>
-type = imex
-step = implicit
-
-# This block must be present and values filled
-# in all EGRMHD simulations
-<emhd>
-on = true
-closure_type = soundspeed
-tau = 1.0
-conduction_alpha = 1.0
-viscosity_alpha = 1.0
-
-<perf>
-pack_comms = false
-
-<parthenon/output0>
-file_type = hdf5
-# This is so as to output only the final state
-dt = 1.0
-single_precision_output = true
-variables = prims.rho, prims.u, prims.uvec, prims.B
-ghost_zones = true
-
-<parthenon/output1>
-file_type = hst
-dt = 0.1
-
diff --git a/pars/mhdmodes_implicit.par b/pars/mhdmodes_implicit.par
index d5b4fd19..04d3bd7e 100644
--- a/pars/mhdmodes_implicit.par
+++ b/pars/mhdmodes_implicit.par
@@ -1,5 +1,8 @@
 # GRMHD Modes problem
-# Try to propagate several analytically-amenable linear modes of the MHD equations
+# This should produce identical output to mhdmodes.par,
+# but uses a semi-implicit solve for stepping forward
+# the fluid variables.
+# The magnetic field is still evolved explicitly by default.
 
 <parthenon/job>
 problem_id = mhdmodes
@@ -39,12 +42,14 @@ transform = null
 # tlim will be overridden depending on the problem
 tlim = 5.0
 integrator = rk2
-dt_min = 0.0001
+dt_min = 0.00001
+use_dt_light = true
 
 <GRMHD>
 cfl = 0.9
 gamma = 1.333333
-reconstruction = weno5
+reconstruction = linear_mc
+implicit = true
 
 <mhdmodes>
 nmode = 1
@@ -60,15 +65,15 @@ extra_checks = 1
 
 <driver>
 type = imex
-step = implicit
 
-<perf>
-pack_comms = false
+<b_field>
+solver = flux_ct
+implicit = false
 
 <parthenon/output0>
 file_type = hdf5
 # This is so as to output only the final state
-dt = 1.0
+dt = 0.1
 single_precision_output = true
 variables = prims.rho, prims.u, prims.uvec, prims.B
 ghost_zones = true
@@ -76,4 +81,3 @@ ghost_zones = true
 <parthenon/output1>
 file_type = hst
 dt = 0.1
-
diff --git a/scripts/compare.py b/scripts/compare.py
index 9e1c84e7..7a323a99 100644
--- a/scripts/compare.py
+++ b/scripts/compare.py
@@ -36,9 +36,9 @@
 dump2file = sys.argv[2]
 imname = sys.argv[3]
 
-dump1 = pyharm.load_dump(dump1file, add_ghosts=GHOSTS)
+dump1 = pyharm.load_dump(dump1file, ghost_zones=GHOSTS)
 #Hopefully this fails for dumps that shouldn't be compared
-dump2 = pyharm.load_dump(dump2file, add_ghosts=GHOSTS)
+dump2 = pyharm.load_dump(dump2file, ghost_zones=GHOSTS)
 
 N1 = dump1['n1']; N2 = dump1['n2']; N3 = dump1['n3']
 
diff --git a/tests/bondi/check.sh b/tests/bondi/check.sh
index 85304def..1988343f 100755
--- a/tests/bondi/check.sh
+++ b/tests/bondi/check.sh
@@ -13,7 +13,7 @@ python check.py $res "in 2D, MKS coordinates" mks || fail=1
 python check.py $res "in 2D, linear recon with MC limiter" linear_mc || fail=1
 python check.py $res "in 2D, linear recon with VL limiter" linear_vl || fail=1
 
-python check.py $res "in 2D, with Imex driver" imex || fail=1
-python check.py $res "in 2D, with implicit stepping" imex_im || fail=1
+#python check.py $res "in 2D, with Imex driver" imex || fail=1
+#python check.py $res "in 2D, with implicit stepping" imex_im || fail=1
 
 exit $fail
diff --git a/tests/bondi/run.sh b/tests/bondi/run.sh
index ac5f4435..95037bc0 100755
--- a/tests/bondi/run.sh
+++ b/tests/bondi/run.sh
@@ -25,5 +25,6 @@ conv_2d eks coordinates/transform=eks
 conv_2d linear_mc GRMHD/reconstruction=linear_mc
 conv_2d linear_vl GRMHD/reconstruction=linear_vl
 # And the GRIM/classic driver
-conv_2d imex driver/type=imex
-conv_2d imex_im "driver/type=imex driver/step=implicit"
+# TODO support implicit w/o B field
+#conv_2d imex "driver/type=imex"
+#conv_2d imex_im "driver/type=imex GRMHD/implicit=true"
diff --git a/tests/emhdmodes/check.py b/tests/emhdmodes/check.py
index c076481f..549db8f9 100644
--- a/tests/emhdmodes/check.py
+++ b/tests/emhdmodes/check.py
@@ -5,12 +5,14 @@
 import matplotlib.pyplot as plt
 import matplotlib as mpl
 
+from pyharm.grid import make_some_grid
+
 if __name__=='__main__':
     outputdir = './'
 
     NVAR = 10
     VARS = ['rho', 'u', 'u1', 'u2', 'u3', 'B1', 'B2', 'B3', 'q', 'deltaP']
-    RES = [16,24,32,48]
+    RES = [int(r) for r in sys.argv[1].split(",")]
 
     # problem params
     var0 = np.zeros(NVAR)
@@ -45,16 +47,15 @@
     # loop over RES
     for r in range(len(RES)):
         # load data
-        dfile = h5py.File(sorted(glob.glob(os.path.join(str(RES[r]), 'dumps', 'dump_000000*.h5')))[-1], 'r')
-        gfile = h5py.File(os.path.join(str(RES[r]), 'dumps', 'grid.h5'), 'r')
+        dfile = h5py.File("emhd_2d_"+str(RES[r])+"_end_"+sys.argv[3]+".h5", 'r')
 
         dump = {}
 
-        amp = dfile['header/problem/amp'][()]
+        amp = float(dfile['header/amp'][()])
         k1  = 2*np.pi
         k2  = 4*np.pi
-        real_omega  = dfile['header/problem/real_omega'][()]
-        imag_omega  = dfile['header/problem/imag_omega'][()]
+        real_omega  = dfile['header/omega_real'][()]
+        imag_omega  = dfile['header/omega_imag'][()]
         t = dfile['t'][()]
 
         dump['RHO'] = dfile['prims'][Ellipsis,0][()]
@@ -68,27 +69,26 @@
         dump['q'] = dfile['prims'][Ellipsis,8][()]
         dump['deltaP'] = dfile['prims'][Ellipsis,9][()]
 
-        grid = {}
-        grid['x'] = gfile['X'][()]
-        grid['y'] = gfile['Y'][()]
+        gridp = {}
+        gridp['n1'] = dfile['header/n1'][()]
+        gridp['n2'] = dfile['header/n2'][()]
+        gridp['n3'] = dfile['header/n3'][()]
+
+        grid = make_some_grid('cartesian', gridp['n1'], gridp['n2'], gridp['n3'])
         cos_phi = np.cos(k1*grid['x'] + k2*grid['y'] + imag_omega*t)
         sin_phi = np.sin(k1*grid['x'] + k2*grid['y'] + imag_omega*t)
 
-        grid['n1'] = dfile['header/n1'][()]
-        grid['n2'] = dfile['header/n2'][()]
-        grid['n3'] = dfile['header/n3'][()]
-
-        gfile.close()
         dfile.close()
 
         # compute analytic result
         var_analytic  = []
-        for i in range(NVAR):    
+        for i in range(NVAR):
             var_analytic.append(var0[i] + ((amp*cos_phi*dvar_cos[i]) + (amp*sin_phi*dvar_sin[i])) * np.exp(real_omega*t))
         var_analytic = np.asarray(var_analytic)
 
         # numerical result
-        var_numerical = np.zeros((NVAR, grid['n1'], grid['n2'], grid['n3']), dtype=float)
+        # TODO 3D, but will need different coeffs too
+        var_numerical = np.zeros((NVAR, grid['n1'], grid['n2']), dtype=float)
         var_numerical[0,Ellipsis] = dump['RHO'] 
         var_numerical[1,Ellipsis] = dump['U'] 
         var_numerical[2,Ellipsis] = dump['U1'] 
@@ -123,6 +123,8 @@
     fig = plt.figure(figsize=(6,6))
     ax = fig.add_subplot(1,1,1)
 
+    fig.suptitle(sys.argv[2])
+
     # loop over prims
     tracker = 0
     for n in range(NVAR):
@@ -134,4 +136,4 @@
     ax.loglog([RES[0], RES[-1]], 100*amp*np.asarray([float(RES[0]), float(RES[-1])])**(-2), color='k', linestyle='dashed', label='$N^{-2}$')
     plt.xscale('log', base=2)
     ax.legend()
-    plt.savefig(os.path.join(outputdir, 'emhd_linear_mode_convergence.png'))
+    plt.savefig(os.path.join(outputdir, "emhd_linear_mode_convergence_"+sys.argv[3]+".png"))
diff --git a/tests/emhdmodes/check.sh b/tests/emhdmodes/check.sh
index c2ba3a77..caea4c53 100755
--- a/tests/emhdmodes/check.sh
+++ b/tests/emhdmodes/check.sh
@@ -5,14 +5,15 @@
 . ~/libs/anaconda3/etc/profile.d/conda.sh
 conda activate pyharm
 
-pyharm-convert *.phdf
+# Very small amplitude by default, preserve double precision
+pyharm-convert --double *.phdf
 
-RES3D="16,24,32,48"
-RES2D="16,24,32,48"
+RES2D="32,64,128,256"
 
 fail=0
 python3 check.py $RES2D "EMHD mode in 2D, WENO5" emhd2d_weno 2d || fail=1
 python3 check.py $RES2D "EMHD mode in 2D, linear/MC reconstruction" emhd2d_mc 2d || fail=1
-python3 check.py $RES2D "EMHD mode in 2D, linear/VL reconstruction" emhd2d_vl 2d || fail=1
+
+python3 check.py $RES2D "EMHD mode in 2D, higher order terms enabled" emhd2d_higher_order || fail=1
 
 exit $fail
diff --git a/tests/emhdmodes/run.sh b/tests/emhdmodes/run.sh
index 39f10922..d8ef243e 100755
--- a/tests/emhdmodes/run.sh
+++ b/tests/emhdmodes/run.sh
@@ -13,13 +13,14 @@ conv_2d() {
       $BASE/run.sh -i $BASE/pars/emhdmodes.par debug/verbose=1 \
                       parthenon/mesh/nx1=$res parthenon/mesh/nx2=$res parthenon/mesh/nx3=1 \
                       parthenon/meshblock/nx1=$half parthenon/meshblock/nx2=$half parthenon/meshblock/nx3=1 $2
-        mv mhdmodes.out0.00000.phdf mhd_2d_${res}_start_${1}.phdf
-        mv mhdmodes.out0.final.phdf mhd_2d_${res}_end_${1}.phdf
+        mv emhdmodes.out0.00000.phdf emhd_2d_${res}_start_${1}.phdf
+        mv emhdmodes.out0.final.phdf emhd_2d_${res}_end_${1}.phdf
     done
 }
 
 # 2D modes use small blocks, could pick up some problems at MPI ranks >> 1
 # Just one default mode
-conv_2d emhd2d_vl "GRMHD/reconstruction=linear_vl"
 conv_2d emhd2d_mc "GRMHD/reconstruction=linear_mc"
 conv_2d emhd2d_weno "GRMHD/reconstruction=weno5"
+# Test that higher-order terms don't mess anything up
+conv_2d emhd2d_higher_order "emhd/higher_order_terms=true"
diff --git a/tests/mhdmodes/check.sh b/tests/mhdmodes/check.sh
index 67cdfd61..ee959ea2 100755
--- a/tests/mhdmodes/check.sh
+++ b/tests/mhdmodes/check.sh
@@ -17,14 +17,19 @@ python3 check.py $RES3D "fast mode in 3D" fast || fail=1
 python3 check.py $RES3D "entropy mode in 3D, linear/MC reconstruction" entropy_mc || fail=1
 python3 check.py $RES3D "entropy mode in 3D, linear/VL reconstruction" entropy_vl || fail=1
 
-python3 check.py $RES3D "slow mode in 3D, classic algo" slow_imex || fail=1
-python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_imex || fail=1
-python3 check.py $RES3D "fast mode in 3D, classic algo" fast_imex || fail=1
+python3 check.py $RES3D "slow mode 3D, ImEx Explicit" slow_imex || fail=1
+python3 check.py $RES3D "Alfven mode 3D, ImEx Explicit" alfven_imex || fail=1
+python3 check.py $RES3D "fast mode 3D, ImEx Explicit" fast_imex || fail=1
 
-python3 check.py $RES3D "slow mode in 3D, classic algo" slow_imex_im || fail=1
-python3 check.py $RES3D "Alfven mode in 3D, classic algo" alfven_imex_im || fail=1
-python3 check.py $RES3D "fast mode in 3D, classic algo" fast_imex_im || fail=1
+#python3 check.py $RES3D "slow mode in 3D, ImEx Semi-Implicit" slow_imex_semi || fail=1
+#python3 check.py $RES3D "Alfven mode in 3D, ImEx Semi-Implicit" alfven_imex_semi || fail=1
+#python3 check.py $RES3D "fast mode in 3D, ImEx Semi-Implicit" fast_imex_semi || fail=1
 
+python3 check.py $RES3D "slow mode in 3D, ImEx Implicit" slow_imex_im || fail=1
+python3 check.py $RES3D "Alfven mode in 3D, ImEx Implicit" alfven_imex_im || fail=1
+python3 check.py $RES3D "fast mode in 3D, ImEx Implicit" fast_imex_im || fail=1
+
+# 2D MODES
 #python3 check.py $RES2D "fast mode in 2D, WENO5" fast2d 2d || fail=1
 #python3 check.py $RES2D "fast mode in 2D, linear/MC reconstruction" fast_mc 2d || fail=1
 #python3 check.py $RES2D "fast mode in 2D, linear/VL reconstruction" fast_vl 2d || fail=1
diff --git a/tests/mhdmodes/run.sh b/tests/mhdmodes/run.sh
index 20aee459..4015d5cc 100755
--- a/tests/mhdmodes/run.sh
+++ b/tests/mhdmodes/run.sh
@@ -58,10 +58,14 @@ conv_3d fast mhdmodes/nmode=3
 conv_3d slow_imex   "mhdmodes/nmode=1 driver/type=imex"
 conv_3d alfven_imex "mhdmodes/nmode=2 driver/type=imex"
 conv_3d fast_imex   "mhdmodes/nmode=3 driver/type=imex"
-# And the implicit solver
-conv_3d slow_imex_im   "mhdmodes/nmode=1 driver/type=imex driver/step=implicit implicit/max_nonlinear_iter=3"
-conv_3d alfven_imex_im "mhdmodes/nmode=2 driver/type=imex driver/step=implicit implicit/max_nonlinear_iter=3"
-conv_3d fast_imex_im   "mhdmodes/nmode=3 driver/type=imex driver/step=implicit implicit/max_nonlinear_iter=3"
+# And the semi-implicit solver
+#conv_3d slow_imex_semi   "mhdmodes/nmode=1 driver/type=imex GRMHD/implicit=true"
+#conv_3d alfven_imex_semi "mhdmodes/nmode=2 driver/type=imex GRMHD/implicit=true"
+#conv_3d fast_imex_semi   "mhdmodes/nmode=3 driver/type=imex GRMHD/implicit=true"
+# And the fully-implicit solver
+conv_3d slow_imex_im   "mhdmodes/nmode=1 driver/type=imex GRMHD/implicit=true b_field/implicit=true"
+conv_3d alfven_imex_im "mhdmodes/nmode=2 driver/type=imex GRMHD/implicit=true b_field/implicit=true"
+conv_3d fast_imex_im   "mhdmodes/nmode=3 driver/type=imex GRMHD/implicit=true b_field/implicit=true"
 
 # 2D modes use small blocks, could pick up some problems at MPI ranks >> 1
 # Currently very slow, plus modes are incorrect
diff --git a/tests/tilt_init/check.py b/tests/tilt_init/check.py
index e9771e08..5766a2f1 100755
--- a/tests/tilt_init/check.py
+++ b/tests/tilt_init/check.py
@@ -10,7 +10,7 @@
 import pyharm.plots.plot_dumps as hplt
 
 dumpname = "torus.out0.00000.phdf"
-dump = pyharm.load_dump(dumpname, calc_derived=True)
+dump = pyharm.load_dump(dumpname)
 fig, ax = plt.subplots(1,1,figsize=(7,7))
 hplt.plot_xz(ax, dump, 'log_beta', window=[-200,200,-200,200])
 plt.savefig(dumpname+"_beta.png")

From 455c2041795e0ffef827fcbd28297c3e94c2b30a Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 21 Mar 2022 13:13:05 -0500
Subject: [PATCH 17/26] Print and allow checking convergence on abs(divB) when
 cleaning B.  Much higher SOR factor, now converges fairly quickly.

---
 kharma/b_cleanup/b_cleanup.cpp  | 73 +++++++++++++++++++++++++++------
 kharma/b_cleanup/b_cleanup.hpp  |  3 +-
 kharma/flux.cpp                 |  4 +-
 kharma/prob/post_initialize.cpp | 23 +++++++----
 kharma/prob/resize.hpp          |  2 +-
 kharma/types.hpp                | 23 ++++++-----
 pars/resize_restart.par         |  9 ++--
 7 files changed, 98 insertions(+), 39 deletions(-)

diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
index 2e1f277f..94fa129d 100644
--- a/kharma/b_cleanup/b_cleanup.cpp
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -63,14 +63,19 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     params.Add("extra_checks", extra_checks);
 
     // Solver options
-    // This tolerance corresponds to divB_max ~ 1e-12. TODO use that as the indicator?
-    Real error_tolerance = pin->GetOrAddReal("b_cleanup", "error_tolerance", 1e-10);
-    params.Add("error_tolerance", error_tolerance);
-    Real sor_factor = pin->GetOrAddReal("b_cleanup", "sor_factor", 2./3);
+    // Allow setting tolerance relative to starting value.  Off by default
+    Real rel_tolerance = pin->GetOrAddReal("b_cleanup", "rel_tolerance", 1.);
+    params.Add("rel_tolerance", rel_tolerance);
+    // Instead set absolute tolerance corresponding roughly to max divB on grid
+    // Note this returns divB max about 1 decade greater, i.e. ~1e-14
+    Real abs_tolerance = pin->GetOrAddReal("b_cleanup", "abs_tolerance", 1e-15);
+    params.Add("abs_tolerance", abs_tolerance);
+    // TODO why does this need to be so large?
+    Real sor_factor = pin->GetOrAddReal("b_cleanup", "sor_factor", 200);
     params.Add("sor_factor", sor_factor);
     int max_iterations = pin->GetOrAddInteger("b_cleanup", "max_iterations", 1e8);
     params.Add("max_iterations", max_iterations);
-    int check_interval = pin->GetOrAddInteger("b_cleanup", "check_interval", 1e4);
+    int check_interval = pin->GetOrAddInteger("b_cleanup", "check_interval", 1e3);
     params.Add("check_interval", check_interval);
     bool fail_without_convergence = pin->GetOrAddBoolean("b_cleanup", "fail_without_convergence", true);
     params.Add("fail_without_convergence", fail_without_convergence);
@@ -159,7 +164,8 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     auto pkg = md->GetMeshPointer()->packages.Get("B_Cleanup");
     auto max_iters = pkg->Param<int>("max_iterations");
     auto check_interval = pkg->Param<int>("check_interval");
-    auto error_tolerance = pkg->Param<Real>("error_tolerance");
+    auto rel_tolerance = pkg->Param<Real>("rel_tolerance");
+    auto abs_tolerance = pkg->Param<Real>("abs_tolerance");
     auto fail_flag = pkg->Param<bool>("fail_without_convergence");
     auto warn_flag = pkg->Param<bool>("warn_without_convergence");
     auto verbose = pkg->Param<int>("verbose");
@@ -199,10 +205,16 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
         cell_centered_bvars::SendBoundaryBuffers(md);
         cell_centered_bvars::ReceiveBoundaryBuffers(md);
         cell_centered_bvars::SetBoundaries(md);
-
         md.get()->ClearBoundary(BoundaryCommSubset::all);
 
+        // And set physical boundaries
+        for (auto &pmb : md->GetMeshPointer()->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            parthenon::ApplyBoundaryConditions(rc);
+        }
+
         if (iter % check_interval == 0) {
+            Flag("Iteration:");
             // Calculate the new norm & relative error in eliminating divB
             update_norm.val = 0.;
             B_Cleanup::SumError(md.get(), update_norm.val);
@@ -212,14 +224,18 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
             // B_Cleanup::SumP(md.get(), P_norm.val);
             // P_norm.StartReduce(MPI_SUM);
             // P_norm.CheckReduce();
+            divB_max.val = 0.;
+            MaxError(md.get(), divB_max.val);
+            divB_max.StartReduce(MPI_MAX);
+            divB_max.CheckReduce();
             if (MPIRank0() && verbose > 0) {
-                std::cout << "divB step " << iter << " error is "
-                        << update_norm.val / divB_norm.val << std::endl;
+                std::cout << "divB step " << iter << " total relative error is " << update_norm.val / divB_norm.val
+                        << " Max absolute error is " << divB_max.val << std::endl;
                 // std::cout << "P norm is " << P_norm.val << std::endl;
             }
 
             // Both these values are already MPI reduced, but we want to make sure
-            converged = (update_norm.val / divB_norm.val) < error_tolerance;
+            converged = ((update_norm.val / divB_norm.val) < rel_tolerance) && (divB_max.val < abs_tolerance);
             converged = MPIMin(converged);
         }
 
@@ -308,7 +324,7 @@ TaskStatus InitP(MeshData<Real> *md)
 
 TaskStatus UpdateP(MeshData<Real> *md)
 {
-    Flag(md, "Updating P");
+    //Flag(md, "Updating P");
     auto pmesh = md->GetParentPointer();
     const int ndim = pmesh->ndim;
     const IndexRange ib = md->GetBoundsI(IndexDomain::interior);
@@ -331,8 +347,8 @@ TaskStatus UpdateP(MeshData<Real> *md)
     auto dB = md->PackVariables(std::vector<std::string>{"dB"});
     auto divB = md->PackVariables(std::vector<std::string>{"divB"});
 
-    // TODO Damped Jacobi takes a *lot* of iterations for anything bigger than a toy problem.
-    // We probably need CG
+    // TODO Multigrid for faster than ~N^2 convergence
+    // TODO don't sync all boundaries here, we only need p
 
     // dB = grad(p), defined at cell centers
     // Need a halo one zone *left*, as corner_div will read that.
@@ -401,6 +417,37 @@ TaskStatus SumError(MeshData<Real> *md, Real& reduce_sum)
     return TaskStatus::complete;
 }
 
+TaskStatus MaxError(MeshData<Real> *md, Real& reduce_max)
+{
+    Flag(md, "Max new divB");
+    auto pm = md->GetParentPointer();
+    IndexRange ib = md->GetBoundsI(IndexDomain::interior);
+    IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = md->GetBoundsK(IndexDomain::interior);
+    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
+
+    // Get variables
+    auto lap = md->PackVariables(std::vector<std::string>{"lap"});
+    auto divB = md->PackVariables(std::vector<std::string>{"divB"});
+
+    // TODO this can be done as
+    // 1. (K*lap - divB) as here
+    // 2. (div of (B - dB)), simulating the actual result
+    // The latter would require a full/scratch vector temporary, and
+    // setting FillGhost on dB, but the sync is in the right spot
+    Real err_max;
+    pmb0->par_reduce("SumError", 0, lap.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA_MESH_3D_REDUCE {
+            const double new_err = abs(lap(b, 0, k, j, i) - divB(b, 0, k, j, i));
+            if (new_err > local_result) local_result = new_err;
+        }
+    , Kokkos::Max<Real>(err_max));
+
+    // Parthenon/caller will take care of MPI reduction
+    reduce_max += err_max;
+    return TaskStatus::complete;
+}
+
 TaskStatus SumP(MeshData<Real> *md, Real& reduce_sum)
 {
     Flag(md, "Summing P");
diff --git a/kharma/b_cleanup/b_cleanup.hpp b/kharma/b_cleanup/b_cleanup.hpp
index df7c84d9..e032514c 100644
--- a/kharma/b_cleanup/b_cleanup.hpp
+++ b/kharma/b_cleanup/b_cleanup.hpp
@@ -72,10 +72,11 @@ TaskStatus InitP(MeshData<Real> *md);
 TaskStatus UpdateP(MeshData<Real> *md);
 
 /**
- * Sum the remaining error, that is, the difference del^2 p - divB
+ * Functions to calculate the remaining error, that is, the difference del^2 p - divB
  */
 TaskStatus SumError(MeshData<Real> *du, Real& reduce_sum);
 TaskStatus SumP(MeshData<Real> *md, Real& reduce_sum);
+TaskStatus MaxError(MeshData<Real> *md, Real& reduce_max);
 
 /**
  * Apply B -= grad(P) to subtract divergence from the magnetic field
diff --git a/kharma/flux.cpp b/kharma/flux.cpp
index b1fbc857..568a7c3c 100644
--- a/kharma/flux.cpp
+++ b/kharma/flux.cpp
@@ -43,7 +43,7 @@ using namespace parthenon;
 
 TaskStatus Flux::PtoU(MeshBlockData<Real> *rc, IndexDomain domain)
 {
-    Flag(rc, "Getting conserved fluxes");
+    Flag(rc, "Getting conserved variables");
     // Pointers
     auto pmb = rc->GetBlockPointer();
     // Options
@@ -114,6 +114,6 @@ TaskStatus Flux::PtoU(MeshBlockData<Real> *rc, IndexDomain domain)
         }
     );
 
-    Flag(rc, "Got conserved fluxes");
+    Flag(rc, "Got conserved variables");
     return TaskStatus::complete;
 }
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index d4a92ef3..562af010 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -52,18 +52,22 @@
 void SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
 {
 
+    // TODO this does syncs per-block.  Correctly afaict,
+    // but they could be done more simply & efficiently per-mesh
+    Flag("Syncing all bounds");
+
     if (pin->GetString("driver", "type") == "imex") {
         // If we're syncing the primitive vars, we just sync
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
-            rc->ClearBoundary(BoundaryCommSubset::mesh_init);
-            rc->StartReceiving(BoundaryCommSubset::mesh_init);
+            //rc->ClearBoundary(BoundaryCommSubset::all);
+            rc->StartReceiving(BoundaryCommSubset::all);
             rc->SendBoundaryBuffers();
         }
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
             rc->ReceiveAndSetBoundariesWithWait();
-            rc->ClearBoundary(BoundaryCommSubset::mesh_init);
+            rc->ClearBoundary(BoundaryCommSubset::all);
             // TODO if amr...
             //pmb->pbval->ProlongateBoundaries();
 
@@ -85,31 +89,34 @@ void SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
 
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
-            rc->ClearBoundary(BoundaryCommSubset::mesh_init);
-            rc->StartReceiving(BoundaryCommSubset::mesh_init);
+            Flag("Block sync send");
+            //rc->ClearBoundary(BoundaryCommSubset::all);
+            rc->StartReceiving(BoundaryCommSubset::all);
             rc->SendBoundaryBuffers();
         }
 
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
+            Flag("Block sync receive");
             rc->ReceiveAndSetBoundariesWithWait();
-            rc->ClearBoundary(BoundaryCommSubset::mesh_init);
+            rc->ClearBoundary(BoundaryCommSubset::all);
             // TODO if amr...
             //pmb->pbval->ProlongateBoundaries();
 
+            Flag("Fill Derived");
             // Fill P again, including ghost zones
             parthenon::Update::FillDerived(rc.get());
 
+            Flag("Physical bounds");
             // Physical boundary conditions
             parthenon::ApplyBoundaryConditions(rc);
         }
     }
+    Flag("Sync'd");
 }
 
 void KHARMA::SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh)
 {
-
-
     // Check which solver we'll be using
     const bool use_b_flux_ct = pmesh->packages.AllPackages().count("B_FluxCT");
     const bool use_b_cd = pmesh->packages.AllPackages().count("B_CD");
diff --git a/kharma/prob/resize.hpp b/kharma/prob/resize.hpp
index 2b0a7e1c..61493292 100644
--- a/kharma/prob/resize.hpp
+++ b/kharma/prob/resize.hpp
@@ -47,7 +47,7 @@
  * divergence, see b_flux_ct for that (as it is divergence-rep dependent)
  */
 
-/*
+/**
  *  translates geodesic coordinates to a grid zone and returns offset
  *  for interpolation purposes. integer index corresponds to the zone
  *  center "below" the desired point and del[i] \in [0,1) returns the
diff --git a/kharma/types.hpp b/kharma/types.hpp
index a7e81f0a..a724755e 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -168,22 +168,23 @@ inline void PrintCorner(MeshBlockData<Real> *rc)
     auto uc = rc->Get("cons.u").data.GetHostMirrorAndCopy();
     auto uvecc = rc->Get("cons.uvec").data.GetHostMirrorAndCopy();
     auto Bu = rc->Get("cons.B").data.GetHostMirrorAndCopy();
-    auto q = rc->Get("prims.q").data.GetHostMirrorAndCopy();
-    auto dP = rc->Get("prims.dP").data.GetHostMirrorAndCopy();
-    cerr << "q:";
+    auto p = rc->Get("p").data.GetHostMirrorAndCopy();
+    //auto q = rc->Get("prims.q").data.GetHostMirrorAndCopy();
+    //auto dP = rc->Get("prims.dP").data.GetHostMirrorAndCopy();
+    cerr << "p:" << endl;
     for (int j=0; j<8; j++) {
         cout << endl;
         for (int i=0; i<8; i++) {
-            fprintf(stderr, "%.5g\t", q(0, j, i));
-        }
-    }
-    cerr << endl << "dP:";
-    for (int j=0; j<8; j++) {
-        cerr << endl;
-        for (int i=0; i<8; i++) {
-            fprintf(stderr, "%.5g\t", dP(0, j, i));
+            fprintf(stderr, "%.5g\t", p(0, j, i));
         }
     }
+    // cerr << endl << "dP:";
+    // for (int j=0; j<8; j++) {
+    //     cerr << endl;
+    //     for (int i=0; i<8; i++) {
+    //         fprintf(stderr, "%.5g\t", dP(0, j, i));
+    //     }
+    // }
     cerr << endl << endl;
 }
 
diff --git a/pars/resize_restart.par b/pars/resize_restart.par
index aebdcb59..5837ff27 100644
--- a/pars/resize_restart.par
+++ b/pars/resize_restart.par
@@ -27,6 +27,7 @@ r_out = 1000
 tlim = 1.0
 integrator = rk2
 dt_min = 0.00001
+nlim = 1
 
 <GRMHD>
 cfl = 0.9
@@ -39,9 +40,11 @@ use_tf = false
 use_restart_size = false
 
 <b_cleanup>
-error_tolerance = 1e-7
-check_interval = 100
-sor_factor = 15
+rel_tolerance = 1.
+abs_tolerance = 1e-14
+check_interval = 1000
+# See b_cleanup.cpp
+sor_factor = 200
 
 <floors>
 rho_min_geom = 1e-6

From 52f87875b5c6dde44ca30d68161903b790cccb57 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 21 Mar 2022 20:31:35 -0500
Subject: [PATCH 18/26] Don't compile all of Kokkos-kernels by default

---
 CMakeLists.txt                         |   8 +-
 kharma/CMakeLists.txt                  |   3 +-
 kharma/implicit/KokkosKernels_config.h | 155 +++++++++++++++++++++++++
 kharma/prob/emhdshock.hpp              |   2 +-
 4 files changed, 163 insertions(+), 5 deletions(-)
 create mode 100644 kharma/implicit/KokkosKernels_config.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61d48916..74343a3a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,11 +54,13 @@ add_subdirectory(external/parthenon)
 include_directories(external/parthenon/src)
 # mpark::variant is header only, don't build anything
 include_directories(external/variant/include)
-# Kokkos kernels
-# Ubelievably, this actually needs to be compiled to use headers
-add_subdirectory(external/kokkos-kernels)
+# Kokkos kernels: don't compile them but import all headers
+# Requires KokkosKernels_config.h shipped with KHARMA, YMMV
+#add_subdirectory(external/kokkos-kernels)
 include_directories(external/kokkos-kernels/src)
 include_directories(external/kokkos-kernels/src/batched)
+include_directories(external/kokkos-kernels/src/batched/dense)
+include_directories(external/kokkos-kernels/src/batched/dense/impl)
 
 # KHARMA folder
 add_subdirectory(kharma)
diff --git a/kharma/CMakeLists.txt b/kharma/CMakeLists.txt
index 9ec7e73b..fdeb25bb 100644
--- a/kharma/CMakeLists.txt
+++ b/kharma/CMakeLists.txt
@@ -46,7 +46,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/wind)
 add_executable(${EXE_NAME} ${EXE_NAME_SRC})
 
 target_link_libraries(${EXE_NAME} PUBLIC kokkos)
-target_link_libraries(${EXE_NAME} PUBLIC kokkoskernels)
+# We actually only need the header
+#target_link_libraries(${EXE_NAME} PUBLIC kokkoskernels)
 target_link_libraries(${EXE_NAME} PUBLIC parthenon)
 
 # OPTIONS
diff --git a/kharma/implicit/KokkosKernels_config.h b/kharma/implicit/KokkosKernels_config.h
new file mode 100644
index 00000000..982dd935
--- /dev/null
+++ b/kharma/implicit/KokkosKernels_config.h
@@ -0,0 +1,155 @@
+#ifndef KOKKOSKERNELS_CONFIG_H
+#define KOKKOSKERNELS_CONFIG_H
+
+
+/* Define Fortran mangle from Trilinos macro definition */
+#ifndef F77_BLAS_MANGLE                                                                                       
+# define F77_BLAS_MANGLE                                                                     
+#endif 
+
+/* Define if fortran blas 1 function can return complex type */
+/* #undef KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX */
+
+/* Define if building in debug mode */
+/* #undef HAVE_KOKKOSKERNELS_DEBUG */
+
+/* Define this macro if the quadmath TPL is enabled */
+/* #undef HAVE_KOKKOSKERNELS_QUADMATH */
+
+/* Define this macro if the MKL TPL is enabled.  This is different
+   than just linking against the MKL to get the BLAS and LAPACK; it
+   requires (a) header file(s) as well, and may use functions other
+   than just BLAS and LAPACK functions.  */
+/* #undef HAVE_KOKKOSKERNELS_MKL */
+
+
+/* #undef KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE */
+
+/* Define this macro if experimental features of Kokkoskernels are enabled */
+/* #undef HAVE_KOKKOSKERNELS_EXPERIMENTAL */
+
+/* Define this macro if we have SuperLU API version 5 */
+/* #undef HAVE_KOKKOSKERNELS_SUPERLU5_API */
+
+/* Define this macro to disallow instantiations of kernels which are not covered
+ * by ETI */
+/* #undef KOKKOSKERNELS_ETI_ONLY */
+/* Define this macro to only test ETI types */
+#define KOKKOSKERNELS_TEST_ETI_ONLY
+
+/* Whether to build kernels for execution space Kokkos::Cuda */
+#define KOKKOSKERNELS_INST_EXECSPACE_CUDA
+#define KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE
+#define KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE
+/* Whether to build kernels for execution space Kokkos::Experimental::HIP */
+/* #undef KOKKOSKERNELS_INST_EXECSPACE_HIP */
+/* #undef KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE */
+/* Whether to build kernels for execution space Kokkos::Experimental::SYCL */
+/* #undef KOKKOSKERNELS_INST_EXECSPACE_SYCL */
+/* #undef KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE */
+/* #undef KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE */
+/* Whether to build kernels for execution space Kokkos::Experimental::HIP */
+/* #undef KOKKOSKERNELS_INST_EXECSPACE_OPENMPTARGET */
+/* #undef KOKKOSKERNELS_INST_MEMSPACE_OPENMPTARGETSPACE */
+/* Whether to build kernels for execution space Kokkos::OpenMP */
+#define KOKKOSKERNELS_INST_EXECSPACE_OPENMP
+/* Whether to build kernels for execution space Kokkos::Threads */
+/* #undef KOKKOSKERNELS_INST_EXECSPACE_THREADS */
+/* Whether to build kernels for execution space Kokkos::Serial */
+/* #undef KOKKOSKERNELS_INST_EXECSPACE_SERIAL */
+
+/* Whether to build kernels for memory space Kokkos::HostSpace */
+#define KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE
+
+
+/* Whether to build kernels for scalar type double */
+#define KOKKOSKERNELS_INST_DOUBLE
+/* Whether to build kernels for scalar type float */
+/* #undef KOKKOSKERNELS_INST_FLOAT */
+/* Whether to build kernels for scalar type complex<double> */
+/* #undef KOKKOSKERNELS_INST_COMPLEX_DOUBLE */
+/* Whether to build kernels for scalar type complex<float> */
+/* #undef KOKKOSKERNELS_INST_COMPLEX_FLOAT */
+#if defined KOKKOSKERNELS_INST_COMPLEX_DOUBLE
+#define KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_
+#endif
+#if defined KOKKOSKERNELS_INST_COMPLEX_FLOAT
+#define KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_
+#endif
+
+/* Whether to build kernels for multivectors of LayoutLeft */
+#define KOKKOSKERNELS_INST_LAYOUTLEFT
+/* Whether to build kernels for multivectors of LayoutRight */
+/* #undef KOKKOSKERNELS_INST_LAYOUTRIGHT */
+
+/* Whether to build kernels for ordinal type int */
+#define KOKKOSKERNELS_INST_ORDINAL_INT
+/* Whether to build kernels for ordinal type int64_t */
+/* #undef KOKKOSKERNELS_INST_ORDINAL_INT64_T */
+
+/* Whether to build kernels for offset type int */
+#define KOKKOSKERNELS_INST_OFFSET_INT
+/* Whether to build kernels for offset type size_t */
+#define KOKKOSKERNELS_INST_OFFSET_SIZE_T
+
+/*
+ * Third Party Libraries
+ */
+
+/* BLAS library */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_BLAS */
+/* MKL library */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_MKL */
+/* CUSPARSE */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE */
+/* CUBLAS */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_CUBLAS */
+/* MAGMA */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_MAGMA */
+/* SuperLU */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_SUPERLU */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_SuperLU */
+/* CHOLMOD */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_CHOLMOD */
+/* CBLAS */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_CBLAS */
+/* LAPACKE */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_LAPACKE */
+/* METIS */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_METIS */
+/* ARMPL */
+/* #undef KOKKOSKERNELS_ENABLE_TPL_ARMPL */
+
+#define KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
+
+/* if MKL or ARMPL, BLAS is also defined */
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) ||\
+    defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
+#if !defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
+#define KOKKOSKERNELS_ENABLE_TPL_BLAS
+#endif
+#endif
+
+#if !defined(KOKKOS_ENABLE_CUDA) \
+  && !defined(KOKKOS_ENABLE_HIP) \
+  && !defined(KOKKOS_ENABLE_SYCL) \
+  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#define KOKKOSKERNELS_ENABLE_HOST_ONLY
+#endif
+
+
+/*
+ * "Optimization level" for computational kernels in this subpackage.
+ * The higher the level, the more code variants get generated, and
+ * thus the longer the compile times.  However, more code variants
+ * mean both better performance overall, and more uniform performance
+ * for corner cases.
+ */
+#define KOKKOSLINALG_OPT_LEVEL 
+
+#ifndef KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY false
+#endif
+
+
+#endif // KOKKOSKERNELS_CONFIG_H
diff --git a/kharma/prob/emhdshock.hpp b/kharma/prob/emhdshock.hpp
index f33faebf..79e80f42 100644
--- a/kharma/prob/emhdshock.hpp
+++ b/kharma/prob/emhdshock.hpp
@@ -54,7 +54,7 @@ using namespace parthenon;
  * steady state solution. However, they may differ by a translation to the BVP solution.
  * 
  * Therefore, to quantitatively check the EMHD implementation, we prefer the BVP solution as the input
- **/
+ */
 
 TaskStatus InitializeEMHDShock(MeshBlockData<Real> *rc, ParameterInput *pin)
 {

From 32399d35b5c66555386d18938b01d81d9d9ebbb7 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Tue, 22 Mar 2022 15:04:41 -0500
Subject: [PATCH 19/26] Solve field divergence on ghost zones rather than
 setting. Several extra options and little fixes

---
 kharma/b_cleanup/b_cleanup.cpp              | 37 +++++----
 kharma/coordinates/coordinate_embedding.hpp |  6 ++
 kharma/floors/floors.cpp                    |  3 +-
 kharma/kharma.cpp                           | 15 +++-
 kharma/prob/post_initialize.cpp             |  7 +-
 kharma/prob/problem.cpp                     | 33 ++++----
 kharma/prob/resize.hpp                      | 86 ++++++++++++++++-----
 kharma/prob/resize_restart.cpp              | 36 +++++----
 kharma/types.hpp                            | 45 ++++++-----
 pars/emhdmodes.par                          |  2 +-
 pars/resize_restart.par                     | 38 +++++----
 11 files changed, 193 insertions(+), 115 deletions(-)

diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
index 94fa129d..b08c2540 100644
--- a/kharma/b_cleanup/b_cleanup.cpp
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -208,10 +208,10 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
         md.get()->ClearBoundary(BoundaryCommSubset::all);
 
         // And set physical boundaries
-        for (auto &pmb : md->GetMeshPointer()->block_list) {
-            auto& rc = pmb->meshblock_data.Get();
-            parthenon::ApplyBoundaryConditions(rc);
-        }
+        // for (auto &pmb : md->GetMeshPointer()->block_list) {
+        //     auto& rc = pmb->meshblock_data.Get();
+        //     parthenon::ApplyBoundaryConditions(rc);
+        // }
 
         if (iter % check_interval == 0) {
             Flag("Iteration:");
@@ -228,14 +228,14 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
             MaxError(md.get(), divB_max.val);
             divB_max.StartReduce(MPI_MAX);
             divB_max.CheckReduce();
-            if (MPIRank0() && verbose > 0) {
+            if (MPIRank0()) {
                 std::cout << "divB step " << iter << " total relative error is " << update_norm.val / divB_norm.val
                         << " Max absolute error is " << divB_max.val << std::endl;
                 // std::cout << "P norm is " << P_norm.val << std::endl;
             }
 
             // Both these values are already MPI reduced, but we want to make sure
-            converged = ((update_norm.val / divB_norm.val) < rel_tolerance) && (divB_max.val < abs_tolerance);
+            converged = (update_norm.val / divB_norm.val < rel_tolerance) && (divB_max.val < abs_tolerance);
             converged = MPIMin(converged);
         }
 
@@ -245,12 +245,12 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
         if (fail_flag) {
             throw std::runtime_error("Failed to converge when cleaning magnetic field divergence!");
         } else if (warn_flag) {
-            cerr << "Failed to converge when cleaning magnetic field divergence!" << endl;
+            cerr << "KHARMA WARNING: Failed to converge when cleaning magnetic field divergence!!!!" << endl;
         }
     }
 
     if (MPIRank0() && verbose > 0) {
-        std::cout << "Applying magnetic field correction!" << std::endl;
+        std::cout << "Applying magnetic field correction" << std::endl;
     }
 
     // Update the magnetic field with one damped Jacobi step
@@ -262,7 +262,7 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     divB_max.StartReduce(MPI_MAX);
     divB_max.CheckReduce();
 
-    if (MPIRank0() && verbose > 0) {
+    if (MPIRank0()) {
         std::cout << "Final divB max is " << divB_max.val << std::endl;
     }
 
@@ -327,12 +327,15 @@ TaskStatus UpdateP(MeshData<Real> *md)
     //Flag(md, "Updating P");
     auto pmesh = md->GetParentPointer();
     const int ndim = pmesh->ndim;
-    const IndexRange ib = md->GetBoundsI(IndexDomain::interior);
-    const IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
-    const IndexRange kb = md->GetBoundsK(IndexDomain::interior);
-    const IndexRange ib_l = IndexRange{ib.s-1, ib.e};
-    const IndexRange jb_l = (ndim > 1) ? IndexRange{jb.s-1, jb.e} : jb;
-    const IndexRange kb_l = (ndim > 2) ? IndexRange{kb.s-1, kb.e} : kb;
+    const IndexRange ib = md->GetBoundsI(IndexDomain::entire);
+    const IndexRange jb = md->GetBoundsJ(IndexDomain::entire);
+    const IndexRange kb = md->GetBoundsK(IndexDomain::entire);
+    const IndexRange ib_l = IndexRange{ib.s, ib.e-1};
+    const IndexRange jb_l = (ndim > 1) ? IndexRange{jb.s, jb.e-1} : jb;
+    const IndexRange kb_l = (ndim > 2) ? IndexRange{kb.s, kb.e-1} : kb;
+    const IndexRange ib_r = IndexRange{ib.s+1, ib.e-1};
+    const IndexRange jb_r = (ndim > 1) ? IndexRange{jb.s+1, jb.e-1} : jb;
+    const IndexRange kb_r = (ndim > 2) ? IndexRange{kb.s+1, kb.e-1} : kb;
 
     auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
     const int n1 = pmb0->cellbounds.ncellsi(IndexDomain::interior);
@@ -366,12 +369,12 @@ TaskStatus UpdateP(MeshData<Real> *md)
 
     // lap = div(dB), defined at cell corners
     // Then apply a damped Jacobi iteration
-    pmb0->par_for("laplacian_dB", 0, lap.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+    pmb0->par_for("laplacian_dB", 0, lap.GetDim(5) - 1, kb_r.s, kb_r.e, jb_r.s, jb_r.e, ib_r.s, ib_r.e,
         KOKKOS_LAMBDA_MESH_3D {
             const auto& G = lap.GetCoords(b);
             // This is the inverse diagonal element of a fictional a_ij Laplacian operator
             // denoted D^-1 below. Note it's not quite what a_ij might work out to for our "laplacian"
-            const double dt = (-1./6) * G.dx1v(i) * G.dx2v(j) * G.dx3v(k);
+            const double dt = (ndim > 2) ? (-1./6) * G.dx1v(i) * G.dx2v(j) * G.dx3v(k) : (-1./4) * G.dx1v(i) * G.dx2v(j);
             lap(b, 0, k, j, i) = B_FluxCT::corner_div(G, dB, b, k, j, i, ndim > 2);
             // In matrix notation the following would be:
             // x^k+1 = omega*D^-1*(b - (L + U) x^k) + (1-omega)*x^k
diff --git a/kharma/coordinates/coordinate_embedding.hpp b/kharma/coordinates/coordinate_embedding.hpp
index b0685721..6f66b67d 100644
--- a/kharma/coordinates/coordinate_embedding.hpp
+++ b/kharma/coordinates/coordinate_embedding.hpp
@@ -277,6 +277,12 @@ class CoordinateEmbedding {
             Real gdet = invert(&gcov[0][0], &gcon[0][0]);
             return sqrt(fabs(gdet));
         }
+        KOKKOS_INLINE_FUNCTION Real gdet_native(const GReal X[GR_DIM]) const
+        {
+            Real gcov[GR_DIM][GR_DIM], gcon[GR_DIM][GR_DIM];
+            gcov_native(X, gcov);
+            return gcon_native(gcov, gcon);
+        }
 
         KOKKOS_INLINE_FUNCTION void conn_native(const GReal X[GR_DIM], const GReal delta, Real conn[GR_DIM][GR_DIM][GR_DIM]) const
         {
diff --git a/kharma/floors/floors.cpp b/kharma/floors/floors.cpp
index cf1d35f3..ac866ef6 100644
--- a/kharma/floors/floors.cpp
+++ b/kharma/floors/floors.cpp
@@ -142,7 +142,8 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin)
 
 TaskStatus PostFillDerivedBlock(MeshBlockData<Real> *rc)
 {
-    if (rc->GetBlockPointer()->packages.Get("Floors")->Param<bool>("disable_floors")) {
+    if (rc->GetBlockPointer()->packages.Get("Floors")->Param<bool>("disable_floors")
+        || !rc->GetBlockPointer()->packages.Get("Globals")->Param<bool>("in_loop")) {
         return TaskStatus::complete;
     } else {
         return ApplyFloors(rc);
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index 9a611e8a..a61091ea 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -202,10 +202,17 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     // Read all package enablements first so we can set their defaults here,
     // before any packages are initialized: thus they can know the full list
     std::string b_field_solver = pin->GetOrAddString("b_field", "solver", "flux_ct");
-    // Enable b_cleanup package if we want periodic cleanups OR are resizing a restart file
-    bool b_cleanup = pin->GetOrAddBoolean("b_cleanup", "on", false) ||
-                     pin->GetString("parthenon/job", "problem_id") == "resize_restart" ||
-                     pin->GetOrAddBoolean("b_field", "initial_clean", false);
+
+    // Enable b_cleanup package if we want it explicitly
+    bool b_cleanup_package = pin->GetOrAddBoolean("b_cleanup", "on", false);
+    // OR if we need it for resizing a dump
+    bool is_resize = pin->GetString("parthenon/job", "problem_id") == "resize_restart";
+    // OR if we want an initial cleanup pass for some other reason
+    bool initial_cleanup = pin->GetOrAddBoolean("b_field", "initial_cleanup", false);
+    // These were separated to make sure that the preference keys are initialized,
+    // since short-circuiting prevented that when they were listed below
+    bool b_cleanup = b_cleanup_package || is_resize || initial_cleanup;
+
     // TODO enable this iff jcon is in the list of outputs
     bool add_jcon = pin->GetOrAddBoolean("GRMHD", "add_jcon", true);
     bool do_electrons = pin->GetOrAddBoolean("electrons", "on", false);
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index 562af010..c31d7685 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -60,7 +60,7 @@ void SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
         // If we're syncing the primitive vars, we just sync
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
-            //rc->ClearBoundary(BoundaryCommSubset::all);
+            rc->ClearBoundary(BoundaryCommSubset::all);
             rc->StartReceiving(BoundaryCommSubset::all);
             rc->SendBoundaryBuffers();
         }
@@ -90,7 +90,7 @@ void SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
             Flag("Block sync send");
-            //rc->ClearBoundary(BoundaryCommSubset::all);
+            rc->ClearBoundary(BoundaryCommSubset::all);
             rc->StartReceiving(BoundaryCommSubset::all);
             rc->SendBoundaryBuffers();
         }
@@ -275,7 +275,8 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart, b
 
     // If we resized the array, cleanup any field divergence we created
     // Let the user specify to do this, too
-    if ((is_restart && is_resize) || pin->GetBoolean("b_field", "initial_clean")) {
+    if ((is_restart && is_resize && !pin->GetOrAddBoolean("resize_restart", "skip_b_cleanup", false))
+        || pin->GetBoolean("b_field", "initial_cleanup")) {
         // Cleanup operates on full single MeshData as there are MPI syncs
         auto &mbase = pmesh->mesh_data.GetOrAdd("base", 0);
         // Clean field divergence across the whole grid
diff --git a/kharma/prob/problem.cpp b/kharma/prob/problem.cpp
index 0081831c..d7989bba 100644
--- a/kharma/prob/problem.cpp
+++ b/kharma/prob/problem.cpp
@@ -113,24 +113,25 @@ void KHARMA::ProblemGenerator(MeshBlock *pmb, ParameterInput *pin)
         throw std::invalid_argument("Invalid or incomplete problem: "+prob);
     }
 
-    // Pertub the internal energy a bit to encourage accretion
-    // option in perturbation->u_jitter
-    // Note this defaults to zero, generally it's controlled via runtime options
-    // But we *definitely* don't want it when restarting
-    if (prob != "resize_restart" && pin->GetOrAddReal("perturbation", "u_jitter", 0.0) > 0.0) {
-        PerturbU(rc.get(), pin);
+    // If we're not restarting, do any grooming of the initial conditions
+    if (prob != "resize_restart") {
+        // Pertub the internal energy a bit to encourage accretion
+        // Note this defaults to zero & is basically turned on only for torii
+        if (pin->GetOrAddReal("perturbation", "u_jitter", 0.0) > 0.0) {
+            PerturbU(rc.get(), pin);
+        }
+
+        // Initialize electron entropies to defaults if enabled
+        if (pmb->packages.AllPackages().count("Electrons")) {
+            Electrons::InitElectrons(rc.get(), pin);
+        }
+
+        // Apply any floors
+        // This is purposefully done even if floors are disabled,
+        // as it is required for consistent initialization
+        Floors::ApplyFloors(rc.get());
     }
 
-    // Initialize electron entropies to defaults if enabled
-    if (pmb->packages.AllPackages().count("Electrons")) {
-        Electrons::InitElectrons(rc.get(), pin);
-    }
-
-    // Apply any floors
-    // This is purposefully done even if floors are disabled,
-    // as it is required for consistent initialization
-    Floors::ApplyFloors(rc.get());
-
     // Fill the conserved variables U,
     // which we'll treat as the independent/fundamental state.
     // P is filled again from this later on
diff --git a/kharma/prob/resize.hpp b/kharma/prob/resize.hpp
index 61493292..416bf809 100644
--- a/kharma/prob/resize.hpp
+++ b/kharma/prob/resize.hpp
@@ -90,13 +90,25 @@ KOKKOS_INLINE_FUNCTION void Xtoijk(const GReal XG[GR_DIM],
     del[3] = (phi   - ((k + 0.5) * dx[3] + startx[3])) / dx[3];
 }
 
+KOKKOS_INLINE_FUNCTION void ijktoX(const GReal startx[GR_DIM], const GReal dx[GR_DIM],
+                                   const int& i, const int& j, const int& k,
+                                   GReal XG[GR_DIM])
+{
+    // get provisional zone index. see note above function for details. note we
+    // shift to zone centers because that's where variables are most exact.
+    XG[0] = 0.;
+    XG[1] = startx[1] + (i + 0.5) * dx[1];
+    XG[2] = startx[2] + (j + 0.5) * dx[2];
+    XG[3] = startx[3] + (k + 0.5) * dx[3];
+}
+
 /**
  * This interpolates a single-array variable 'var' representing a grid of size 'startx' to 'stopx' in
  * native coordinates, returning its value at location X
  */
-KOKKOS_INLINE_FUNCTION Real interp_scalar(const GReal X[GR_DIM],
+KOKKOS_INLINE_FUNCTION Real interp_scalar(const GRCoordinates& G, const GReal X[GR_DIM],
                                           const GReal startx[GR_DIM], const GReal stopx[GR_DIM],
-                                          const GReal dx[GR_DIM], const bool& is_spherical,
+                                          const GReal dx[GR_DIM], const bool& is_spherical, const bool& weight_by_gdet,
                                           const int& n3, const int& n2, const int& n1,
                                           const Real *var)
 {
@@ -108,32 +120,64 @@ KOKKOS_INLINE_FUNCTION Real interp_scalar(const GReal X[GR_DIM],
     Real interp;
     if (is_spherical) {
         // For ghost zones, we treat each boundary differently:
-        // In X1, repeat first & last zones. TODO should be scaled by sqrt(-g). 
-        if (i < 0) i = 0; if (i >= n1-1) i = n1 - 2;
-        // In X2, bounce over the pole. Not probably perfect for rightward interp
-        if (j < 0) j = -j; if (j > n2-2) j = (n2-2) - (j - (n2-2));
+        // In X1, repeat first & last zones.
+        if (i < 0) { i = 0; del[1] = 0; }
+        if (i > n1-2) { i = n1 - 2; del[1] = 1; }
+        // In X2, stop completely at the last zone
+        // Left side of leftmost segment
+        if (j < 0) { j = 0; del[2] = 0; }
+        // Right side of rightmost segment.  Phrased this way to not segfault
+        if (j > n2-2) { j = n2 - 2; del[2] = 1; }
         // k auto-wraps. So do all indices for periodic boxes.
 
-        // interpolate in x1 and x2
-            interp = var[ind_sph(i    , j    , k)]*(1. - del[1])*(1. - del[2]) +
-                    var[ind_sph(i    , j + 1, k)]*(1. - del[1])*del[2] +
-                    var[ind_sph(i + 1, j    , k)]*del[1]*(1. - del[2]) +
-                    var[ind_sph(i + 1, j + 1, k)]*del[1]*del[2];
+        if (weight_by_gdet) {
+            GReal Xtmp[GR_DIM];
+            ijktoX(startx, dx, i, j, k, Xtmp);
+            GReal g_ij = G.coords.gdet_native(Xtmp);
+            ijktoX(startx, dx, i + 1, j, k, Xtmp);
+            GReal g_i1j = G.coords.gdet_native(Xtmp);
+            ijktoX(startx, dx, i, j + 1, k, Xtmp);
+            GReal g_ij1 = G.coords.gdet_native(Xtmp);
+            ijktoX(startx, dx, i + 1, j + 1, k, Xtmp);
+            GReal g_i1j1 = G.coords.gdet_native(Xtmp);
 
-        // then interpolate in x3 if we need
-        if (n3 > 1) {
-            interp = (1. - del[3])*interp +
-                    del[3]*(var[ind_sph(i    , j    , k + 1)]*(1. - del[1])*(1. - del[2]) +
-                            var[ind_sph(i    , j + 1, k + 1)]*(1. - del[1])*del[2] +
-                            var[ind_sph(i + 1, j    , k + 1)]*del[1]*(1. - del[2]) +
-                            var[ind_sph(i + 1, j + 1, k + 1)]*del[1]*del[2]);
+            // interpolate in x1 and x2
+                interp = var[ind_sph(i    , j    , k)]*g_ij*(1. - del[1])*(1. - del[2]) +
+                         var[ind_sph(i    , j + 1, k)]*g_ij1*(1. - del[1])*del[2] +
+                         var[ind_sph(i + 1, j    , k)]*g_i1j*del[1]*(1. - del[2]) +
+                         var[ind_sph(i + 1, j + 1, k)]*g_i1j1*del[1]*del[2];
+
+            // then interpolate in x3 if we need
+            if (n3 > 1) {
+                interp = (1. - del[3])*interp +
+                        del[3]*(var[ind_sph(i    , j    , k + 1)]*g_ij*(1. - del[1])*(1. - del[2]) +
+                                var[ind_sph(i    , j + 1, k + 1)]*g_ij1*(1. - del[1])*del[2] +
+                                var[ind_sph(i + 1, j    , k + 1)]*g_i1j*del[1]*(1. - del[2]) +
+                                var[ind_sph(i + 1, j + 1, k + 1)]*g_i1j1*del[1]*del[2]);
+            }
+            interp /= G.coords.gdet_native(X);
+        } else {
+            // interpolate in x1 and x2
+                interp = var[ind_sph(i    , j    , k)]*(1. - del[1])*(1. - del[2]) +
+                         var[ind_sph(i    , j + 1, k)]*(1. - del[1])*del[2] +
+                         var[ind_sph(i + 1, j    , k)]*del[1]*(1. - del[2]) +
+                         var[ind_sph(i + 1, j + 1, k)]*del[1]*del[2];
+
+            // then interpolate in x3 if we need
+            if (n3 > 1) {
+                interp = (1. - del[3])*interp +
+                        del[3]*(var[ind_sph(i    , j    , k + 1)]*(1. - del[1])*(1. - del[2]) +
+                                var[ind_sph(i    , j + 1, k + 1)]*(1. - del[1])*del[2] +
+                                var[ind_sph(i + 1, j    , k + 1)]*del[1]*(1. - del[2]) +
+                                var[ind_sph(i + 1, j + 1, k + 1)]*del[1]*del[2]);
+            }
         }
     } else {
         // interpolate in x1 and x2
             interp = var[ind_periodic(i    , j    , k)]*(1. - del[1])*(1. - del[2]) +
-                    var[ind_periodic(i    , j + 1, k)]*(1. - del[1])*del[2] +
-                    var[ind_periodic(i + 1, j    , k)]*del[1]*(1. - del[2]) +
-                    var[ind_periodic(i + 1, j + 1, k)]*del[1]*del[2];
+                     var[ind_periodic(i    , j + 1, k)]*(1. - del[1])*del[2] +
+                     var[ind_periodic(i + 1, j    , k)]*del[1]*(1. - del[2]) +
+                     var[ind_periodic(i + 1, j + 1, k)]*del[1]*del[2];
 
         // then interpolate in x3 if we need
         if (n3 > 1) {
diff --git a/kharma/prob/resize_restart.cpp b/kharma/prob/resize_restart.cpp
index 0f7d979e..86886bc0 100644
--- a/kharma/prob/resize_restart.cpp
+++ b/kharma/prob/resize_restart.cpp
@@ -85,19 +85,14 @@ void ReadIharmRestartHeader(std::string fname, std::unique_ptr<ParameterInput>&
     pin->SetInteger("parthenon/mesh", "restart_nx2", n2file);
     pin->SetInteger("parthenon/mesh", "restart_nx3", n3file);
 
-    double gam, cour, t, dt;
+    double gam, cour, t;
     hdf5_read_single_val(&gam, "gam", H5T_IEEE_F64LE);
     hdf5_read_single_val(&cour, "cour", H5T_IEEE_F64LE);
     hdf5_read_single_val(&t, "t", H5T_IEEE_F64LE);
-    hdf5_read_single_val(&dt, "dt", H5T_IEEE_F64LE);
 
     pin->SetReal("GRMHD", "gamma", gam);
-    //pin->SetReal("GRMHD", "cfl", cour);  // TODO use_cour option?
-    // Setting dt here is actually for KHARMA,
-    // which returns this from EstimateTimestep in step 0
-    pin->SetReal("parthenon/time", "dt", dt);
     pin->SetReal("parthenon/time", "start_time", t);
-    // TODO NSTEP, next tdump/tlog, etc? Do KHARMA globals need anything?
+    // TODO NSTEP, next tdump/tlog, etc?
 
     if (hdf5_exists("a")) {
         double a, hslope, Rout;
@@ -176,6 +171,7 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
 
     auto fname = pin->GetString("resize_restart", "fname"); // Require this, don't guess
     bool use_tf = pin->GetOrAddBoolean("resize_restart", "use_tf", false);
+    bool use_dt = pin->GetOrAddBoolean("resize_restart", "use_dt", true);
     const bool is_spherical = pin->GetBoolean("coordinates", "spherical");
 
     // Size of the file mesh
@@ -201,10 +197,11 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
         cout << "Restarting from " << fname << ", file version " << version << endl << endl;
     }
 
-    // Get tf here and not when reading the header, since whether we use this
-    // value depends on another parameter, "use_tf," which needs to be initialized
-    double tf;
+    // Get tf/dt here and not when reading the header, since whether we use them
+    // depends on another parameter, "use_tf" & "use_dt" which need to be initialized
+    double tf, dt;
     hdf5_read_single_val(&tf, "tf", H5T_IEEE_F64LE);
+    hdf5_read_single_val(&dt, "dt", H5T_IEEE_F64LE);
 
     // TODO do this better by recording/counting flags in MODEL
     hsize_t nfprim;
@@ -245,13 +242,13 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
         pin->GetReal("parthenon/mesh", "x3max")};
     // Same here
     const GReal dx[GR_DIM] = {0., (stopx[1] - startx[1])/n1tot,
-                                (stopx[2] - startx[2])/n2tot,
-                                (stopx[3] - startx[3])/n3tot};
+                                  (stopx[2] - startx[2])/n2tot,
+                                  (stopx[3] - startx[3])/n3tot};
 
     const int block_sz = n3tot*n2tot*n1tot;
 
     // Host-side interpolate & copy into the mirror array
-    // TODO Interpolate in native coordinates of restart
+    // TODO Support restart native coordinates != new native coordinates
     // NOTE: KOKKOS USES < not <=!! Therefore the RangePolicy below will seem like it is too big
     Kokkos::parallel_for("copy_restart_state",
         Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<3>>({ks, js, is}, {ke+1, je+1, ie+1}),
@@ -260,10 +257,10 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
             GReal X[GR_DIM];
             G.coord(k, j, i, Loci::center, X);
             // Interpolate the value at this location from the global grid
-            rho_host(k, j, i) = interp_scalar(X, startx, stopx, dx, is_spherical, n3tot, n2tot, n1tot, &(ptmp[0*block_sz]));
-            u_host(k, j, i) = interp_scalar(X, startx, stopx, dx, is_spherical, n3tot, n2tot, n1tot, &(ptmp[1*block_sz]));
-            VLOOP uvec_host(v, k, j, i) = interp_scalar(X, startx, stopx, dx, is_spherical, n3tot, n2tot, n1tot, &(ptmp[(2+v)*block_sz]));
-            VLOOP B_host(v, k, j, i) = interp_scalar(X, startx, stopx, dx, is_spherical, n3tot, n2tot, n1tot, &(ptmp[(5+v)*block_sz]));
+            rho_host(k, j, i) = interp_scalar(G, X, startx, stopx, dx, is_spherical, false, n3tot, n2tot, n1tot, &(ptmp[0*block_sz]));
+            u_host(k, j, i) = interp_scalar(G, X, startx, stopx, dx, is_spherical, false, n3tot, n2tot, n1tot, &(ptmp[1*block_sz]));
+            VLOOP uvec_host(v, k, j, i) = interp_scalar(G, X, startx, stopx, dx, is_spherical, false, n3tot, n2tot, n1tot, &(ptmp[(2+v)*block_sz]));
+            VLOOP B_host(v, k, j, i) = interp_scalar(G, X, startx, stopx, dx, is_spherical, false, n3tot, n2tot, n1tot, &(ptmp[(5+v)*block_sz]));
         }
     );
     delete[] ptmp;
@@ -280,6 +277,11 @@ TaskStatus ReadIharmRestart(MeshBlockData<Real> *rc, ParameterInput *pin)
     if (use_tf) {
         pin->SetReal("parthenon/time", "tlim", tf);
     }
+    if (use_dt) {
+        // Setting dt here is actually for KHARMA,
+        // which returns this from EstimateTimestep in step 0
+        pin->SetReal("parthenon/time", "dt", dt);
+    }
 
     return TaskStatus::complete;
 }
diff --git a/kharma/types.hpp b/kharma/types.hpp
index a724755e..0d4d5001 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -156,7 +156,7 @@ KOKKOS_INLINE_FUNCTION bool outside(const int& k, const int& j, const int& i,
 }
 
 #if TRACE
-#define PRINTCORNERS 0
+#define PRINTCORNERS 1
 #define PRINTZONE 0
 inline void PrintCorner(MeshBlockData<Real> *rc)
 {
@@ -171,20 +171,23 @@ inline void PrintCorner(MeshBlockData<Real> *rc)
     auto p = rc->Get("p").data.GetHostMirrorAndCopy();
     //auto q = rc->Get("prims.q").data.GetHostMirrorAndCopy();
     //auto dP = rc->Get("prims.dP").data.GetHostMirrorAndCopy();
-    cerr << "p:" << endl;
+    const IndexRange ib = rc->GetBoundsI(IndexDomain::interior);
+    const IndexRange jb = rc->GetBoundsJ(IndexDomain::interior);
+    const IndexRange kb = rc->GetBoundsK(IndexDomain::interior);
+    cerr << "p:";
     for (int j=0; j<8; j++) {
-        cout << endl;
+        cerr << endl;
         for (int i=0; i<8; i++) {
-            fprintf(stderr, "%.5g\t", p(0, j, i));
+            fprintf(stderr, "%.5g\t", p(kb.s, j, i));
+        }
+    }
+    cerr << endl << "B1:";
+    for (int j=0; j<8; j++) {
+        cerr << endl;
+        for (int i=0; i<8; i++) {
+            fprintf(stderr, "%.5g\t", Bu(V1, kb.s, j, i));
         }
     }
-    // cerr << endl << "dP:";
-    // for (int j=0; j<8; j++) {
-    //     cerr << endl;
-    //     for (int i=0; i<8; i++) {
-    //         fprintf(stderr, "%.5g\t", dP(0, j, i));
-    //     }
-    // }
     cerr << endl << endl;
 }
 
@@ -212,9 +215,11 @@ inline void Flag(MeshBlockData<Real> *rc, std::string label)
 {
 #pragma omp critical
 {
-    if(MPIRank0()) std::cerr << label << std::endl;
-    if(PRINTCORNERS) PrintCorner(rc);
-    if(PRINTZONE) PrintZone(rc);
+    if(MPIRank0()) {
+        std::cerr << label << std::endl;
+        if(PRINTCORNERS) PrintCorner(rc);
+        if(PRINTZONE) PrintZone(rc);
+    }
 }
 }
 
@@ -222,11 +227,13 @@ inline void Flag(MeshData<Real> *md, std::string label)
 {
 #pragma omp critical
 {
-    if(MPIRank0()) std::cerr << label << std::endl;
-    if(PRINTCORNERS || PRINTZONE) {
-        auto rc = md->GetBlockData(0).get();
-        if(PRINTCORNERS) PrintCorner(rc);
-        if(PRINTZONE) PrintZone(rc);
+    if(MPIRank0()) {
+        std::cerr << label << std::endl;
+        if(PRINTCORNERS || PRINTZONE) {
+            auto rc = md->GetBlockData(0).get();
+            if(PRINTCORNERS) PrintCorner(rc);
+            if(PRINTZONE) PrintZone(rc);
+        }
     }
 }
 }
diff --git a/pars/emhdmodes.par b/pars/emhdmodes.par
index 3a2d16e8..831a5246 100644
--- a/pars/emhdmodes.par
+++ b/pars/emhdmodes.par
@@ -52,7 +52,7 @@ reconstruction = linear_mc
 # use this to specify explicit if desired
 <b_field>
 implicit = true
-initial_clean = false
+initial_cleanup = false
 
 <emhdmodes>
 amp = 1e-4
diff --git a/pars/resize_restart.par b/pars/resize_restart.par
index 5837ff27..dfd7f057 100644
--- a/pars/resize_restart.par
+++ b/pars/resize_restart.par
@@ -1,5 +1,6 @@
-# Restart from an iharm3d snapshot file
-# Very limited for the moment
+# Restart from an iharm3d snapshot file, resizing to specified mesh
+# Note most parameters here will carry through to running after
+# restarting, as iharm3d restart files do not specify much
 
 <parthenon/job>
 problem_id = resize_restart
@@ -7,12 +8,12 @@ problem_id = resize_restart
 <parthenon/mesh>
 refinement = none
 numlevel = 1
-nx1 = 128
+nx1 = 288
 nx2 = 128
 nx3 = 128
 
 <parthenon/meshblock>
-nx1 = 128
+nx1 = 288
 nx2 = 128
 nx3 = 64
 
@@ -24,27 +25,27 @@ hslope = 0.3
 r_out = 1000
 
 <parthenon/time>
-tlim = 1.0
+tlim = 300000
 integrator = rk2
 dt_min = 0.00001
-nlim = 1
 
 <GRMHD>
 cfl = 0.9
 gamma = 1.666667
 
 <resize_restart>
-fname = restart_00000001.h5
+fname = torus.out1.00100.h5
 use_tf = false
-# Ignore meshsize above and use the restart's size
-use_restart_size = false
+use_dt = false
+skip_b_cleanup = false
 
 <b_cleanup>
 rel_tolerance = 1.
-abs_tolerance = 1e-14
-check_interval = 1000
+abs_tolerance = 1.e-14
+check_interval = 100
+max_iterations = 1000000
 # See b_cleanup.cpp
-sor_factor = 200
+sor_factor = 20.3
 
 <floors>
 rho_min_geom = 1e-6
@@ -54,15 +55,20 @@ bsq_over_u_max = 50
 u_over_rho_max = 100
 
 <debug>
-verbose = 2
-flag_verbose = 1
+verbose = 0
+flag_verbose = 2
 extra_checks = 1
 
 <parthenon/output0>
 file_type = hdf5
-dt = 1.0
+dt = 0.0001
 single_precision_output = true
-variables = prims.rho, prims.u, prims.uvec, prims.B
+variables = prims.rho, prims.u, prims.uvec, prims.B, jcon, fflag, pflag
+ghost_zones = true
+
+<parthenon/output1>
+file_type = rst
+dt = 100.0
 ghost_zones = true
 
 <parthenon/output1>

From d7928de169b88ae5e6303763f216f8a03a46fe48 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 23 Mar 2022 16:07:35 -0500
Subject: [PATCH 20/26] Include all dompain zones when computing max divB

---
 kharma/b_flux_ct/b_flux_ct.cpp | 139 ++++++++++++++-------------------
 kharma/b_flux_ct/b_flux_ct.hpp |   2 +
 kharma/types.hpp               |  15 +++-
 pars/resize_restart.par        |   2 +-
 pars/sane.par                  |  12 +--
 5 files changed, 81 insertions(+), 89 deletions(-)

diff --git a/kharma/b_flux_ct/b_flux_ct.cpp b/kharma/b_flux_ct/b_flux_ct.cpp
index a0859234..91550168 100644
--- a/kharma/b_flux_ct/b_flux_ct.cpp
+++ b/kharma/b_flux_ct/b_flux_ct.cpp
@@ -219,9 +219,9 @@ TaskStatus FluxCT(MeshData<Real> *md)
     const IndexRange kb = md->GetBoundsK(IndexDomain::interior);
     const IndexRange block = IndexRange{0, B_F.GetDim(5)-1};
     // One zone halo on the *right only*, except for k in 2D
-    const IndexRange il = IndexRange{ib.s - 3, ib.e + 3};
-    const IndexRange jl = IndexRange{jb.s - 3, jb.e + 3};
-    const IndexRange kl = (ndim > 2) ? IndexRange{kb.s - 3, kb.e + 3} : kb;
+    const IndexRange il = IndexRange{ib.s, ib.e + 1};
+    const IndexRange jl = IndexRange{jb.s, jb.e + 1};
+    const IndexRange kl = (ndim > 2) ? IndexRange{kb.s, kb.e + 1} : kb;
 
     // Declare temporaries
     // TODO make these a true Edge field of B_FluxCT? Could then output, use elsewhere, skip re-declaring
@@ -252,26 +252,7 @@ TaskStatus FluxCT(MeshData<Real> *md)
     // Note that zeroing FX(BX) is *necessary* -- this flux gets filled by GetFlux,
     // And it's necessary to keep track of it for B_CD
     Flag(md, "Calc Fluxes");
-#if FUSE_EMF_KERNELS
-    pmb0->par_for("flux_ct_all", block.s, block.e, kl.s, kl.e, jl.s, jl.e, il.s, il.e,
-        KOKKOS_LAMBDA_MESH_3D {
-            B_F(b).flux(X1DIR, V1, k, j, i) =  0.0;
-            B_F(b).flux(X1DIR, V2, k, j, i) =  0.5 * (emf3(b, k, j, i) + emf3(b, k, j+1, i));
-
-            B_F(b).flux(X2DIR, V1, k, j, i) = -0.5 * (emf3(b, k, j, i) + emf3(b, k, j, i+1));
-            B_F(b).flux(X2DIR, V2, k, j, i) =  0.0;
-
-            if (ndim > 2) {
-                B_F(b).flux(X1DIR, V3, k, j, i) = -0.5 * (emf2(b, k, j, i) + emf2(b, k+1, j, i));
-                B_F(b).flux(X2DIR, V3, k, j, i) =  0.5 * (emf1(b, k, j, i) + emf1(b, k+1, j, i));
 
-                B_F(b).flux(X3DIR, V1, k, j, i) =  0.5 * (emf2(b, k, j, i) + emf2(b, k, j, i+1));
-                B_F(b).flux(X3DIR, V2, k, j, i) = -0.5 * (emf1(b, k, j, i) + emf1(b, k, j+1, i));
-                B_F(b).flux(X3DIR, V3, k, j, i) =  0.0;
-            }
-        }
-    );
-#else
     // Note these each have different domains, eg il vs ib.  The former extends one index farther if appropriate
     pmb0->par_for("flux_ct_1", block.s, block.e, kb.s, kb.e, jb.s, jb.e, il.s, il.e,
         KOKKOS_LAMBDA_MESH_3D {
@@ -296,9 +277,8 @@ TaskStatus FluxCT(MeshData<Real> *md)
             }
         );
     }
-#endif
-    Flag(md, "CT Finished");
 
+    Flag(md, "CT Finished");
     return TaskStatus::complete;
 }
 
@@ -362,38 +342,49 @@ TaskStatus TransportB(MeshData<Real> *md)
 
 double MaxDivB(MeshData<Real> *md)
 {
-    Flag(md, "Calculating divB");
-    // Pointers
+    Flag(md, "Calculating divB Mesh");
     auto pmesh = md->GetMeshPointer();
-    auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
-    // Exit on trivial operations
     const int ndim = pmesh->ndim;
-    if (ndim < 2) return 0.;
 
-    // Pack variables
+    // Packing out here avoids frequent per-mesh packs.  Do we need to?
     auto B_U = md->PackVariables(std::vector<std::string>{"cons.B"});
-    // Get sizes
+
     const IndexRange ib = md->GetBoundsI(IndexDomain::interior);
     const IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
     const IndexRange kb = md->GetBoundsK(IndexDomain::interior);
     const IndexRange block = IndexRange{0, B_U.GetDim(5)-1};
-    // Note this is a stencil-4 (or -8) function, which would involve zones outside the
-    // domain unless we stay off the left edges
-    // So we do the *reverse* of a halo:
-    const IndexRange il = IndexRange{ib.s + 1, ib.e};
-    const IndexRange jl = IndexRange{jb.s + 1, jb.e};
-    const IndexRange kl = (ndim > 2) ? IndexRange{kb.s + 1, kb.e} : kb;
-
-    double max_divb;
-    Kokkos::Max<double> max_reducer(max_divb);
-    pmb0->par_reduce("divB_max", block.s, block.e, kl.s, kl.e, jl.s, jl.e, il.s, il.e,
-        KOKKOS_LAMBDA_MESH_3D_REDUCE {
-            const auto& G = B_U.GetCoords(b);
-            double local_divb = fabs(corner_div(G, B_U, b, k, j, i, ndim > 2));
-            if (local_divb > local_result) local_result = local_divb;
-        }
-    , max_reducer);
 
+    // This is one kernel call per block, because each block will have different bounds.
+    // Could consolidate at the cost of lots of bounds checking.
+    double max_divb = 0.0;
+    for (int b = block.s; b < block.e; ++b) {
+        auto pmb = md->GetBlockData(b)->GetBlockPointer().get();
+
+        // Note this is a stencil-4 (or -8) function, which would involve zones outside the
+        // domain unless we stay off the left edges.
+        // However, *inside* the domain we want to catch all corners, including those at 0/N+1
+        // bordering other meshblocks.
+        const int is = IsDomainBound(pmb, BoundaryFace::inner_x1) ? ib.s + 1 : ib.s;
+        const int ie = IsDomainBound(pmb, BoundaryFace::outer_x1) ? ib.e : ib.e + 1;
+        const int js = IsDomainBound(pmb, BoundaryFace::inner_x2) ? jb.s + 1 : jb.s;
+        const int je = IsDomainBound(pmb, BoundaryFace::outer_x2) ? jb.e : jb.e + 1;
+        const int ks = (IsDomainBound(pmb, BoundaryFace::inner_x3) && ndim > 2) ? kb.s + 1 : kb.s;
+        const int ke = (IsDomainBound(pmb, BoundaryFace::outer_x3) || ndim <= 2) ? kb.e : kb.e + 1;
+
+        double max_divb_block;
+        Kokkos::Max<double> max_reducer(max_divb_block);
+        pmb->par_reduce("divB_max", ks, ke, js, je, is, ie,
+            KOKKOS_LAMBDA_3D_REDUCE {
+                const auto& G = B_U.GetCoords(b);
+                const double local_divb = fabs(corner_div(G, B_U, b, k, j, i, ndim > 2));
+                if (local_divb > local_result) local_result = local_divb;
+            }
+        , max_reducer);
+
+        if (max_divb_block > max_divb) max_divb = max_divb_block;
+    }
+
+    Flag("Calculated");
     return max_divb;
 }
 
@@ -409,7 +400,10 @@ TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
         Real max_divb = B_FluxCT::MaxDivB(md);
         max_divb = MPIMax(max_divb);
 
-        if(MPIRank0()) cout << "Max DivB: " << max_divb << endl;
+        if(MPIRank0()) {
+            cout << "Max DivB: " << max_divb << endl;
+        }
+
     }
 
     Flag(md, "Printed");
@@ -424,44 +418,27 @@ void FillOutput(MeshBlock *pmb, ParameterInput *pin)
     const int ndim = pmb->pmy_mesh->ndim;
     if (ndim < 2) return;
 
-    // TODO can we call corner_div here?  Extra b=0 out front in addressing zones...
-    GridVars B_U = rc->Get("cons.B").data;
-    GridVars divB = rc->Get("divB").data;
+    auto B_U = rc->PackVariables(std::vector<std::string>{"cons.B"});
+    auto divB = rc->PackVariables(std::vector<std::string>{"divB"});
 
+    // Note this is a stencil-4 (or -8) function, which would involve zones outside the
+    // domain unless we stay off the left edges.
+    // However, *inside* the domain we want to catch all corners, including those at 0/N+1
+    // bordering other meshblocks.
     const IndexRange ib = rc->GetBoundsI(IndexDomain::interior);
     const IndexRange jb = rc->GetBoundsJ(IndexDomain::interior);
     const IndexRange kb = rc->GetBoundsK(IndexDomain::interior);
-    // Note this is a stencil-4 (or -8) function, which would involve zones outside the
-    // domain unless we stay off the left edges
-    // So we do the *reverse* of a halo:
-    const IndexRange il = IndexRange{ib.s + 1, ib.e};
-    const IndexRange jl = IndexRange{jb.s + 1, jb.e};
-    const IndexRange kl = (ndim > 2) ? IndexRange{kb.s + 1, kb.e} : kb;
-
-    const double norm = (ndim > 2) ? 0.25 : 0.5;
-
-    const auto& G = pmb->coords;
-
-    pmb->par_for("divB_output", kl.s, kl.e, jl.s, jl.e, il.s, il.e,
+    const int is = IsDomainBound(pmb, BoundaryFace::inner_x1) ? ib.s + 1 : ib.s;
+    const int ie = IsDomainBound(pmb, BoundaryFace::outer_x1) ? ib.e : ib.e + 1;
+    const int js = IsDomainBound(pmb, BoundaryFace::inner_x2) ? jb.s + 1 : jb.s;
+    const int je = IsDomainBound(pmb, BoundaryFace::outer_x2) ? jb.e : jb.e + 1;
+    const int ks = (IsDomainBound(pmb, BoundaryFace::inner_x3) && ndim > 2) ? kb.s + 1 : kb.s;
+    const int ke = (IsDomainBound(pmb, BoundaryFace::outer_x3) || ndim <= 2) ? kb.e : kb.e + 1;
+
+    pmb->par_for("divB_output", ks, ke, js, je, is, ie,
         KOKKOS_LAMBDA_3D {
-            // 2D divergence, averaging to corners
-            double term1 = B_U(V1, k, j, i)   + B_U(V1, k, j-1, i)
-                         - B_U(V1, k, j, i-1) - B_U(V1, k, j-1, i-1);
-            double term2 = B_U(V2, k, j, i)   + B_U(V2, k, j, i-1)
-                         - B_U(V2, k, j-1, i) - B_U(V2, k, j-1, i-1);
-            double term3 = 0.;
-            if (ndim > 2) {
-                // Average to corners in 3D, add 3rd flux
-                term1 +=  B_U(V1, k-1, j, i)   + B_U(V1, k-1, j-1, i)
-                        - B_U(V1, k-1, j, i-1) - B_U(V1, k-1, j-1, i-1);
-                term2 +=  B_U(V2, k-1, j, i)   + B_U(V2, k-1, j, i-1)
-                        - B_U(V2, k-1, j-1, i) - B_U(V2, k-1, j-1, i-1);
-                term3 =   B_U(V3, k, j, i)     + B_U(V3, k, j-1, i)
-                        + B_U(V3, k, j, i-1)   + B_U(V3, k, j-1, i-1)
-                        - B_U(V3, k-1, j, i)   - B_U(V3, k-1, j-1, i)
-                        - B_U(V3, k-1, j, i-1) - B_U(V3, k-1, j-1, i-1);
-            }
-            divB(k, j, i) = norm*term1/G.dx1v(i) + norm*term2/G.dx2v(j) + norm*term3/G.dx3v(k);
+            const auto& G = B_U.GetCoords();
+            divB(0, k, j, i) = corner_div(G, B_U, 0, k, j, i, ndim > 2);
         }
     );
 
diff --git a/kharma/b_flux_ct/b_flux_ct.hpp b/kharma/b_flux_ct/b_flux_ct.hpp
index bdb516a4..ba4c3a09 100644
--- a/kharma/b_flux_ct/b_flux_ct.hpp
+++ b/kharma/b_flux_ct/b_flux_ct.hpp
@@ -101,6 +101,8 @@ double MaxDivB(MeshData<Real> *md);
 // Version for Parthenon tasking as a reduction
 inline TaskStatus MaxDivBTask(MeshData<Real> *md, double& divb_max)
     { divb_max = MaxDivB(md); return TaskStatus::complete; }
+double MaxDivBBlock(MeshBlockData<Real> *rc);
+// TODO task for MeshBlocks?
 
 /**
  * Clean the magnetic field divergence via successive over-relaxation
diff --git a/kharma/types.hpp b/kharma/types.hpp
index 0d4d5001..15258a01 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -155,8 +155,21 @@ KOKKOS_INLINE_FUNCTION bool outside(const int& k, const int& j, const int& i,
     return (i < ib.s) || (i > ib.e) || (j < jb.s) || (j > jb.e) || (k < kb.s) || (k > kb.e);
 }
 
+/**
+ * Function for checking boundary flags: is this a domain or internal bound?
+ */
+inline bool IsDomainBound(MeshBlock *pmb, BoundaryFace face)
+{
+    return (pmb->boundary_flag[face] != BoundaryFlag::block &&
+            pmb->boundary_flag[face] != BoundaryFlag::periodic);
+}
+
+/**
+ * Functions for "tracing" execution by printing strings (and optionally state of zones)
+ * at each important function entry/exit
+ */
 #if TRACE
-#define PRINTCORNERS 1
+#define PRINTCORNERS 0
 #define PRINTZONE 0
 inline void PrintCorner(MeshBlockData<Real> *rc)
 {
diff --git a/pars/resize_restart.par b/pars/resize_restart.par
index dfd7f057..7645b468 100644
--- a/pars/resize_restart.par
+++ b/pars/resize_restart.par
@@ -55,7 +55,7 @@ bsq_over_u_max = 50
 u_over_rho_max = 100
 
 <debug>
-verbose = 0
+verbose = 1
 flag_verbose = 2
 extra_checks = 1
 
diff --git a/pars/sane.par b/pars/sane.par
index ec26df8d..8814f710 100644
--- a/pars/sane.par
+++ b/pars/sane.par
@@ -8,15 +8,15 @@ problem_id = torus
 <parthenon/mesh>
 refinement = none
 numlevel = 1
-nx1 = 256
-nx2 = 128
-nx3 = 128
-
-<parthenon/meshblock>
 nx1 = 128
-nx2 = 128
+nx2 = 64
 nx3 = 64
 
+<parthenon/meshblock>
+nx1 = 32
+nx2 = 32
+nx3 = 32
+
 <coordinates>
 base = spherical_ks
 transform = fmks

From facdd5a328741d0ba964a9a6056afb9dbc17a7d0 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Tue, 29 Mar 2022 15:50:51 -0500
Subject: [PATCH 21/26] Keep more accurate ghost zones in KHARMA

Flux-CT on corners bordering two meshblocks must be updated identically
in order to apply the algorithm correctly and maintain zero divergence.
Thus the primitive values they're operating upon must be identical to
machine precision, as well.  KHARMA applies several operations *after*
the variable sync in order to avoid a second synchronization, and these
were not always maintaining the same values between ghost zones vs
their physical counterparts, and thus maintaining the magnetic field
divergence on boundary corners only as well as the ghost zone
inaccuracies allowed.  This commit avoids operations which were
producing differences, under the logic that no stability is worth
introducing monopoles to its degree.

The big offender was applying floors to zones which had been fixed up,
which required a U->P solve, an iterative process which did not always
occur identically when performed by different nodes.  One remaining
problem for KHARMA's native mode of operation is adjacent failures,
which do not always occur identically between blocks.
---
 kharma/CMakeLists.txt          |  1 -
 kharma/b_cleanup/b_cleanup.cpp |  6 +--
 kharma/b_flux_ct/b_flux_ct.cpp |  9 ++--
 kharma/b_flux_ct/b_flux_ct.hpp |  8 ++--
 kharma/boundaries.cpp          | 49 ++++++++-------------
 kharma/debug.cpp               | 13 ++++--
 kharma/floors/floors.cpp       |  4 +-
 kharma/floors/floors.hpp       | 51 +++++++++++++++++++---
 kharma/flux.hpp                |  4 +-
 kharma/grmhd/fixup.cpp         | 13 +++---
 kharma/grmhd/grmhd.cpp         |  5 ++-
 kharma/harm_driver.cpp         | 68 ++++++++++++++++++-----------
 kharma/imex_driver.cpp         | 43 ++++++++----------
 kharma/types.hpp               | 21 ++++-----
 pars/sane_divb_2d.par          | 80 ++++++++++++++++++++++++++++++++++
 scripts/quick_divb.py          | 43 ++++++++++++++++++
 16 files changed, 297 insertions(+), 121 deletions(-)
 create mode 100644 pars/sane_divb_2d.par
 create mode 100644 scripts/quick_divb.py

diff --git a/kharma/CMakeLists.txt b/kharma/CMakeLists.txt
index fdeb25bb..b235be91 100644
--- a/kharma/CMakeLists.txt
+++ b/kharma/CMakeLists.txt
@@ -54,7 +54,6 @@ target_link_libraries(${EXE_NAME} PUBLIC parthenon)
 # These are almost universally performance trade-offs
 # TODO is there any way to make compile options less painful in CMake?
 option(FUSE_FLUX_KERNELS "Bundle the usual four flux calculation kernels (floors,R,L,apply) into one" ON)
-option(FUSE_EMF_KERNELS "Bundle the three emf direction kernels into one. Likely won't affect much" ON)
 option(FUSE_FLOOR_KERNELS "Bundle applying the floors and ceilings into one kernel" ON)
 option(FAST_CARTESIAN "Break operation in curved spacetimes to make Cartesian Minkowski space computations faster" OFF)
 if(FUSE_FLUX_KERNELS)
diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
index b08c2540..e2e22f05 100644
--- a/kharma/b_cleanup/b_cleanup.cpp
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -68,14 +68,14 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     params.Add("rel_tolerance", rel_tolerance);
     // Instead set absolute tolerance corresponding roughly to max divB on grid
     // Note this returns divB max about 1 decade greater, i.e. ~1e-14
-    Real abs_tolerance = pin->GetOrAddReal("b_cleanup", "abs_tolerance", 1e-15);
+    Real abs_tolerance = pin->GetOrAddReal("b_cleanup", "abs_tolerance", 1e-11);
     params.Add("abs_tolerance", abs_tolerance);
     // TODO why does this need to be so large?
-    Real sor_factor = pin->GetOrAddReal("b_cleanup", "sor_factor", 200);
+    Real sor_factor = pin->GetOrAddReal("b_cleanup", "sor_factor", 10);
     params.Add("sor_factor", sor_factor);
     int max_iterations = pin->GetOrAddInteger("b_cleanup", "max_iterations", 1e8);
     params.Add("max_iterations", max_iterations);
-    int check_interval = pin->GetOrAddInteger("b_cleanup", "check_interval", 1e3);
+    int check_interval = pin->GetOrAddInteger("b_cleanup", "check_interval", 2e2);
     params.Add("check_interval", check_interval);
     bool fail_without_convergence = pin->GetOrAddBoolean("b_cleanup", "fail_without_convergence", true);
     params.Add("fail_without_convergence", fail_without_convergence);
diff --git a/kharma/b_flux_ct/b_flux_ct.cpp b/kharma/b_flux_ct/b_flux_ct.cpp
index 91550168..50bd5e6a 100644
--- a/kharma/b_flux_ct/b_flux_ct.cpp
+++ b/kharma/b_flux_ct/b_flux_ct.cpp
@@ -357,7 +357,7 @@ double MaxDivB(MeshData<Real> *md)
     // This is one kernel call per block, because each block will have different bounds.
     // Could consolidate at the cost of lots of bounds checking.
     double max_divb = 0.0;
-    for (int b = block.s; b < block.e; ++b) {
+    for (int b = block.s; b <= block.e; ++b) {
         auto pmb = md->GetBlockData(b)->GetBlockPointer().get();
 
         // Note this is a stencil-4 (or -8) function, which would involve zones outside the
@@ -388,7 +388,7 @@ double MaxDivB(MeshData<Real> *md)
     return max_divb;
 }
 
-TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
+TaskStatus PrintGlobalMaxDivB(MeshData<Real> *md)
 {
     Flag(md, "Printing B field diagnostics");
     auto pmb0 = md->GetBlockData(0)->GetBlockPointer();
@@ -406,13 +406,12 @@ TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
 
     }
 
-    Flag(md, "Printed");
+    Flag(md, "Printed B field diagnostics");
     return TaskStatus::complete;
 }
 
 void FillOutput(MeshBlock *pmb, ParameterInput *pin)
 {
-    // TODO define this on meshblock or pack vars
     auto rc = pmb->meshblock_data.Get().get();
     Flag(rc, "Calculating divB for output");
     const int ndim = pmb->pmy_mesh->ndim;
@@ -442,7 +441,7 @@ void FillOutput(MeshBlock *pmb, ParameterInput *pin)
         }
     );
 
-    Flag(rc, "Output");
+    Flag(rc, "Output divB");
 }
 
 } // namespace B_FluxCT
diff --git a/kharma/b_flux_ct/b_flux_ct.hpp b/kharma/b_flux_ct/b_flux_ct.hpp
index ba4c3a09..db11b0f2 100644
--- a/kharma/b_flux_ct/b_flux_ct.hpp
+++ b/kharma/b_flux_ct/b_flux_ct.hpp
@@ -101,8 +101,6 @@ double MaxDivB(MeshData<Real> *md);
 // Version for Parthenon tasking as a reduction
 inline TaskStatus MaxDivBTask(MeshData<Real> *md, double& divb_max)
     { divb_max = MaxDivB(md); return TaskStatus::complete; }
-double MaxDivBBlock(MeshBlockData<Real> *rc);
-// TODO task for MeshBlocks?
 
 /**
  * Clean the magnetic field divergence via successive over-relaxation
@@ -115,7 +113,11 @@ void CleanupDivergence(MeshBlockData<Real> *rc, IndexDomain domain=IndexDomain::
  * Diagnostics printed/computed after each step
  * Currently just max divB
  */
-TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md);
+TaskStatus PrintGlobalMaxDivB(MeshData<Real> *md);
+inline TaskStatus PostStepDiagnostics(const SimTime& tm, MeshData<Real> *md)
+    { return PrintGlobalMaxDivB(md); }
+// Block version; unused now, kept for future fiascos
+TaskStatus PrintMaxBlockDivB(MeshBlockData<Real> *rc, bool prims, std::string tag);
 
 /**
  * Fill fields which are calculated only for output to file
diff --git a/kharma/boundaries.cpp b/kharma/boundaries.cpp
index 7089961f..334fac78 100644
--- a/kharma/boundaries.cpp
+++ b/kharma/boundaries.cpp
@@ -69,11 +69,13 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     const bool check_inflow = ((check_inner && domain == IndexDomain::inner_x1)
                             || (check_outer && domain == IndexDomain::outer_x1));
 
+    // q will actually have *both* cons & prims (unless using imex driver)
+    // We'll only need cons.B specifically tho
     PackIndexMap prims_map, cons_map;
     auto P = GRMHD::PackMHDPrims(rc.get(), prims_map, coarse);
     auto q = rc->PackVariables({Metadata::FillGhost}, cons_map, coarse);
     const VarMap m_u(cons_map, true), m_p(prims_map, false);
-    // If we're running imex, q is the *primitive* variables
+    // If we're running imex, q is just the *primitive* variables
     bool prim_ghosts = pmb->packages.Get("GRMHD")->Param<std::string>("driver_type") == "imex";
 
     // KHARMA is very particular about corner boundaries.
@@ -81,7 +83,8 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     // Then the polar bound only where outflow is not applied,
     // and periodic bounds only where neither other bound applies.
     // The latter is accomplished regardless of Parthenon's definitions,
-    // since these functions are run after Parthenon's MPI boundary syncs
+    // since these functions are run after Parthenon's MPI boundary syncs &
+    // replace whatever they've done.
     IndexDomain ldomain = IndexDomain::interior;
     int is = bounds.is(ldomain), ie = bounds.ie(ldomain);
     int js = bounds.js(ldomain), je = bounds.je(ldomain);
@@ -108,24 +111,14 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     const int ref = ref_tmp;
     const int dir = dir_tmp;
 
-    // This first loop copies all conserved variables into the outer zones
-    // This includes some we will replace below, but it would be harder
-    // to figure out where they were in the pack than just replace them
+    // This first loop copies all variables with the "FillGhost" tag into the outer zones
+    // This includes some we may replace below
     pmb->par_for("OutflowX1", 0, q.GetDim(4) - 1, ks_e, ke_e, js_e, je_e, ibs, ibe,
         KOKKOS_LAMBDA_VARS {
             q(p, k, j, i) = q(p, k, j, ref);
         }
     );
-    if (!prim_ghosts) {
-        // Apply KHARMA boundary to the primitive values
-        // TODO currently this includes B, which we then replace.
-        pmb->par_for("OutflowX1_prims", 0, P.GetDim(4) - 1, ks_e, ke_e, js_e, je_e, ibs, ibe,
-            KOKKOS_LAMBDA_VARS {
-                P(p, k, j, i) = P(p, k, j, ref);
-            }
-        );
-    }
-    // Inflow check, recover U
+    // Inflow check, always applied
     pmb->par_for("OutflowX1_check", ks_e, ke_e, js_e, je_e, ibs, ibe,
         KOKKOS_LAMBDA_3D {
             // Inflow check
@@ -133,15 +126,15 @@ void OutflowX1(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
         }
     );
     if (!prim_ghosts) {
-        // Recover U
+        // Normal operation: We copied both both prim & con GRMHD variables, but we want to apply
+        // the boundaries based on just the former, so we run P->U
         pmb->par_for("OutflowX1_PtoU", ks_e, ke_e, js_e, je_e, ibs, ibe,
             KOKKOS_LAMBDA_3D {
                 // TODO move these steps into FillDerivedDomain, make a GRMHD::PtoU call the last in that series
                 // Correct primitive B
                 if (m_p.B1 >= 0)
                     VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
-                // Recover conserved vars
-                // TODO all flux
+                // Recover conserved vars.  Must be only GRMHD.
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
             }
         );
@@ -159,6 +152,8 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     const auto& G = pmb->coords;
     const Real gam = pmb->packages.Get("GRMHD")->Param<Real>("gamma");
 
+    // q will actually have *both* cons & prims (unless using imex driver)
+    // We'll only need cons.B specifically tho
     PackIndexMap prims_map, cons_map;
     auto P = GRMHD::PackMHDPrims(rc.get(), prims_map, coarse);
     auto q = rc->PackVariables({Metadata::FillGhost}, cons_map, coarse);
@@ -179,7 +174,9 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     // So. Parthenon wants us to do our thing over is_e to ie_e
     // BUT if we're at the interior bound on X1, that's gonna blow things up
     // (for reasons unknown, inflow bounds must take precedence)
-    // so we have to be smart
+    // so we have to be smart.
+    // Side note: this *lags* the X1/X2 corner zones by one step, since X1 is applied first.
+    // this is potentially bad
     int ics = (pmb->boundary_flag[BoundaryFace::inner_x1] == BoundaryFlag::user) ? is : is_e;
     int ice = (pmb->boundary_flag[BoundaryFace::outer_x1] == BoundaryFlag::user) ? ie : ie_e;
     //int ics = is_e;
@@ -202,28 +199,20 @@ void ReflectX2(std::shared_ptr<MeshBlockData<Real>> &rc, IndexDomain domain, boo
     const int ref = ref_tmp;
     const int add = add_tmp;
 
+    // This first loop copies all variables with the "FillGhost" tag into the outer zones
+    // This includes some we may replace below
     pmb->par_for("ReflectX2", 0, q.GetDim(4) - 1, ks_e, ke_e, jbs, jbe, ics, ice,
         KOKKOS_LAMBDA_VARS {
             Real reflect = q.VectorComponent(p) == X2DIR ? -1.0 : 1.0;
             q(p, k, j, i) = reflect * q(p, k, (ref + add) + (ref - j), i);
         }
     );
-    // If we're using imex driver, the above is all we need.
     if (!prim_ghosts) {
-        // If we're using the HARM/KHARMA driver, we need to do the primitives
-        // separately after the conserved vars
-        pmb->par_for("ReflectX2_prims", 0, P.GetDim(4) - 1, ks_e, ke_e, jbs, jbe, ics, ice,
-            KOKKOS_LAMBDA_VARS {
-                Real reflect = P.VectorComponent(p) == X2DIR ? -1.0 : 1.0;
-                P(p, k, j, i) = reflect * P(p, k, (ref + add) + (ref - j), i);
-            }
-        );
-        // And we need to fill the corresponding conserved vars
+        // Normal operation: see above
         pmb->par_for("ReflectX2_PtoU", ks_e, ke_e, jbs, jbe, ics, ice,
             KOKKOS_LAMBDA_3D {
                 if (m_p.B1 >= 0)
                     VLOOP P(m_p.B1 + v, k, j, i) = q(m_u.B1 + v, k, j, i) / G.gdet(Loci::center, j, i);
-                // TODO all flux
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, q, m_u);
             }
         );
diff --git a/kharma/debug.cpp b/kharma/debug.cpp
index 6510835e..091e3c16 100644
--- a/kharma/debug.cpp
+++ b/kharma/debug.cpp
@@ -164,6 +164,7 @@ int CountPFlags(MeshData<Real> *md, IndexDomain domain, int verbose)
     int n_utsq = 0, n_gamma = 0, n_neg_u = 0, n_neg_rho = 0, n_neg_both = 0;
     auto pmesh = md->GetMeshPointer();
 
+    int block = 0;
     for (auto &pmb : pmesh->block_list) {
         int is = pmb->cellbounds.is(domain), ie = pmb->cellbounds.ie(domain);
         int js = pmb->cellbounds.js(domain), je = pmb->cellbounds.je(domain);
@@ -171,6 +172,8 @@ int CountPFlags(MeshData<Real> *md, IndexDomain domain, int verbose)
         auto& rc = pmb->meshblock_data.Get();
         auto pflag = rc->Get("pflag").data.GetHostMirrorAndCopy();
 
+    // OpenMP causes problems when used separately from Kokkos
+    // TODO make this a kokkos reduction to a View
 //#pragma omp parallel for simd collapse(3) reduction(+:n_cells,n_tot,n_neg_in,n_max_iter,n_utsq,n_gamma,n_neg_u,n_neg_rho,n_neg_both)
         for(int k=ks; k <= ke; ++k)
             for(int j=js; j <= je; ++j)
@@ -187,13 +190,14 @@ int CountPFlags(MeshData<Real> *md, IndexDomain domain, int verbose)
             if (flag == InversionStatus::neg_u) ++n_neg_u;
             if (flag == InversionStatus::neg_rhou) ++n_neg_both;
 
-#if 0 // TODO be able to print pflag contexts
+            // TODO MPI Rank
             if (flag > InversionStatus::success && verbose >= 3) {
-                cout << "Bad inversion (" << flag << ") at i,j,k: " << i << " " << j << " " << k << endl;
-                compare_P_U(pmb->meshblock_data.Get().get(), k, j, i);
+                printf("Bad inversion (%d) at block %d zone %d %d %d\n", flag, block, i, j, k);
+                //compare_P_U(pmb->meshblock_data.Get().get(), k, j, i);
             }
-#endif
         }
+
+        ++block;
     }
 
     n_tot = MPISum(n_tot);
@@ -236,6 +240,7 @@ int CountFFlags(MeshData<Real> *md, IndexDomain domain, int verbose)
         auto& rc = pmb->meshblock_data.Get();
         auto fflag = rc->Get("fflag").data.GetHostMirrorAndCopy();
 
+    // See above re: Openmp. TODO Kokkosify
 //#pragma omp parallel for simd collapse(3) reduction(+:n_cells,n_tot,n_geom_rho,n_geom_u,n_b_rho,n_b_u,n_temp,n_gamma,n_ktot)
         for(int k=ks; k <= ke; ++k)
             for(int j=js; j <= je; ++j)
diff --git a/kharma/floors/floors.cpp b/kharma/floors/floors.cpp
index ac866ef6..2582bbe0 100644
--- a/kharma/floors/floors.cpp
+++ b/kharma/floors/floors.cpp
@@ -170,11 +170,11 @@ TaskStatus ApplyFloors(MeshBlockData<Real> *rc)
 
     // Apply floors over the same zones we just updated with UtoP
     // This selects the entire zone, but we then require pflag >= 0,
-    // which eliminates marked corner zones
+    // which keeps us from covering completely uninitialized zones
+    // (but still applies to failed UtoP!)
     const IndexRange ib = rc->GetBoundsI(IndexDomain::entire);
     const IndexRange jb = rc->GetBoundsJ(IndexDomain::entire);
     const IndexRange kb = rc->GetBoundsK(IndexDomain::entire);
-
     pmb->par_for("apply_floors", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA_3D {
             if (((int) pflag(k, j, i)) >= InversionStatus::success) {
diff --git a/kharma/floors/floors.hpp b/kharma/floors/floors.hpp
index 821f7ff2..cec24209 100644
--- a/kharma/floors/floors.hpp
+++ b/kharma/floors/floors.hpp
@@ -335,18 +335,18 @@ KOKKOS_INLINE_FUNCTION int apply_floors(const GRCoordinates& G, const VariablePa
  * 
  * @return fflag: since no inversion is performed, this just returns a flag representing which geometric floors were hit
  * 
- * LOCKSTEP: Operates on and respects primitives *only*
+ * NOT LOCKSTEP: Operates on and respects primitives *only*
  */
 template<typename Local>
 KOKKOS_INLINE_FUNCTION int apply_geo_floors(const GRCoordinates& G, Local& P, const VarMap& m,
-                                            const Real& gam, const int& k, const int& j, const int& i,
+                                            const Real& gam, const int& j, const int& i,
                                             const Floors::Prescription& floors, const Loci loc=Loci::center)
 {
     // Apply only the geometric floors
     Real rhoflr_geom, uflr_geom;
     if(G.coords.spherical()) {
         GReal Xembed[GR_DIM];
-        G.coord_embed(k, j, i, loc, Xembed);
+        G.coord_embed(0, j, i, loc, Xembed);
         GReal r = Xembed[1];
 
         if (floors.use_r_char) {
@@ -368,8 +368,8 @@ KOKKOS_INLINE_FUNCTION int apply_geo_floors(const GRCoordinates& G, Local& P, co
 #if RECORD_POST_RECON
     // Record all the floors that were hit, using bitflags
     // Record Geometric floor hits
-    fflag |= (rhoflr_geom > P(m.RHO, i)) * HIT_FLOOR_GEOM_RHO_FLUX;
-    fflag |= (uflr_geom > P(m.UU, i)) * HIT_FLOOR_GEOM_U_FLUX;
+    fflag |= (rhoflr_geom > P(m.RHO)) * HIT_FLOOR_GEOM_RHO_FLUX;
+    fflag |= (uflr_geom > P(m.UU)) * HIT_FLOOR_GEOM_U_FLUX;
 #endif
 
     P(m.RHO) += max(0., rhoflr_geom - P(m.RHO));
@@ -378,4 +378,45 @@ KOKKOS_INLINE_FUNCTION int apply_geo_floors(const GRCoordinates& G, Local& P, co
     return fflag;
 }
 
+template<typename Global>
+KOKKOS_INLINE_FUNCTION int apply_geo_floors(const GRCoordinates& G, Global& P, const VarMap& m,
+                                            const Real& gam, const int& k, const int& j, const int& i,
+                                            const Floors::Prescription& floors, const Loci loc=Loci::center)
+{
+    // Apply only the geometric floors
+    Real rhoflr_geom, uflr_geom;
+    if(G.coords.spherical()) {
+        GReal Xembed[GR_DIM];
+        G.coord_embed(k, j, i, loc, Xembed);
+        GReal r = Xembed[1];
+
+        if (floors.use_r_char) {
+            // Steeper floor from iharm3d
+            Real rhoscal = pow(r, -2.) * 1 / (1 + r / floors.r_char);
+            rhoflr_geom = floors.rho_min_geom * rhoscal;
+            uflr_geom = floors.u_min_geom * pow(rhoscal, gam);
+        } else {
+            // Original floors from iharm2d
+            rhoflr_geom = floors.rho_min_geom * pow(r, -1.5);
+            uflr_geom = floors.u_min_geom * pow(r, -2.5); //rhoscal/r as in iharm2d
+        }
+    } else {
+        rhoflr_geom = floors.rho_min_geom;
+        uflr_geom = floors.u_min_geom;
+    }
+
+    int fflag = 0;
+#if RECORD_POST_RECON
+    // Record all the floors that were hit, using bitflags
+    // Record Geometric floor hits
+    fflag |= (rhoflr_geom > P(m.RHO, k, j, i)) * HIT_FLOOR_GEOM_RHO_FLUX;
+    fflag |= (uflr_geom > P(m.UU, k, j, i)) * HIT_FLOOR_GEOM_U_FLUX;
+#endif
+
+    P(m.RHO, k, j, i) += max(0., rhoflr_geom - P(m.RHO, k, j, i));
+    P(m.UU, k, j, i) += max(0., uflr_geom - P(m.UU, k, j, i));
+
+    return fflag;
+}
+
 } // namespace Floors
diff --git a/kharma/flux.hpp b/kharma/flux.hpp
index c78b2ea9..83112d8d 100644
--- a/kharma/flux.hpp
+++ b/kharma/flux.hpp
@@ -211,8 +211,8 @@ inline TaskStatus GetFlux(MeshData<Real> *md)
                     // Apply floors to the *reconstructed* primitives, because without TVD
                     // we have no guarantee they remotely resemble the *centered* primitives
                     if (Recon == ReconstructionType::weno5 && !disable_floors) {
-                        Floors::apply_geo_floors(G, Pl, m_p, gam, k, j, i, floors, loc);
-                        Floors::apply_geo_floors(G, Pr, m_p, gam, k, j, i, floors, loc);
+                        Floors::apply_geo_floors(G, Pl, m_p, gam, j, i, floors, loc);
+                        Floors::apply_geo_floors(G, Pr, m_p, gam, j, i, floors, loc);
                     }
 #if !FUSE_FLUX_KERNELS
                 }
diff --git a/kharma/grmhd/fixup.cpp b/kharma/grmhd/fixup.cpp
index 4b194434..626ac6d0 100644
--- a/kharma/grmhd/fixup.cpp
+++ b/kharma/grmhd/fixup.cpp
@@ -111,7 +111,7 @@ TaskStatus GRMHD::FixUtoP(MeshBlockData<Real> *rc)
                     if (verbose >= 1 && inside(k, j, i, kb_b, jb_b, ib_b)) // If an interior zone...
                         printf("No neighbors were available at %d %d %d!\n", i, j, k);
 #endif
-                    PRIMLOOP P(p, k, j, i) = sum_x[p]/wsum_x;
+                    //PRIMLOOP P(p, k, j, i) = sum_x[p]/wsum_x;
                 } else {
                     PRIMLOOP P(p, k, j, i) = sum[p]/wsum;
                 }
@@ -131,15 +131,18 @@ TaskStatus GRMHD::FixUtoP(MeshBlockData<Real> *rc)
     pmb->par_for("fix_U_to_P_floors", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA_3D {
             if (((int) pflag(k, j, i)) > InversionStatus::success) {
+                apply_geo_floors(G, P, m_p, gam, k, j, i, floors);
+
                 // Make sure to keep lockstep
                 // This will only be run for GRMHD, so we can call its p_to_u
                 GRMHD::p_to_u(G, P, m_p, gam, k, j, i, U, m_u);
 
                 // And make sure the fixed values still abide by floors (floors keep lockstep)
-                int fflag_local = 0;
-                fflag_local |= Floors::apply_floors(G, P, m_p, gam, k, j, i, floors, U, m_u);
-                fflag_local |= Floors::apply_ceilings(G, P, m_p, gam, k, j, i, floors, U, m_u);
-                fflag(k, j, i) = fflag_local;
+                // TODO Fluid Frame instead of just geo?
+                // int fflag_local = 0;
+                // fflag_local |= Floors::apply_floors(G, P, m_p, gam, k, j, i, floors, U, m_u);
+                // fflag_local |= Floors::apply_ceilings(G, P, m_p, gam, k, j, i, floors, U, m_u);
+                // fflag(k, j, i) = fflag_local;
             }
         }
     );
diff --git a/kharma/grmhd/grmhd.cpp b/kharma/grmhd/grmhd.cpp
index d77bff48..0124b078 100644
--- a/kharma/grmhd/grmhd.cpp
+++ b/kharma/grmhd/grmhd.cpp
@@ -214,8 +214,11 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
         // and the primitives as "Derived"
         // Primitives are still used for reconstruction, physical boundaries, and output, and are
         // generally the easier to understand quantities
+        // Note especially their ghost zones are also filled! This is less efficient than syncing just
+        // one or the other, but allows the most flexibility for reasons that should be clearer in harm_driver.cpp
         flags_prim = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Derived,
-                                                Metadata::Restart, isPrimitive, isHD, isMHD});
+                                                Metadata::FillGhost, Metadata::Restart,
+                                                isPrimitive, isHD, isMHD});
         // Conserved variables are actualy rho*u^0 & T^0_mu, but are named after the prims for consistency
         // We will rarely need the conserved variables by name, we will mostly be treating them as a group
         flags_cons = std::vector<MetadataFlag>({Metadata::Real, Metadata::Cell, Metadata::Independent,
diff --git a/kharma/harm_driver.cpp b/kharma/harm_driver.cpp
index 0bff7667..97b2a2a5 100644
--- a/kharma/harm_driver.cpp
+++ b/kharma/harm_driver.cpp
@@ -159,13 +159,13 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         // Zero any fluxes through the pole or inflow from outflow boundaries
         auto t_fix_flux = tl.AddTask(t_recv_flux, KBoundaries::FixFlux, mc0.get());
 
-        auto t_flux_fixed = t_fix_flux;
+        auto t_flux_ct = t_fix_flux;
         if (use_b_flux_ct) {
             // Fix the conserved fluxes (exclusively B1/2/3) so that they obey divB==0,
             // and there is no B field flux through the pole
-            auto t_flux_ct = tl.AddTask(t_fix_flux, B_FluxCT::TransportB, mc0.get());
-            t_flux_fixed = t_flux_ct;
+            t_flux_ct = tl.AddTask(t_fix_flux, B_FluxCT::TransportB, mc0.get());
         }
+        auto t_flux_fixed = t_flux_ct;
 
         // APPLY FLUXES
         auto t_flux_div = tl.AddTask(t_flux_fixed, Update::FluxDivergence<MeshData<Real>>, mc0.get(), mdudt.get());
@@ -193,6 +193,16 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         // apply du/dt to all independent fields in the container
         auto t_update = tl.AddTask(t_avg_data, Update::UpdateIndependentData<MeshData<Real>>, mc0.get(),
                                 mdudt.get(), beta * dt, mc1.get());
+
+        // U_to_P needs a guess in order to converge, so we copy in sc0
+        // (but only the fluid primitives!)  Copying and syncing ensures that solves of the same zone
+        // on adjacent ranks are seeded with the same value, which keeps them (more) similar
+        MetadataFlag isPrimitive = pkgs.at("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+        MetadataFlag isHD = pkgs.at("GRMHD")->Param<MetadataFlag>("HDFlag");
+        auto t_copy_prims = tl.AddTask(t_none, Update::WeightedSumData<MetadataFlag, MeshData<Real>>,
+                                    std::vector<MetadataFlag>({isHD, isPrimitive}),
+                                    mc0.get(), mc0.get(), 1.0, 0.0, mc1.get());
+
     }
 
     // MPI/MeshBlock boundary exchange.
@@ -204,32 +214,50 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         TaskRegion &tr1 = tc.AddRegion(num_partitions);
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr1[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
+            , mc1.get());
             tr1[i].AddTask(t_none, cell_centered_bvars::SendBoundaryBuffers, mc1);
         }
         TaskRegion &tr2 = tc.AddRegion(num_partitions);
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr2[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
+            , mc1.get());
             tr2[i].AddTask(t_none, cell_centered_bvars::ReceiveBoundaryBuffers, mc1);
         }
         TaskRegion &tr3 = tc.AddRegion(num_partitions);
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr3[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
+            , mc1.get());
             tr3[i].AddTask(t_none, cell_centered_bvars::SetBoundaries, mc1);
         }
     } else {
         TaskRegion &tr1 = tc.AddRegion(blocks.size());
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr1[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
+            , sc1.get());
             tr1[i].AddTask(t_none, &MeshBlockData<Real>::SendBoundaryBuffers, sc1.get());
         }
         TaskRegion &tr2 = tc.AddRegion(blocks.size());
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr2[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
+            , sc1.get());
             tr2[i].AddTask(t_none, &MeshBlockData<Real>::ReceiveBoundaryBuffers, sc1.get());
         }
         TaskRegion &tr3 = tc.AddRegion(blocks.size());
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr3[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
+            , sc1.get());
             tr3[i].AddTask(t_none, &MeshBlockData<Real>::SetBoundaries, sc1.get());
         }
     }
@@ -257,27 +285,15 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         // since they must be applied to the primitive variables rho,u,u1,u2,u3
         // but should apply to conserved forms of everything else.
 
-        // U_to_P needs a guess in order to converge, so we copy in sc0
-        // (but only the fluid primitives!)
-        // TODO move this before the bounds sync, in case we need to exchange U *AND* P for some reason
-        auto t_copy_prims = tl.AddTask(t_prolongBound,
-            [](MeshBlockData<Real> *rc0, MeshBlockData<Real> *rc1)
-            {
-                Flag(rc1, "Copying prims");
-                rc1->Get("prims.rho").data.DeepCopy(rc0->Get("prims.rho").data);
-                rc1->Get("prims.u").data.DeepCopy(rc0->Get("prims.u").data);
-                rc1->Get("prims.uvec").data.DeepCopy(rc0->Get("prims.uvec").data);
-                Flag(rc1, "Copied");
-                return TaskStatus::complete;
-            }, sc0.get(), sc1.get());
-
-        // This call fills the fluid primitive values in all physical zones, that is, including MPI boundaries:
-        // everywhere the conserved variables have been updated so far.
-        // This setup avoids extra boundary synchronization, by updating the primitives identically on different blocks,
-        // instead of explicitly exchanging them.
-        auto t_fill_derived = tl.AddTask(t_copy_prims, Update::FillDerived<MeshBlockData<Real>>, sc1.get());
-
-        // Then fix any inversions which failed.  Floors have been applied already as a part of (Post)FillDerived,
+        // This call fills the fluid primitive values in all physical zones, that is, including MPI boundaries but
+        // not the physical boundaries (which haven't been filled yet!)
+        // This relies on the primitives being calculated identically in MPI boundaries, vs their corresponding
+        // physical zones in the adjacent mesh block.  To ensure this, we seed the solver with the same values
+        // in each case, by synchronizing them along with the conserved values above.
+        auto t_fill_derived = tl.AddTask(t_prolongBound, Update::FillDerived<MeshBlockData<Real>>, sc1.get());
+        // After this call, the floors are applied (with the hook 'PostFillDerived', see floors.cpp)
+
+        // Immediately fix any inversions which failed.  Floors have been applied already as a part of (Post)FillDerived,
         // so fixups performed by averaging zones will return logical results.  Floors are re-applied after fixups
         // Someday this will not be necessary as guaranteed-convergent UtoP schemes exist
         auto t_fix_derived = tl.AddTask(t_fill_derived, GRMHD::FixUtoP, sc1.get());
@@ -286,9 +302,9 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         // boundaries.cpp, which apply physical boundary conditions based on the primitive variables of GRHD,
         // and based on the conserved forms for everything else.  Note that because this is called *after*
         // FillDerived (since it needs bulk fluid primitives to apply GRMHD boundaries), this function
-        // must call FillDerived *again*, to update just the ghost zones.
+        // must call FillDerived *again* (for everything except the GRHD variables) to fill P in the ghost zones.
         // This is why KHARMA packages need to implement their "FillDerived" a.k.a. UtoP functions in the form
-        // UtoP(rc, domain, coarse), so that they can be run over just the boundary domains here.
+        // UtoP(rc, domain, coarse): so that they can be run over just the boundary domains here.
         auto t_set_bc = tl.AddTask(t_fix_derived, parthenon::ApplyBoundaryConditions, sc1);
 
         // ADD SOURCES TO PRIMITIVE VARIABLES
diff --git a/kharma/imex_driver.cpp b/kharma/imex_driver.cpp
index 013dba62..b684ed26 100644
--- a/kharma/imex_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -168,13 +168,13 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         // Zero any fluxes through the pole or inflow from outflow boundaries
         auto t_fix_flux = tl.AddTask(t_recv_flux, KBoundaries::FixFlux, mc0.get());
 
-        auto t_flux_fixed = t_fix_flux;
+        auto t_flux_ct = t_fix_flux;
         if (use_b_flux_ct) {
             // Fix the conserved fluxes (exclusively B1/2/3) so that they obey divB==0,
             // and there is no B field flux through the pole
             auto t_flux_ct = tl.AddTask(t_fix_flux, B_FluxCT::TransportB, mc0.get());
-            t_flux_fixed = t_flux_ct;
         }
+        auto t_flux_fixed = t_flux_ct;
 
         // APPLY FLUXES
         auto t_flux_div = tl.AddTask(t_none, Update::FluxDivergence<MeshData<Real>>, mc0.get(), mdudt.get());
@@ -257,6 +257,16 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         auto t_copy_result = tl.AddTask(t_implicit, Update::WeightedSumData<MetadataFlag, MeshData<Real>>, std::vector<MetadataFlag>({}),
                                         mc_solver.get(), mc_solver.get(), 1.0, 0.0, mc1.get());
 
+        // If evolving GRMHD explicitly, U_to_P needs a guess in order to converge, so we copy in mc0
+        auto t_copy_prims = t_none;
+        if (!pkgs.at("GRMHD")->Param<bool>("implicit")) {
+            MetadataFlag isPrimitive = pkgs.at("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
+            MetadataFlag isHD = pkgs.at("GRMHD")->Param<MetadataFlag>("HDFlag");
+            auto t_copy_prims = tl.AddTask(t_none, Update::WeightedSumData<MetadataFlag, MeshData<Real>>,
+                                        std::vector<MetadataFlag>({isHD, isPrimitive}),
+                                        mc0.get(), mc0.get(), 1.0, 0.0, mc1.get());
+        }
+
     }
 
     // Even though we filled some primitive vars 
@@ -267,22 +277,11 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         auto &sc0 = pmb->meshblock_data.Get(stage_name[stage-1]);
         auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
 
-        // Copy primitives to form the guess for GRMHD::UtoP
-        // Only needed if GRMHD vars are being updated explicitly
-        auto t_copy_prims = t_none;
-        if (!pkgs.at("GRMHD")->Param<bool>("implicit")) {
-            MetadataFlag isPrimitive = pkgs.at("GRMHD")->Param<MetadataFlag>("PrimitiveFlag");
-            MetadataFlag isHD = pkgs.at("GRMHD")->Param<MetadataFlag>("HDFlag");
-            auto t_copy_prims = tl.AddTask(t_none, Update::WeightedSumData<MetadataFlag, MeshBlockData<Real>>,
-                                        std::vector<MetadataFlag>({isHD, isPrimitive}),
-                                        sc0.get(), sc0.get(), 1.0, 0.0, sc1.get());
-        }
-
         // Note that floors are applied (to all variables!) immediately after this FillDerived call.
         // However, it is *not* immediately corrected with FixUtoP, but synchronized (including pflags!) first.
         // With an extra ghost zone, this *should* still allow binary-similar evolution between numbers of mesh blocks,
         // but hasn't been tested.
-        auto t_fill_derived = tl.AddTask(t_copy_prims, Update::FillDerived<MeshBlockData<Real>>, sc1.get());
+        auto t_fill_derived = tl.AddTask(t_none, Update::FillDerived<MeshBlockData<Real>>, sc1.get());
     }
 
     // MPI/MeshBlock boundary exchange.
@@ -294,7 +293,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
             tr1[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Boundary 1"); return TaskStatus::complete; }
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
             , mc1.get());
             tr1[i].AddTask(t_none, cell_centered_bvars::SendBoundaryBuffers, mc1);
         }
@@ -302,7 +301,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
             tr2[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Boundary 2"); return TaskStatus::complete; }
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
             , mc1.get());
             tr2[i].AddTask(t_none, cell_centered_bvars::ReceiveBoundaryBuffers, mc1);
         }
@@ -310,7 +309,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         for (int i = 0; i < num_partitions; i++) {
             auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
             tr3[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Boundary 3"); return TaskStatus::complete; }
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
             , mc1.get());
             tr3[i].AddTask(t_none, cell_centered_bvars::SetBoundaries, mc1);
         }
@@ -319,7 +318,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
             tr1[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Boundary 1"); return TaskStatus::complete; }
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
             , sc1.get());
             tr1[i].AddTask(t_none, &MeshBlockData<Real>::SendBoundaryBuffers, sc1.get());
         }
@@ -327,7 +326,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
             tr2[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Boundary 2"); return TaskStatus::complete; }
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
             , sc1.get());
             tr2[i].AddTask(t_none, &MeshBlockData<Real>::ReceiveBoundaryBuffers, sc1.get());
         }
@@ -335,7 +334,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         for (int i = 0; i < blocks.size(); i++) {
             auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
             tr3[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Boundary 3"); return TaskStatus::complete; }
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
             , sc1.get());
             tr3[i].AddTask(t_none, &MeshBlockData<Real>::SetBoundaries, sc1.get());
         }
@@ -349,10 +348,6 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         auto &sc0 = pmb->meshblock_data.Get(stage_name[stage-1]);
         auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
 
-        auto t_flag = tl.AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Copying prims"); return TaskStatus::complete; }
-            , sc1.get());
-
         auto t_clear_comm_flags = tl.AddTask(t_none, &MeshBlockData<Real>::ClearBoundary,
                                         sc1.get(), BoundaryCommSubset::all);
 
diff --git a/kharma/types.hpp b/kharma/types.hpp
index 15258a01..31b33bef 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -169,7 +169,7 @@ inline bool IsDomainBound(MeshBlock *pmb, BoundaryFace face)
  * at each important function entry/exit
  */
 #if TRACE
-#define PRINTCORNERS 0
+#define PRINTCORNERS 1
 #define PRINTZONE 0
 inline void PrintCorner(MeshBlockData<Real> *rc)
 {
@@ -181,7 +181,8 @@ inline void PrintCorner(MeshBlockData<Real> *rc)
     auto uc = rc->Get("cons.u").data.GetHostMirrorAndCopy();
     auto uvecc = rc->Get("cons.uvec").data.GetHostMirrorAndCopy();
     auto Bu = rc->Get("cons.B").data.GetHostMirrorAndCopy();
-    auto p = rc->Get("p").data.GetHostMirrorAndCopy();
+    //auto p = rc->Get("p").data.GetHostMirrorAndCopy();
+    auto pflag = rc->Get("pflag").data.GetHostMirrorAndCopy();
     //auto q = rc->Get("prims.q").data.GetHostMirrorAndCopy();
     //auto dP = rc->Get("prims.dP").data.GetHostMirrorAndCopy();
     const IndexRange ib = rc->GetBoundsI(IndexDomain::interior);
@@ -191,16 +192,16 @@ inline void PrintCorner(MeshBlockData<Real> *rc)
     for (int j=0; j<8; j++) {
         cerr << endl;
         for (int i=0; i<8; i++) {
-            fprintf(stderr, "%.5g\t", p(kb.s, j, i));
-        }
-    }
-    cerr << endl << "B1:";
-    for (int j=0; j<8; j++) {
-        cerr << endl;
-        for (int i=0; i<8; i++) {
-            fprintf(stderr, "%.5g\t", Bu(V1, kb.s, j, i));
+            fprintf(stderr, "%.5g\t", pflag(kb.s, j, i));
         }
     }
+    // cerr << endl << "B1:";
+    // for (int j=0; j<8; j++) {
+    //     cerr << endl;
+    //     for (int i=0; i<8; i++) {
+    //         fprintf(stderr, "%.5g\t", Bu(V1, kb.s, j, i));
+    //     }
+    // }
     cerr << endl << endl;
 }
 
diff --git a/pars/sane_divb_2d.par b/pars/sane_divb_2d.par
new file mode 100644
index 00000000..542ba8d6
--- /dev/null
+++ b/pars/sane_divb_2d.par
@@ -0,0 +1,80 @@
+# SANE model mirroring the simulation library
+# Quite small to run for more than 10kM, 6M/12M F-M torus,
+# Overall simulation size 1000M
+
+<parthenon/job>
+problem_id = torus
+
+<parthenon/mesh>
+refinement = none
+numlevel = 1
+nx1 = 128
+nx2 = 128
+nx3 = 1
+
+<parthenon/meshblock>
+nx1 = 128
+nx2 = 64
+nx3 = 1
+
+<coordinates>
+base = spherical_ks
+transform = fmks
+r_out = 1000
+a = 0.9375
+hslope = 0.3
+mks_smooth = 0.5
+poly_xt = 0.82
+poly_alpha = 14.0
+
+<parthenon/time>
+tlim = 3000.0
+nlim = -1
+
+<driver>
+type = harm
+
+<GRMHD>
+cfl = 0.9
+gamma = 1.666667
+reconstruction = weno5
+implicit = false
+
+<torus>
+rin = 6.0
+rmax = 12.0
+
+<perturbation>
+u_jitter = 0.04
+
+<b_field>
+type = sane
+beta_min = 100.
+implicit = false
+
+<floors>
+rho_min_geom = 1e-6
+u_min_geom = 1e-8
+bsq_over_rho_max = 100
+u_over_rho_max = 2
+frame = fluid
+
+<debug>
+archive_parameters = false
+verbose = 1
+extra_checks = 1
+flag_verbose = 0
+
+<parthenon/output0>
+file_type = hdf5
+dt = 100.0
+variables = prims.rho, prims.u, prims.uvec, prims.B, jcon, fflag, pflag, divB
+
+<parthenon/output1>
+file_type = rst
+dt = 100.0
+ghost_zones = true
+
+<parthenon/output2>
+file_type = hst
+dt = 0.1
diff --git a/scripts/quick_divb.py b/scripts/quick_divb.py
new file mode 100644
index 00000000..ad872af5
--- /dev/null
+++ b/scripts/quick_divb.py
@@ -0,0 +1,43 @@
+import numpy as np
+import h5py
+
+from pyharm.grid import make_some_grid
+from pyharm.defs import Loci, Slices
+
+f = h5py.File("torus.out1.00100.h5", "r")
+B = f['p'][5:8,:,:,:].transpose(0,3,2,1)
+
+G = make_some_grid('fmks', 288, 128, 128, 0.9375)
+gdet = G.gdet[Loci.CENT.value]
+s = Slices(ng=1)
+
+divB = np.abs(0.25 * (
+        B[0][s.b, s.b, s.b] * gdet[s.b, s.b, :]
+        + B[0][s.b, s.l1, s.b] * gdet[s.b, s.l1, :]
+        + B[0][s.b, s.b, s.l1] * gdet[s.b, s.b, :]
+        + B[0][s.b, s.l1, s.l1] * gdet[s.b, s.l1, :]
+        - B[0][s.l1, s.b, s.b] * gdet[s.l1, s.b, :]
+        - B[0][s.l1, s.l1, s.b] * gdet[s.l1, s.l1, :]
+        - B[0][s.l1, s.b, s.l1] * gdet[s.l1, s.b, :]
+        - B[0][s.l1, s.l1, s.l1] * gdet[s.l1, s.l1, :]
+        ) / G.dx[1] + 0.25 * (
+        B[1][s.b, s.b, s.b] * gdet[s.b, s.b, :]
+        + B[1][s.l1, s.b, s.b] * gdet[s.l1, s.b, :]
+        + B[1][s.b, s.b, s.l1] * gdet[s.b, s.b, :]
+        + B[1][s.l1, s.b, s.l1] * gdet[s.l1, s.b, :]
+        - B[1][s.b, s.l1, s.b] * gdet[s.b, s.l1, :]
+        - B[1][s.l1, s.l1, s.b] * gdet[s.l1, s.l1, :]
+        - B[1][s.b, s.l1, s.l1] * gdet[s.b, s.l1, :]
+        - B[1][s.l1, s.l1, s.l1] * gdet[s.l1, s.l1, :]
+        ) / G.dx[2] + 0.25 * (
+        B[2][s.b, s.b, s.b] * gdet[s.b, s.b, :]
+        + B[2][s.b, s.l1, s.b] * gdet[s.b, s.l1, :]
+        + B[2][s.l1, s.b, s.b] * gdet[s.l1, s.b, :]
+        + B[2][s.l1, s.l1, s.b] * gdet[s.l1, s.l1, :]
+        - B[2][s.b, s.b, s.l1] * gdet[s.b, s.b, :]
+        - B[2][s.b, s.l1, s.l1] * gdet[s.b, s.l1, :]
+        - B[2][s.l1, s.b, s.l1] * gdet[s.l1, s.b, :]
+        - B[2][s.l1, s.l1, s.l1] * gdet[s.l1, s.l1, :]
+        ) / G.dx[3])
+
+print("Max divB ", np.max(divB), " at ", np.unravel_index(np.argmax(divB, axis=None), divB.shape))

From f60a39960b1aaeceeb4085528b14f7c9d8bb7321 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 30 Mar 2022 15:24:31 -0500
Subject: [PATCH 22/26] Optional second boundary synchronization in KHARMA, for
 testing meshblock-breakdown dependency.

---
 kharma/grmhd/grmhd.cpp |  4 +++
 kharma/harm_driver.cpp | 63 +++++++-------------------------------
 kharma/harm_driver.hpp | 68 ++++++++++++++++++++++++++++++++++++++++++
 kharma/imex_driver.cpp | 58 +++++------------------------------
 kharma/kharma.cpp      |  1 +
 5 files changed, 91 insertions(+), 103 deletions(-)

diff --git a/kharma/grmhd/grmhd.cpp b/kharma/grmhd/grmhd.cpp
index 0124b078..7072a929 100644
--- a/kharma/grmhd/grmhd.cpp
+++ b/kharma/grmhd/grmhd.cpp
@@ -177,6 +177,10 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin, Packages_t pack
     // together.  Useful if # MeshBlocks is > # MPI ranks
     bool pack_comms = pin->GetOrAddBoolean("perf", "pack_comms", true);
     params.Add("pack_comms", pack_comms);
+    // Synchronize boundary variables twice.  Ensures KHARMA is agnostic to the breakdown
+    // of meshblocks, at the cost of twice the MPI overhead, for potentially much worse strong scaling.
+    bool two_sync = pin->GetOrAddBoolean("perf", "two_sync", false);
+    params.Add("two_sync", two_sync);
 
     // Adaptive mesh refinement options
     // Only active if "refinement" and "numlevel" parameters allow
diff --git a/kharma/harm_driver.cpp b/kharma/harm_driver.cpp
index 97b2a2a5..1eecca0c 100644
--- a/kharma/harm_driver.cpp
+++ b/kharma/harm_driver.cpp
@@ -207,60 +207,10 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
 
     // MPI/MeshBlock boundary exchange.
     // Optionally "packed" to send all data in one call (num_partitions defaults to 1)
-    // TODO do these all need to be sequential?  What are the specifics here?
+    // Recall this syncs conserved vars *and* primitive vars to seed UtoP correctly
     const auto &pack_comms =
         blocks[0]->packages.Get("GRMHD")->Param<bool>("pack_comms");
-    if (pack_comms) {
-        TaskRegion &tr1 = tc.AddRegion(num_partitions);
-        for (int i = 0; i < num_partitions; i++) {
-            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
-            tr1[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
-            , mc1.get());
-            tr1[i].AddTask(t_none, cell_centered_bvars::SendBoundaryBuffers, mc1);
-        }
-        TaskRegion &tr2 = tc.AddRegion(num_partitions);
-        for (int i = 0; i < num_partitions; i++) {
-            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
-            tr2[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
-            , mc1.get());
-            tr2[i].AddTask(t_none, cell_centered_bvars::ReceiveBoundaryBuffers, mc1);
-        }
-        TaskRegion &tr3 = tc.AddRegion(num_partitions);
-        for (int i = 0; i < num_partitions; i++) {
-            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
-            tr3[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
-            , mc1.get());
-            tr3[i].AddTask(t_none, cell_centered_bvars::SetBoundaries, mc1);
-        }
-    } else {
-        TaskRegion &tr1 = tc.AddRegion(blocks.size());
-        for (int i = 0; i < blocks.size(); i++) {
-            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
-            tr1[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
-            , sc1.get());
-            tr1[i].AddTask(t_none, &MeshBlockData<Real>::SendBoundaryBuffers, sc1.get());
-        }
-        TaskRegion &tr2 = tc.AddRegion(blocks.size());
-        for (int i = 0; i < blocks.size(); i++) {
-            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
-            tr2[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
-            , sc1.get());
-            tr2[i].AddTask(t_none, &MeshBlockData<Real>::ReceiveBoundaryBuffers, sc1.get());
-        }
-        TaskRegion &tr3 = tc.AddRegion(blocks.size());
-        for (int i = 0; i < blocks.size(); i++) {
-            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
-            tr3[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
-            , sc1.get());
-            tr3[i].AddTask(t_none, &MeshBlockData<Real>::SetBoundaries, sc1.get());
-        }
-    }
+    AddBoundarySync(tc, pmesh, blocks, integrator.get(), stage, pack_comms);
 
     // Async Region: Fill primitive values, apply physical boundary conditions,
     // add any source terms which require the full primitives->primitives step
@@ -334,5 +284,14 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
         }
     }
 
+    // Second boundary sync:
+    // ensure that primitive variables in ghost zones are *exactly*
+    // identical to their physical counterparts, now that they have been
+    // modified on each rank.
+    const auto &two_sync = pkgs.at("GRMHD")->Param<bool>("two_sync");
+    if (two_sync) {
+        AddBoundarySync(tc, pmesh, blocks, integrator.get(), stage, pack_comms);
+    }
+
     return tc;
 }
diff --git a/kharma/harm_driver.hpp b/kharma/harm_driver.hpp
index 609949f9..a95eae10 100644
--- a/kharma/harm_driver.hpp
+++ b/kharma/harm_driver.hpp
@@ -37,6 +37,8 @@
 
 #include <parthenon/parthenon.hpp>
 
+#include "types.hpp"
+
 using namespace parthenon;
 
 /**
@@ -66,3 +68,69 @@ class HARMDriver : public MultiStageDriver {
         // Global solves need a reduction point
         AllReduce<Real> update_norm;
 };
+
+/**
+ * Add a boundary synchronization sequence to the TaskCollection tc.
+ * 
+ * This sequence is used identically in several places, so it makes sense
+ * to define once and use elsewhere.
+ * TODO could make member of a HARMDriver/ImExDriver superclass?
+ */
+inline void AddBoundarySync(TaskCollection &tc, Mesh *pmesh, BlockList_t &blocks, StagedIntegrator *integrator, int stage, bool pack_comms=false)
+{
+    TaskID t_none(0);
+    const int num_partitions = pmesh->DefaultNumPartitions();
+    auto stage_name = integrator->stage_name;
+    // TODO do these all need to be sequential?  What are the specifics here?
+    if (pack_comms) {
+        TaskRegion &tr1 = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr1[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
+            , mc1.get());
+            tr1[i].AddTask(t_none, cell_centered_bvars::SendBoundaryBuffers, mc1);
+        }
+        TaskRegion &tr2 = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr2[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
+            , mc1.get());
+            tr2[i].AddTask(t_none, cell_centered_bvars::ReceiveBoundaryBuffers, mc1);
+        }
+        TaskRegion &tr3 = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+            tr3[i].AddTask(t_none,
+                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
+            , mc1.get());
+            tr3[i].AddTask(t_none, cell_centered_bvars::SetBoundaries, mc1);
+        }
+    } else {
+        TaskRegion &tr1 = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr1[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
+            , sc1.get());
+            tr1[i].AddTask(t_none, &MeshBlockData<Real>::SendBoundaryBuffers, sc1.get());
+        }
+        TaskRegion &tr2 = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr2[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
+            , sc1.get());
+            tr2[i].AddTask(t_none, &MeshBlockData<Real>::ReceiveBoundaryBuffers, sc1.get());
+        }
+        TaskRegion &tr3 = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
+            tr3[i].AddTask(t_none,
+                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
+            , sc1.get());
+            tr3[i].AddTask(t_none, &MeshBlockData<Real>::SetBoundaries, sc1.get());
+        }
+    }
+}
diff --git a/kharma/imex_driver.cpp b/kharma/imex_driver.cpp
index b684ed26..b2e81c33 100644
--- a/kharma/imex_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -52,6 +52,7 @@
 #include "debug.hpp"
 #include "fixup.hpp"
 #include "flux.hpp"
+#include "harm_driver.hpp"
 #include "resize_restart.hpp"
 #include "implicit.hpp"
 #include "source.hpp"
@@ -288,57 +289,7 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
     // Optionally "packed" to send all data in one call (num_partitions defaults to 1)
     // Note that in this driver, this block syncs *primitive* variables, not conserved
     const auto &pack_comms = pkgs.at("GRMHD")->Param<bool>("pack_comms");
-    if (pack_comms) {
-        TaskRegion &tr1 = tc.AddRegion(num_partitions);
-        for (int i = 0; i < num_partitions; i++) {
-            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
-            tr1[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
-            , mc1.get());
-            tr1[i].AddTask(t_none, cell_centered_bvars::SendBoundaryBuffers, mc1);
-        }
-        TaskRegion &tr2 = tc.AddRegion(num_partitions);
-        for (int i = 0; i < num_partitions; i++) {
-            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
-            tr2[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
-            , mc1.get());
-            tr2[i].AddTask(t_none, cell_centered_bvars::ReceiveBoundaryBuffers, mc1);
-        }
-        TaskRegion &tr3 = tc.AddRegion(num_partitions);
-        for (int i = 0; i < num_partitions; i++) {
-            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
-            tr3[i].AddTask(t_none,
-                [](MeshData<Real> *mc1){ Flag(mc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
-            , mc1.get());
-            tr3[i].AddTask(t_none, cell_centered_bvars::SetBoundaries, mc1);
-        }
-    } else {
-        TaskRegion &tr1 = tc.AddRegion(blocks.size());
-        for (int i = 0; i < blocks.size(); i++) {
-            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
-            tr1[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Send Buffers"); return TaskStatus::complete; }
-            , sc1.get());
-            tr1[i].AddTask(t_none, &MeshBlockData<Real>::SendBoundaryBuffers, sc1.get());
-        }
-        TaskRegion &tr2 = tc.AddRegion(blocks.size());
-        for (int i = 0; i < blocks.size(); i++) {
-            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
-            tr2[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Recv Buffers"); return TaskStatus::complete; }
-            , sc1.get());
-            tr2[i].AddTask(t_none, &MeshBlockData<Real>::ReceiveBoundaryBuffers, sc1.get());
-        }
-        TaskRegion &tr3 = tc.AddRegion(blocks.size());
-        for (int i = 0; i < blocks.size(); i++) {
-            auto &sc1 = blocks[i]->meshblock_data.Get(stage_name[stage]);
-            tr3[i].AddTask(t_none,
-                [](MeshBlockData<Real> *rc1){ Flag(rc1, "Parthenon Set Boundaries"); return TaskStatus::complete; }
-            , sc1.get());
-            tr3[i].AddTask(t_none, &MeshBlockData<Real>::SetBoundaries, sc1.get());
-        }
-    }
+    AddBoundarySync(tc, pmesh, blocks, integrator.get(), stage, pack_comms);
 
     // Async Region: Any post-sync tasks.  Fixups, timestep & AMR things.
     TaskRegion &async_region2 = tc.AddRegion(blocks.size());
@@ -390,6 +341,11 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             }
         }
     }
+    
+    const auto &two_sync = pkgs.at("GRMHD")->Param<bool>("two_sync");
+    if (two_sync) {
+        AddBoundarySync(tc, pmesh, blocks, integrator.get(), stage, pack_comms);
+    }
 
     return tc;
 }
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index a61091ea..d2a4cc57 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -158,6 +158,7 @@ void KHARMA::FixParameters(std::unique_ptr<ParameterInput>& pin)
         pin->SetReal("parthenon/mesh", "x1min", SMALL);
         pin->SetReal("parthenon/mesh", "x1max", Rout);
     }
+    
 
     // Assumption: if we're in a spherical system...
     if (cb == "spherical_ks" || cb == "spherical_bl" || cb == "spherical_minkowski") {

From 55066f62fc2e53113a146abb77cd3f91d783b6ee Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 30 Mar 2022 16:09:28 -0500
Subject: [PATCH 23/26] Fix two-sync by setting all the extra recv/clear flags
 not part of the core sync process.

---
 kharma/harm_driver.cpp | 24 ++++++++++++++++++++++++
 kharma/imex_driver.cpp | 30 +++++++++++++++++++++++++++++-
 kharma/types.hpp       |  2 +-
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/kharma/harm_driver.cpp b/kharma/harm_driver.cpp
index 1eecca0c..e5b32e84 100644
--- a/kharma/harm_driver.cpp
+++ b/kharma/harm_driver.cpp
@@ -290,7 +290,31 @@ TaskCollection HARMDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
     // modified on each rank.
     const auto &two_sync = pkgs.at("GRMHD")->Param<bool>("two_sync");
     if (two_sync) {
+        TaskRegion &single_tasklist_per_pack_region = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &tl = single_tasklist_per_pack_region[i];
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+
+            auto t_start_recv = tl.AddTask(t_none, &MeshData<Real>::StartReceiving, mc1.get(),
+                                        BoundaryCommSubset::all);
+        }
+
         AddBoundarySync(tc, pmesh, blocks, integrator.get(), stage, pack_comms);
+
+        TaskRegion &async_region = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &pmb = blocks[i];
+            auto &tl = async_region[i];
+            auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
+
+            auto t_clear_comm_flags = tl.AddTask(t_none, &MeshBlockData<Real>::ClearBoundary,
+                                            sc1.get(), BoundaryCommSubset::all);
+
+            auto t_prolongBound = t_clear_comm_flags;
+            if (pmesh->multilevel) {
+                t_prolongBound = tl.AddTask(t_clear_comm_flags, ProlongateBoundaries, sc1);
+            }
+        }
     }
 
     return tc;
diff --git a/kharma/imex_driver.cpp b/kharma/imex_driver.cpp
index b2e81c33..f3bb72d5 100644
--- a/kharma/imex_driver.cpp
+++ b/kharma/imex_driver.cpp
@@ -341,10 +341,38 @@ TaskCollection ImexDriver::MakeTaskCollection(BlockList_t &blocks, int stage)
             }
         }
     }
-    
+
+    // Second boundary sync:
+    // ensure that primitive variables in ghost zones are *exactly*
+    // identical to their physical counterparts, now that they have been
+    // modified on each rank.
     const auto &two_sync = pkgs.at("GRMHD")->Param<bool>("two_sync");
     if (two_sync) {
+        TaskRegion &single_tasklist_per_pack_region = tc.AddRegion(num_partitions);
+        for (int i = 0; i < num_partitions; i++) {
+            auto &tl = single_tasklist_per_pack_region[i];
+            auto &mc1 = pmesh->mesh_data.GetOrAdd(stage_name[stage], i);
+
+            auto t_start_recv = tl.AddTask(t_none, &MeshData<Real>::StartReceiving, mc1.get(),
+                                        BoundaryCommSubset::all);
+        }
+
         AddBoundarySync(tc, pmesh, blocks, integrator.get(), stage, pack_comms);
+
+        TaskRegion &async_region = tc.AddRegion(blocks.size());
+        for (int i = 0; i < blocks.size(); i++) {
+            auto &pmb = blocks[i];
+            auto &tl = async_region[i];
+            auto &sc1 = pmb->meshblock_data.Get(stage_name[stage]);
+
+            auto t_clear_comm_flags = tl.AddTask(t_none, &MeshBlockData<Real>::ClearBoundary,
+                                            sc1.get(), BoundaryCommSubset::all);
+
+            auto t_prolongBound = t_clear_comm_flags;
+            if (pmesh->multilevel) {
+                t_prolongBound = tl.AddTask(t_clear_comm_flags, ProlongateBoundaries, sc1);
+            }
+        }
     }
 
     return tc;
diff --git a/kharma/types.hpp b/kharma/types.hpp
index 31b33bef..3e2b6cb8 100644
--- a/kharma/types.hpp
+++ b/kharma/types.hpp
@@ -169,7 +169,7 @@ inline bool IsDomainBound(MeshBlock *pmb, BoundaryFace face)
  * at each important function entry/exit
  */
 #if TRACE
-#define PRINTCORNERS 1
+#define PRINTCORNERS 0
 #define PRINTZONE 0
 inline void PrintCorner(MeshBlockData<Real> *rc)
 {

From 40fd91ead39812d65e439af2d6470da12dafbe0b Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 30 Mar 2022 17:57:57 -0500
Subject: [PATCH 24/26] Streamline & document CMakeLists for easier compile
 changes, update SyncAllBounds for sync changes, default to cleaning B when
 starting tilted disks

---
 CMakeLists.txt                  | 22 +++++-----
 kharma/b_cleanup/b_cleanup.cpp  |  5 ++-
 kharma/boundaries.cpp           | 71 ++++++++++++++++++++++++++++++++
 kharma/boundaries.hpp           | 48 +++++++++++++++++++++-
 kharma/prob/post_initialize.cpp | 72 ++-------------------------------
 pars/sane_tilt.par              |  9 ++++-
 6 files changed, 145 insertions(+), 82 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74343a3a..9c9f39ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,15 +13,11 @@ set(CMAKE_CXX_STANDARD 14)
 #set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17")
 #set(PARTHENON_ENABLE_CPP17 ON CACHE BOOL "KHARMA Override")
 
-#foreach(path ${CMAKE_PREFIX_PATH})
-#  include_directories(${path})
-#endforeach()
-
 # Parthenon options
 set(PARTHENON_DISABLE_EXAMPLES ON CACHE BOOL "KHARMA Override")
 set(PARTHENON_LINT_DEFAULT OFF CACHE BOOL "KHARMA Override")
-# To use old Summit built-in HDF5
-set(PARTHENON_DISABLE_HDF5_COMPRESSION ON CACHE BOOL "KHARMA Override")
+# Attempt HDF5 compression, requires recent/standard HDF5. YMMV
+set(PARTHENON_DISABLE_HDF5_COMPRESSION OFF CACHE BOOL "KHARMA Override")
 
 # Parthenon internal build options
 set(BUILD_TESTING OFF CACHE BOOL "KHARMA Override")
@@ -44,23 +40,29 @@ set(KokkosKernels_ENABLE_TPL_CUBLAS OFF CACHE BOOL "KHARMA Override")
 # Parthenon says it doesn't need MPI.  It just *strongly prefers* it, and so do we.
 # Builds without MPI have pretty limited support, you can usually find distribution 
 # packages or other ways to install it on personal machines without too much work.
-# Check out oneAPI or NVHPC for software distributions that include easily-usable MPI modules
+# Check out oneAPI or NVHPC for software distributions that include easily-usable,
+# fast MPI modules
+# If you really want to disable MPI, set this to ON and comment the next two lines
+set(PARTHENON_DISABLE_MPI OFF CACHE BOOL "KHARMA Override")
 find_package(MPI REQUIRED)
 include_directories(SYSTEM ${MPI_INCLUDE_PATH})
+
+# OpenMP is strictly required
 find_package(OpenMP REQUIRED)
 
-# TODO don't build parthenon unit tests etc just the library
+# Build Parthenon
 add_subdirectory(external/parthenon)
 include_directories(external/parthenon/src)
 # mpark::variant is header only, don't build anything
 include_directories(external/variant/include)
-# Kokkos kernels: don't compile them but import all headers
+# Kokkos kernels: don't build them (very slow), just import all headers
 # Requires KokkosKernels_config.h shipped with KHARMA, YMMV
+# In case of issues, uncomment the following line to build them
 #add_subdirectory(external/kokkos-kernels)
 include_directories(external/kokkos-kernels/src)
 include_directories(external/kokkos-kernels/src/batched)
 include_directories(external/kokkos-kernels/src/batched/dense)
 include_directories(external/kokkos-kernels/src/batched/dense/impl)
 
-# KHARMA folder
+# Finally, build KHARMA
 add_subdirectory(kharma)
diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
index e2e22f05..570fe4f4 100644
--- a/kharma/b_cleanup/b_cleanup.cpp
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -171,7 +171,10 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     auto verbose = pkg->Param<int>("verbose");
 
     if (MPIRank0() && verbose > 0) {
-        std::cout << "Cleaning divB" << std::endl;
+        std::cout << "Cleaning divB to relative tolerance " << rel_tolerance;
+        std::cout << " and absolute tolerance " << abs_tolerance << std::endl;
+        if (warn_flag) std::cout << "Warning on failure to converge." << std::endl;
+        if (fail_flag) std::cout << "Erroring on failure to converge." << std::endl;
     }
 
     // Calculate existing divB max & sum for checking relative error later
diff --git a/kharma/boundaries.cpp b/kharma/boundaries.cpp
index 334fac78..5849f36c 100644
--- a/kharma/boundaries.cpp
+++ b/kharma/boundaries.cpp
@@ -37,6 +37,7 @@
 #include "boundaries.hpp"
 
 #include "kharma.hpp"
+#include "flux.hpp"
 #include "flux_functions.hpp"
 #include "grmhd_functions.hpp"
 #include "pack.hpp"
@@ -343,3 +344,73 @@ TaskStatus KBoundaries::FixFlux(MeshData<Real> *md)
     Flag("Fixed fluxes");
     return TaskStatus::complete;
 }
+
+void KBoundaries::SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
+{
+
+    // TODO this does syncs per-block.  Correctly afaict,
+    // but they could be done more simply & efficiently per-mesh
+    Flag("Syncing all bounds");
+
+    if (pin->GetString("driver", "type") == "imex") {
+        // If we're syncing the primitive vars, we just sync
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            rc->ClearBoundary(BoundaryCommSubset::all);
+            rc->StartReceiving(BoundaryCommSubset::all);
+            rc->SendBoundaryBuffers();
+        }
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            rc->ReceiveAndSetBoundariesWithWait();
+            rc->ClearBoundary(BoundaryCommSubset::all);
+            // TODO if amr...
+            //pmb->pbval->ProlongateBoundaries();
+
+            Flag("Physical bounds");
+            // Physical boundary conditions
+            parthenon::ApplyBoundaryConditions(rc);
+        }
+    } else {
+        // If we're syncing the conserved vars...
+        // Honestly, the easiest way through this sync is:
+        // 1. PtoU everywhere
+        // 2. Sync like a normal step, incl. physical bounds
+        // 3. UtoP everywhere
+        // Luckily we're amortized over the whole sim, so we can
+        // take our time.
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            Flux::PtoU(rc.get(), IndexDomain::entire);
+        }
+
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            Flag("Block sync send");
+            rc->ClearBoundary(BoundaryCommSubset::all);
+            rc->StartReceiving(BoundaryCommSubset::all);
+            rc->SendBoundaryBuffers();
+        }
+
+        for (auto &pmb : pmesh->block_list) {
+            auto& rc = pmb->meshblock_data.Get();
+            Flag("Block sync receive");
+            rc->ReceiveAndSetBoundariesWithWait();
+            rc->ClearBoundary(BoundaryCommSubset::all);
+            // TODO if amr...
+            //pmb->pbval->ProlongateBoundaries();
+
+            Flag("Fill Derived");
+            // Fill P again, including ghost zones
+            // But, sice we sync'd GRHD primitives already,
+            // leave those off by calling *Domain like in a normal
+            // boundary sync
+            KHARMA::FillDerivedDomain(rc, IndexDomain::entire, false);
+
+            Flag("Physical bounds");
+            // Physical boundary conditions
+            parthenon::ApplyBoundaryConditions(rc);
+        }
+    }
+    Flag("Sync'd");
+}
diff --git a/kharma/boundaries.hpp b/kharma/boundaries.hpp
index a94b5a3e..c7e17d6c 100644
--- a/kharma/boundaries.hpp
+++ b/kharma/boundaries.hpp
@@ -1,4 +1,36 @@
-//
+/* 
+ *  File: boundaries.hpp
+ *  
+ *  BSD 3-Clause License
+ *  
+ *  Copyright (c) 2020, AFD Group at UIUC
+ *  All rights reserved.
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright notice, this
+ *     list of conditions and the following disclaimer.
+ *  
+ *  2. Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *  
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 #pragma once
 
 #include "decs.hpp"
@@ -6,6 +38,14 @@
 #include "bondi.hpp"
 #include "grmhd_functions.hpp"
 
+/**
+ * Any functions related to KHARMA's treatment of boundary conditions.
+ * These largely build on/fill in Parthenon's boundary functions,
+ * which KHARMA uses to handle all MPI & periodic boundaries.
+ * 
+ * Thus this Namespace is for outflow, reflecting, and problem-specific
+ * bounds, which KHARMA has to handle separately from Parthenon.
+ */
 namespace KBoundaries {
 
 /**
@@ -28,6 +68,12 @@ void OuterX2(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse);
  */
 TaskStatus FixFlux(MeshData<Real> *rc);
 
+/**
+ * Single call to sync all boundary conditions.
+ * Used anytime boundary sync is needed outside the usual loop of steps.
+ */
+void SyncAllBounds(ParameterInput *pin, Mesh *pmesh);
+
 /**
  * Check for flow into simulation and reset velocity to eliminate it
  * TODO does Parthenon do something like this for outflow bounds already?
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index c31d7685..44375bae 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -49,72 +49,6 @@
 #include "seed_B_ct.hpp"
 #include "seed_B_cd.hpp"
 
-void SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
-{
-
-    // TODO this does syncs per-block.  Correctly afaict,
-    // but they could be done more simply & efficiently per-mesh
-    Flag("Syncing all bounds");
-
-    if (pin->GetString("driver", "type") == "imex") {
-        // If we're syncing the primitive vars, we just sync
-        for (auto &pmb : pmesh->block_list) {
-            auto& rc = pmb->meshblock_data.Get();
-            rc->ClearBoundary(BoundaryCommSubset::all);
-            rc->StartReceiving(BoundaryCommSubset::all);
-            rc->SendBoundaryBuffers();
-        }
-        for (auto &pmb : pmesh->block_list) {
-            auto& rc = pmb->meshblock_data.Get();
-            rc->ReceiveAndSetBoundariesWithWait();
-            rc->ClearBoundary(BoundaryCommSubset::all);
-            // TODO if amr...
-            //pmb->pbval->ProlongateBoundaries();
-
-            // Physical boundary conditions
-            parthenon::ApplyBoundaryConditions(rc);
-        }
-    } else {
-        // If we're syncing the conserved vars...
-        // Honestly, the easiest way through this sync is:
-        // 1. PtoU everywhere
-        // 2. Sync like a normal step, incl. physical bounds
-        // 3. UtoP everywhere
-        // Luckily we're amortized over the whole sim, so we can
-        // take our time.
-        for (auto &pmb : pmesh->block_list) {
-            auto& rc = pmb->meshblock_data.Get();
-            Flux::PtoU(rc.get(), IndexDomain::entire);
-        }
-
-        for (auto &pmb : pmesh->block_list) {
-            auto& rc = pmb->meshblock_data.Get();
-            Flag("Block sync send");
-            rc->ClearBoundary(BoundaryCommSubset::all);
-            rc->StartReceiving(BoundaryCommSubset::all);
-            rc->SendBoundaryBuffers();
-        }
-
-        for (auto &pmb : pmesh->block_list) {
-            auto& rc = pmb->meshblock_data.Get();
-            Flag("Block sync receive");
-            rc->ReceiveAndSetBoundariesWithWait();
-            rc->ClearBoundary(BoundaryCommSubset::all);
-            // TODO if amr...
-            //pmb->pbval->ProlongateBoundaries();
-
-            Flag("Fill Derived");
-            // Fill P again, including ghost zones
-            parthenon::Update::FillDerived(rc.get());
-
-            Flag("Physical bounds");
-            // Physical boundary conditions
-            parthenon::ApplyBoundaryConditions(rc);
-        }
-    }
-    Flag("Sync'd");
-}
-
 void KHARMA::SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh)
 {
     // Check which solver we'll be using
@@ -126,7 +60,7 @@ void KHARMA::SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh)
     if (pin->GetOrAddString("b_field", "type", "none") != "none") {
         // Calculating B has a stencil outside physical zones
         Flag("Extra boundary sync for B");
-        SyncAllBounds(pin, pmesh);
+        KBoundaries::SyncAllBounds(pin, pmesh);
 
         // "Legacy" is the much more common normalization:
         // It's the ratio of max values over the domain i.e. max(P) / max(P_B),
@@ -265,7 +199,7 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart, b
 
     // Sync to fill the ghost zones
     Flag("Boundary sync");
-    SyncAllBounds(pin, pmesh);
+    KBoundaries::SyncAllBounds(pin, pmesh);
 
     // Extra cleanup & init to do if restarting
     if (is_restart) {
@@ -283,7 +217,7 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart, b
         B_Cleanup::CleanupDivergence(mbase);
         // Sync to make sure periodic boundaries are set
         Flag("Boundary sync");
-        SyncAllBounds(pin, pmesh);
+        KBoundaries::SyncAllBounds(pin, pmesh);
     }
 
     Flag("Post-initialization finished");
diff --git a/pars/sane_tilt.par b/pars/sane_tilt.par
index 3a3bbec0..547abaca 100644
--- a/pars/sane_tilt.par
+++ b/pars/sane_tilt.par
@@ -32,10 +32,13 @@ cfl = 0.9
 gamma = 1.666667
 reconstruction = weno5
 
+<driver>
+type = harm
+
 <torus>
 rin = 8.0
 rmax = 16.0
-tilt = 20
+tilt = 30
 
 <perturbation>
 u_jitter = 0.0
@@ -43,6 +46,10 @@ u_jitter = 0.0
 <b_field>
 type = sane
 beta_min = 100
+initial_cleanup = true
+
+<b_cleanup>
+abs_tolerance = 1e-10
 
 <floors>
 rho_min_geom = 1e-5

From 8fdd603a391b5cc2d4d6ba7559d1cd5dc207a8c8 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 30 Mar 2022 20:14:28 -0500
Subject: [PATCH 25/26] Field cleanup improvements in boundary synchronization

---
 kharma/b_cleanup/b_cleanup.cpp  | 32 +++++++++++++++-----------------
 kharma/boundaries.cpp           | 20 ++++++++++++--------
 kharma/boundaries.hpp           |  2 +-
 kharma/implicit/implicit.hpp    |  2 +-
 kharma/kharma.cpp               |  4 ++--
 kharma/prob/post_initialize.cpp | 10 ++++++----
 pars/sane.par                   |  1 +
 7 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
index 570fe4f4..edad9a1b 100644
--- a/kharma/b_cleanup/b_cleanup.cpp
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -39,6 +39,7 @@
 // For a bunch of utility functions
 #include "b_flux_ct.hpp"
 
+#include "boundaries.hpp"
 #include "decs.hpp"
 #include "grmhd.hpp"
 #include "kharma.hpp"
@@ -161,7 +162,8 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     AllReduce<Real> update_norm, divB_norm, divB_max;
     AllReduce<Real> P_norm;
 
-    auto pkg = md->GetMeshPointer()->packages.Get("B_Cleanup");
+    auto pmesh = md->GetMeshPointer();
+    auto pkg = pmesh->packages.Get("B_Cleanup");
     auto max_iters = pkg->Param<int>("max_iterations");
     auto check_interval = pkg->Param<int>("check_interval");
     auto rel_tolerance = pkg->Param<Real>("rel_tolerance");
@@ -169,12 +171,13 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     auto fail_flag = pkg->Param<bool>("fail_without_convergence");
     auto warn_flag = pkg->Param<bool>("warn_without_convergence");
     auto verbose = pkg->Param<int>("verbose");
+    bool sync_prims = pmesh->packages.Get("GRMHD")->Param<std::string>("driver_type") == "imex";
 
     if (MPIRank0() && verbose > 0) {
         std::cout << "Cleaning divB to relative tolerance " << rel_tolerance;
         std::cout << " and absolute tolerance " << abs_tolerance << std::endl;
-        if (warn_flag) std::cout << "Warning on failure to converge." << std::endl;
-        if (fail_flag) std::cout << "Erroring on failure to converge." << std::endl;
+        if (warn_flag) std::cout << "Convergence failure will produce a warning." << std::endl;
+        if (fail_flag) std::cout << "Convergence failure will produce an error." << std::endl;
     }
 
     // Calculate existing divB max & sum for checking relative error later
@@ -191,6 +194,11 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     if (MPIRank0() && verbose > 0) {
         std::cout << "Starting divB max is " << divB_max.val << " and sum is " << divB_norm.val << std::endl;
     }
+    // These two aren't *strictly* comparable, but we're unlikely to do any good if this is true
+    if (divB_max.val < abs_tolerance) {
+        std::cout << "Starting divB is within tolerance, exiting." << std::endl;
+        return;
+    }
 
     // set P = divB as guess
     B_Cleanup::InitP(md.get());
@@ -198,23 +206,12 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     bool converged = false;
     int iter = 0;
     while ( (!converged) && (iter < max_iters) ) {
-        // Start syncing bounds
-        md.get()->StartReceiving(BoundaryCommSubset::all);
-
         // Update our guess at the potential 
         B_Cleanup::UpdateP(md.get());
 
         // Boundary sync. We really only need p syncd here...
-        cell_centered_bvars::SendBoundaryBuffers(md);
-        cell_centered_bvars::ReceiveBoundaryBuffers(md);
-        cell_centered_bvars::SetBoundaries(md);
-        md.get()->ClearBoundary(BoundaryCommSubset::all);
-
-        // And set physical boundaries
-        // for (auto &pmb : md->GetMeshPointer()->block_list) {
-        //     auto& rc = pmb->meshblock_data.Get();
-        //     parthenon::ApplyBoundaryConditions(rc);
-        // }
+        // Last option prevents updating physical boundaries, which we want to *solve* instead
+        KBoundaries::SyncAllBounds(pmesh, sync_prims, false);
 
         if (iter % check_interval == 0) {
             Flag("Iteration:");
@@ -327,7 +324,7 @@ TaskStatus InitP(MeshData<Real> *md)
 
 TaskStatus UpdateP(MeshData<Real> *md)
 {
-    //Flag(md, "Updating P");
+    Flag(md, "Updating P");
     auto pmesh = md->GetParentPointer();
     const int ndim = pmesh->ndim;
     const IndexRange ib = md->GetBoundsI(IndexDomain::entire);
@@ -390,6 +387,7 @@ TaskStatus UpdateP(MeshData<Real> *md)
         }
     );
 
+    Flag("Updated");
     return TaskStatus::complete;
 }
 
diff --git a/kharma/boundaries.cpp b/kharma/boundaries.cpp
index 5849f36c..98a44c53 100644
--- a/kharma/boundaries.cpp
+++ b/kharma/boundaries.cpp
@@ -345,14 +345,14 @@ TaskStatus KBoundaries::FixFlux(MeshData<Real> *md)
     return TaskStatus::complete;
 }
 
-void KBoundaries::SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
+void KBoundaries::SyncAllBounds(Mesh *pmesh, bool sync_prims, bool sync_phys)
 {
 
     // TODO this does syncs per-block.  Correctly afaict,
     // but they could be done more simply & efficiently per-mesh
     Flag("Syncing all bounds");
 
-    if (pin->GetString("driver", "type") == "imex") {
+    if (sync_prims) {
         // If we're syncing the primitive vars, we just sync
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
@@ -367,9 +367,11 @@ void KBoundaries::SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
             // TODO if amr...
             //pmb->pbval->ProlongateBoundaries();
 
-            Flag("Physical bounds");
-            // Physical boundary conditions
-            parthenon::ApplyBoundaryConditions(rc);
+            if (sync_phys) {
+                Flag("Physical bounds");
+                // Physical boundary conditions
+                parthenon::ApplyBoundaryConditions(rc);
+            }
         }
     } else {
         // If we're syncing the conserved vars...
@@ -407,9 +409,11 @@ void KBoundaries::SyncAllBounds(ParameterInput *pin, Mesh *pmesh)
             // boundary sync
             KHARMA::FillDerivedDomain(rc, IndexDomain::entire, false);
 
-            Flag("Physical bounds");
-            // Physical boundary conditions
-            parthenon::ApplyBoundaryConditions(rc);
+            if (sync_phys) {
+                Flag("Physical bounds");
+                // Physical boundary conditions
+                parthenon::ApplyBoundaryConditions(rc);
+            }
         }
     }
     Flag("Sync'd");
diff --git a/kharma/boundaries.hpp b/kharma/boundaries.hpp
index c7e17d6c..0f091861 100644
--- a/kharma/boundaries.hpp
+++ b/kharma/boundaries.hpp
@@ -72,7 +72,7 @@ TaskStatus FixFlux(MeshData<Real> *rc);
  * Single call to sync all boundary conditions.
  * Used anytime boundary sync is needed outside the usual loop of steps.
  */
-void SyncAllBounds(ParameterInput *pin, Mesh *pmesh);
+void SyncAllBounds(Mesh *pmesh, bool sync_prims, bool sync_phys=true);
 
 /**
  * Check for flow into simulation and reset velocity to eliminate it
diff --git a/kharma/implicit/implicit.hpp b/kharma/implicit/implicit.hpp
index 00c28eaf..53d28575 100644
--- a/kharma/implicit/implicit.hpp
+++ b/kharma/implicit/implicit.hpp
@@ -137,7 +137,7 @@ KOKKOS_INLINE_FUNCTION void calc_jacobian(const GRCoordinates& G, const Local& P
                                           const Real& jac_delta, const Real& gam, const double& dt,
                                           Local2& jacobian, Local& residual)
 {
-    // Calculate residual for Sf->P
+    // Calculate residual of P
     calc_residual(G, P, Pi, Ui, Ps, dudt_explicit, dUi, tmp3, m_p, m_u, emhd_params, nfvar, j, i, gam, dt, residual);
 
     // Use one scratchpad as the incremented prims P_delta,
diff --git a/kharma/kharma.cpp b/kharma/kharma.cpp
index d2a4cc57..c8b2071b 100644
--- a/kharma/kharma.cpp
+++ b/kharma/kharma.cpp
@@ -221,8 +221,8 @@ Packages_t KHARMA::ProcessPackages(std::unique_ptr<ParameterInput>& pin)
     bool do_emhd = pin->GetOrAddBoolean("emhd", "on", false);
     bool do_wind = pin->GetOrAddBoolean("wind", "on", false);
 
-    // Set the default driver way up here so packages know how to flag
-    // prims vs cons (imex stepper syncs prims, but packages have to mark them that way)
+    // Set the default driver all the way up here, so packages know how to flag
+    // prims vs cons (imex stepper syncs prims, but it's the packages' job to mark them)
     std::string driver_type;
     if (do_emhd) {
         // Default to implicit step for EMHD
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index 44375bae..9c25d5d3 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -54,13 +54,14 @@ void KHARMA::SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh)
     // Check which solver we'll be using
     const bool use_b_flux_ct = pmesh->packages.AllPackages().count("B_FluxCT");
     const bool use_b_cd = pmesh->packages.AllPackages().count("B_CD");
+    bool sync_prims = pin->GetString("driver", "type") == "imex";
 
     // Add the field for torus problems as a second pass
     // Preserves P==U and ends with all physical zones fully defined
     if (pin->GetOrAddString("b_field", "type", "none") != "none") {
         // Calculating B has a stencil outside physical zones
         Flag("Extra boundary sync for B");
-        KBoundaries::SyncAllBounds(pin, pmesh);
+        KBoundaries::SyncAllBounds(pmesh, sync_prims);
 
         // "Legacy" is the much more common normalization:
         // It's the ratio of max values over the domain i.e. max(P) / max(P_B),
@@ -197,9 +198,10 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart, b
         }
     }
 
-    // Sync to fill the ghost zones
+    // Sync to fill the ghost zones: prims for ImExDriver, everything for HARMDriver
     Flag("Boundary sync");
-    KBoundaries::SyncAllBounds(pin, pmesh);
+    bool sync_prims = pin->GetString("driver", "type") == "imex";
+    KBoundaries::SyncAllBounds(pmesh, sync_prims);
 
     // Extra cleanup & init to do if restarting
     if (is_restart) {
@@ -217,7 +219,7 @@ void KHARMA::PostInitialize(ParameterInput *pin, Mesh *pmesh, bool is_restart, b
         B_Cleanup::CleanupDivergence(mbase);
         // Sync to make sure periodic boundaries are set
         Flag("Boundary sync");
-        KBoundaries::SyncAllBounds(pin, pmesh);
+        KBoundaries::SyncAllBounds(pmesh, sync_prims);
     }
 
     Flag("Post-initialization finished");
diff --git a/pars/sane.par b/pars/sane.par
index 8814f710..76a44cf8 100644
--- a/pars/sane.par
+++ b/pars/sane.par
@@ -46,6 +46,7 @@ u_jitter = 0.04
 <b_field>
 type = sane
 beta_min = 100.
+initial_cleanup = true
 
 <floors>
 rho_min_geom = 1e-6

From ef6e85c54db8c1eebed5d4af2950aab7d3eedf72 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 30 Mar 2022 22:55:43 -0500
Subject: [PATCH 26/26] Properly wait/sync MPI reductions in b_cleanup

---
 kharma/b_cleanup/b_cleanup.cpp  | 39 ++++++++++++++++++++-------------
 kharma/boundaries.cpp           |  8 +++++--
 kharma/prob/post_initialize.cpp |  7 ++++--
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/kharma/b_cleanup/b_cleanup.cpp b/kharma/b_cleanup/b_cleanup.cpp
index edad9a1b..8388d508 100644
--- a/kharma/b_cleanup/b_cleanup.cpp
+++ b/kharma/b_cleanup/b_cleanup.cpp
@@ -159,8 +159,7 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
 {
     Flag(md.get(), "Cleaning up divB");
     // Local Allreduce values since we're just calling things
-    AllReduce<Real> update_norm, divB_norm, divB_max;
-    AllReduce<Real> P_norm;
+    AllReduce<Real> update_norm, divB_norm, divB_max, P_norm;
 
     auto pmesh = md->GetMeshPointer();
     auto pkg = pmesh->packages.Get("B_Cleanup");
@@ -184,12 +183,14 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
     divB_max.val = 0.;
     B_FluxCT::MaxDivBTask(md.get(), divB_max.val);
     divB_max.StartReduce(MPI_MAX);
-    divB_max.CheckReduce();
 
     divB_norm.val = 0.;
     B_Cleanup::CalcSumDivB(md.get(), divB_norm.val);
     divB_norm.StartReduce(MPI_SUM);
-    divB_norm.CheckReduce();
+
+    // Wait on results
+    while (divB_max.CheckReduce() == TaskStatus::incomplete);
+    while (divB_norm.CheckReduce() == TaskStatus::incomplete);
 
     if (MPIRank0() && verbose > 0) {
         std::cout << "Starting divB max is " << divB_max.val << " and sum is " << divB_norm.val << std::endl;
@@ -200,12 +201,18 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
         return;
     }
 
+    // TODO Unmark everything but P as FillGhost, for efficiency. Re-mark before last sync
+    // auto vars = rc->GetVariablesByFlag(std::vector<MetadataFlag>({isImplicit, flag}), true).labels();
+    // for (auto& var : vars) {
+    //     // etc
+    // }
+
     // set P = divB as guess
     B_Cleanup::InitP(md.get());
 
-    bool converged = false;
+    bool is_converged = false;
     int iter = 0;
-    while ( (!converged) && (iter < max_iters) ) {
+    while ( (!is_converged) && (iter < max_iters) ) {
         // Update our guess at the potential 
         B_Cleanup::UpdateP(md.get());
 
@@ -219,24 +226,24 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
             update_norm.val = 0.;
             B_Cleanup::SumError(md.get(), update_norm.val);
             update_norm.StartReduce(MPI_SUM);
-            update_norm.CheckReduce();
             // P_norm.val = 0.;
             // B_Cleanup::SumP(md.get(), P_norm.val);
             // P_norm.StartReduce(MPI_SUM);
-            // P_norm.CheckReduce();
             divB_max.val = 0.;
             MaxError(md.get(), divB_max.val);
             divB_max.StartReduce(MPI_MAX);
-            divB_max.CheckReduce();
+            // Wait on both reductions to move on
+            while (update_norm.CheckReduce() == TaskStatus::incomplete);
+            //while (P_norm.CheckReduce() == TaskStatus::incomplete);
+            while (divB_max.CheckReduce() == TaskStatus::incomplete);
             if (MPIRank0()) {
                 std::cout << "divB step " << iter << " total relative error is " << update_norm.val / divB_norm.val
                         << " Max absolute error is " << divB_max.val << std::endl;
                 // std::cout << "P norm is " << P_norm.val << std::endl;
             }
 
-            // Both these values are already MPI reduced, but we want to make sure
-            converged = (update_norm.val / divB_norm.val < rel_tolerance) && (divB_max.val < abs_tolerance);
-            converged = MPIMin(converged);
+            // This behaves identically on ranks, unless we've broken a fundamental assumption
+            is_converged = (update_norm.val / divB_norm.val < rel_tolerance) && (divB_max.val < abs_tolerance);
         }
 
         iter++;
@@ -253,14 +260,16 @@ void CleanupDivergence(std::shared_ptr<MeshData<Real>>& md)
         std::cout << "Applying magnetic field correction" << std::endl;
     }
 
-    // Update the magnetic field with one damped Jacobi step
+    // Update the magnetic field on physical zones using our solution
     B_Cleanup::ApplyP(md.get());
+    // Synchronize to update ghost zones
+    KBoundaries::SyncAllBounds(pmesh, sync_prims);
 
-    // Recalculate divB max to reassure
+    // Recalculate divB max for one last check
     divB_max.val = 0.;
     B_FluxCT::MaxDivBTask(md.get(), divB_max.val);
     divB_max.StartReduce(MPI_MAX);
-    divB_max.CheckReduce();
+    while (divB_max.CheckReduce() == TaskStatus::incomplete);
 
     if (MPIRank0()) {
         std::cout << "Final divB max is " << divB_max.val << std::endl;
diff --git a/kharma/boundaries.cpp b/kharma/boundaries.cpp
index 98a44c53..4a59411e 100644
--- a/kharma/boundaries.cpp
+++ b/kharma/boundaries.cpp
@@ -347,8 +347,7 @@ TaskStatus KBoundaries::FixFlux(MeshData<Real> *md)
 
 void KBoundaries::SyncAllBounds(Mesh *pmesh, bool sync_prims, bool sync_phys)
 {
-
-    // TODO this does syncs per-block.  Correctly afaict,
+    // TODO this does syncs per-block.  Correctly and without race conditions afaict,
     // but they could be done more simply & efficiently per-mesh
     Flag("Syncing all bounds");
 
@@ -356,17 +355,21 @@ void KBoundaries::SyncAllBounds(Mesh *pmesh, bool sync_prims, bool sync_phys)
         // If we're syncing the primitive vars, we just sync
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
+            Flag("Block sync send");
             rc->ClearBoundary(BoundaryCommSubset::all);
             rc->StartReceiving(BoundaryCommSubset::all);
             rc->SendBoundaryBuffers();
         }
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
+            Flag("Block sync receive");
             rc->ReceiveAndSetBoundariesWithWait();
             rc->ClearBoundary(BoundaryCommSubset::all);
             // TODO if amr...
             //pmb->pbval->ProlongateBoundaries();
 
+            Flux::PtoU(rc.get());
+
             if (sync_phys) {
                 Flag("Physical bounds");
                 // Physical boundary conditions
@@ -383,6 +386,7 @@ void KBoundaries::SyncAllBounds(Mesh *pmesh, bool sync_prims, bool sync_phys)
         // take our time.
         for (auto &pmb : pmesh->block_list) {
             auto& rc = pmb->meshblock_data.Get();
+            Flag("Block PtoU");
             Flux::PtoU(rc.get(), IndexDomain::entire);
         }
 
diff --git a/kharma/prob/post_initialize.cpp b/kharma/prob/post_initialize.cpp
index 9c25d5d3..e9254fdb 100644
--- a/kharma/prob/post_initialize.cpp
+++ b/kharma/prob/post_initialize.cpp
@@ -165,9 +165,12 @@ void KHARMA::SeedAndNormalizeB(ParameterInput *pin, Mesh *pmesh)
         }
     }
 
-    if (pin->GetString("b_field", "solver") != "none" && pin->GetInteger("debug", "verbose") > 0) {
-        // Still print divB, even if we're not initializing/normalizing field here
+    if (pin->GetString("b_field", "solver") != "none") {
         auto md = pmesh->mesh_data.GetOrAdd("base", 0).get();
+        // Synchronize our seeded field (incl. primitives) before we print out what divB it has
+        KBoundaries::SyncAllBounds(pmesh, sync_prims);
+
+        // Still print divB, even if we're not initializing/normalizing field here
         Real divb_max = 0.;
         if (use_b_flux_ct) {
             divb_max = B_FluxCT::MaxDivB(md);