From b7cde8fa207fd0095d33f30ba3d8d4f9e38eba8b Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Fri, 8 Dec 2023 15:31:31 -0600
Subject: [PATCH 01/11] Fix compiling without MPI

---
 external/parthenon | 2 +-
 kharma/decs.hpp    | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/external/parthenon b/external/parthenon
index 665aedf0..8b12c121 160000
--- a/external/parthenon
+++ b/external/parthenon
@@ -1 +1 @@
-Subproject commit 665aedf0bf816d6894d474a2e742fd7b84d4fd6f
+Subproject commit 8b12c121442271c50258c05e8fabefc15ea4ecf8
diff --git a/kharma/decs.hpp b/kharma/decs.hpp
index edaa296f..ee86102d 100644
--- a/kharma/decs.hpp
+++ b/kharma/decs.hpp
@@ -155,7 +155,11 @@ inline int MPIRank()
 }
 inline int MPIBarrier()
 {
+#if MPI_PARALLEL
     return MPI_Barrier(MPI_COMM_WORLD);
+#else
+    return 0;
+#endif
 }
 
 // A few generic "NDArray" overloads for readability.

From 14c98520ab9fbbab272ed73a27a6527e90d51111 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Fri, 8 Dec 2023 16:05:37 -0600
Subject: [PATCH 02/11] Properly support MPI/No MPI option. CMake cleanup.

---
 CMakeLists.txt        |  5 +----
 kharma/CMakeLists.txt | 22 ++++++++++++----------
 kharma/decs.hpp       |  2 +-
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3b61010..5819fe04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,7 @@ set(Kokkos_ENABLE_CUDA_CONSTEXPR ON CACHE BOOL "KHARMA Override")
 set(Kokkos_ENABLE_HWLOC OFF CACHE BOOL "KHARMA Override") # Possible speed improvement?
 set(Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION ON CACHE BOOL "KHARMA Override")
 
+# For including the current git revision in the exe
 include(GetGitRevisionDescription)
 get_git_head_revision(GIT_REFSPEC GIT_SHA1)
 git_describe_working_tree(GIT_VERSION --tags)
@@ -53,10 +54,6 @@ else()
   include_directories(SYSTEM ${MPI_INCLUDE_PATH})
 endif()
 
-# OpenMP is usually used host-side.  We're letting Parthenon/Kokkos
-# find it though, as sometimes we require disabling it fully
-#find_package(OpenMP REQUIRED)
-
 # Build Parthenon
 add_subdirectory(external/parthenon)
 include_directories(external/parthenon/src)
diff --git a/kharma/CMakeLists.txt b/kharma/CMakeLists.txt
index aec167cc..614ff4b1 100644
--- a/kharma/CMakeLists.txt
+++ b/kharma/CMakeLists.txt
@@ -92,11 +92,16 @@ else()
 endif()
 
 # OPTIONS
-# These are almost universally performance trade-offs
+# These are almost universally performance trade-offs,
+# or disabling features that need dependencies.
 # TODO is there any way to make compile options less painful in CMake?
 option(FUSE_FLUX_KERNELS "Bundle the usual four flux calculation kernels (floors,R,L,apply) into one" ON)
 option(FUSE_FLOOR_KERNELS "Bundle applying the floors and ceilings into one kernel" ON)
 option(FAST_CARTESIAN "Break operation in curved spacetimes to make Cartesian Minkowski space computations faster" OFF)
+option(KHARMA_DISABLE_IMPLICIT "Disable the implicit solver, which requires bundled kokkos-kernels. Default false" OFF)
+option(KHARMA_DISABLE_CLEANUP "Disable the magnetic field cleanup module, which requires recent Parthenon. Default false" OFF)
+option(KHARMA_TRACE "Compile with tracing: print entry and exit of important functions. Default false" OFF)
+
 if(FUSE_FLUX_KERNELS)
     target_compile_definitions(${EXE_NAME} PUBLIC FUSE_FLUX_KERNELS=1)
 else()
@@ -108,13 +113,11 @@ else()
     target_compile_definitions(${EXE_NAME} PUBLIC FUSE_FLOOR_KERNELS=0)
 endif()
 if(FAST_CARTESIAN)
+    message("Compiling for Cartesian coordinates only. GRMHD will be disabled!")
     target_compile_definitions(${EXE_NAME} PUBLIC FAST_CARTESIAN=1)
 else()
     target_compile_definitions(${EXE_NAME} PUBLIC FAST_CARTESIAN=0)
 endif()
-option(KHARMA_DISABLE_IMPLICIT "Disable the implicit solver, which requires bundled kokkos-kernels. Default false" OFF)
-option(KHARMA_DISABLE_CLEANUP "Disable the magnetic field cleanup module, which requires recent Parthenon. Default false" OFF)
-option(KHARMA_TRACE "Compile with tracing: print entry and exit of important functions" OFF)
 if(KHARMA_DISABLE_IMPLICIT)
     message("Compiling without the implicit solver.  Extended GRMHD will be disabled!")
     target_compile_definitions(${EXE_NAME} PUBLIC DISABLE_IMPLICIT=1)
@@ -134,14 +137,13 @@ if(KHARMA_TRACE)
 else()
     target_compile_definitions(${EXE_NAME} PUBLIC TRACE=0)
 endif()
-option(KHARMA_DISABLE_IMPLICIT "Compile the implicit solver, requiring kokkos-kernels. Default true" OFF)
-option(KHARMA_TRACE "Compile with tracing: print entry and exit of important functions" OFF)
-if(KHARMA_DISABLE_IMPLICIT)
-    message("Compiling without the implicit solver.  Extended GRMHD will be disabled!")
-    target_compile_definitions(${EXE_NAME} PUBLIC ENABLE_IMPLICIT=0)
+if(KHARMA_DISABLE_MPI)
+    message("Compiling without MPI!")
+    target_compile_definitions(${EXE_NAME} PUBLIC ENABLE_MPI=0)
 else()
-    target_compile_definitions(${EXE_NAME} PUBLIC ENABLE_IMPLICIT=1)
+    target_compile_definitions(${EXE_NAME} PUBLIC ENABLE_MPI=1)
 endif()
+
 # FLAGS
 if(CMAKE_BUILD_TYPE)
     if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
diff --git a/kharma/decs.hpp b/kharma/decs.hpp
index ee86102d..6795d020 100644
--- a/kharma/decs.hpp
+++ b/kharma/decs.hpp
@@ -155,7 +155,7 @@ inline int MPIRank()
 }
 inline int MPIBarrier()
 {
-#if MPI_PARALLEL
+#if ENABLE_MPI
     return MPI_Barrier(MPI_COMM_WORLD);
 #else
     return 0;

From e6556c23ade2d0b9118b2a4ecc205e6a4e1558f6 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 11 Dec 2023 13:56:37 -0700
Subject: [PATCH 03/11] Update Chicoma compile configuration

---
 machines/chicoma.sh      | 41 ++++++++++++++++++++++++----------------
 scripts/batch/chicoma.sb | 24 +++++++++++++++++++++++
 2 files changed, 49 insertions(+), 16 deletions(-)
 create mode 100755 scripts/batch/chicoma.sb

diff --git a/machines/chicoma.sh b/machines/chicoma.sh
index 7e01fd52..65ff72db 100644
--- a/machines/chicoma.sh
+++ b/machines/chicoma.sh
@@ -1,48 +1,57 @@
 # LANL Machines: HPC and IC
 
 # Chicoma
+if [[ "$HOST" == "ch-fe"* && "$ARGS" == *"cuda"* ]]; then
+  echo "MUST BE COMPILED ON A GPU NODE!"
+  exit
+fi
 if [[ "$HOST" == "ch-fe"* || "$HOST" == "nid00"* ]]; then
   HOST_ARCH="ZEN2"
   NPROC=64
 
   # Cray environments get confused easy
   # Make things as simple as possible
-  # TODO version with Cray wrappers?
+  # TODO ONLY NVHPC WORKS
   module purge
   export CRAY_CPU_TARGET="x86-64"
   if [[ "$ARGS" == *"cuda"* ]]; then
     DEVICE_ARCH="AMPERE80"
     if [[ "$ARGS" == *"gnu"* ]]; then
-      module load PrgEnv-gnu
+      module load PrgEnv-gnu cudatoolkit
+    elif [[ "$ARGS" == *"gcc"* ]]; then
+      module load PrgEnv-nvhpc gcc
+      C_NATIVE=gcc
+      CXX_NATIVE=g++
     elif [[ "$ARGS" == *"intel"* ]]; then
       module load PrgEnv-intel
     elif [[ "$ARGS" == *"nvc++"* ]]; then
       module load PrgEnv-nvhpc
-      EXTRA_FLAGS="-DCMAKE_CUDA_COMPILER=$HOME/bin/nvc++-wrapper -DCMAKE_CUDA_COMPILER_ID=NVHPC -DCMAKE_CUDA_COMPILER_VERSION=11.6 $EXTRA_FLAGS"
+      EXTRA_FLAGS="-DCMAKE_CUDA_COMPILER=$SOURCE_DIR/bin/nvc++-wrapper -DCMAKE_CUDA_COMPILER_ID=NVHPC -DCMAKE_CUDA_COMPILER_VERSION=11.6 $EXTRA_FLAGS"
+      C_NATIVE=nvc
+      CXX_NATIVE=nvc++
     else
       module load PrgEnv-nvhpc
     fi
-    module load cpe-cuda cuda craype-accel-nvidia80
+    module load craype-accel-nvidia80
     # GPU runtime opts
-    MPI_NUM_PROCS=4
     MPI_EXTRA_ARGS="--cpu-bind=mask_cpu:0x0*16,0x1*16,0x2*16,0x3*16 $SOURCE_DIR/bin/select_gpu_chicoma"
     unset OMP_NUM_THREADS
     unset OMP_PROC_BIND
     unset OMP_PLACES
+
+    # Sometimes device-side buffers don't work
+    # if [conditions]
+    EXTRA_FLAGS="-DPARTHENON_ENABLE_HOST_COMM_BUFFERS=ON $EXTRA_FLAGS"
+    #else
+    #export MPICH_GPU_SUPPORT_ENABLED=1
+
   else
     module load PrgEnv-aocc
+    MPI_EXTRA_ARGS="--cpus-per-task=2"
   fi
-  # Use your own HDF5, Chicoma's is old
-  #module load cray-hdf5-parallel
-  module load cmake
-  # System HDF5 can't use compression
-  EXTRA_FLAGS="-DPARTHENON_DISABLE_HDF5_COMPRESSION=ON $EXTRA_FLAGS"
-  # Parthenon crashes with device buffers on some Nvidia machines...
-  # if [conditions]
-  EXTRA_FLAGS="-DPARTHENON_ENABLE_HOST_COMM_BUFFERS=ON $EXTRA_FLAGS"
-  #else
-  #export MPICH_GPU_SUPPORT_ENABLED=1
+  module load cmake cray-hdf5-parallel
 
   # Runtime opts
-  MPI_EXE=srun
+  MPI_EXE="srun"
+  MPI_NUM_PROCS=""
 fi
diff --git a/scripts/batch/chicoma.sb b/scripts/batch/chicoma.sb
new file mode 100755
index 00000000..a405634f
--- /dev/null
+++ b/scripts/batch/chicoma.sb
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Admin stuff
+#SBATCH -A t23_eh_variability_g
+#SBATCH -J KHARMA
+#SBATCH -t 12:00:00
+#SBATCH -N 1
+#SBATCH -o "out-%j.txt"
+
+# Partition
+##SBATCH -p gpu_debug --reservation gpu_debug --qos debug
+#SBATCH -p gpu
+
+# Node options: full, all CPU
+# Note we could do 32 if HT is faster
+#SBATCH --tasks-per-node=4
+#SBATCH --cpus-per-task=16
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+# All options/environment are taken care of in run.sh now
+# I should do that on more machines...
+
+KHARMA_DIR=$HOME/Code/kharma
+$KHARMA_DIR/run.sh -t 11:50:00 "$@"

From 21e44e706776326d3184e95b20cc0918eccf6897 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 11 Dec 2023 14:34:22 -0700
Subject: [PATCH 04/11] Update Darwin configuration for host buffers

---
 machines/darwin.sh | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/machines/darwin.sh b/machines/darwin.sh
index 8559346e..c5114273 100644
--- a/machines/darwin.sh
+++ b/machines/darwin.sh
@@ -5,33 +5,24 @@
 
 if [[ ($HOSTNAME == "cn"* || $HOSTNAME == "darwin"*) &&
       ("$PWD" == "/projects/jacamar-ci"* || "$PWD" == "/vast"*) ]]; then
-  #module purge # This messes things up on ARM nodes
+  module purge
   module load cmake
 
   # Where we're going, we don't need system libraries
   ARGS="$ARGS hdf5"
 
-  # Help Darwin find the right modules in automated jobs
-  if [[ "$ARGS" == *"cuda"* && "$ARGS" == *"arm-"* ]]; then
-    export MODULEPATH="/projects/darwin-nv/modulefiles/rhel8/aarch64:/projects/darwin-nv/modulefiles/rhel8/aarch64"
-  fi
-
   # Load compiler...
-  if [[ "$ARGS" == *"gcc12"* ]]; then
-    module load gcc/12.2.0 openmpi
-    C_NATIVE=gcc
-    CXX_NATIVE=g++
-  elif [[ "$ARGS" == *"gcc10"* ]]; then
-    module load gcc/10.4.0 openmpi
+  if [[ "$ARGS" == *"gcc10"* ]]; then
+    module load gcc/10.4.0
     C_NATIVE=gcc
     CXX_NATIVE=g++
   elif [[ "$ARGS" == *"gcc"* ]]; then
     # Default GCC
-    #module load gcc/13.1.0 openmpi
+    module load gcc/12.1.0
     C_NATIVE=gcc
     CXX_NATIVE=g++
   elif [[ "$ARGS" == *"aocc"* ]]; then
-    module load aocc openmpi
+    module load aocc
     C_NATIVE=clang
     CXX_NATIVE=clang++
   elif [[ "$ARGS" == *"nvhpc"* ]]; then
@@ -42,20 +33,20 @@ if [[ ($HOSTNAME == "cn"* || $HOSTNAME == "darwin"*) &&
     export NVHPC_CUDA_HOME="$CUDA_HOME"
     unset CUDA_HOME
   elif [[ "$ARGS" == *"icc"* ]]; then
-    module load intel-classic/2021.3.0 openmpi
+    module load intel-classic/2021.3.0
     C_NATIVE=icc
     CXX_NATIVE=icpc
   else
     # Default: NVHPC if cuda else IntelLLVM
     if [[ "$ARGS" == *"cuda"* ]]; then
-      module load nvhpc openmpi
+      module load nvhpc
       C_NATIVE="nvc"
       CXX_NATIVE="nvc++"
       # New NVHPC doesn't like CUDA_HOME
       export NVHPC_CUDA_HOME="$CUDA_HOME"
       unset CUDA_HOME
     else
-      module load intel openmpi
+      module load intel
       C_NATIVE=icx
       CXX_NATIVE=icpx
     fi
@@ -63,13 +54,17 @@ if [[ ($HOSTNAME == "cn"* || $HOSTNAME == "darwin"*) &&
 
   # ...any accelerator libraries...
   if [[ "$ARGS" == *"cuda"* ]]; then
-    module load cuda/12.0.0
+    module load cuda/12.0.0 nvhpc
+    PREFIX_PATH=$NVHPC_ROOT
+    EXTRA_FLAGS="-DPARTHENON_ENABLE_HOST_COMM_BUFFERS=ON $EXTRA_FLAGS"
   elif [[ "$ARGS" == *"hip"* ]]; then
     module load rocm/5.4.3 #openmpi/5.0.0rc11-gcc_13.1.0
     source ~/libs/env.sh
     C_NATIVE=hipcc
     CXX_NATIVE=hipcc
     export CXXFLAGS="-fopenmp $CXXFLAGS"
+  else
+    module load openmpi
   fi
 
   # ...and set architecture

From d24b51fe598971fc0cfb2dcbdba3fbe516acf15d Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 11 Dec 2023 14:37:22 -0700
Subject: [PATCH 05/11] Add some old comments better describing NOF floors

---
 kharma/floors/floors_functions.hpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/kharma/floors/floors_functions.hpp b/kharma/floors/floors_functions.hpp
index 7e3b4bba..6a13be32 100644
--- a/kharma/floors/floors_functions.hpp
+++ b/kharma/floors/floors_functions.hpp
@@ -260,31 +260,32 @@ KOKKOS_INLINE_FUNCTION int apply_floors(const GRCoordinates& G, const VariablePa
             // Update the conserved variables
             Flux::p_to_u(G, P, m_p, emhd_params, gam, k, j, i, U, m_u, loc);
         } else {
-            // Add the material in the normal observer frame, by:
-            // Adding the floors to the primitive variables
+            // Add the material in the normal observer frame.
+            // 1. Calculate how much material we're adding.
+            // This is an estimate, as it's what we'd have to do in fluid frame
             const Real rho_add    = m::max(0., rhoflr_max - rho);
             const Real u_add      = m::max(0., uflr_max - u);
             const Real uvec[NVEC] = {0}, B[NVEC] = {0};
 
-            // Calculating the corresponding conserved variables
+            // 2. Calculate the increase in conserved mass/energy corresponding to the new material.
             Real rho_ut, T[GR_DIM];
             GRMHD::p_to_u_mhd(G, rho_add, u_add, uvec, B, gam, k, j, i, rho_ut, T, loc);
 
-            // Add new conserved mass/energy to the current "conserved" state,
-            // and to the local primitives as a guess
+            // 3. Add new conserved mass/energy to the current "conserved" state.
+            // Also add to the local primitives as a guess
             P(m_p.RHO, k, j, i) += rho_add;
             P(m_p.UU, k, j, i)  += u_add;
             // Add any velocity here
             U(m_u.RHO, k, j, i) += rho_ut;
-            U(m_u.UU, k, j, i)  += T[0]; // Note this shouldn't be a single loop: m_u.U1 != m_u.UU + 1 necessarily
+            U(m_u.UU, k, j, i)  += T[0]; // Note that m_u.U1 != m_u.UU + 1 necessarily
             U(m_u.U1, k, j, i)  += T[1];
             U(m_u.U2, k, j, i)  += T[2];
             U(m_u.U3, k, j, i)  += T[3];
             
             // Recover primitive variables from conserved versions
-            // TODO selector here when we get more
+            // TODO selector here when we get more options
             Inverter::Status pflag = Inverter::u_to_p<Inverter::Type::onedw>(G, U, m_u, gam, k, j, i, P, m_p, loc);
-            // If that fails, we've effectively already applied the floors in fluid-frame to the prims,
+            // 4. If the inversion fails, we've effectively already applied the floors in fluid-frame to the prims,
             // so we just formalize that
             if (Inverter::failed(pflag)) {
                 Flux::p_to_u(G, P, m_p, emhd_params, gam, k, j, i, U, m_u, loc);

From eafdfbb21a91953331f24e60d8b6506e8d873f20 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Mon, 11 Dec 2023 15:27:17 -0700
Subject: [PATCH 06/11] Attempt to fix Darwin config again

---
 machines/darwin.sh    | 12 +++++++++---
 scripts/ci/darwin.yml |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/machines/darwin.sh b/machines/darwin.sh
index c5114273..14d6e7fb 100644
--- a/machines/darwin.sh
+++ b/machines/darwin.sh
@@ -11,7 +11,7 @@ if [[ ($HOSTNAME == "cn"* || $HOSTNAME == "darwin"*) &&
   # Where we're going, we don't need system libraries
   ARGS="$ARGS hdf5"
 
-  # Load compiler...
+  # 1. Load compiler stack
   if [[ "$ARGS" == *"gcc10"* ]]; then
     module load gcc/10.4.0
     C_NATIVE=gcc
@@ -52,7 +52,7 @@ if [[ ($HOSTNAME == "cn"* || $HOSTNAME == "darwin"*) &&
     fi
   fi
 
-  # ...any accelerator libraries...
+  # 2. Load accelerator libraries
   if [[ "$ARGS" == *"cuda"* ]]; then
     module load cuda/12.0.0 nvhpc
     PREFIX_PATH=$NVHPC_ROOT
@@ -64,10 +64,16 @@ if [[ ($HOSTNAME == "cn"* || $HOSTNAME == "darwin"*) &&
     CXX_NATIVE=hipcc
     export CXXFLAGS="-fopenmp $CXXFLAGS"
   else
+    # load OpenMPI for CPU builds...
     module load openmpi
   fi
 
-  # ...and set architecture
+  # ... or if we force it (CI)
+  if [[ "$ARGS" == *"ompi"* ]]; then
+    module load openmpi
+  fi
+
+  # 3. Set architecture
   # These are orthogonal to above, so long as the hardware
   # supports the paradigm
   # Note this also specifies cores to use for compiling
diff --git a/scripts/ci/darwin.yml b/scripts/ci/darwin.yml
index c0cd1ccf..bc0b5de9 100644
--- a/scripts/ci/darwin.yml
+++ b/scripts/ci/darwin.yml
@@ -57,7 +57,7 @@ build:
     - echo "Skipping pyharm install in build."
   script:
     - export PREFIX_PATH=$PWD/external/hdf5
-    - ./make.sh clean cuda hdf5 volta
+    - ./make.sh clean cuda hdf5 volta ompi
   artifacts:
     paths:
       - kharma.*

From 4d941ab28117fff0eb831db8bb14829442fae34c Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Wed, 13 Dec 2023 07:19:58 -0600
Subject: [PATCH 07/11] Parse ~/.config/kharma.sh if present instead of machine
 file, remove bp.sh

---
 Dockerfile     |  11 -----
 machines/bp.sh | 131 -------------------------------------------------
 make.sh        |  26 +++++-----
 3 files changed, 14 insertions(+), 154 deletions(-)
 delete mode 100644 Dockerfile
 delete mode 100644 machines/bp.sh

diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 34495d5b..00000000
--- a/Dockerfile
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM nvcr.io/nvidia/nvhpc:22.9-devel-cuda_multi-rockylinux8
-
-# NVIDIA container has PowerTools, EPEL, dev tools installed
-
-COPY . /app/
-
-ENV PREFIX_PATH="/app/external/hdf5" DEVICE_ARCH=VOLTA70 C_NATIVE=nvc CXX_NATIVE=nvc++
-
-RUN cd /app && bash -ic './make.sh clean cuda hdf5'
-
-CMD /app/kharma.cuda
diff --git a/machines/bp.sh b/machines/bp.sh
deleted file mode 100644
index df4e4ca5..00000000
--- a/machines/bp.sh
+++ /dev/null
@@ -1,131 +0,0 @@
-
-# BP's machines
-
-if [[ $HOST == "pn2400633"* ]]; then
-  export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
-  PREFIX_PATH=/opt/homebrew/
-  C_NATIVE=/opt/homebrew/bin/gcc-13
-  CXX_NATIVE=/opt/homebrew/bin/g++-13
-  CXXFLAGS="-Wl,-ld_classic"
-fi
-
-if [[ $HOST == "cheshire"* ]]; then
-  HOST_ARCH="HSW"
-  DEVICE_ARCH="PASCAL61"
-  export OMP_NUM_THREADS=24
-
-  if [[ "$ARGS" == *"cuda"* ]]; then
-    # NVHPC. Compiler is chosen automatically now
-    module load nvhpc
-    NPROC=8 # so much memory
-  else
-    if [[ "$ARGS" == *"gcc"* ]]; then
-      # GCC
-      module load mpi/mpich-x86_64
-      C_NATIVE=gcc
-      CXX_NATIVE=g++
-    else
-      # Intel oneAPI
-      module load compiler mpi/2021
-    fi
-    NPROC=24
-  fi
-  # Even CPU kharma is unkillable without this
-  MPI_EXE=mpirun
-fi
-
-if [[ $HOST == "toolbox"* || $HOST == "nvhpc"* ]]; then
-  METAL_HOSTNAME=$(cat ~/.config/hostname)
-fi
-
-if [[ $METAL_HOSTNAME == "fermium" ]]; then
-  HOST_ARCH="AMDAVX"
-  # We patch Kokkos to make this gfx1101==rx7800xt
-  DEVICE_ARCH="AMD_GFX1100"
-  # MPI & Kokkos separately dislike running the bin alone
-  #MPI_EXE=mpirun
-  NPROC=24
-
-  if [[ "$ARGS" == *"hip"* ]]; then
-    # AMD for GPUs (this will be run in container, no modules)
-    C_NATIVE=hipcc
-    CXX_NATIVE=hipcc
-  else
-    # AMD for CPUs
-    module load aocc-compiler-4.1.0 mpi
-    CXX_NATIVE=clang++
-    C_NATIVE=clang
-  fi
-fi
-
-if [[ $METAL_HOSTNAME == "ferrum" ]]; then
-  HOST_ARCH="HSW"
-  NPROC=6
-
-  if [[ "$ARGS" == *"gcc"* ]]; then
-    module load mpi/mpich-x86_64
-    C_NATIVE="gcc"
-    CXX_NATIVE="g++"
-  elif [[ "$ARGS" == *"icc"* ]]; then
-    # Intel compiler
-    module purge
-    module load compiler mpi
-    C_NATIVE="icc"
-    CXX_NATIVE="icpc"
-  else
-    # Intel SYCL implementation "DPC++"
-    module purge
-    module load compiler mpi
-    C_NATIVE="icx"
-    CXX_NATIVE="icpx"
-  fi
-fi
-
-if [[ $HOST == "cinnabar"* ]]; then
-  # All my MPI stacks can use this as the call
-  MPI_EXE=mpirun
-
-  module purge # Handle modules inside this script
-  HOST_ARCH="HSW" # This won't change
-  DEVICE_ARCH="TURING75"
-  NPROC=56
-
-  # Runtime
-  MPI_NUM_PROCS=1
-
-  # TODO container:
-  # module swap nvhpc-hpcx nvhpc
-
-  if [[ "$ARGS" == *"cuda"* ]]; then
-    # Runtime
-    MPI_EXTRA_ARGS="--map-by ppr:1:numa:pe=14"
-
-    # System CUDA path
-    EXTRA_FLAGS="-DCUDAToolkit_INCLUDE_DIR=/usr/include/cuda $EXTRA_FLAGS"
-
-    # Switch between g++/NVC++:
-    if [[ "$ARGS" == *"gcc"* ]]; then
-      module load mpi/mpich-x86_64 nvhpc-nompi
-      C_NATIVE="gcc"
-      CXX_NATIVE="g++"
-    else
-      module load nvhpc/23.7
-      PREFIX_PATH="$HOME/libs/hdf5-nvhpc"
-      C_NATIVE="nvc"
-      CXX_NATIVE="nvc++"
-      #export CXXFLAGS="-mp"
-    fi
-  else
-    if [[ "$ARGS" == *"gcc"* ]]; then
-      # GCC
-      module load mpi/mpich-x86_64
-      C_NATIVE="gcc"
-      CXX_NATIVE="g++"
-    else
-      # Intel by default
-      module load compiler mpi
-      C_NATIVE="icx"
-      CXX_NATIVE="icpx"
-    fi
-  fi
-fi
diff --git a/make.sh b/make.sh
index 71b241fa..2fa93b7e 100755
--- a/make.sh
+++ b/make.sh
@@ -29,25 +29,27 @@
 # Set in environment or override in machine file
 NPROC=${NPROC:-8}
 
-### Load machine-specific configurations ###
+### Load basic stuff ###
 HOST=$(hostname -f)
 if [ -z $HOST ]; then
   HOST=$(hostname)
 fi
 ARGS="$*"
 SOURCE_DIR=$(dirname "$(readlink -f "$0")")
-for machine in machines/*.sh
-do
-  source $machine
-done
 
-# If we haven't special-cased already, guess an architecture
-# This only works with newer Kokkos, it's always best to
-# specify HOST_ARCH in a machine file once you know it.
-if [[ -z "$HOST_ARCH" ]]; then
-  HOST_ARCH="NATIVE"
+# A machine config in .config overrides our defaults
+if [ -f $HOME/.config/kharma.sh ]; then
+  source $HOME/.config/kharma.sh
+else
+  for machine in $SOURCE_DIR/machines/*.sh
+  do
+    source $machine
+  done
 fi
-EXTRA_FLAGS="-DKokkos_ARCH_${HOST_ARCH}=ON $EXTRA_FLAGS"
+
+# Default to compiling for the host architecture
+# Always better to specify, though, for cross-compile/older Kokkos support
+EXTRA_FLAGS="-DKokkos_ARCH_${HOST_ARCH:-NATIVE}=ON $EXTRA_FLAGS"
 
 # Kokkos does *not* support compiling for multiple devices!
 # But if they ever do, you can separate a list of DEVICE_ARCH
@@ -73,10 +75,10 @@ fi
 
 ### Enivoronment Prep ###
 if [[ "$(which python3 2>/dev/null)" == *"conda"* ]]; then
-  echo
   echo "make.sh note:"
   echo "It looks like you have Anaconda loaded."
   echo "This is usually okay, but double-check the line 'Found MPI_CXX:' below!"
+  echo
 fi
 # Save arguments if we've changed them
 # Used in run.sh for loading the same modules/etc.

From 525149a255625090d752b84b8125f1a9806a882f Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Thu, 14 Dec 2023 15:05:18 -0700
Subject: [PATCH 08/11] Bump parthenon to fix SMR on small grids

---
 external/parthenon | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/parthenon b/external/parthenon
index 665aedf0..efad02d3 160000
--- a/external/parthenon
+++ b/external/parthenon
@@ -1 +1 @@
-Subproject commit 665aedf0bf816d6894d474a2e742fd7b84d4fd6f
+Subproject commit efad02d395d29b441001100efa5bb3e5831fb19d

From e045255ffd2a93518527b244eb749e2c6852ca73 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Thu, 14 Dec 2023 15:19:20 -0700
Subject: [PATCH 09/11] git merge weirdness

---
 external/parthenon | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/parthenon b/external/parthenon
index efad02d3..4489b595 160000
--- a/external/parthenon
+++ b/external/parthenon
@@ -1 +1 @@
-Subproject commit efad02d395d29b441001100efa5bb3e5831fb19d
+Subproject commit 4489b595ed68cfe79629320854315b5986f9c1f0

From 2864ff9075ea7392a94395a6c036efef0dcda7c0 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprathr2@illinois.edu>
Date: Fri, 15 Dec 2023 12:30:33 -0600
Subject: [PATCH 10/11] Readme, comments, overdue parfile updates

---
 README.md                              | 25 +++------
 kharma/boundaries/boundaries.cpp       |  5 +-
 pars/smr/orszag_tang_refined_small.par | 76 ++++++++++++++++++++++++++
 pars/smr/sane3d_refined.par            | 19 +++----
 run.sh                                 | 18 +++---
 5 files changed, 107 insertions(+), 36 deletions(-)
 create mode 100644 pars/smr/orszag_tang_refined_small.par

diff --git a/README.md b/README.md
index fd768d76..be89ab59 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # KHARMA
 KHARMA is an implementation of the HARM scheme for gerneral relativistic magnetohydrodynamics (GRMHD) in C++.  It is based on the Parthenon AMR infrastructure, using Kokkos for parallelism and GPU support.  It is composed of modular "packages," which in theory make it easy to add or swap components representing different physical processes.
 
-The project is capable of the same GRMHD functions found in e.g. [iharm3d](https://github.com/AFD-Illinois/iharm3d). Support for adaptive mesh refinement is planned, but not yet working for runs involving magnetic field transport.
+KHARMA is capable of closely matching other HARM implementations, e.g. [iharm3d](https://github.com/AFD-Illinois/iharm3d). However, it also extends the scheme with additional options for magnetic field transport, reconstruction, etc.  Notably, it implements a split face-centered CT scheme, allowing static and adaptive mesh refinement.
 
 ## Prerequisites
 KHARMA requires that the system have a C++17-compliant compiler, MPI, and parallel HDF5.  All other dependencies are included as submodules, and can be checked out with `git` by running
@@ -18,29 +18,22 @@ Old submodules are a common cause of compile errors!
 ## Compiling
 On directly supported systems, or systems with standard install locations, you may be able to run:
 ```bash
-./make.sh clean
+./make.sh clean [cuda hip sycl]
 ```
-And (possibly) the following to compile for GPU with CUDA:
-```bash
-./make.sh clean cuda
-```
-after a *successful* compile, subsequent invocations can omit `clean`.
+after a *successful* compile, subsequent invocations can omit `clean`.  If this command fails on supported machines (those with a file in `machines/`), please open an issue.  Broken builds aren't uncommon, as HPC machines change software all the time.
 
-If (when) these fail, take a look at the [wiki page](https://github.com/AFD-Illinois/kharma/wiki/Building-KHARMA), and the `make.sh` source code.  At worst this should involve running something like
-```bash
-PREFIX_PATH="/absolute/path/to/phdf5;/absolute/path/to/cuda" HOST_ARCH=CPUVER DEVICE_ARCH=GPUVER ./make.sh clean cuda
-```
-Where `CPUVER` and `GPUVER` are the strings used by Kokkos to denote a particular architecture & set of compile flags (Note `make.sh` needs only the portion of the flag *after* `Kokkos_ARCH_`).
+If running KHARMA on a new machine (or repairing the build on an old one), take a look at the [wiki page](https://github.com/AFD-Illinois/kharma/wiki/Building-KHARMA) describing the build system.
 
 ## Running
 Run a particular problem with e.g.
 ```bash
-$ ./kharma.host -i pars/orszag_tang.par
+$ ./kharma.host -i pars/tests/orszag_tang.par
 ```
+note that *all* options are runtime.  The single KHARMA binary can run any of the parameter files in `pars/`, and indeed this is checked as a part of the regression tests.  Note you can still disable some sub-systems manually at compile time, and of course in that case the accompanying problems will crash.
 
-KHARMA benefits from certain runtime environment variables and CPU pinning, included in a short wrapper script `run.sh`.  Note that some MPI implementations require that KHARMA be run using `mpirun`, even for a single process, and may cause errors or hangs otherwise.
+KHARMA benefits from certain runtime environment variables and CPU pinning, included in a short wrapper script `run.sh`.  This script is provided mostly as an optional convenience, and an example of how to construct your own batch scripts for running KHARMA in production.  Other example batch scripts are in the `scripts/batch/` folder.
 
-Except for performance tuning, KHARMA has no compile time parameters: all of the parameters specifying a simulation are listed in the input "deck" `problem_name.par`.  Several sample inputs corresponding to standard tests and astrophysical systems are included in `pars/`.  Further information can be found on the [wiki page](https://github.com/AFD-Illinois/kharma/wiki/Running-KHARMA).
+Further information can be found on the [wiki page](https://github.com/AFD-Illinois/kharma/wiki/Running-KHARMA).
 
 ## Hacking
 KHARMA has some preliminary documentation for developers, hosted in its GitHub [wiki](https://github.com/AFD-Illinois/kharma/wiki).
@@ -50,4 +43,4 @@ KHARMA is made available under the BSD 3-clause license included in each file an
 
 This repository also carries a substantial portion of the [Kokkos Kernels](https://github.com/kokkos/kokkos-kernels), in the directory `kharma/implicit/kokkos-kernels-pivoted`, which is provided under the license included in that directory.
 
-Submodules of this repository, [Parthenon](https://github.com/parthenon-hpc-lab/parthenon) and [mpark::variant](https://github.com/mpark/variant) are made available under their own licenses.
\ No newline at end of file
+Submodules of this repository are subject to their own licenses.
diff --git a/kharma/boundaries/boundaries.cpp b/kharma/boundaries/boundaries.cpp
index 8068fc46..d84426b6 100644
--- a/kharma/boundaries/boundaries.cpp
+++ b/kharma/boundaries/boundaries.cpp
@@ -475,7 +475,7 @@ TaskStatus KBoundaries::FixFlux(MeshData<Real> *md)
 
             if (bdir > ndim) continue;
 
-            // Set ranges based
+            // Set ranges for entire width.  Probably not needed for fluxes but won't hurt
             IndexRange ib = ibe, jb = jbe, kb = kbe;
             // Range for inner_x1 bounds is first face only, etc.
             if (bdir == 1) {
@@ -520,7 +520,8 @@ TaskStatus KBoundaries::FixFlux(MeshData<Real> *md)
                         "zero_flux_" + bname, 0, F.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.s, ib.s, ib.e,
                         KOKKOS_LAMBDA(const int &p, const int &k, const int &j, const int &i) {
                             F.flux(bdir, p, k, j, i) = 0.;
-                        });
+                        }
+                    );
                 }
             }
         }
diff --git a/pars/smr/orszag_tang_refined_small.par b/pars/smr/orszag_tang_refined_small.par
new file mode 100644
index 00000000..6f516862
--- /dev/null
+++ b/pars/smr/orszag_tang_refined_small.par
@@ -0,0 +1,76 @@
+# An Orszag-Tang vortex designed to trigger
+# an old Parthenon bug, or any bugs related
+# to SMR interacting with periodic boundaries
+
+<parthenon/job>
+problem_id = orszag_tang
+
+<parthenon/mesh>
+refinement = static
+numlevel = 2
+nx1 = 64
+x1min = -3.141592653589793
+x1max = 3.141592653589793
+
+nx2 = 64
+x2min = -3.141592653589793
+x2max = 3.141592653589793
+
+nx3 = 1
+x3min = -0.01
+x3max = 0.01
+
+<parthenon/meshblock>
+nx1 = 32
+nx2 = 32
+nx3 = 1
+
+<parthenon/static_refinement0>
+x1min = -3.14
+x1max = -3.1
+x2min = -3.14
+x2max = -3.1
+level = 1
+
+# Set boring box coordinates. Explanations in bondi.par
+<coordinates>
+base = cartesian_minkowski
+transform = null
+
+<parthenon/time>
+# tlim will be overridden depending on the problem
+tlim = 100.0
+integrator = rk2
+
+<GRMHD>
+cfl = 0.9
+gamma = 1.666667
+
+<b_field>
+solver = face_ct
+#type = wave
+
+<floors>
+# Disable all floor applications in this problem
+disable_floors = true
+
+<debug>
+verbose = 1
+extra_checks = 2
+flag_verbose = 0
+
+<driver>
+type = kharma
+reconstruction = weno5
+
+# Primary HDF5 output enabled in most problems
+<parthenon/output0>
+file_type = hdf5
+dt = 1.0
+variables = prims, divB
+
+# Text file with statistics (e.g. fluxes, floors hit)
+<parthenon/output1>
+file_type = hst
+dt = 0.1
+
diff --git a/pars/smr/sane3d_refined.par b/pars/smr/sane3d_refined.par
index b6ea072d..e9bc0811 100644
--- a/pars/smr/sane3d_refined.par
+++ b/pars/smr/sane3d_refined.par
@@ -27,8 +27,8 @@ level = 1
 
 <coordinates>
 base = spherical_ks
-transform = mks
-r_out = 20
+transform = fmks
+r_out = 100
 a = 0.9375
 
 <parthenon/time>
@@ -69,16 +69,13 @@ bsq_over_rho_max = 100
 
 <parthenon/output0>
 file_type = hdf5
-dt = 0.0
+dt = 5.0
 single_precision_output = true
-variables = prims, divB
-#ghost_zones = true
-
-# Can't until face field output is enabled
-#<parthenon/output1>
-#file_type = rst
-#dt = 100.0
-#ghost_zones = true
+variables = prims, divB, fflag, pflag
+
+<parthenon/output1>
+file_type = rst
+dt = 100.0
 
 <parthenon/output2>
 file_type = hst
diff --git a/run.sh b/run.sh
index 275f5968..7e333daf 100755
--- a/run.sh
+++ b/run.sh
@@ -33,8 +33,8 @@ elif [ -f $KHARMA_DIR/kharma.hip ]; then
 elif [ -f $KHARMA_DIR/kharma.host ]; then
   EXE_NAME=kharma.host
   # Enable OpenMP to use all threads only where not counterproductive
-  export OMP_PROC_BIND=${OMP_PROC_BIND:-spread}
-  export OMP_PLACES=${OMP_PLACES:-threads}
+  #export OMP_PROC_BIND=${OMP_PROC_BIND:-spread}
+  #export OMP_PLACES=${OMP_PLACES:-threads}
   # Force a number of OpenMP threads if it doesn't autodetect
   #export OMP_NUM_THREADS=${OMP_NUM_THREADS:-28}
 else
@@ -42,11 +42,6 @@ else
   exit
 fi
 
-# Optionally use the Kokkos tools to profile kernels
-#export KOKKOS_TOOLS_LIBS=$KHARMA_DIR/../kokkos-tools/kp_kernel_timer.so
-#export KOKKOS_TOOLS_LIBS=$KHARMA_DIR/../kokkos-tools/kp_nvprof_cnnector.so
-#export KOKKOS_TOOLS_LIBS=$KHARMA_DIR/../kokkos-tools/kp_kernel_logger.so
-
 # Load environment from the same files as the compile process
 HOST=$(hostname -f)
 ARGS=$(cat $KHARMA_DIR/make_args)
@@ -60,6 +55,15 @@ if [[ "$1" == "trace" ]]; then
   export KOKKOS_TOOLS_LIBS=$KHARMA_DIR/../kokkos-tools/kp_kernel_logger.so
   shift
 fi
+if [[ "$1" == "prof" ]]; then
+  export KOKKOS_TOOLS_LIBS=$KHARMA_DIR/../kokkos-tools/kp_kernel_timer.so
+  shift
+fi
+if [[ "$1" == "nvprof" ]]; then
+  export KOKKOS_TOOLS_LIBS=$KHARMA_DIR/../kokkos-tools/kp_nvprof_connector.so
+  shift
+fi
+
 # Override MPI_NUM_PROCS at user option "-n"
 # and OMP_NUM_THREADS at option "-nt"
 if [[ "$1" == "-n" ]]; then

From ed68b68bfa091d60cdd2063a83fdc793301ca091 Mon Sep 17 00:00:00 2001
From: Ben Prather <bprather@lanl.gov>
Date: Fri, 15 Dec 2023 13:22:58 -0700
Subject: [PATCH 11/11] Cleanup, make 2D SMR sane beefier

---
 .gitignore                  |  1 +
 pars/smr/sane2d_refined.par | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0cf1683b..40dc7d35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ kharma_parsed_*.par
 log_*.txt
 bondi_analytic_*.txt
 atmosphere_soln_*.txt
+shock_soln_*.txt
 
 # Editor documents
 .project
diff --git a/pars/smr/sane2d_refined.par b/pars/smr/sane2d_refined.par
index fb410403..22eade4d 100644
--- a/pars/smr/sane2d_refined.par
+++ b/pars/smr/sane2d_refined.par
@@ -7,26 +7,26 @@ problem_id = torus
 <parthenon/mesh>
 refinement = static
 numlevel = 2
-nx1 = 192
-nx2 = 192
+nx1 = 512
+nx2 = 640
 nx3 = 1
 
 <parthenon/meshblock>
-nx1 = 64
-nx2 = 64
+nx1 = 256
+nx2 = 128
 nx3 = 1
 
 <parthenon/static_refinement0>
 x1min = 1.0
 x1max = 3.0
-x2min = 1.57
-x2max = 1.57
-level = 1
+x2min = 0.49
+x2max = 0.51
+level = 2
 
 <coordinates>
 base = spherical_ks
-transform = eks
-r_out = 50
+transform = fmks
+r_out = 500
 a = 0.9375
 
 <parthenon/time>
@@ -62,12 +62,12 @@ beta_min = 100.
 <floors>
 rho_min_geom = 1e-6
 u_min_geom = 1e-8
-u_over_rho_max = 100
-bsq_over_rho_max = 100
+#u_over_rho_max = 100
+bsq_over_rho_max = 500
 
 <parthenon/output0>
 file_type = hdf5
-dt = 10.0
+dt = 5.0
 single_precision_output = true
 variables = prims.rho, prims.u, prims.uvec, prims.B, divB