diff --git a/.gitignore b/.gitignore
index 24f794a4..35923df0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@ driver/examples/comm
 20*-*-*-*-*-*.json
 *.pkl
 
+# example outputs
+examples/mpi/output
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/examples/NDSL/03_orchestration_basics.ipynb b/examples/NDSL/03_orchestration_basics.ipynb
index eea24154..01a77dd8 100644
--- a/examples/NDSL/03_orchestration_basics.ipynb
+++ b/examples/NDSL/03_orchestration_basics.ipynb
@@ -37,7 +37,7 @@
     ")\n",
     "from ndsl.constants import X_DIM, Y_DIM, Z_DIM\n",
     "from ndsl.dsl.typing import FloatField, Float\n",
-    "from ndsl.boilerplate import get_factories_single_tile_orchestrated_cpu"
+    "from ndsl.boilerplate import get_factories_single_tile_orchestrated"
    ]
   },
   {
@@ -126,7 +126,7 @@
     "    tile_size = (3, 3, 3)\n",
     "\n",
     "    # Setup\n",
-    "    stencil_factory, qty_factory = get_factories_single_tile_orchestrated_cpu(\n",
+    "    stencil_factory, qty_factory = get_factories_single_tile_orchestrated(\n",
     "        nx=tile_size[0],\n",
     "        ny=tile_size[1],\n",
     "        nz=tile_size[2],\n",
diff --git a/examples/mpi/.gitignore b/examples/mpi/.gitignore
deleted file mode 100644
index 53752db2..00000000
--- a/examples/mpi/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-output
diff --git a/ndsl/__init__.py b/ndsl/__init__.py
index a2f771cd..5ec303de 100644
--- a/ndsl/__init__.py
+++ b/ndsl/__init__.py
@@ -10,7 +10,7 @@
 from .dsl.dace.utils import (
     ArrayReport,
     DaCeProgress,
-    MaxBandwithBenchmarkProgram,
+    MaxBandwidthBenchmarkProgram,
     StorageReport,
 )
 from .dsl.dace.wrapped_halo_exchange import WrappedHaloUpdater
diff --git a/ndsl/boilerplate.py b/ndsl/boilerplate.py
index f22e0b9f..a777cd82 100644
--- a/ndsl/boilerplate.py
+++ b/ndsl/boilerplate.py
@@ -16,6 +16,7 @@
     TileCommunicator,
     TilePartitioner,
 )
+from ndsl.optional_imports import cupy as cp
 
 
 def _get_factories(
@@ -74,13 +75,15 @@ def _get_factories(
 
     grid_indexing = GridIndexing.from_sizer_and_communicator(sizer, comm)
     stencil_factory = StencilFactory(config=stencil_config, grid_indexing=grid_indexing)
-    quantity_factory = QuantityFactory(sizer, np)
+    quantity_factory = QuantityFactory(
+        sizer, cp if stencil_config.is_gpu_backend else np
+    )
 
     return stencil_factory, quantity_factory
 
 
-def get_factories_single_tile_orchestrated_cpu(
-    nx, ny, nz, nhalo
+def get_factories_single_tile_orchestrated(
+    nx, ny, nz, nhalo, on_cpu: bool = True
 ) -> Tuple[StencilFactory, QuantityFactory]:
     """Build a Stencil & Quantity factory for orchestrated CPU, on a single tile topology."""
     return _get_factories(
@@ -88,22 +91,21 @@ def get_factories_single_tile_orchestrated_cpu(
         ny=ny,
         nz=nz,
         nhalo=nhalo,
-        backend="dace:cpu",
+        backend="dace:cpu" if on_cpu else "dace:gpu",
         orchestration=DaCeOrchestration.BuildAndRun,
         topology="tile",
     )
 
 
-def get_factories_single_tile_numpy(
-    nx, ny, nz, nhalo
+def get_factories_single_tile(
+    nx, ny, nz, nhalo, backend: str = "numpy"
 ) -> Tuple[StencilFactory, QuantityFactory]:
-    """Build a Stencil & Quantity factory for Numpy, on a single tile topology."""
     return _get_factories(
         nx=nx,
         ny=ny,
         nz=nz,
         nhalo=nhalo,
-        backend="numpy",
+        backend=backend,
         orchestration=DaCeOrchestration.Python,
         topology="tile",
     )
diff --git a/ndsl/comm/boundary.py b/ndsl/comm/boundary.py
index 020798c6..f6016f6e 100644
--- a/ndsl/comm/boundary.py
+++ b/ndsl/comm/boundary.py
@@ -28,7 +28,7 @@ def send_view(self, quantity: Quantity, n_points: int):
         return self._view(quantity, n_points, interior=True)
 
     def recv_view(self, quantity: Quantity, n_points: int):
-        """Return a sliced view of points which should be recieved at this boundary.
+        """Return a sliced view of points which should be received at this boundary.
 
         Args:
             quantity: quantity for which to return a slice
@@ -37,7 +37,7 @@ def recv_view(self, quantity: Quantity, n_points: int):
         return self._view(quantity, n_points, interior=False)
 
     def send_slice(self, specification: QuantityHaloSpec) -> Tuple[slice]:
-        """Return the index slices which shoud be sent at this boundary.
+        """Return the index slices which should be sent at this boundary.
 
         Args:
             specification: data specifications for the halo. Including shape
diff --git a/ndsl/comm/caching_comm.py b/ndsl/comm/caching_comm.py
index 36587d73..42f92ea2 100644
--- a/ndsl/comm/caching_comm.py
+++ b/ndsl/comm/caching_comm.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from ndsl.comm.comm_abc import Comm, Request
+from ndsl.comm.comm_abc import Comm, ReductionOperator, Request
 
 
 T = TypeVar("T")
@@ -147,9 +147,12 @@ def Split(self, color, key) -> "CachingCommReader":
         new_data = self._data.get_split()
         return CachingCommReader(data=new_data)
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op: Optional[ReductionOperator] = None) -> Any:
         return self._data.get_generic_obj()
 
+    def Allreduce(self, sendobj, recvobj, op: ReductionOperator) -> Any:
+        raise NotImplementedError("CachingCommReader.Allreduce")
+
     @classmethod
     def load(cls, file: BinaryIO) -> "CachingCommReader":
         data = CachingCommData.load(file)
@@ -229,7 +232,10 @@ def Split(self, color, key) -> "CachingCommWriter":
     def dump(self, file: BinaryIO):
         self._data.dump(file)
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op: Optional[ReductionOperator] = None) -> Any:
         result = self._comm.allreduce(sendobj, op)
         self._data.generic_obj_buffers.append(copy.deepcopy(result))
         return result
+
+    def Allreduce(self, sendobj, recvobj, op: ReductionOperator) -> Any:
+        raise NotImplementedError("CachingCommWriter.Allreduce")
diff --git a/ndsl/comm/comm_abc.py b/ndsl/comm/comm_abc.py
index 77f56586..45596f1e 100644
--- a/ndsl/comm/comm_abc.py
+++ b/ndsl/comm/comm_abc.py
@@ -1,10 +1,30 @@
 import abc
+import enum
 from typing import List, Optional, TypeVar
 
 
 T = TypeVar("T")
 
 
+@enum.unique
+class ReductionOperator(enum.Enum):
+    OP_NULL = enum.auto()
+    MAX = enum.auto()
+    MIN = enum.auto()
+    SUM = enum.auto()
+    PROD = enum.auto()
+    LAND = enum.auto()
+    BAND = enum.auto()
+    LOR = enum.auto()
+    BOR = enum.auto()
+    LXOR = enum.auto()
+    BXOR = enum.auto()
+    MAXLOC = enum.auto()
+    MINLOC = enum.auto()
+    REPLACE = enum.auto()
+    NO_OP = enum.auto()
+
+
 class Request(abc.ABC):
     @abc.abstractmethod
     def wait(self):
@@ -69,5 +89,12 @@ def Split(self, color, key) -> "Comm":
         ...
 
     @abc.abstractmethod
-    def allreduce(self, sendobj: T, op=None) -> T:
+    def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
+        ...
+
+    @abc.abstractmethod
+    def Allreduce(self, sendobj: T, recvobj: T, op: ReductionOperator) -> T:
+        ...
+
+    def Allreduce_inplace(self, obj: T, op: ReductionOperator) -> T:
         ...
diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index ff270df5..014142da 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -6,6 +6,8 @@
 import ndsl.constants as constants
 from ndsl.buffer import array_buffer, device_synchronize, recv_buffer, send_buffer
 from ndsl.comm.boundary import Boundary
+from ndsl.comm.comm_abc import Comm as CommABC
+from ndsl.comm.comm_abc import ReductionOperator
 from ndsl.comm.partitioner import CubedSpherePartitioner, Partitioner, TilePartitioner
 from ndsl.halo.updater import HaloUpdater, HaloUpdateRequest, VectorInterfaceHaloUpdater
 from ndsl.performance.timer import NullTimer, Timer
@@ -44,7 +46,11 @@ def to_numpy(array, dtype=None) -> np.ndarray:
 
 class Communicator(abc.ABC):
     def __init__(
-        self, comm, partitioner, force_cpu: bool = False, timer: Optional[Timer] = None
+        self,
+        comm: CommABC,
+        partitioner,
+        force_cpu: bool = False,
+        timer: Optional[Timer] = None,
     ):
         self.comm = comm
         self.partitioner: Partitioner = partitioner
@@ -61,7 +67,7 @@ def tile(self) -> "TileCommunicator":
     @abc.abstractmethod
     def from_layout(
         cls,
-        comm,
+        comm: CommABC,
         layout: Tuple[int, int],
         force_cpu: bool = False,
         timer: Optional[Timer] = None,
@@ -93,17 +99,63 @@ def _device_synchronize():
         # this is a method so we can profile it separately from other device syncs
         device_synchronize()
 
+    def _create_all_reduce_quantity(
+        self, input_metadata: QuantityMetadata, input_data
+    ) -> Quantity:
+        """Create a Quantity for all_reduce data and metadata"""
+        all_reduce_quantity = Quantity(
+            input_data,
+            dims=input_metadata.dims,
+            units=input_metadata.units,
+            origin=input_metadata.origin,
+            extent=input_metadata.extent,
+            gt4py_backend=input_metadata.gt4py_backend,
+            allow_mismatch_float_precision=False,
+        )
+        return all_reduce_quantity
+
+    def all_reduce(
+        self,
+        input_quantity: Quantity,
+        op: ReductionOperator,
+        output_quantity: Quantity = None,
+    ):
+        reduced_quantity_data = self.comm.allreduce(input_quantity.data, op)
+        if output_quantity is None:
+            all_reduce_quantity = self._create_all_reduce_quantity(
+                input_quantity.metadata, reduced_quantity_data
+            )
+            return all_reduce_quantity
+        else:
+            if output_quantity.data.shape != input_quantity.data.shape:
+                raise TypeError("Shapes not matching")
+
+            input_quantity.metadata.duplicate_metadata(output_quantity.metadata)
+
+            output_quantity.data = reduced_quantity_data
+
+    def all_reduce_per_element(
+        self,
+        input_quantity: Quantity,
+        output_quantity: Quantity,
+        op: ReductionOperator,
+    ):
+        self.comm.Allreduce(input_quantity.data, output_quantity.data, op)
+
+    def all_reduce_per_element_in_place(
+        self, quantity: Quantity, op: ReductionOperator
+    ):
+        self.comm.Allreduce_inplace(quantity.data, op)
+
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
-        with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
-            numpy_module.zeros, recvbuf
-        ) as recv:
-            self.comm.Scatter(send, recv, **kwargs)
+        with send_buffer(numpy_module.zeros, sendbuf) as send:
+            with recv_buffer(numpy_module.zeros, recvbuf) as recv:
+                self.comm.Scatter(send, recv, **kwargs)
 
     def _Gather(self, numpy_module, sendbuf, recvbuf, **kwargs):
-        with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
-            numpy_module.zeros, recvbuf
-        ) as recv:
-            self.comm.Gather(send, recv, **kwargs)
+        with send_buffer(numpy_module.zeros, sendbuf) as send:
+            with recv_buffer(numpy_module.zeros, recvbuf) as recv:
+                self.comm.Gather(send, recv, **kwargs)
 
     def scatter(
         self,
@@ -252,7 +304,7 @@ def gather_state(self, send_state=None, recv_state=None, transfer_type=None):
 
         Args:
             send_state: the model state to be sent containing the subtile data
-            recv_state: the pre-allocated state in which to recieve the full tile
+            recv_state: the pre-allocated state in which to receive the full tile
                 state. Only variables which are scattered will be written to.
         Returns:
             recv_state: on the root rank, the state containing the entire tile
@@ -288,7 +340,7 @@ def scatter_state(self, send_state=None, recv_state=None):
         Args:
             send_state: the model state to be sent containing the entire tile,
                 required only from the root rank
-            recv_state: the pre-allocated state in which to recieve the scattered
+            recv_state: the pre-allocated state in which to receive the scattered
                 state. Only variables which are scattered will be written to.
         Returns:
             rank_state: the state corresponding to this rank's subdomain
@@ -709,7 +761,7 @@ class CubedSphereCommunicator(Communicator):
 
     def __init__(
         self,
-        comm,
+        comm: CommABC,
         partitioner: CubedSpherePartitioner,
         force_cpu: bool = False,
         timer: Optional[Timer] = None,
@@ -722,6 +774,11 @@ def __init__(
             force_cpu: Force all communication to go through central memory.
             timer: Time communication operations.
         """
+        if not issubclass(type(comm), CommABC):
+            raise TypeError(
+                "Communicator needs to be instantiated with communication subsystem"
+                f" derived from `comm_abc.Comm`, got {type(comm)}."
+            )
         if comm.Get_size() != partitioner.total_ranks:
             raise ValueError(
                 f"was given a partitioner for {partitioner.total_ranks} ranks but a "
diff --git a/ndsl/comm/local_comm.py b/ndsl/comm/local_comm.py
index 5ebfb47d..1ae10177 100644
--- a/ndsl/comm/local_comm.py
+++ b/ndsl/comm/local_comm.py
@@ -189,8 +189,14 @@ def Split(self, color, key):
         self._split_comms[color].append(new_comm)
         return new_comm
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op=None, recvobj=None) -> Any:
         raise NotImplementedError(
-            "sendrecv fundamentally cannot be written for LocalComm, "
+            "allreduce fundamentally cannot be written for LocalComm, "
+            "as it requires synchronicity"
+        )
+
+    def Allreduce(self, sendobj, recvobj, op) -> Any:
+        raise NotImplementedError(
+            "Allreduce fundamentally cannot be written for LocalComm, "
             "as it requires synchronicity"
         )
diff --git a/ndsl/comm/mpi.py b/ndsl/comm/mpi.py
index 6f47c791..6b3ff17f 100644
--- a/ndsl/comm/mpi.py
+++ b/ndsl/comm/mpi.py
@@ -1,10 +1,11 @@
 try:
+    import mpi4py
     from mpi4py import MPI
 except ImportError:
     MPI = None
-from typing import List, Optional, TypeVar, cast
+from typing import Dict, List, Optional, TypeVar, cast
 
-from ndsl.comm.comm_abc import Comm, Request
+from ndsl.comm.comm_abc import Comm, ReductionOperator, Request
 from ndsl.logging import ndsl_log
 
 
@@ -12,6 +13,24 @@
 
 
 class MPIComm(Comm):
+    _op_mapping: Dict[ReductionOperator, mpi4py.MPI.Op] = {
+        ReductionOperator.OP_NULL: mpi4py.MPI.OP_NULL,
+        ReductionOperator.MAX: mpi4py.MPI.MAX,
+        ReductionOperator.MIN: mpi4py.MPI.MIN,
+        ReductionOperator.SUM: mpi4py.MPI.SUM,
+        ReductionOperator.PROD: mpi4py.MPI.PROD,
+        ReductionOperator.LAND: mpi4py.MPI.LAND,
+        ReductionOperator.BAND: mpi4py.MPI.BAND,
+        ReductionOperator.LOR: mpi4py.MPI.LOR,
+        ReductionOperator.BOR: mpi4py.MPI.BOR,
+        ReductionOperator.LXOR: mpi4py.MPI.LXOR,
+        ReductionOperator.BXOR: mpi4py.MPI.BXOR,
+        ReductionOperator.MAXLOC: mpi4py.MPI.MAXLOC,
+        ReductionOperator.MINLOC: mpi4py.MPI.MINLOC,
+        ReductionOperator.REPLACE: mpi4py.MPI.REPLACE,
+        ReductionOperator.NO_OP: mpi4py.MPI.NO_OP,
+    }
+
     def __init__(self):
         if MPI is None:
             raise RuntimeError("MPI not available")
@@ -72,8 +91,22 @@ def Split(self, color, key) -> "Comm":
         )
         return self._comm.Split(color, key)
 
-    def allreduce(self, sendobj: T, op=None) -> T:
+    def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
         ndsl_log.debug(
             "allreduce on rank %s with operator %s", self._comm.Get_rank(), op
         )
-        return self._comm.allreduce(sendobj, op)
+        return self._comm.allreduce(sendobj, self._op_mapping[op])
+
+    def Allreduce(self, sendobj_or_inplace: T, recvobj: T, op: ReductionOperator) -> T:
+        ndsl_log.debug(
+            "Allreduce on rank %s with operator %s", self._comm.Get_rank(), op
+        )
+        return self._comm.Allreduce(sendobj_or_inplace, recvobj, self._op_mapping[op])
+
+    def Allreduce_inplace(self, recvobj: T, op: ReductionOperator) -> T:
+        ndsl_log.debug(
+            "Allreduce (in place) on rank %s with operator %s",
+            self._comm.Get_rank(),
+            op,
+        )
+        return self._comm.Allreduce(mpi4py.MPI.IN_PLACE, recvobj, self._op_mapping[op])
diff --git a/ndsl/comm/null_comm.py b/ndsl/comm/null_comm.py
index 7e0c07fa..5ca92359 100644
--- a/ndsl/comm/null_comm.py
+++ b/ndsl/comm/null_comm.py
@@ -1,7 +1,7 @@
 import copy
-from typing import Any, Mapping
+from typing import Any, Mapping, Optional
 
-from ndsl.comm.comm_abc import Comm, Request
+from ndsl.comm.comm_abc import Comm, ReductionOperator, Request
 
 
 class NullAsyncResult(Request):
@@ -91,5 +91,9 @@ def Split(self, color, key):
         self._split_comms[color].append(new_comm)
         return new_comm
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op: Optional[ReductionOperator] = None) -> Any:
         return self._fill_value
+
+    def Allreduce(self, sendobj, recvobj, op: ReductionOperator) -> Any:
+        recvobj = sendobj
+        return recvobj
diff --git a/ndsl/constants.py b/ndsl/constants.py
index 0f7d55da..b7e91b18 100644
--- a/ndsl/constants.py
+++ b/ndsl/constants.py
@@ -1,19 +1,22 @@
 import os
 from enum import Enum
 
+import numpy as np
+
+from ndsl.dsl.typing import Float
 from ndsl.logging import ndsl_log
 
 
-# The FV3GFS model ships with two sets of constants, one used in the GFS physics
+# The FV3GFS model ships with two sets of constants, one used in the UFS physics
 # package and the other used for the Dycore. Their difference are small but significant
 # In addition the GSFC's GEOS model as its own variables
 class ConstantVersions(Enum):
     GFDL = "GFDL"  # NOAA's FV3 dynamical core constants (original port)
-    GFS = "GFS"  # Constant as defined in NOAA GFS
-    GEOS = "GEOS"  # Constant as defined in GEOS v13
+    UFS = "UFS"  # Constant as defined in NOAA's UFS
+    GEOS = "GEOS"  # Constant as defined in GEOS v11.4.2
 
 
-CONST_VERSION_AS_STR = os.environ.get("PACE_CONSTANTS", "GFS")
+CONST_VERSION_AS_STR = os.environ.get("PACE_CONSTANTS", "UFS")
 
 try:
     CONST_VERSION = ConstantVersions[CONST_VERSION_AS_STR]
@@ -66,7 +69,7 @@ class ConstantVersions(Enum):
 if CONST_VERSION == ConstantVersions.GEOS:
     # 'qlcd' is exchanged in GEOS
     NQ = 9
-elif CONST_VERSION == ConstantVersions.GFS or CONST_VERSION == ConstantVersions.GFDL:
+elif CONST_VERSION == ConstantVersions.UFS or CONST_VERSION == ConstantVersions.GFDL:
     NQ = 8
 else:
     raise RuntimeError("Constant selector failed, bad code.")
@@ -75,69 +78,73 @@ class ConstantVersions(Enum):
 # Physical constants
 #####################
 if CONST_VERSION == ConstantVersions.GEOS:
-    RADIUS = 6.371e6
-    PI = 3.14159265358979323846
-    OMEGA = 2.0 * PI / 86164.0
-    GRAV = 9.80665
-    RGRAV = 1.0 / GRAV
-    RDGAS = 8314.47 / 28.965
-    RVGAS = 8314.47 / 18.015
-    HLV = 2.4665e6
-    HLF = 3.3370e5
-    KAPPA = RDGAS / (3.5 * RDGAS)
+    RADIUS = Float(6.371e6)  # Radius of the Earth [m]
+    PI_8 = np.float64(3.14159265358979323846)
+    PI = Float(PI_8)
+    OMEGA = Float(2.0) * PI / Float(86164.0)  # Rotation of the earth
+    GRAV = Float(9.80665)  # Acceleration due to gravity [m/s^2].04
+    RGRAV = Float(1.0) / GRAV  # Inverse of gravitational acceleration
+    RDGAS = Float(8314.47) / Float(
+        28.965
+    )  # Gas constant for dry air [J/kg/deg] ~287.04
+    RVGAS = Float(8314.47) / Float(18.015)  # Gas constant for water vapor [J/kg/deg]
+    HLV = Float(2.4665e6)  # Latent heat of evaporation [J/kg]
+    HLF = Float(3.3370e5)  # Latent heat of fusion [J/kg]  ~3.34e5
+    KAPPA = RDGAS / (Float(3.5) * RDGAS)  # Specific heat capacity of dry air at
     CP_AIR = RDGAS / KAPPA
-    TFREEZE = 273.15
-    SAT_ADJUST_THRESHOLD = 1.0e-6
-elif CONST_VERSION == ConstantVersions.GFS:
-    RADIUS = 6.3712e6  # Radius of the Earth [m]
-    PI = 3.1415926535897931
-    OMEGA = 7.2921e-5  # Rotation of the earth
-    GRAV = 9.80665  # Acceleration due to gravity [m/s^2].04
-    RGRAV = 1.0 / GRAV  # Inverse of gravitational acceleration
-    RDGAS = 287.05  # Gas constant for dry air [J/kg/deg] # 287.04
-    RVGAS = 461.50  # Gas constant for water vapor [J/kg/deg]
-    HLV = 2.5e6  # Latent heat of evaporation [J/kg]
-    HLF = 3.3358e5  # Latent heat of fusion [J/kg]  # 3.34e5
-    CP_AIR = 1004.6
+    TFREEZE = Float(273.16)  # Freezing temperature of fresh water [K]
+    SAT_ADJUST_THRESHOLD = Float(1.0e-6)
+elif CONST_VERSION == ConstantVersions.UFS:
+    RADIUS = Float(6.3712e6)  # Radius of the Earth [m]
+    PI = Float(3.1415926535897931)
+    OMEGA = Float(7.2921e-5)  # Rotation of the earth
+    GRAV = Float(9.80665)  # Acceleration due to gravity [m/s^2].04
+    RGRAV = Float(1.0 / GRAV)  # Inverse of gravitational acceleration
+    RDGAS = Float(287.05)  # Gas constant for dry air [J/kg/deg] ~287.04
+    RVGAS = Float(461.50)  # Gas constant for water vapor [J/kg/deg]
+    HLV = Float(2.5e6)  # Latent heat of evaporation [J/kg]
+    HLF = Float(3.3358e5)  # Latent heat of fusion [J/kg]  ~3.34e5
+    CP_AIR = Float(1004.6)
     KAPPA = RDGAS / CP_AIR  # Specific heat capacity of dry air at
-    TFREEZE = 273.15
-    SAT_ADJUST_THRESHOLD = 1.0e-8
+    TFREEZE = Float(273.15)  # Freezing temperature of fresh water [K]
+    SAT_ADJUST_THRESHOLD = Float(1.0e-8)
 elif CONST_VERSION == ConstantVersions.GFDL:
-    RADIUS = 6371.0e3  # Radius of the Earth [m] #6371.0e3
-    PI = 3.14159265358979323846  # 3.14159265358979323846
-    OMEGA = 7.292e-5  # Rotation of the earth  # 7.292e-5
-    GRAV = 9.80  # Acceleration due to gravity [m/s^2].04
-    RGRAV = 1.0 / GRAV  # Inverse of gravitational acceleration
-    RDGAS = 287.04  # Gas constant for dry air [J/kg/deg] # 287.04
-    RVGAS = 461.50  # Gas constant for water vapor [J/kg/deg]
-    HLV = 2.500e6  # Latent heat of evaporation [J/kg]
-    HLF = 3.34e5  # Latent heat of fusion [J/kg]  # 3.34e5
-    KAPPA = 2.0 / 7.0
+    RADIUS = Float(6371.0e3)  # Radius of the Earth [m] #6371.0e3
+    PI = Float(3.14159265358979323846)  # 3.14159265358979323846
+    OMEGA = Float(7.292e-5)  # Rotation of the earth  # 7.292e-5
+    GRAV = Float(9.80)  # Acceleration due to gravity [m/s^2].04
+    RGRAV = Float(1.0) / GRAV  # Inverse of gravitational acceleration
+    RDGAS = Float(287.04)  # Gas constant for dry air [J/kg/deg] ~287.04
+    RVGAS = Float(461.50)  # Gas constant for water vapor [J/kg/deg]
+    HLV = Float(2.500e6)  # Latent heat of evaporation [J/kg]
+    HLF = Float(3.34e5)  # Latent heat of fusion [J/kg]  ~3.34e5
+    KAPPA = Float(2.0) / Float(7.0)
     CP_AIR = RDGAS / KAPPA  # Specific heat capacity of dry air at
-    TFREEZE = 273.16  # Freezing temperature of fresh water [K]
-    SAT_ADJUST_THRESHOLD = 1.0e-8
+    TFREEZE = Float(273.16)  # Freezing temperature of fresh water [K]
+    SAT_ADJUST_THRESHOLD = Float(1.0e-8)
 else:
     raise RuntimeError("Constant selector failed, bad code.")
 
-DZ_MIN = 2.0
+SECONDS_PER_DAY = Float(86400.0)
+DZ_MIN = Float(2.0)
 CV_AIR = CP_AIR - RDGAS  # Heat capacity of dry air at constant volume
 RDG = -RDGAS / GRAV
-CNST_0P20 = 0.2
+CNST_0P20 = Float(0.2)
 K1K = RDGAS / CV_AIR
-CNST_0P20 = 0.2
-CV_VAP = 3.0 * RVGAS  # Heat capacity of water vapor at constant volume
-ZVIR = RVGAS / RDGAS - 1  # con_fvirt in Fortran physics
-C_ICE = 1972.0  # Heat capacity of ice at -15 degrees Celsius
-C_ICE_0 = 2106.0  # Heat capacity of ice at 0 degrees Celsius
-C_LIQ = 4.1855e3  # Heat capacity of water at 15 degrees Celsius
-CP_VAP = 4.0 * RVGAS  # Heat capacity of water vapor at constant pressure
-TICE = 273.16  # Freezing temperature
+CNST_0P20 = Float(0.2)
+CV_VAP = Float(3.0) * RVGAS  # Heat capacity of water vapor at constant volume
+ZVIR = RVGAS / RDGAS - Float(1)  # con_fvirt in Fortran physics
+C_ICE = Float(1972.0)  # Heat capacity of ice at -15 degrees Celsius
+C_ICE_0 = Float(2106.0)  # Heat capacity of ice at 0 degrees Celsius
+C_LIQ = Float(4.1855e3)  # Heat capacity of water at 15 degrees Celsius
+CP_VAP = Float(4.0) * RVGAS  # Heat capacity of water vapor at constant pressure
+TICE = Float(273.16)  # Freezing temperature
 DC_ICE = C_LIQ - C_ICE  # Isobaric heating / cooling
 DC_VAP = CP_VAP - C_LIQ  # Isobaric heating / cooling
 D2ICE = DC_VAP + DC_ICE  # Isobaric heating / cooling
 LI0 = HLF - DC_ICE * TICE
 EPS = RDGAS / RVGAS
-EPSM1 = EPS - 1.0
+EPSM1 = EPS - Float(1.0)
 LV0 = (
     HLV - DC_VAP * TICE
 )  # 3.13905782e6, evaporation latent heat coefficient at 0 degrees Kelvin
@@ -147,10 +154,10 @@ class ConstantVersions(Enum):
 LI2 = (
     LV0 + LI00
 )  # 2.86799816e6, sublimation latent heat coefficient at 0 degrees Kelvin
-E00 = 611.21  # Saturation vapor pressure at 0 degrees Celsius (Pa)
-PSAT = 610.78  # Saturation vapor pressure at H2O 3pt (Pa)
-T_WFR = TICE - 40.0  # homogeneous freezing temperature
-TICE0 = TICE - 0.01
-T_MIN = 178.0  # Minimum temperature to freeze-dry all water vapor
-T_SAT_MIN = TICE - 160.0
-LAT2 = (HLV + HLF) ** 2  # used in bigg mechanism
+E00 = Float(611.21)  # Saturation vapor pressure at 0 degrees Celsius (Pa)
+PSAT = Float(610.78)  # Saturation vapor pressure at H2O 3pt (Pa)
+T_WFR = TICE - Float(40.0)  # homogeneous freezing temperature
+TICE0 = TICE - Float(0.01)
+T_MIN = Float(178.0)  # Minimum temperature to freeze-dry all water vapor
+T_SAT_MIN = TICE - Float(160.0)
+LAT2 = np.power((HLV + HLF), 2, dtype=Float)  # used in bigg mechanism
diff --git a/ndsl/dsl/__init__.py b/ndsl/dsl/__init__.py
index ed44420a..b7034e07 100644
--- a/ndsl/dsl/__init__.py
+++ b/ndsl/dsl/__init__.py
@@ -9,5 +9,3 @@
     gt4py.cartesian.config.cache_settings["dir_name"] = os.environ.get(
         "GT_CACHE_DIR_NAME", f".gt_cache_{MPI.COMM_WORLD.Get_rank():06}"
     )
-
-__version__ = "0.2.0"
diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py
index 7f1c1477..5129dac8 100644
--- a/ndsl/dsl/dace/dace_config.py
+++ b/ndsl/dsl/dace/dace_config.py
@@ -65,7 +65,7 @@ def _determine_compiling_ranks(
         6 7 8
         3 4 5
         0 1 2
-    Using the partitionner we find mapping of the given layout
+    Using the partitioner we find mapping of the given layout
     to all of those. For example on 4x4 layout
         12 13 14 15
         8  9  10 11
@@ -217,7 +217,7 @@ def __init__(
             # Block size/thread count is defaulted to an average value for recent
             # hardware (Pascal and upward). The problem of setting an optimized
             # block/thread is both hardware and problem dependant. Fine tuners
-            # available in DaCe should be relied on for futher tuning of this value.
+            # available in DaCe should be relied on for further tuning of this value.
             dace.config.Config.set(
                 "compiler", "cuda", "default_block_size", value="64,8,1"
             )
diff --git a/ndsl/dsl/dace/orchestration.py b/ndsl/dsl/dace/orchestration.py
index b5417cec..767610c3 100644
--- a/ndsl/dsl/dace/orchestration.py
+++ b/ndsl/dsl/dace/orchestration.py
@@ -40,12 +40,12 @@
     cp = None
 
 
-def dace_inhibitor(func: Callable):
+def dace_inhibitor(func: Callable) -> Callable:
     """Triggers callback generation wrapping `func` while doing DaCe parsing."""
     return func
 
 
-def _upload_to_device(host_data: List[Any]):
+def _upload_to_device(host_data: List[Any]) -> None:
     """Make sure any ndarrays gets uploaded to the device
 
     This will raise an assertion if cupy is not installed.
@@ -60,22 +60,11 @@ def _download_results_from_dace(
     config: DaceConfig, dace_result: Optional[List[Any]], args: List[Any]
 ):
     """Move all data from DaCe memory space to GT4Py"""
-    gt4py_results = None
-    if dace_result is not None:
-        if config.is_gpu_backend():
-            gt4py_results = [
-                gt4py.storage.from_array(
-                    r,
-                    backend=config.get_backend(),
-                )
-                for r in dace_result
-            ]
-        else:
-            gt4py_results = [
-                gt4py.storage.from_array(r, backend=config.get_backend())
-                for r in dace_result
-            ]
-    return gt4py_results
+    if dace_result is None:
+        return None
+
+    backend = config.get_backend()
+    return [gt4py.storage.from_array(result, backend=backend) for result in dace_result]
 
 
 def _to_gpu(sdfg: dace.SDFG):
@@ -99,8 +88,8 @@ def _to_gpu(sdfg: dace.SDFG):
         else:
             arr.storage = dace.StorageType.GPU_Global
 
-    # All maps will be scedule on GPU
-    for mapentry, state in topmaps:
+    # All maps will be schedule on GPU
+    for mapentry, _state in topmaps:
         mapentry.schedule = dace.ScheduleType.GPU_Device
 
     # Deactivate OpenMP sections
@@ -108,25 +97,30 @@ def _to_gpu(sdfg: dace.SDFG):
         sd.openmp_sections = False
 
 
-def _simplify(sdfg: dace.SDFG, validate=True, verbose=False):
+def _simplify(
+    sdfg: dace.SDFG,
+    *,
+    validate: bool = True,
+    validate_all: bool = False,
+    verbose: bool = False,
+):
     """Override of sdfg.simplify to skip failing transformation
     per https://github.com/spcl/dace/issues/1328
     """
     return SimplifyPass(
         validate=validate,
+        validate_all=validate_all,
         verbose=verbose,
         skip=["ConstantPropagation"],
     ).apply_pass(sdfg, {})
 
 
 def _build_sdfg(
-    daceprog: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
+    dace_program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
 ):
     """Build the .so out of the SDFG on the top tile ranks only"""
-    if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-        is_compiling = True
-    else:
-        is_compiling = config.do_compile
+    is_compiling = True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile
+
     if is_compiling:
         # Make the transients array persistents
         if config.is_gpu_backend():
@@ -136,18 +130,18 @@ def _build_sdfg(
             # Upload args to device
             _upload_to_device(list(args) + list(kwargs.values()))
         else:
-            for sd, _aname, arr in sdfg.arrays_recursive():
+            for _sd, _aname, arr in sdfg.arrays_recursive():
                 if arr.shape == (1,):
                     arr.storage = DaceStorageType.Register
             make_transients_persistent(sdfg=sdfg, device=DaceDeviceType.CPU)
 
         # Build non-constants & non-transients from the sdfg_kwargs
-        sdfg_kwargs = daceprog._create_sdfg_args(sdfg, args, kwargs)
-        for k in daceprog.constant_args:
+        sdfg_kwargs = dace_program._create_sdfg_args(sdfg, args, kwargs)
+        for k in dace_program.constant_args:
             if k in sdfg_kwargs:
                 del sdfg_kwargs[k]
         sdfg_kwargs = {k: v for k, v in sdfg_kwargs.items() if v is not None}
-        for k, tup in daceprog.resolver.closure_arrays.items():
+        for k, tup in dace_program.resolver.closure_arrays.items():
             if k in sdfg_kwargs and tup[1].transient:
                 del sdfg_kwargs[k]
 
@@ -204,13 +198,16 @@ def _build_sdfg(
     # On BuildAndRun: all ranks sync, then load the SDFG from
     #                 the expected path (made available by build).
     # We use a "FrozenCompiledSDFG" to minimize re-entry cost at call time
+
+    mode = config.get_orchestrate()
     # DEV NOTE: we explicitly use MPI.COMM_WORLD here because it is
     # a true multi-machine sync, outside of our own communicator class.
-    if config.get_orchestrate() == DaCeOrchestration.Build:
+    if mode == DaCeOrchestration.Build:
         MPI.COMM_WORLD.Barrier()  # Protect against early exist which kill SLURM jobs
         ndsl_log.info(f"{DaCeProgress.default_prefix(config)} Build only, exiting.")
         exit(0)
-    elif config.get_orchestrate() == DaCeOrchestration.BuildAndRun:
+
+    if mode == DaCeOrchestration.BuildAndRun:
         if not is_compiling:
             ndsl_log.info(
                 f"{DaCeProgress.default_prefix(config)} Rank is not compiling."
@@ -219,52 +216,50 @@ def _build_sdfg(
         MPI.COMM_WORLD.Barrier()
 
         with DaCeProgress(config, "Loading"):
-            sdfg_path = get_sdfg_path(daceprog.name, config, override_run_only=True)
-            csdfg, _ = daceprog.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
-            config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
-                daceprog, csdfg, args, kwargs
+            sdfg_path = get_sdfg_path(dace_program.name, config, override_run_only=True)
+            compiledSDFG, _ = dace_program.load_precompiled_sdfg(
+                sdfg_path, *args, **kwargs
+            )
+            config.loaded_precompiled_SDFG[dace_program] = FrozenCompiledSDFG(
+                dace_program, compiledSDFG, args, kwargs
             )
 
-        return _call_sdfg(daceprog, sdfg, config, args, kwargs)
+        return _call_sdfg(dace_program, sdfg, config, args, kwargs)
 
 
 def _call_sdfg(
-    daceprog: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
+    dace_program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
 ):
     """Dispatch the SDFG execution and/or build"""
     # Pre-compiled SDFG code path does away with any data checks and
     # cached the marshalling - leading to almost direct C call
     # DaceProgram performs argument transformation & checks for a cost ~200ms
     # of overhead
-    if daceprog in config.loaded_precompiled_SDFG:
+    if dace_program in config.loaded_precompiled_SDFG:
         with DaCeProgress(config, "Run"):
             if config.is_gpu_backend():
                 _upload_to_device(list(args) + list(kwargs.values()))
-            res = config.loaded_precompiled_SDFG[daceprog]()
+            res = config.loaded_precompiled_SDFG[dace_program]()
             res = _download_results_from_dace(
                 config, res, list(args) + list(kwargs.values())
             )
         return res
+
+    mode = config.get_orchestrate()
+    if mode in [DaCeOrchestration.Build, DaCeOrchestration.BuildAndRun]:
+        ndsl_log.info("Building DaCe orchestration")
+        return _build_sdfg(dace_program, sdfg, config, args, kwargs)
+
+    if mode == DaCeOrchestration.Run:
+        # We should never hit this, it should be caught by the
+        # loaded_precompiled_SDFG check above
+        raise RuntimeError("Unexpected call - pre-compiled SDFG failed to load")
     else:
-        if (
-            config.get_orchestrate() == DaCeOrchestration.Build
-            or config.get_orchestrate() == DaCeOrchestration.BuildAndRun
-        ):
-            ndsl_log.info("Building DaCe orchestration")
-            res = _build_sdfg(daceprog, sdfg, config, args, kwargs)
-        elif config.get_orchestrate() == DaCeOrchestration.Run:
-            # We should never hit this, it should be caught by the
-            # loaded_precompiled_SDFG check above
-            raise RuntimeError("Unexpected call - csdfg didn't get caught")
-        else:
-            raise NotImplementedError(
-                f"Mode {config.get_orchestrate()} unimplemented at call time"
-            )
-        return res
+        raise NotImplementedError(f"Mode '{mode}' unimplemented at call time")
 
 
 def _parse_sdfg(
-    daceprog: DaceProgram,
+    dace_program: DaceProgram,
     config: DaceConfig,
     *args,
     **kwargs,
@@ -273,45 +268,46 @@ def _parse_sdfg(
     Either parses, load a .sdfg or load .so (as a compiled sdfg)
 
     Attributes:
-        daceprog: the DaceProgram carrying reference to the original method/function
+        dace_program: the DaceProgram carrying reference to the original method/function
         config: the DaceConfig configuration for this execution
     """
     # Check cache for already loaded SDFG
-    if daceprog in config.loaded_precompiled_SDFG:
-        return config.loaded_precompiled_SDFG[daceprog]
+    if dace_program in config.loaded_precompiled_SDFG:
+        return config.loaded_precompiled_SDFG[dace_program]
 
     # Build expected path
-    sdfg_path = get_sdfg_path(daceprog.name, config)
+    sdfg_path = get_sdfg_path(dace_program.name, config)
     if sdfg_path is None:
-        if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-            is_compiling = True
-        else:
-            is_compiling = config.do_compile
+        is_compiling = (
+            True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile
+        )
+
         if not is_compiling:
             # We can not parse the SDFG since we will load the proper
             # compiled SDFG from the compiling rank
             return None
-        with DaCeProgress(config, f"Parse code of {daceprog.name} to SDFG"):
-            sdfg = daceprog.to_sdfg(
+
+        with DaCeProgress(config, f"Parse code of {dace_program.name} to SDFG"):
+            sdfg = dace_program.to_sdfg(
                 *args,
-                **daceprog.__sdfg_closure__(),
+                **dace_program.__sdfg_closure__(),
                 **kwargs,
                 save=False,
                 simplify=False,
             )
         return sdfg
-    else:
-        if os.path.isfile(sdfg_path):
-            with DaCeProgress(config, "Load .sdfg"):
-                sdfg, _ = daceprog.load_sdfg(sdfg_path, *args, **kwargs)
-            return sdfg
-        else:
-            with DaCeProgress(config, "Load precompiled .sdfg (.so)"):
-                csdfg, _ = daceprog.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
-                config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
-                    daceprog, csdfg, args, kwargs
-                )
-            return csdfg
+
+    if os.path.isfile(sdfg_path):
+        with DaCeProgress(config, "Load .sdfg"):
+            sdfg, _ = dace_program.load_sdfg(sdfg_path, *args, **kwargs)
+        return sdfg
+
+    with DaCeProgress(config, "Load precompiled .sdfg (.so)"):
+        compiledSDFG, _ = dace_program.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
+        config.loaded_precompiled_SDFG[dace_program] = FrozenCompiledSDFG(
+            dace_program, compiledSDFG, args, kwargs
+        )
+    return compiledSDFG
 
 
 class _LazyComputepathFunction(SDFGConvertible):
@@ -448,7 +444,7 @@ def orchestrate(
 ):
     """
     Orchestrate a method of an object with DaCe.
-    The method object is patched in place, replacing the orignal Callable with
+    The method object is patched in place, replacing the original Callable with
     a wrapper that will trigger orchestration at call time.
     If the model configuration doesn't demand orchestration, this won't do anything.
 
@@ -463,72 +459,71 @@ def orchestrate(
         dace_compiletime_args = []
 
     if config.is_dace_orchestrated():
-        if hasattr(obj, method_to_orchestrate):
-            func = type.__getattribute__(type(obj), method_to_orchestrate)
-
-            # Flag argument as dace.constant
-            for argument in dace_compiletime_args:
-                func.__annotations__[argument] = DaceCompiletime
-
-            # Build DaCe orchestrated wrapper
-            # This is a JIT object, e.g. DaCe compilation will happen on call
-            wrapped = _LazyComputepathMethod(func, config).__get__(obj)
-
-            if method_to_orchestrate == "__call__":
-                # Grab the function from the type of the child class
-                # Dev note: we need to use type for dunder call because:
-                #   a = A()
-                #   a()
-                # resolved to: type(a).__call__(a)
-                # therefore patching the instance call (e.g a.__call__) is not enough.
-                # We could patch the type(self), ergo the class itself
-                # but that would patch _every_ instance of A.
-                # What we can do is patch the instance.__class__ with a local made class
-                # in order to keep each instance with it's own patch.
-                #
-                # Re: type:ignore
-                # Mypy is unhappy about dynamic class name and the devs (per github
-                # issues discussion) is to make a plugin. Too much work -> ignore mypy
-
-                class _(type(obj)):  # type: ignore
-                    __qualname__ = f"{type(obj).__qualname__}_patched"
-                    __name__ = f"{type(obj).__name__}_patched"
-
-                    def __call__(self, *arg, **kwarg):
-                        return wrapped(*arg, **kwarg)
-
-                    def __sdfg__(self, *args, **kwargs):
-                        return wrapped.__sdfg__(*args, **kwargs)
-
-                    def __sdfg_closure__(self, reevaluate=None):
-                        return wrapped.__sdfg_closure__(reevaluate)
-
-                    def __sdfg_signature__(self):
-                        return wrapped.__sdfg_signature__()
-
-                    def closure_resolver(
-                        self, constant_args, given_args, parent_closure=None
-                    ):
-                        return wrapped.closure_resolver(
-                            constant_args, given_args, parent_closure
-                        )
-
-                # We keep the original class type name to not perturb
-                # the workflows that uses it to build relevant info (path, hash...)
-                previous_cls_name = type(obj).__name__
-                obj.__class__ = _
-                type(obj).__name__ = previous_cls_name
-            else:
-                # For regular attribute - we can just patch as usual
-                setattr(obj, method_to_orchestrate, wrapped)
-
-        else:
+        if not hasattr(obj, method_to_orchestrate):
             raise RuntimeError(
                 f"Could not orchestrate, "
                 f"{type(obj).__name__}.{method_to_orchestrate} "
                 "does not exists"
             )
 
+        func = type.__getattribute__(type(obj), method_to_orchestrate)
+
+        # Flag argument as dace.constant
+        for argument in dace_compiletime_args:
+            func.__annotations__[argument] = DaceCompiletime
+
+        # Build DaCe orchestrated wrapper
+        # This is a JIT object, e.g. DaCe compilation will happen on call
+        wrapped = _LazyComputepathMethod(func, config).__get__(obj)
+
+        if method_to_orchestrate == "__call__":
+            # Grab the function from the type of the child class
+            # Dev note: we need to use type for dunder call because:
+            #   a = A()
+            #   a()
+            # resolved to: type(a).__call__(a)
+            # therefore patching the instance call (e.g a.__call__) is not enough.
+            # We could patch the type(self), ergo the class itself
+            # but that would patch _every_ instance of A.
+            # What we can do is patch the instance.__class__ with a local made class
+            # in order to keep each instance with it's own patch.
+            #
+            # Re: type:ignore
+            # Mypy is unhappy about dynamic class name and the devs (per github
+            # issues discussion) is to make a plugin. Too much work -> ignore mypy
+
+            class _(type(obj)):  # type: ignore
+                __qualname__ = f"{type(obj).__qualname__}_patched"
+                __name__ = f"{type(obj).__name__}_patched"
+
+                def __call__(self, *arg, **kwarg):
+                    return wrapped(*arg, **kwarg)
+
+                def __sdfg__(self, *args, **kwargs):
+                    return wrapped.__sdfg__(*args, **kwargs)
+
+                def __sdfg_closure__(self, reevaluate=None):
+                    return wrapped.__sdfg_closure__(reevaluate)
+
+                def __sdfg_signature__(self):
+                    return wrapped.__sdfg_signature__()
+
+                def closure_resolver(
+                    self, constant_args, given_args, parent_closure=None
+                ):
+                    return wrapped.closure_resolver(
+                        constant_args, given_args, parent_closure
+                    )
+
+            # We keep the original class type name to not perturb
+            # the workflows that uses it to build relevant info (path, hash...)
+            previous_cls_name = type(obj).__name__
+            obj.__class__ = _
+            type(obj).__name__ = previous_cls_name
+        else:
+            # For regular attribute - we can just patch as usual
+            setattr(obj, method_to_orchestrate, wrapped)
+
 
 def orchestrate_function(
     config: DaceConfig = None,
@@ -553,9 +548,6 @@ def _wrapper(*args, **kwargs):
                 func.__annotations__[argument] = DaceCompiletime
             return _LazyComputepathFunction(func, config)
 
-        if config.is_dace_orchestrated():
-            return _wrapper(func)
-        else:
-            return func
+        return _wrapper(func) if config.is_dace_orchestrated() else func
 
     return _decorator
diff --git a/ndsl/dsl/dace/utils.py b/ndsl/dsl/dace/utils.py
index 29e3211d..05fa5754 100644
--- a/ndsl/dsl/dace/utils.py
+++ b/ndsl/dsl/dace/utils.py
@@ -15,26 +15,22 @@
 from ndsl.optional_imports import cupy as cp
 
 
-# ----------------------------------------------------------
-# Rough timer & log for major operations of DaCe build stack
-# ----------------------------------------------------------
 class DaCeProgress:
-    """Timer and log to track build progress"""
+    """Rough timer & log for major operations of DaCe build stack."""
 
-    def __init__(self, config: DaceConfig, label: str):
+    def __init__(self, config: DaceConfig, label: str) -> None:
         self.prefix = DaCeProgress.default_prefix(config)
-        self.prefix = f"[{config.get_orchestrate()}]"
         self.label = label
 
     @classmethod
     def default_prefix(cls, config: DaceConfig) -> str:
         return f"[{config.get_orchestrate()}]"
 
-    def __enter__(self):
+    def __enter__(self) -> None:
         ndsl_log.debug(f"{self.prefix} {self.label}...")
         self.start = time.time()
 
-    def __exit__(self, _type, _val, _traceback):
+    def __exit__(self, _type, _val, _traceback) -> None:
         elapsed = time.time() - self.start
         ndsl_log.debug(f"{self.prefix} {self.label}...{elapsed}s.")
 
@@ -81,7 +77,7 @@ def memory_static_analysis(
     """Analysis an SDFG for memory pressure.
 
     The results split memory by type (dace.StorageType) and account for
-    allocated, unreferenced and top lovel (e.g. top-most SDFG) memory
+    allocated, unreferenced and top level (e.g. top-most SDFG) memory
     """
     # We report all allocation type
     allocations: Dict[dace.StorageType, StorageReport] = {}
@@ -92,7 +88,7 @@ def memory_static_analysis(
         array_size_in_bytes = arr.total_size * arr.dtype.bytes
         ref = _is_ref(sd, aname)
 
-        # Transient in maps (refrence and not referenced)
+        # Transient in maps (reference and not referenced)
         if sd is not sdfg and arr.transient:
             if arr.pool:
                 allocations[arr.storage].in_pooled_in_bytes += array_size_in_bytes
@@ -111,7 +107,7 @@ def memory_static_analysis(
             else:
                 allocations[arr.storage].unreferenced_in_bytes += array_size_in_bytes
 
-        # SDFG-level memory (refrence, not referenced and pooled)
+        # SDFG-level memory (reference, not referenced and pooled)
         elif sd is sdfg:
             if arr.pool:
                 allocations[arr.storage].in_pooled_in_bytes += array_size_in_bytes
@@ -137,7 +133,7 @@ def memory_static_analysis(
 def report_memory_static_analysis(
     sdfg: dace.sdfg.SDFG,
     allocations: Dict[dace.StorageType, StorageReport],
-    detail_report=False,
+    detail_report: bool = False,
 ) -> str:
     """Create a human readable report form the memory analysis results"""
     report = f"{sdfg.name}:\n"
@@ -145,14 +141,14 @@ def report_memory_static_analysis(
         alloc_in_mb = float(allocs.referenced_in_bytes / (1024 * 1024))
         unref_alloc_in_mb = float(allocs.unreferenced_in_bytes / (1024 * 1024))
         in_pooled_in_mb = float(allocs.in_pooled_in_bytes / (1024 * 1024))
-        toplvlalloc_in_mb = float(allocs.top_level_in_bytes / (1024 * 1024))
-        if alloc_in_mb or toplvlalloc_in_mb > 0:
+        top_level_alloc_in_mb = float(allocs.top_level_in_bytes / (1024 * 1024))
+        if alloc_in_mb or top_level_alloc_in_mb > 0:
             report += (
                 f"{storage}:\n"
                 f"  Alloc ref {alloc_in_mb:.2f} mb\n"
                 f"  Alloc unref {unref_alloc_in_mb:.2f} mb\n"
                 f"  Pooled {in_pooled_in_mb:.2f} mb\n"
-                f"  Top lvl alloc: {toplvlalloc_in_mb:.2f}mb\n"
+                f"  Top lvl alloc: {top_level_alloc_in_mb:.2f}mb\n"
             )
             if detail_report:
                 report += "\n"
@@ -172,7 +168,9 @@ def report_memory_static_analysis(
     return report
 
 
-def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str:
+def memory_static_analysis_from_path(
+    sdfg_path: str, detail_report: bool = False
+) -> str:
     """Open a SDFG and report the memory analysis"""
     sdfg = dace.SDFG.from_file(sdfg_path)
     return report_memory_static_analysis(
@@ -183,53 +181,55 @@ def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str
 
 
 # ----------------------------------------------------------
-# Theoritical bandwith from SDFG
+# Theoretical bandwidth from SDFG
 # ----------------------------------------------------------
-def copy_defn(q_in: FloatField, q_out: FloatField):
+def copy_kernel(q_in: FloatField, q_out: FloatField) -> None:
     with computation(PARALLEL), interval(...):
         q_in = q_out
 
 
-class MaxBandwithBenchmarkProgram:
+class MaxBandwidthBenchmarkProgram:
     def __init__(self, size, backend) -> None:
         from ndsl.dsl.dace.orchestration import DaCeOrchestration, orchestrate
 
-        dconfig = DaceConfig(None, backend, orchestration=DaCeOrchestration.BuildAndRun)
+        dace_config = DaceConfig(
+            None, backend, orchestration=DaCeOrchestration.BuildAndRun
+        )
         c = CompilationConfig(backend=backend)
-        s = StencilConfig(dace_config=dconfig, compilation_config=c)
+        s = StencilConfig(dace_config=dace_config, compilation_config=c)
         self.copy_stencil = FrozenStencil(
-            func=copy_defn,
+            func=copy_kernel,
             origin=(0, 0, 0),
             domain=size,
             stencil_config=s,
         )
-        orchestrate(obj=self, config=dconfig)
+        orchestrate(obj=self, config=dace_config)
 
-    def __call__(self, A, B, n: int):
+    def __call__(self, A, B, n: int) -> None:
         for i in dace.nounroll(range(n)):
             self.copy_stencil(A, B)
 
 
 def kernel_theoretical_timing(
     sdfg: dace.sdfg.SDFG,
-    hardware_bw_in_GB_s=None,
-    backend=None,
+    hardware_bw_in_GB_s: Optional[float] = None,
+    backend: Optional[str] = None,
 ) -> Dict[str, float]:
     """Compute a lower timing bound for kernels with the following hypothesis:
 
     - Performance is memory bound, e.g. arithmetic intensity isn't counted
     - Hardware bandwidth comes from a GT4Py/DaCe test rather than a spec sheet for
-      for higher accuracy. Best is to run a copy_stencils on a full domain
+      for higher accuracy. Best is to run a copy_stencil on a full domain
     - Memory pressure is mostly in read/write from global memory, inner scalar & shared
       memory is not counted towards memory movement.
     """
-    if not hardware_bw_in_GB_s:
+    if hardware_bw_in_GB_s is None:
         size = np.array(sdfg.arrays["__g_self__w"].shape)
         print(
-            f"Calculating experimental hardware bandwith on {size}"
+            f"Calculating experimental hardware bandwidth on {size}"
             f" arrays at {Float} precision..."
         )
-        bench = MaxBandwithBenchmarkProgram(size, backend)
+        bench = MaxBandwidthBenchmarkProgram(size, backend)
         if backend == "dace:gpu":
             A = cp.ones(size, dtype=Float)
             B = cp.ones(size, dtype=Float)
@@ -248,13 +248,19 @@ def kernel_theoretical_timing(
             bench(A, B, n)
             dt.append((time.time() - s) / n)
         memory_size_in_b = np.prod(size) * np.dtype(Float).itemsize * 8
-        bandwidth_in_bytes_s = memory_size_in_b / np.median(dt)
-        print(
-            f"Hardware bandwith computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s"
-        )
-    else:
-        bandwidth_in_bytes_s = hardware_bw_in_GB_s * 1024 * 1024 * 1024
-        print(f"Given hardware bandwith: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
+        measured_bandwidth_in_bytes_s = memory_size_in_b / np.median(dt)
+
+    bandwidth_in_bytes_s = (
+        measured_bandwidth_in_bytes_s
+        if hardware_bw_in_GB_s is None
+        else hardware_bw_in_GB_s * 1024 * 1024 * 1024
+    )
+    label = (
+        "Hardware bandwidth computed"
+        if hardware_bw_in_GB_s
+        else "Given hardware bandwidth"
+    )
+    print(f"{label}: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
 
     allmaps = [
         (me, state)
@@ -307,12 +313,6 @@ def kernel_theoretical_timing(
             except TypeError:
                 pass
 
-        # Bad expansion
-        if not isinstance(newresult_in_us, sympy.core.numbers.Float) and not isinstance(
-            newresult_in_us, float
-        ):
-            continue
-
         result[node.label] = float(newresult_in_us)
 
     return result
diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index f1facfe4..31c60ca7 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -53,11 +53,14 @@ def wrapper(*args, **kwargs) -> Any:
 def _mask_to_dimensions(
     mask: Tuple[bool, ...], shape: Sequence[int]
 ) -> List[Union[str, int]]:
-    assert len(mask) == 3
+    assert len(mask) >= 3
     dimensions: List[Union[str, int]] = []
     for i, axis in enumerate(("I", "J", "K")):
         if mask[i]:
             dimensions.append(axis)
+    if len(mask) > 3:
+        for i in range(3, len(mask)):
+            dimensions.append(str(shape[i]))
     offset = int(sum(mask))
     dimensions.extend(shape[offset:])
     return dimensions
@@ -65,7 +68,7 @@ def _mask_to_dimensions(
 
 def _translate_origin(origin: Sequence[int], mask: Tuple[bool, ...]) -> Sequence[int]:
     if len(origin) == int(sum(mask)):
-        # Correct length. Assumedd to be correctly specified.
+        # Correct length. Assumed to be correctly specified.
         return origin
 
     assert len(mask) == 3
@@ -154,6 +157,8 @@ def make_storage_data(
         data = _make_storage_data_2d(
             data, shape, start, dummy, axis, read_only, backend=backend
         )
+    elif n_dims >= 4:
+        data = _make_storage_data_Nd(data, shape, start, backend=backend)
     else:
         data = _make_storage_data_3d(data, shape, start, backend=backend)
 
@@ -257,6 +262,21 @@ def _make_storage_data_3d(
     return buffer
 
 
+def _make_storage_data_Nd(
+    data: Field,
+    shape: Tuple[int, ...],
+    start: Tuple[int, ...] = None,
+    *,
+    backend: str,
+) -> Field:
+    if start is None:
+        start = tuple([0] * data.ndim)
+    buffer = zeros(shape, backend=backend)
+    idx = tuple([slice(start[i], start[i] + data.shape[i]) for i in range(len(start))])
+    buffer[idx] = asarray(data, type(buffer))
+    return buffer
+
+
 def make_storage_from_shape(
     shape: Tuple[int, ...],
     origin: Tuple[int, ...] = origin,
@@ -310,6 +330,7 @@ def make_storage_dict(
     axis: int = 2,
     *,
     backend: str,
+    dtype: DTypes = Float,
 ) -> Dict[str, "Field"]:
     assert names is not None, "for 4d variable storages, specify a list of names"
     if shape is None:
@@ -324,6 +345,7 @@ def make_storage_dict(
             dummy=dummy,
             axis=axis,
             backend=backend,
+            dtype=dtype,
         )
     return data_dict
 
diff --git a/ndsl/dsl/stencil.py b/ndsl/dsl/stencil.py
index 6a70b4d8..5e917e66 100644
--- a/ndsl/dsl/stencil.py
+++ b/ndsl/dsl/stencil.py
@@ -737,6 +737,9 @@ def axis_offsets(
             "local_js": gtscript.J[0] + self.jsc - origin[1],
             "j_end": j_end,
             "local_je": gtscript.J[-1] + self.jec - origin[1] - domain[1] + 1,
+            "k_start": origin[2] if len(origin) > 2 else 0,
+            "k_end": (origin[2] if len(origin) > 2 else 0)
+            + (domain[2] - 1 if len(domain) > 2 else 0),
         }
 
     def get_origin_domain(
diff --git a/ndsl/dsl/typing.py b/ndsl/dsl/typing.py
index 5eab76ef..b3fa72d8 100644
--- a/ndsl/dsl/typing.py
+++ b/ndsl/dsl/typing.py
@@ -46,7 +46,6 @@ def global_set_floating_point_precision():
         NotImplementedError(
             f"{precision_in_bit} bit precision not implemented or tested"
         )
-    return None
 
 
 # Default float and int types
@@ -55,10 +54,20 @@ def global_set_floating_point_precision():
 Bool = np.bool_
 
 FloatField = Field[gtscript.IJK, Float]
+FloatField64 = Field[gtscript.IJK, np.float64]
+FloatField32 = Field[gtscript.IJK, np.float32]
 FloatFieldI = Field[gtscript.I, Float]
+FloatFieldI64 = Field[gtscript.I, np.float64]
+FloatFieldI32 = Field[gtscript.I, np.float32]
 FloatFieldJ = Field[gtscript.J, Float]
+FloatFieldJ64 = Field[gtscript.J, np.float64]
+FloatFieldJ32 = Field[gtscript.J, np.float32]
 FloatFieldIJ = Field[gtscript.IJ, Float]
+FloatFieldIJ64 = Field[gtscript.IJ, np.float64]
+FloatFieldIJ32 = Field[gtscript.IJ, np.float32]
 FloatFieldK = Field[gtscript.K, Float]
+FloatFieldK64 = Field[gtscript.K, np.float64]
+FloatFieldK32 = Field[gtscript.K, np.float32]
 IntField = Field[gtscript.IJK, Int]
 IntFieldIJ = Field[gtscript.IJ, Int]
 IntFieldK = Field[gtscript.K, Int]
@@ -80,3 +89,14 @@ def cast_to_index3d(val: Tuple[int, ...]) -> Index3D:
     if len(val) != 3:
         raise ValueError(f"expected 3d index, received {val}")
     return cast(Index3D, val)
+
+
+def is_float(dtype: type):
+    """Expected floating point type"""
+    return dtype in [
+        Float,
+        float,
+        np.float16,
+        np.float32,
+        np.float64,
+    ]
diff --git a/ndsl/grid/generation.py b/ndsl/grid/generation.py
index 75437272..f77e2cd2 100644
--- a/ndsl/grid/generation.py
+++ b/ndsl/grid/generation.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 
+from ndsl.comm.comm_abc import ReductionOperator
 from ndsl.comm.communicator import Communicator
 from ndsl.constants import (
     N_HALO_DEFAULT,
@@ -686,7 +687,7 @@ def ptop(self) -> Quantity:
     @property
     def ec1(self) -> Quantity:
         """
-        cartesian components of the local unit vetcor
+        cartesian components of the local unit vector
         in the x-direction at the cell centers
         3d array whose last dimension is length 3 and indicates cartesian x/y/z value
         """
@@ -697,8 +698,8 @@ def ec1(self) -> Quantity:
     @property
     def ec2(self) -> Quantity:
         """
-        cartesian components of the local unit vetcor
-        in the y-direation at the cell centers
+        cartesian components of the local unit vector
+        in the y-direction at the cell centers
         3d array whose last dimension is length 3 and indicates cartesian x/y/z value
         """
         if self._ec2 is None:
@@ -708,8 +709,8 @@ def ec2(self) -> Quantity:
     @property
     def ew1(self) -> Quantity:
         """
-        cartesian components of the local unit vetcor
-        in the x-direation at the left/right cell edges
+        cartesian components of the local unit vector
+        in the x-direction at the left/right cell edges
         3d array whose last dimension is length 3 and indicates cartesian x/y/z value
         """
         if self._ew1 is None:
@@ -719,8 +720,8 @@ def ew1(self) -> Quantity:
     @property
     def ew2(self) -> Quantity:
         """
-        cartesian components of the local unit vetcor
-        in the y-direation at the left/right cell edges
+        cartesian components of the local unit vector
+        in the y-direction at the left/right cell edges
         3d array whose last dimension is length 3 and indicates cartesian x/y/z value
         """
         if self._ew2 is None:
@@ -3428,7 +3429,11 @@ def _reduce_global_area_minmaxes(self):
         max_area = self._np.max(self.area.data[3:-4, 3:-4])[()]
         min_area_c = self._np.min(self.area_c.data[3:-4, 3:-4])[()]
         max_area_c = self._np.max(self.area_c.data[3:-4, 3:-4])[()]
-        self._da_min = float(self._comm.comm.allreduce(min_area, min))
-        self._da_max = float(self._comm.comm.allreduce(max_area, max))
-        self._da_min_c = float(self._comm.comm.allreduce(min_area_c, min))
-        self._da_max_c = float(self._comm.comm.allreduce(max_area_c, max))
+        self._da_min = float(self._comm.comm.allreduce(min_area, ReductionOperator.MIN))
+        self._da_max = float(self._comm.comm.allreduce(max_area, ReductionOperator.MAX))
+        self._da_min_c = float(
+            self._comm.comm.allreduce(min_area_c, ReductionOperator.MIN)
+        )
+        self._da_max_c = float(
+            self._comm.comm.allreduce(max_area_c, ReductionOperator.MAX)
+        )
diff --git a/ndsl/grid/global_setup.py b/ndsl/grid/global_setup.py
index a0237ec6..60bd3c3b 100644
--- a/ndsl/grid/global_setup.py
+++ b/ndsl/grid/global_setup.py
@@ -21,8 +21,8 @@ def gnomonic_grid(grid_type: int, lon, lat, np):
 
     args:
         grid_type: type of grid to apply
-        lon: longitute array with dimensions [x, y]
-        lat: latitude array with dimensionos [x, y]
+        lon: longitude array with dimensions [x, y]
+        lat: latitude array with dimensions [x, y]
     """
     _check_shapes(lon, lat)
     if grid_type == 0:
@@ -142,7 +142,7 @@ def global_mirror_grid(
                 y1, grid_global[ng + npx - (i + 1), ng + npy - (j + 1), 1, nreg]
             )
 
-            # force dateline/greenwich-meridion consistency
+            # force dateline/greenwich-meridian consistency
             if npx % 2 != 0:
                 if i == (npx - 1) // 2:
                     grid_global[ng + i, ng + j, 0, nreg] = 0.0
diff --git a/ndsl/grid/gnomonic.py b/ndsl/grid/gnomonic.py
index 778d9064..0fc421ef 100644
--- a/ndsl/grid/gnomonic.py
+++ b/ndsl/grid/gnomonic.py
@@ -595,8 +595,8 @@ def get_rectangle_area(p1, p2, p3, p4, radius, np):
     counterclockwise order, return an array of spherical rectangle areas.
     NOTE, this is not the exact same order of operations as the Fortran code
     This results in some errors in the last digit, but the spherical_angle
-    is an exact match. The errors in the last digit multipled out by the radius
-    end up causing relative errors larger than 1e-14, but still wtihin 1e-12.
+    is an exact match. The errors in the last digit multiplied out by the radius
+    end up causing relative errors larger than 1e-14, but still within 1e-12.
     """
     total_angle = spherical_angle(p2, p3, p1, np)
     for (
@@ -702,7 +702,7 @@ def spherical_cos(p_center, p2, p3, np):
 
 def get_unit_vector_direction(p1, p2, np):
     """
-    Returms the unit vector pointing from a set of lonlat points p1 to lonlat points p2
+    Returns the unit vector pointing from a set of lonlat points p1 to lonlat points p2
     """
     xyz1 = lon_lat_to_xyz(p1[:, :, 0], p1[:, :, 1], np)
     xyz2 = lon_lat_to_xyz(p2[:, :, 0], p2[:, :, 1], np)
diff --git a/ndsl/grid/helper.py b/ndsl/grid/helper.py
index 745dce37..6cb6a374 100644
--- a/ndsl/grid/helper.py
+++ b/ndsl/grid/helper.py
@@ -284,10 +284,20 @@ class AngleGridData:
     sin_sg2: Quantity
     sin_sg3: Quantity
     sin_sg4: Quantity
+    sin_sg5: Quantity
+    sin_sg6: Quantity
+    sin_sg7: Quantity
+    sin_sg8: Quantity
+    sin_sg9: Quantity
     cos_sg1: Quantity
     cos_sg2: Quantity
     cos_sg3: Quantity
     cos_sg4: Quantity
+    cos_sg5: Quantity
+    cos_sg6: Quantity
+    cos_sg7: Quantity
+    cos_sg8: Quantity
+    cos_sg9: Quantity
 
     @classmethod
     def new_from_metric_terms(cls, metric_terms: MetricTerms) -> "AngleGridData":
@@ -296,10 +306,20 @@ def new_from_metric_terms(cls, metric_terms: MetricTerms) -> "AngleGridData":
             sin_sg2=metric_terms.sin_sg2,
             sin_sg3=metric_terms.sin_sg3,
             sin_sg4=metric_terms.sin_sg4,
+            sin_sg5=metric_terms.sin_sg5,
+            sin_sg6=metric_terms.sin_sg6,
+            sin_sg7=metric_terms.sin_sg7,
+            sin_sg8=metric_terms.sin_sg8,
+            sin_sg9=metric_terms.sin_sg9,
             cos_sg1=metric_terms.cos_sg1,
             cos_sg2=metric_terms.cos_sg2,
             cos_sg3=metric_terms.cos_sg3,
             cos_sg4=metric_terms.cos_sg4,
+            cos_sg5=metric_terms.cos_sg5,
+            cos_sg6=metric_terms.cos_sg6,
+            cos_sg7=metric_terms.cos_sg7,
+            cos_sg8=metric_terms.cos_sg8,
+            cos_sg9=metric_terms.cos_sg9,
         )
 
 
@@ -618,6 +638,26 @@ def sin_sg3(self):
     def sin_sg4(self):
         return self._angle_data.sin_sg4
 
+    @property
+    def sin_sg5(self):
+        return self._angle_data.sin_sg5
+
+    @property
+    def sin_sg6(self):
+        return self._angle_data.sin_sg6
+
+    @property
+    def sin_sg7(self):
+        return self._angle_data.sin_sg7
+
+    @property
+    def sin_sg8(self):
+        return self._angle_data.sin_sg8
+
+    @property
+    def sin_sg9(self):
+        return self._angle_data.sin_sg9
+
     @property
     def cos_sg1(self):
         return self._angle_data.cos_sg1
@@ -634,6 +674,26 @@ def cos_sg3(self):
     def cos_sg4(self):
         return self._angle_data.cos_sg4
 
+    @property
+    def cos_sg5(self):
+        return self._angle_data.cos_sg5
+
+    @property
+    def cos_sg6(self):
+        return self._angle_data.cos_sg6
+
+    @property
+    def cos_sg7(self):
+        return self._angle_data.cos_sg7
+
+    @property
+    def cos_sg8(self):
+        return self._angle_data.cos_sg8
+
+    @property
+    def cos_sg9(self):
+        return self._angle_data.cos_sg9
+
 
 @dataclasses.dataclass(frozen=True)
 class DriverGridData:
diff --git a/ndsl/grid/mirror.py b/ndsl/grid/mirror.py
index cf547bdf..3f3a4b6b 100644
--- a/ndsl/grid/mirror.py
+++ b/ndsl/grid/mirror.py
@@ -63,7 +63,7 @@ def mirror_grid(
                 y1, mirror_data["local"][i, j, 1]
             )
 
-            # force dateline/greenwich-meridion consistency
+            # force dateline/greenwich-meridian consistency
             if npx % 2 != 0:
                 if x_center_tile and i == ng + i_mid:
                     mirror_data["local"][i, j, 0] = 0.0
diff --git a/ndsl/grid/stretch_transformation.py b/ndsl/grid/stretch_transformation.py
index 5604a110..9a481ab9 100644
--- a/ndsl/grid/stretch_transformation.py
+++ b/ndsl/grid/stretch_transformation.py
@@ -21,9 +21,9 @@ def direct_transform(
     """
     The direct_transform subroutine from fv_grid_utils.F90.
     Takes in latitude and longitude in radians.
-    Shrinks tile 6 by stretch factor in area to increse resolution locally.
+    Shrinks tile 6 by stretch factor in area to increase resolution locally.
     Then performs translation of all tiles so that the now-smaller tile 6 is
-    centeres on lon_target, lat_target.
+    centered on lon_target, lat_target.
 
     Args:
         lon (in) in radians
diff --git a/ndsl/halo/data_transformer.py b/ndsl/halo/data_transformer.py
index cb50cc12..9f9ab2f6 100644
--- a/ndsl/halo/data_transformer.py
+++ b/ndsl/halo/data_transformer.py
@@ -239,7 +239,7 @@ def get(
         """Construct a module from a numpy-like module.
 
         Args:
-            np_module: numpy-like module to determin child transformer type.
+            np_module: numpy-like module to determine child transformer type.
             exchange_descriptors_x: list of memory information describing an exchange.
                 Used for scalar data and the x-component of vectors.
             exchange_descriptors_y: list of memory information describing an exchange.
diff --git a/ndsl/halo/updater.py b/ndsl/halo/updater.py
index 665d0b95..76f7608f 100644
--- a/ndsl/halo/updater.py
+++ b/ndsl/halo/updater.py
@@ -38,7 +38,7 @@ class HaloUpdater:
     - update and start/wait trigger the halo exchange
     - the class creates a "pattern" of exchange that can fit
       any memory given to do/start
-    - temporary references to the Quanitites are held between start and wait
+    - temporary references to the Quantities are held between start and wait
     """
 
     def __init__(
@@ -106,7 +106,7 @@ def from_scalar_specifications(
             numpy_like_module: module implementing numpy API
             specifications: data specifications to exchange, including
                 number of halo points
-            boundaries: informations on the exchange boundaries.
+            boundaries: information on the exchange boundaries.
             tag: network tag (to differentiate messaging) for this node.
             optional_timer: timing of operations.
 
@@ -161,7 +161,7 @@ def from_vector_specifications(
                 Length must match y specifications.
             specifications_y: specifications to exchange along the y axis.
                 Length must match x specifications.
-            boundaries: informations on the exchange boundaries.
+            boundaries: information on the exchange boundaries.
             tag: network tag (to differentiate messaging) for this node.
             optional_timer: timing of operations.
 
@@ -210,7 +210,7 @@ def update(
         quantities_x: List[Quantity],
         quantities_y: Optional[List[Quantity]] = None,
     ):
-        """Exhange the data and blocks until finished."""
+        """Exchange the data and blocks until finished."""
         self.start(quantities_x, quantities_y)
         self.wait()
 
@@ -283,7 +283,7 @@ def wait(self):
             for recv_req in self._recv_requests:
                 recv_req.wait()
 
-        # Unpack buffers (updated by MPI with neighbouring halos)
+        # Unpack buffers (updated by MPI with neighboring halos)
         # to proper quantities
         with self._timer.clock("unpack"):
             for buffer in self._transformers.values():
diff --git a/ndsl/io.py b/ndsl/io.py
index 9d9f9149..b07248bb 100644
--- a/ndsl/io.py
+++ b/ndsl/io.py
@@ -44,7 +44,7 @@ def write_state(state: dict, filename: str) -> None:
 
 
 def _extract_time(value: xr.DataArray) -> cftime.datetime:
-    """Exctract time value from read-in state."""
+    """Extract time value from read-in state."""
     if value.ndim > 0:
         raise ValueError(
             "State must be representative of a single scalar time. " f"Got {value}."
diff --git a/ndsl/namelist.py b/ndsl/namelist.py
index 205954ca..304d9160 100644
--- a/ndsl/namelist.py
+++ b/ndsl/namelist.py
@@ -42,7 +42,7 @@ class NamelistDefaults:
     qi_gen = 1.82e-6  # max cloud ice generation during remapping step
     qi_lim = 1.0  # cloud ice limiter to prevent large ice build up
     qi0_max = 1.0e-4  # max cloud ice value (by other sources)
-    rad_snow = True  # consider snow in cloud fraciton calculation
+    rad_snow = True  # consider snow in cloud fraction calculation
     rad_rain = True  # consider rain in cloud fraction calculation
     rad_graupel = True  # consider graupel in cloud fraction calculation
     tintqs = False  # use temperature in the saturation mixing in PDF
@@ -110,7 +110,7 @@ class NamelistDefaults:
     z_slope_liq = True  # Use linear mono slope for autoconversions
     tice = 273.16  # set tice = 165. to turn off ice - phase phys (kessler emulator)
     alin = 842.0  # "a" in lin1983
-    clin = 4.8  # "c" in lin 1983, 4.8 -- > 6. (to ehance ql -- > qs)
+    clin = 4.8  # "c" in lin 1983, 4.8 -- > 6. (to enhance ql -- > qs)
     isatmedmf = 0  # which version of satmedmfvdif to use
     dspheat = False  # flag for tke dissipative heating
     xkzm_h = 1.0  # background vertical diffusion for heat q over ocean
@@ -123,10 +123,10 @@ class NamelistDefaults:
     xkzm_lim = 0.01  # background vertical diffusion limit
     xkzminv = 0.15  # diffusivity in inversion layers
     xkgdx = 25.0e3  # background vertical diffusion threshold
-    rlmn = 30.0  # lower-limter on asymtotic mixing length in satmedmfdiff
-    rlmx = 300.0  # upper-limter on asymtotic mixing length in satmedmfdiff
+    rlmn = 30.0  # lower-limiter on asymtotic mixing length in satmedmfdiff
+    rlmx = 300.0  # upper-limiter on asymtotic mixing length in satmedmfdiff
     do_dk_hb19 = False  # flag for using hb19 background diff formula in satmedmfdiff
-    cap_k0_land = False  # flag for applying limter on background diff in inversion layer over land in satmedmfdiff
+    cap_k0_land = False  # flag for applying limiter on background diff in inversion layer over land in satmedmfdiff
 
     @classmethod
     def as_dict(cls):
diff --git a/ndsl/quantity/__init__.py b/ndsl/quantity/__init__.py
new file mode 100644
index 00000000..fce0cf63
--- /dev/null
+++ b/ndsl/quantity/__init__.py
@@ -0,0 +1,9 @@
+from ndsl.quantity.metadata import QuantityHaloSpec, QuantityMetadata
+from ndsl.quantity.quantity import Quantity
+
+
+__all__ = [
+    "Quantity",
+    "QuantityMetadata",
+    "QuantityHaloSpec",
+]
diff --git a/ndsl/quantity/bounds.py b/ndsl/quantity/bounds.py
new file mode 100644
index 00000000..419cff22
--- /dev/null
+++ b/ndsl/quantity/bounds.py
@@ -0,0 +1,190 @@
+from typing import Sequence, Tuple, Union
+
+import numpy as np
+
+import ndsl.constants as constants
+from ndsl.comm._boundary_utils import bound_default_slice, shift_boundary_slice_tuple
+
+
+class BoundaryArrayView:
+    def __init__(self, data, boundary_type, dims, origin, extent):
+        self._data = data
+        self._boundary_type = boundary_type
+        self._dims = dims
+        self._origin = origin
+        self._extent = extent
+
+    def __getitem__(self, index):
+        if len(self._origin) == 0:
+            if isinstance(index, tuple) and len(index) > 0:
+                raise IndexError("more than one index given for a zero-dimension array")
+            elif isinstance(index, slice) and index != slice(None, None, None):
+                raise IndexError("cannot slice a zero-dimension array")
+            else:
+                return self._data  # array[()] does not return an ndarray
+        else:
+            return self._data[self._get_array_index(index)]
+
+    def __setitem__(self, index, value):
+        self._data[self._get_array_index(index)] = value
+
+    def _get_array_index(self, index):
+        if isinstance(index, list):
+            index = tuple(index)
+        if not isinstance(index, tuple):
+            index = (index,)
+        if len(index) > len(self._dims):
+            raise IndexError(
+                f"{len(index)} is too many indices for a "
+                f"{len(self._dims)}-dimensional quantity"
+            )
+        if len(index) < len(self._dims):
+            index = index + (slice(None, None),) * (len(self._dims) - len(index))
+        return shift_boundary_slice_tuple(
+            self._dims, self._origin, self._extent, self._boundary_type, index
+        )
+
+    def sel(self, **kwargs: Union[slice, int]) -> np.ndarray:
+        """Convenience method to perform indexing using dimension names
+        without knowing dimension order.
+
+        Args:
+            **kwargs: slice/index to retrieve for a given dimension name
+
+        Returns:
+            view_selection: an ndarray-like selection of the given indices
+                on `self.view`
+        """
+        return self[tuple(kwargs.get(dim, slice(None, None)) for dim in self._dims)]
+
+
+class BoundedArrayView:
+    """
+    A container of objects which provide indexing relative to corners and edges
+    of the computational domain for convenience.
+
+    Default start and end indices for all dimensions are modified to be the
+    start and end of the compute domain. When using edge and corner attributes, it is
+    recommended to explicitly write start and end offsets to avoid confusion.
+
+    Indexing on the object itself (view[:]) is offset by the origin, and default
+    start and end indices are modified to be the start and end of the compute domain.
+
+    For corner attributes e.g. `northwest`, modified indexing is done for the two
+    axes according to the edges which make up the corner. In other words, indexing
+    is offset relative to the intersection of the two edges which make the corner.
+
+    For `interior`, start indices of the horizontal dimensions are relative to the
+    origin, and end indices are relative to the origin + extent. For example,
+    view.interior[0:0, 0:0, :] would retrieve the entire compute domain for an x/y/z
+    array, while view.interior[-1:1, -1:1, :] would also include one halo point.
+    """
+
+    def __init__(
+        self, array, dims: Sequence[str], origin: Sequence[int], extent: Sequence[int]
+    ):
+        self._data = array
+        self._dims = tuple(dims)
+        self._origin = tuple(origin)
+        self._extent = tuple(extent)
+        self._northwest = BoundaryArrayView(
+            array, constants.NORTHWEST, dims, origin, extent
+        )
+        self._northeast = BoundaryArrayView(
+            array, constants.NORTHEAST, dims, origin, extent
+        )
+        self._southwest = BoundaryArrayView(
+            array, constants.SOUTHWEST, dims, origin, extent
+        )
+        self._southeast = BoundaryArrayView(
+            array, constants.SOUTHEAST, dims, origin, extent
+        )
+        self._interior = BoundaryArrayView(
+            array, constants.INTERIOR, dims, origin, extent
+        )
+
+    @property
+    def origin(self) -> Tuple[int, ...]:
+        """the start of the computational domain"""
+        return self._origin
+
+    @property
+    def extent(self) -> Tuple[int, ...]:
+        """the shape of the computational domain"""
+        return self._extent
+
+    def __getitem__(self, index):
+        if len(self.origin) == 0:
+            if isinstance(index, tuple) and len(index) > 0:
+                raise IndexError("more than one index given for a zero-dimension array")
+            elif isinstance(index, slice) and index != slice(None, None, None):
+                raise IndexError("cannot slice a zero-dimension array")
+            else:
+                return self._data  # array[()] does not return an ndarray
+        else:
+            return self._data[self._get_compute_index(index)]
+
+    def __setitem__(self, index, value):
+        self._data[self._get_compute_index(index)] = value
+
+    def _get_compute_index(self, index):
+        if not isinstance(index, (tuple, list)):
+            index = (index,)
+        if len(index) > len(self._dims):
+            raise IndexError(
+                f"{len(index)} is too many indices for a "
+                f"{len(self._dims)}-dimensional quantity"
+            )
+        index = _fill_index(index, len(self._data.shape))
+        shifted_index = []
+        for entry, origin, extent in zip(index, self.origin, self.extent):
+            if isinstance(entry, slice):
+                shifted_slice = _shift_slice(entry, origin, extent)
+                shifted_index.append(
+                    bound_default_slice(shifted_slice, origin, origin + extent)
+                )
+            elif entry is None:
+                shifted_index.append(entry)
+            else:
+                shifted_index.append(entry + origin)
+        return tuple(shifted_index)
+
+    @property
+    def northwest(self) -> BoundaryArrayView:
+        return self._northwest
+
+    @property
+    def northeast(self) -> BoundaryArrayView:
+        return self._northeast
+
+    @property
+    def southwest(self) -> BoundaryArrayView:
+        return self._southwest
+
+    @property
+    def southeast(self) -> BoundaryArrayView:
+        return self._southeast
+
+    @property
+    def interior(self) -> BoundaryArrayView:
+        return self._interior
+
+
+def _fill_index(index, length):
+    return tuple(index) + (slice(None, None, None),) * (length - len(index))
+
+
+def _shift_slice(slice_in, shift, extent):
+    start = _shift_index(slice_in.start, shift, extent)
+    stop = _shift_index(slice_in.stop, shift, extent)
+    return slice(start, stop, slice_in.step)
+
+
+def _shift_index(current_value, shift, extent):
+    if current_value is None:
+        new_value = None
+    else:
+        new_value = current_value + shift
+        if new_value < 0:
+            new_value = extent + new_value
+    return new_value
diff --git a/ndsl/quantity/metadata.py b/ndsl/quantity/metadata.py
new file mode 100644
index 00000000..d7ddba0f
--- /dev/null
+++ b/ndsl/quantity/metadata.py
@@ -0,0 +1,70 @@
+import dataclasses
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+
+from ndsl.optional_imports import cupy
+from ndsl.types import NumpyModule
+
+
+if cupy is None:
+    import numpy as cupy
+
+
+@dataclasses.dataclass
+class QuantityMetadata:
+    origin: Tuple[int, ...]
+    "the start of the computational domain"
+    extent: Tuple[int, ...]
+    "the shape of the computational domain"
+    dims: Tuple[str, ...]
+    "names of each dimension"
+    units: str
+    "units of the quantity"
+    data_type: type
+    "ndarray-like type used to store the data"
+    dtype: type
+    "dtype of the data in the ndarray-like object"
+    gt4py_backend: Union[str, None] = None
+    "backend to use for gt4py storages"
+
+    @property
+    def dim_lengths(self) -> Dict[str, int]:
+        """mapping of dimension names to their lengths"""
+        return dict(zip(self.dims, self.extent))
+
+    @property
+    def np(self) -> NumpyModule:
+        """numpy-like module used to interact with the data"""
+        if issubclass(self.data_type, cupy.ndarray):
+            return cupy
+        elif issubclass(self.data_type, np.ndarray):
+            return np
+        else:
+            raise TypeError(
+                f"quantity underlying data is of unexpected type {self.data_type}"
+            )
+
+    def duplicate_metadata(self, metadata_copy):
+        metadata_copy.origin = self.origin
+        metadata_copy.extent = self.extent
+        metadata_copy.dims = self.dims
+        metadata_copy.units = self.units
+        metadata_copy.data_type = self.data_type
+        metadata_copy.dtype = self.dtype
+        metadata_copy.gt4py_backend = self.gt4py_backend
+
+
+@dataclasses.dataclass
+class QuantityHaloSpec:
+    """Describe the memory to be exchanged, including size of the halo."""
+
+    n_points: int
+    strides: Tuple[int]
+    itemsize: int
+    shape: Tuple[int]
+    origin: Tuple[int, ...]
+    extent: Tuple[int, ...]
+    dims: Tuple[str, ...]
+    numpy_module: NumpyModule
+    dtype: Any
diff --git a/ndsl/quantity.py b/ndsl/quantity/quantity.py
similarity index 59%
rename from ndsl/quantity.py
rename to ndsl/quantity/quantity.py
index b95a9aad..c88ba140 100644
--- a/ndsl/quantity.py
+++ b/ndsl/quantity/quantity.py
@@ -1,274 +1,21 @@
-import dataclasses
 import warnings
-from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Iterable, Optional, Sequence, Tuple, Union, cast
 
 import matplotlib.pyplot as plt
 import numpy as np
 
 import ndsl.constants as constants
-from ndsl.comm._boundary_utils import bound_default_slice, shift_boundary_slice_tuple
-from ndsl.dsl.typing import Float
+from ndsl.dsl.typing import Float, is_float
 from ndsl.optional_imports import cupy, dace, gt4py
 from ndsl.optional_imports import xarray as xr
+from ndsl.quantity.bounds import BoundedArrayView
+from ndsl.quantity.metadata import QuantityHaloSpec, QuantityMetadata
 from ndsl.types import NumpyModule
 
 
 if cupy is None:
     import numpy as cupy
 
-__all__ = ["Quantity", "QuantityMetadata"]
-
-
-@dataclasses.dataclass
-class QuantityMetadata:
-    origin: Tuple[int, ...]
-    "the start of the computational domain"
-    extent: Tuple[int, ...]
-    "the shape of the computational domain"
-    dims: Tuple[str, ...]
-    "names of each dimension"
-    units: str
-    "units of the quantity"
-    data_type: type
-    "ndarray-like type used to store the data"
-    dtype: type
-    "dtype of the data in the ndarray-like object"
-    gt4py_backend: Union[str, None] = None
-    "backend to use for gt4py storages"
-
-    @property
-    def dim_lengths(self) -> Dict[str, int]:
-        """mapping of dimension names to their lengths"""
-        return dict(zip(self.dims, self.extent))
-
-    @property
-    def np(self) -> NumpyModule:
-        """numpy-like module used to interact with the data"""
-        if issubclass(self.data_type, cupy.ndarray):
-            return cupy
-        elif issubclass(self.data_type, np.ndarray):
-            return np
-        else:
-            raise TypeError(
-                f"quantity underlying data is of unexpected type {self.data_type}"
-            )
-
-
-@dataclasses.dataclass
-class QuantityHaloSpec:
-    """Describe the memory to be exchanged, including size of the halo."""
-
-    n_points: int
-    strides: Tuple[int]
-    itemsize: int
-    shape: Tuple[int]
-    origin: Tuple[int, ...]
-    extent: Tuple[int, ...]
-    dims: Tuple[str, ...]
-    numpy_module: NumpyModule
-    dtype: Any
-
-
-class BoundaryArrayView:
-    def __init__(self, data, boundary_type, dims, origin, extent):
-        self._data = data
-        self._boundary_type = boundary_type
-        self._dims = dims
-        self._origin = origin
-        self._extent = extent
-
-    def __getitem__(self, index):
-        if len(self._origin) == 0:
-            if isinstance(index, tuple) and len(index) > 0:
-                raise IndexError("more than one index given for a zero-dimension array")
-            elif isinstance(index, slice) and index != slice(None, None, None):
-                raise IndexError("cannot slice a zero-dimension array")
-            else:
-                return self._data  # array[()] does not return an ndarray
-        else:
-            return self._data[self._get_array_index(index)]
-
-    def __setitem__(self, index, value):
-        self._data[self._get_array_index(index)] = value
-
-    def _get_array_index(self, index):
-        if isinstance(index, list):
-            index = tuple(index)
-        if not isinstance(index, tuple):
-            index = (index,)
-        if len(index) > len(self._dims):
-            raise IndexError(
-                f"{len(index)} is too many indices for a "
-                f"{len(self._dims)}-dimensional quantity"
-            )
-        if len(index) < len(self._dims):
-            index = index + (slice(None, None),) * (len(self._dims) - len(index))
-        return shift_boundary_slice_tuple(
-            self._dims, self._origin, self._extent, self._boundary_type, index
-        )
-
-    def sel(self, **kwargs: Union[slice, int]) -> np.ndarray:
-        """Convenience method to perform indexing using dimension names
-        without knowing dimension order.
-
-        Args:
-            **kwargs: slice/index to retrieve for a given dimension name
-
-        Returns:
-            view_selection: an ndarray-like selection of the given indices
-                on `self.view`
-        """
-        return self[tuple(kwargs.get(dim, slice(None, None)) for dim in self._dims)]
-
-
-class BoundedArrayView:
-    """
-    A container of objects which provide indexing relative to corners and edges
-    of the computational domain for convenience.
-
-    Default start and end indices for all dimensions are modified to be the
-    start and end of the compute domain. When using edge and corner attributes, it is
-    recommended to explicitly write start and end offsets to avoid confusion.
-
-    Indexing on the object itself (view[:]) is offset by the origin, and default
-    start and end indices are modified to be the start and end of the compute domain.
-
-    For corner attributes e.g. `northwest`, modified indexing is done for the two
-    axes according to the edges which make up the corner. In other words, indexing
-    is offset relative to the intersection of the two edges which make the corner.
-
-    For `interior`, start indices of the horizontal dimensions are relative to the
-    origin, and end indices are relative to the origin + extent. For example,
-    view.interior[0:0, 0:0, :] would retrieve the entire compute domain for an x/y/z
-    array, while view.interior[-1:1, -1:1, :] would also include one halo point.
-    """
-
-    def __init__(
-        self, array, dims: Sequence[str], origin: Sequence[int], extent: Sequence[int]
-    ):
-        self._data = array
-        self._dims = tuple(dims)
-        self._origin = tuple(origin)
-        self._extent = tuple(extent)
-        self._northwest = BoundaryArrayView(
-            array, constants.NORTHWEST, dims, origin, extent
-        )
-        self._northeast = BoundaryArrayView(
-            array, constants.NORTHEAST, dims, origin, extent
-        )
-        self._southwest = BoundaryArrayView(
-            array, constants.SOUTHWEST, dims, origin, extent
-        )
-        self._southeast = BoundaryArrayView(
-            array, constants.SOUTHEAST, dims, origin, extent
-        )
-        self._interior = BoundaryArrayView(
-            array, constants.INTERIOR, dims, origin, extent
-        )
-
-    @property
-    def origin(self) -> Tuple[int, ...]:
-        """the start of the computational domain"""
-        return self._origin
-
-    @property
-    def extent(self) -> Tuple[int, ...]:
-        """the shape of the computational domain"""
-        return self._extent
-
-    def __getitem__(self, index):
-        if len(self.origin) == 0:
-            if isinstance(index, tuple) and len(index) > 0:
-                raise IndexError("more than one index given for a zero-dimension array")
-            elif isinstance(index, slice) and index != slice(None, None, None):
-                raise IndexError("cannot slice a zero-dimension array")
-            else:
-                return self._data  # array[()] does not return an ndarray
-        else:
-            return self._data[self._get_compute_index(index)]
-
-    def __setitem__(self, index, value):
-        self._data[self._get_compute_index(index)] = value
-
-    def _get_compute_index(self, index):
-        if not isinstance(index, (tuple, list)):
-            index = (index,)
-        if len(index) > len(self._dims):
-            raise IndexError(
-                f"{len(index)} is too many indices for a "
-                f"{len(self._dims)}-dimensional quantity"
-            )
-        index = fill_index(index, len(self._data.shape))
-        shifted_index = []
-        for entry, origin, extent in zip(index, self.origin, self.extent):
-            if isinstance(entry, slice):
-                shifted_slice = shift_slice(entry, origin, extent)
-                shifted_index.append(
-                    bound_default_slice(shifted_slice, origin, origin + extent)
-                )
-            elif entry is None:
-                shifted_index.append(entry)
-            else:
-                shifted_index.append(entry + origin)
-        return tuple(shifted_index)
-
-    @property
-    def northwest(self) -> BoundaryArrayView:
-        return self._northwest
-
-    @property
-    def northeast(self) -> BoundaryArrayView:
-        return self._northeast
-
-    @property
-    def southwest(self) -> BoundaryArrayView:
-        return self._southwest
-
-    @property
-    def southeast(self) -> BoundaryArrayView:
-        return self._southeast
-
-    @property
-    def interior(self) -> BoundaryArrayView:
-        return self._interior
-
-
-def ensure_int_tuple(arg, arg_name):
-    return_list = []
-    for item in arg:
-        try:
-            return_list.append(int(item))
-        except ValueError:
-            raise TypeError(
-                f"tuple arg {arg_name}={arg} contains item {item} of "
-                f"unexpected type {type(item)}"
-            )
-    return tuple(return_list)
-
-
-def _validate_quantity_property_lengths(shape, dims, origin, extent):
-    n_dims = len(shape)
-    for var, desc in (
-        (dims, "dimension names"),
-        (origin, "origins"),
-        (extent, "extents"),
-    ):
-        if len(var) != n_dims:
-            raise ValueError(
-                f"received {len(var)} {desc} for {n_dims} dimensions: {var}"
-            )
-
-
-def _is_float(dtype):
-    """Expected floating point type for Pace"""
-    return (
-        dtype == Float
-        or dtype == float
-        or dtype == np.float32
-        or dtype == np.float64
-        or dtype == np.float16
-    )
-
 
 class Quantity:
     """
@@ -302,7 +49,7 @@ def __init__(
 
         if (
             not allow_mismatch_float_precision
-            and _is_float(data.dtype)
+            and is_float(data.dtype)
             and data.dtype != Float
         ):
             raise ValueError(
@@ -362,8 +109,8 @@ def __init__(
 
         _validate_quantity_property_lengths(data.shape, dims, origin, extent)
         self._metadata = QuantityMetadata(
-            origin=ensure_int_tuple(origin, "origin"),
-            extent=ensure_int_tuple(extent, "extent"),
+            origin=_ensure_int_tuple(origin, "origin"),
+            extent=_ensure_int_tuple(extent, "extent"),
             dims=tuple(dims),
             units=units,
             data_type=type(self._data),
@@ -492,6 +239,11 @@ def data(self) -> Union[np.ndarray, cupy.ndarray]:
         """the underlying array of data"""
         return self._data
 
+    @data.setter
+    def data(self, inputData):
+        if type(inputData) in [np.ndarray, cupy.ndarray]:
+            self._data = inputData
+
     @property
     def origin(self) -> Tuple[int, ...]:
         """the start of the computational domain"""
@@ -584,10 +336,10 @@ def transpose(
         transpose_order = [self.dims.index(dim) for dim in target_dims]
         transposed = Quantity(
             self.np.transpose(self.data, transpose_order),  # type: ignore[attr-defined]
-            dims=transpose_sequence(self.dims, transpose_order),
+            dims=_transpose_sequence(self.dims, transpose_order),
             units=self.units,
-            origin=transpose_sequence(self.origin, transpose_order),
-            extent=transpose_sequence(self.extent, transpose_order),
+            origin=_transpose_sequence(self.origin, transpose_order),
+            extent=_transpose_sequence(self.extent, transpose_order),
             gt4py_backend=self.gt4py_backend,
             allow_mismatch_float_precision=allow_mismatch_float_precision,
         )
@@ -611,7 +363,7 @@ def plot_k_level(self, k_index=0):
         plt.show()
 
 
-def transpose_sequence(sequence, order):
+def _transpose_sequence(sequence, order):
     return sequence.__class__(sequence[i] for i in order)
 
 
@@ -641,21 +393,27 @@ def _collapse_dims(target_dims, dims):
     return return_list
 
 
-def fill_index(index, length):
-    return tuple(index) + (slice(None, None, None),) * (length - len(index))
-
-
-def shift_slice(slice_in, shift, extent):
-    start = shift_index(slice_in.start, shift, extent)
-    stop = shift_index(slice_in.stop, shift, extent)
-    return slice(start, stop, slice_in.step)
+def _validate_quantity_property_lengths(shape, dims, origin, extent):
+    n_dims = len(shape)
+    for var, desc in (
+        (dims, "dimension names"),
+        (origin, "origins"),
+        (extent, "extents"),
+    ):
+        if len(var) != n_dims:
+            raise ValueError(
+                f"received {len(var)} {desc} for {n_dims} dimensions: {var}"
+            )
 
 
-def shift_index(current_value, shift, extent):
-    if current_value is None:
-        new_value = None
-    else:
-        new_value = current_value + shift
-        if new_value < 0:
-            new_value = extent + new_value
-    return new_value
+def _ensure_int_tuple(arg, arg_name):
+    return_list = []
+    for item in arg:
+        try:
+            return_list.append(int(item))
+        except ValueError:
+            raise TypeError(
+                f"tuple arg {arg_name}={arg} contains item {item} of "
+                f"unexpected type {type(item)}"
+            )
+    return tuple(return_list)
diff --git a/ndsl/stencils/__init__.py b/ndsl/stencils/__init__.py
index cbbd3f82..5115a28e 100644
--- a/ndsl/stencils/__init__.py
+++ b/ndsl/stencils/__init__.py
@@ -1,4 +1 @@
 from .corners import CopyCorners, CopyCornersXY, FillCornersBGrid
-
-
-__version__ = "0.2.0"
diff --git a/ndsl/stencils/c2l_ord.py b/ndsl/stencils/c2l_ord.py
index 67f2b5a1..39194232 100644
--- a/ndsl/stencils/c2l_ord.py
+++ b/ndsl/stencils/c2l_ord.py
@@ -227,7 +227,7 @@ def __init__(
             )
 
             # TODO:
-            # To break the depedency to pyFV3 we allow ourselves to not have a type
+            # To break the dependency to pyFV3 we allow ourselves to not have a type
             # hint around state and we check for u and v to make sure we don't
             # have bad input.
             # This entire code should be retired when WrappedHaloUpdater is no longer
diff --git a/ndsl/stencils/corners.py b/ndsl/stencils/corners.py
index 5eb7767a..18c1f6c1 100644
--- a/ndsl/stencils/corners.py
+++ b/ndsl/stencils/corners.py
@@ -54,7 +54,7 @@ def __init__(self, direction: str, stencil_factory: StencilFactory) -> None:
     def __call__(self, field: FloatField):
         """
         Fills cell quantity field using corners from itself and multipliers
-        in the dirction specified initialization of the instance of this class.
+        in the direction specified initialization of the instance of this class.
         """
         self._copy_corners(field, field)
 
diff --git a/ndsl/stencils/testing/README.md b/ndsl/stencils/testing/README.md
index f8b0f04f..098dd489 100644
--- a/ndsl/stencils/testing/README.md
+++ b/ndsl/stencils/testing/README.md
@@ -6,7 +6,7 @@ First, make sure you have followed the instruction in the top level [README](../
 
 The unit and regression tests of pace require data generated from the Fortran reference implementation which has to be downloaded from a Google Cloud Platform storage bucket. Since the bucket is setup as "requester pays", you need a valid GCP account to download the test data.
 
-First, make sure you have configured the authentication with user credientials and configured Docker with the following commands:
+First, make sure you have configured the authentication with user credentials and configured Docker with the following commands:
 
 ```shell
 gcloud auth login
@@ -22,11 +22,11 @@ cd $(git rev-parse --show-toplevel)/physics
 make get_test_data
 ```
 
-If you do not have a GCP account, there is an option to download basic test data from a public FTP server and you can skip the GCP authentication step above. To download test data from the FTP server, use `make USE_FTP=yes get_test_data` instead and this will avoid fetching from a GCP storage bucket. You will need a valid in stallation of the `lftp` command.
+If you do not have a GCP account, there is an option to download basic test data from a public FTP server and you can skip the GCP authentication step above. To download test data from the FTP server, use `make USE_FTP=yes get_test_data` instead and this will avoid fetching from a GCP storage bucket. You will need a valid installation of the `lftp` command.
 
 ## Running the tests (manually)
 
-There are two ways to run the tests, manually by explicitly invoking `pytest` or autmatically using make targets. The former can be used both inside the Docker container as well as for a bare-metal installation and will be described here.
+There are two ways to run the tests, manually by explicitly invoking `pytest` or automatically using make targets. The former can be used both inside the Docker container as well as for a bare-metal installation and will be described here.
 
 First enter the container and navigate to the pace directory:
 
@@ -40,7 +40,7 @@ Note that by entering the container with the `make dev` command, volumes for cod
 
 There are two sets of tests. The "sequential tests" test components which do not require MPI-parallelism. The "parallel tests" can only within an MPI environment.
 
-To run the sequential and parallel tests for the dynmical core (fv3core), you can execute the following commands (these take a bit of time):
+To run the sequential and parallel tests for the dynamical core (fv3core), you can execute the following commands (these take a bit of time):
 
 ```shell
 pytest -v -s --data_path=/pace/fv3core/test_data/8.1.1/c12_6ranks_standard/dycore/ ./fv3core/tests
@@ -79,8 +79,8 @@ DEV=y make physics_savepoint_tests_mpi
 ## Test failure
 
 Test are running for each gridpoint of the domain, unless the Translate class for the test specifically restricts it.
-Upon failure, the test will drop a `netCDF` faile in a `./.translate-errors` directory and named `translate-TestCase(-Rank).nc` containing input, computed output, reference and errors.
+Upon failure, the test will drop a `netCDF` file in a `./.translate-errors` directory and named `translate-TestCase(-Rank).nc` containing input, computed output, reference and errors.
 
 ## Environment variables
 
-- `PACE_TEST_N_THRESHOLD_SAMPLES`: Upon failure the system will try to pertub the output in an attempt to check for numerical instability. This means re-running the test for N samples. Default is `10`, `0` or less turns this feature off.
+- `PACE_TEST_N_THRESHOLD_SAMPLES`: Upon failure the system will try to perturb the output in an attempt to check for numerical instability. This means re-running the test for N samples. Default is `10`, `0` or less turns this feature off.
diff --git a/ndsl/stencils/testing/conftest.py b/ndsl/stencils/testing/conftest.py
index af5bb6a6..2ed22fee 100644
--- a/ndsl/stencils/testing/conftest.py
+++ b/ndsl/stencils/testing/conftest.py
@@ -13,7 +13,7 @@
     CubedSphereCommunicator,
     TileCommunicator,
 )
-from ndsl.comm.mpi import MPI
+from ndsl.comm.mpi import MPI, MPIComm
 from ndsl.comm.partitioner import CubedSpherePartitioner, TilePartitioner
 from ndsl.dsl.dace.dace_config import DaceConfig
 from ndsl.namelist import Namelist
@@ -47,6 +47,9 @@ def pytest_addoption(parser):
     parser.addoption(
         "--which_rank", action="store", help="Restrict test to a single rank"
     )
+    parser.addoption(
+        "--which_savepoint", action="store", help="Restrict test to a single savepoint"
+    )
     parser.addoption(
         "--data_path",
         action="store",
@@ -204,6 +207,11 @@ def get_ranks(metafunc, layout):
         return [int(only_rank)]
 
 
+def get_savepoint_restriction(metafunc):
+    svpt = metafunc.config.getoption("which_savepoint")
+    return int(svpt) if svpt else None
+
+
 def get_namelist(namelist_filename):
     return Namelist.from_f90nml(f90nml.read(namelist_filename))
 
@@ -226,11 +234,13 @@ def sequential_savepoint_cases(metafunc, data_path, namelist_filename, *, backen
     namelist = get_namelist(namelist_filename)
     stencil_config = get_config(backend, None)
     ranks = get_ranks(metafunc, namelist.layout)
+    savepoint_to_replay = get_savepoint_restriction(metafunc)
     grid_mode = metafunc.config.getoption("grid")
     topology_mode = metafunc.config.getoption("topology")
     return _savepoint_cases(
         savepoint_names,
         ranks,
+        savepoint_to_replay,
         stencil_config,
         namelist,
         backend,
@@ -243,6 +253,7 @@ def sequential_savepoint_cases(metafunc, data_path, namelist_filename, *, backen
 def _savepoint_cases(
     savepoint_names,
     ranks,
+    savepoint_to_replay,
     stencil_config,
     namelist: Namelist,
     backend: str,
@@ -289,7 +300,11 @@ def _savepoint_cases(
             n_calls = xr.open_dataset(
                 os.path.join(data_path, f"{test_name}-In.nc")
             ).dims["savepoint"]
-            for i_call in range(n_calls):
+            if savepoint_to_replay is not None:
+                savepoint_iterator = range(savepoint_to_replay, savepoint_to_replay + 1)
+            else:
+                savepoint_iterator = range(n_calls)
+            for i_call in savepoint_iterator:
                 return_list.append(
                     SavepointCase(
                         savepoint_name=test_name,
@@ -308,7 +323,7 @@ def compute_grid_data(grid, namelist, backend, layout, topology_mode):
         npx=namelist.npx,
         npy=namelist.npy,
         npz=namelist.npz,
-        communicator=get_communicator(MPI.COMM_WORLD, layout, topology_mode),
+        communicator=get_communicator(MPIComm(), layout, topology_mode),
         backend=backend,
     )
 
@@ -322,9 +337,11 @@ def parallel_savepoint_cases(
     stencil_config = get_config(backend, communicator)
     savepoint_names = get_parallel_savepoint_names(metafunc, data_path)
     grid_mode = metafunc.config.getoption("grid")
+    savepoint_to_replay = get_savepoint_restriction(metafunc)
     return _savepoint_cases(
         savepoint_names,
         [mpi_rank],
+        savepoint_to_replay,
         stencil_config,
         namelist,
         backend,
@@ -360,13 +377,12 @@ def generate_parallel_stencil_tests(metafunc, *, backend: str):
         metafunc.config
     )
     # get MPI environment
-    comm = MPI.COMM_WORLD
-    mpi_rank = comm.Get_rank()
+    comm = MPIComm()
     savepoint_cases = parallel_savepoint_cases(
         metafunc,
         data_path,
         namelist_filename,
-        mpi_rank,
+        comm.Get_rank(),
         backend=backend,
         comm=comm,
     )
@@ -376,7 +392,7 @@ def generate_parallel_stencil_tests(metafunc, *, backend: str):
 
 
 def get_communicator(comm, layout, topology_mode):
-    if (MPI.COMM_WORLD.Get_size() > 1) and (topology_mode == "cubed-sphere"):
+    if (comm.Get_size() > 1) and (topology_mode == "cubed-sphere"):
         partitioner = CubedSpherePartitioner(TilePartitioner(layout))
         communicator = CubedSphereCommunicator(comm, partitioner)
     else:
diff --git a/ndsl/stencils/testing/grid.py b/ndsl/stencils/testing/grid.py
index 273a0f3d..fbee10a5 100644
--- a/ndsl/stencils/testing/grid.py
+++ b/ndsl/stencils/testing/grid.py
@@ -734,6 +734,31 @@ def grid_data(self) -> "GridData":
                 dims=GridDefinitions.sin_sg4.dims,
                 units=GridDefinitions.sin_sg4.units,
             ),
+            sin_sg5=self.quantity_factory.from_array(
+                data=self.sin_sg5,
+                dims=GridDefinitions.sin_sg5.dims,
+                units=GridDefinitions.sin_sg5.units,
+            ),
+            sin_sg6=self.quantity_factory.from_array(
+                data=self.sin_sg6,
+                dims=GridDefinitions.sin_sg6.dims,
+                units=GridDefinitions.sin_sg6.units,
+            ),
+            sin_sg7=self.quantity_factory.from_array(
+                data=self.sin_sg7,
+                dims=GridDefinitions.sin_sg7.dims,
+                units=GridDefinitions.sin_sg7.units,
+            ),
+            sin_sg8=self.quantity_factory.from_array(
+                data=self.sin_sg8,
+                dims=GridDefinitions.sin_sg8.dims,
+                units=GridDefinitions.sin_sg8.units,
+            ),
+            sin_sg9=self.quantity_factory.from_array(
+                data=self.sin_sg9,
+                dims=GridDefinitions.sin_sg9.dims,
+                units=GridDefinitions.sin_sg9.units,
+            ),
             cos_sg1=self.quantity_factory.from_array(
                 data=self.cos_sg1,
                 dims=GridDefinitions.cos_sg1.dims,
@@ -754,6 +779,31 @@ def grid_data(self) -> "GridData":
                 dims=GridDefinitions.cos_sg4.dims,
                 units=GridDefinitions.cos_sg4.units,
             ),
+            cos_sg5=self.quantity_factory.from_array(
+                data=self.cos_sg5,
+                dims=GridDefinitions.cos_sg5.dims,
+                units=GridDefinitions.cos_sg5.units,
+            ),
+            cos_sg6=self.quantity_factory.from_array(
+                data=self.cos_sg6,
+                dims=GridDefinitions.cos_sg6.dims,
+                units=GridDefinitions.cos_sg6.units,
+            ),
+            cos_sg7=self.quantity_factory.from_array(
+                data=self.cos_sg7,
+                dims=GridDefinitions.cos_sg7.dims,
+                units=GridDefinitions.cos_sg7.units,
+            ),
+            cos_sg8=self.quantity_factory.from_array(
+                data=self.cos_sg8,
+                dims=GridDefinitions.cos_sg8.dims,
+                units=GridDefinitions.cos_sg8.units,
+            ),
+            cos_sg9=self.quantity_factory.from_array(
+                data=self.cos_sg9,
+                dims=GridDefinitions.cos_sg9.dims,
+                units=GridDefinitions.cos_sg9.units,
+            ),
         )
         self._grid_data = GridData(
             horizontal_data=horizontal,
diff --git a/ndsl/stencils/testing/savepoint.py b/ndsl/stencils/testing/savepoint.py
index 77d71917..7571befb 100644
--- a/ndsl/stencils/testing/savepoint.py
+++ b/ndsl/stencils/testing/savepoint.py
@@ -45,6 +45,15 @@ class SavepointCase:
     def __str__(self):
         return f"{self.savepoint_name}-rank={self.rank}-call={self.i_call}"
 
+    @property
+    def exists(self) -> bool:
+        return (
+            xr.open_dataset(
+                os.path.join(self.data_dir, f"{self.savepoint_name}-In.nc")
+            ).sizes["rank"]
+            > self.rank
+        )
+
     @property
     def ds_in(self) -> xr.Dataset:
         return (
diff --git a/ndsl/stencils/testing/serialbox_to_netcdf.py b/ndsl/stencils/testing/serialbox_to_netcdf.py
index a29139c5..46996dbe 100644
--- a/ndsl/stencils/testing/serialbox_to_netcdf.py
+++ b/ndsl/stencils/testing/serialbox_to_netcdf.py
@@ -62,7 +62,7 @@ def get_serializer(data_path: str, rank: int, data_name: Optional[str] = None):
         name = data_name
     else:
         name = f"Generator_rank{rank}"
-    return serialbox.Serializer(serialbox.OpenModeKind.Read, data_path, name)
+    return serialbox.Serializer(serialbox.OpenModeKind.Read, data_path, name)  # type: ignore
 
 
 def main(
@@ -81,22 +81,13 @@ def main(
     if namelist_filename_out != namelist_filename_in:
         shutil.copyfile(os.path.join(data_path, "input.nml"), namelist_filename_out)
     namelist = f90nml.read(namelist_filename_out)
-    if namelist["fv_core_nml"]["grid_type"] <= 3:
-        total_ranks = (
-            6
-            * namelist["fv_core_nml"]["layout"][0]
-            * namelist["fv_core_nml"]["layout"][1]
-        )
+    fv_core_nml: Dict[str, Any] = namelist["fv_core_nml"]  # type: ignore
+    if fv_core_nml["grid_type"] <= 3:
+        total_ranks = 6 * fv_core_nml["layout"][0] * fv_core_nml["layout"][1]
     else:
-        total_ranks = (
-            namelist["fv_core_nml"]["layout"][0] * namelist["fv_core_nml"]["layout"][1]
-        )
-    nx = int(
-        (namelist["fv_core_nml"]["npx"] - 1) / (namelist["fv_core_nml"]["layout"][0])
-    )
-    ny = int(
-        (namelist["fv_core_nml"]["npy"] - 1) / (namelist["fv_core_nml"]["layout"][1])
-    )
+        total_ranks = fv_core_nml["layout"][0] * fv_core_nml["layout"][1]
+    nx = int((fv_core_nml["npx"] - 1) / (fv_core_nml["layout"][0]))
+    ny = int((fv_core_nml["npy"] - 1) / (fv_core_nml["layout"][1]))
 
     # all ranks have the same names, just look at first one
     serializer_0 = get_serializer(data_path, rank=0, data_name=data_name)
@@ -109,6 +100,7 @@ def main(
                 serializer_0.get_savepoint(savepoint_name)[0]
             )
         )
+        print(f"Exporting {savepoint_name}")
         serializer_list = []
         for rank in range(total_ranks):
             serializer = get_serializer(data_path, rank, data_name)
@@ -149,7 +141,27 @@ def main(
         if n_savepoints > 0:
             encoding = {}
             for varname in set(names_list).difference(["rank"]):
+                # Check that all ranks have the same size. If not, aggregate and
+                # feedback on one rank
+                collapse_all_ranks = False
                 data_shape = list(rank_list[0][varname][0].shape)
+                print(f"  Exporting {varname} - {data_shape}")
+                for rank in range(total_ranks):
+                    this_shape = list(rank_list[rank][varname][0].shape)
+                    if data_shape != this_shape:
+                        if len(data_shape) != 1:
+                            raise ValueError(
+                                "Arrays have different dimensions. "
+                                f"E.g. rank 0 is {data_shape} "
+                                f"and rank {rank} is {this_shape} "
+                            )
+                        else:
+                            print(
+                                f"... different shape for {varname} across ranks, collapsing in on rank."
+                            )
+                            collapse_all_ranks = True
+                            break
+
                 if savepoint_name in [
                     "FVDynamics-In",
                     "FVDynamics-Out",
@@ -173,22 +185,53 @@ def main(
                         data_vars[varname] = get_data(
                             data_shape, total_ranks, n_savepoints, rank_list, varname
                         )
+                elif collapse_all_ranks:
+                    data_vars[varname] = get_data_collapse_all_ranks(
+                        total_ranks, n_savepoints, rank_list, varname
+                    )
                 else:
                     data_vars[varname] = get_data(
                         data_shape, total_ranks, n_savepoints, rank_list, varname
                     )
                 if len(data_shape) > 2:
                     encoding[varname] = {"zlib": True, "complevel": 1}
+
             dataset = xr.Dataset(data_vars=data_vars)
             dataset.to_netcdf(
                 os.path.join(output_path, f"{savepoint_name}.nc"), encoding=encoding
             )
 
 
+def get_data_collapse_all_ranks(total_ranks, n_savepoints, output_list, varname):
+    if total_ranks <= 0:
+        return xr.DataArray([], dims=[])
+    # Build array shape - we hypothesis there's only 1 axis
+    K_shape = 0
+    for rank in range(total_ranks):
+        assert len(output_list[rank][varname][0].shape) == 1
+        K_shape = K_shape + output_list[rank][varname][0].shape[0]
+
+    array = np.full(
+        [n_savepoints, 1] + [K_shape],
+        fill_value=np.nan,
+        dtype=output_list[0][varname][0].dtype,
+    )
+    data = xr.DataArray(array, dims=["savepoint", "rank", f"dim_{varname}"])
+    last_size = 0
+    for rank in range(total_ranks):
+        for i_savepoint in range(n_savepoints):
+            rank_data = output_list[rank][varname][i_savepoint]
+            rank_data_size = rank_data.shape[0]
+            data[i_savepoint, 0, last_size : last_size + rank_data_size] = rank_data[:]
+            last_size += rank_data_size
+
+    return data
+
+
 def get_data(data_shape, total_ranks, n_savepoints, output_list, varname):
-    # Read in dtype
     if total_ranks <= 0:
-        return
+        return xr.DataArray([], dims=[])
+    # Read in dtype
     varname_dtype = output_list[0][varname][0].dtype
     # Build data array
     array = np.full(
diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index db8e6047..70480c16 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -8,21 +8,21 @@
 
 import ndsl.dsl.gt4py_utils as gt_utils
 from ndsl.comm.communicator import CubedSphereCommunicator, TileCommunicator
-from ndsl.comm.mpi import MPI
+from ndsl.comm.mpi import MPI, MPIComm
 from ndsl.comm.partitioner import CubedSpherePartitioner, TilePartitioner
 from ndsl.dsl.dace.dace_config import DaceConfig
 from ndsl.dsl.stencil import CompilationConfig, StencilConfig
 from ndsl.quantity import Quantity
 from ndsl.restart._legacy_restart import RESTART_PROPERTIES
 from ndsl.stencils.testing.savepoint import SavepointCase, dataset_to_dict
-from ndsl.testing.comparison import LegacyMetric, MultiModalFloatMetric
+from ndsl.testing.comparison import BaseMetric, LegacyMetric, MultiModalFloatMetric
 from ndsl.testing.perturbation import perturb
 
 
 # this only matters for manually-added print statements
 np.set_printoptions(threshold=4096)
 
-OUTDIR = "./.translate-errors"
+OUTDIR = "./.translate-outputs"
 GPU_MAX_ERR = 1e-10
 GPU_NEAR_ZERO = 1e-15
 
@@ -75,6 +75,16 @@ def process_override(threshold_overrides, testobj, test_name, backend):
                     raise TypeError(
                         "ignore_near_zero_errors is either a list or a dict"
                     )
+            if "multimodal" in match:
+                parsed_multimodal = match["multimodal"]
+                if "absolute_epsilon" in parsed_multimodal:
+                    testobj.mmr_absolute_eps = float(parsed_multimodal["absolute_eps"])
+                if "relative_fraction" in parsed_multimodal:
+                    testobj.mmr_relative_fraction = float(
+                        parsed_multimodal["relative_fraction"]
+                    )
+                if "ulp_threshold" in parsed_multimodal:
+                    testobj.mmr_ulp = float(parsed_multimodal["ulp_threshold"])
             if "skip_test" in match:
                 testobj.skip_test = bool(match["skip_test"])
         elif len(matches) > 1:
@@ -167,6 +177,8 @@ def test_sequential_savepoint(
         )
     if case.testobj.skip_test:
         return
+    if not case.exists:
+        pytest.skip(f"Data at rank {case.rank} does not exists")
     input_data = dataset_to_dict(case.ds_in)
     input_names = (
         case.testobj.serialnames(case.testobj.in_vars["data_vars"])
@@ -185,9 +197,15 @@ def test_sequential_savepoint(
     passing_names: List[str] = []
     all_ref_data = dataset_to_dict(case.ds_out)
     ref_data_out = {}
+    results = {}
+
+    # Assign metrics and report on terminal any failures
     for varname in case.testobj.serialnames(case.testobj.out_vars):
         ignore_near_zero = case.testobj.ignore_near_zero_errors.get(varname, False)
-        ref_data = all_ref_data[varname]
+        try:
+            ref_data = all_ref_data[varname]
+        except KeyError:
+            raise KeyError(f'Output "{varname}" couldn\'t be found in output data')
         if hasattr(case.testobj, "subset_output"):
             ref_data = case.testobj.subset_output(varname, ref_data)
         with subtests.test(varname=varname):
@@ -197,9 +215,9 @@ def test_sequential_savepoint(
                 metric = MultiModalFloatMetric(
                     reference_values=ref_data,
                     computed_values=output_data,
-                    eps=case.testobj.max_error,
-                    ignore_near_zero_errors=ignore_near_zero,
-                    near_zero=case.testobj.near_zero,
+                    absolute_eps_override=case.testobj.mmr_absolute_eps,
+                    relative_fraction_override=case.testobj.mmr_relative_fraction,
+                    ulp_override=case.testobj.mmr_ulp,
                 )
             else:
                 metric = LegacyMetric(
@@ -209,16 +227,14 @@ def test_sequential_savepoint(
                     ignore_near_zero_errors=ignore_near_zero,
                     near_zero=case.testobj.near_zero,
                 )
+            results[varname] = metric
             if not metric.check:
-                os.makedirs(OUTDIR, exist_ok=True)
-                log_filename = os.path.join(
-                    OUTDIR,
-                    f"details-{case.savepoint_name}-{varname}-rank{case.rank}.log",
-                )
-                metric.report(log_filename)
                 pytest.fail(str(metric), pytrace=False)
             passing_names.append(failing_names.pop())
         ref_data_out[varname] = [ref_data]
+
+    # Reporting & data save
+    _report_results(case.savepoint_name, results)
     if len(failing_names) > 0:
         get_thresholds(case.testobj, input_data=original_input_data)
         os.makedirs(OUTDIR, exist_ok=True)
@@ -288,18 +304,19 @@ def test_parallel_savepoint(
     multimodal_metric,
     xy_indices=True,
 ):
-    if MPI.COMM_WORLD.Get_size() % 6 != 0:
+    mpi_comm = MPIComm()
+    if mpi_comm.Get_size() % 6 != 0:
         layout = (
-            int(MPI.COMM_WORLD.Get_size() ** 0.5),
-            int(MPI.COMM_WORLD.Get_size() ** 0.5),
+            int(mpi_comm.Get_size() ** 0.5),
+            int(mpi_comm.Get_size() ** 0.5),
         )
-        communicator = get_tile_communicator(MPI.COMM_WORLD, layout)
+        communicator = get_tile_communicator(mpi_comm, layout)
     else:
         layout = (
-            int((MPI.COMM_WORLD.Get_size() // 6) ** 0.5),
-            int((MPI.COMM_WORLD.Get_size() // 6) ** 0.5),
+            int((mpi_comm.Get_size() // 6) ** 0.5),
+            int((mpi_comm.Get_size() // 6) ** 0.5),
         )
-        communicator = get_communicator(MPI.COMM_WORLD, layout)
+        communicator = get_communicator(mpi_comm, layout)
     if case.testobj is None:
         pytest.xfail(
             f"no translate object available for savepoint {case.savepoint_name}"
@@ -323,6 +340,8 @@ def test_parallel_savepoint(
         return
     if (grid == "compute") and not case.testobj.compute_grid_option:
         pytest.xfail(f"Grid compute option not used for test {case.savepoint_name}")
+    if not case.exists:
+        pytest.skip(f"Data at rank {case.rank} does not exists")
     input_data = dataset_to_dict(case.ds_in)
     # run python version of functionality
     output = case.testobj.compute_parallel(input_data, communicator)
@@ -332,6 +351,9 @@ def test_parallel_savepoint(
     passing_names = []
     ref_data: Dict[str, Any] = {}
     all_ref_data = dataset_to_dict(case.ds_out)
+    results = {}
+
+    # Assign metrics and report on terminal any failures
     for varname in out_vars:
         ref_data[varname] = []
         new_ref_data = all_ref_data[varname]
@@ -358,14 +380,13 @@ def test_parallel_savepoint(
                     ignore_near_zero_errors=ignore_near_zero,
                     near_zero=case.testobj.near_zero,
                 )
+            results[varname] = metric
             if not metric.check:
-                os.makedirs(OUTDIR, exist_ok=True)
-                log_filename = os.path.join(
-                    OUTDIR, f"details-{case.savepoint_name}-{varname}.log"
-                )
-                metric.report(log_filename)
                 pytest.fail(str(metric), pytrace=False)
             passing_names.append(failing_names.pop())
+
+    # Reporting & data save
+    _report_results(case.savepoint_name, results)
     if len(failing_names) > 0:
         os.makedirs(OUTDIR, exist_ok=True)
         nct_filename = os.path.join(
@@ -393,6 +414,20 @@ def test_parallel_savepoint(
         pytest.fail("No tests passed")
 
 
+def _report_results(savepoint_name: str, results: Dict[str, BaseMetric]) -> None:
+    os.makedirs(OUTDIR, exist_ok=True)
+
+    # Summary
+    with open(f"{OUTDIR}/summary-{savepoint_name}.log", "w") as f:
+        for varname, metric in results.items():
+            f.write(f"{varname}: {metric.one_line_report()}\n")
+
+    # Detailed log
+    for varname, metric in results.items():
+        log_filename = os.path.join(OUTDIR, f"details-{savepoint_name}-{varname}.log")
+        metric.report(log_filename)
+
+
 def save_netcdf(
     testobj,
     # first list over rank, second list over savepoint
diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index 3d9d20f9..e3fc8845 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -5,7 +5,7 @@
 
 import ndsl.dsl.gt4py_utils as utils
 from ndsl.dsl.stencil import StencilFactory
-from ndsl.dsl.typing import Field, Float  # noqa: F401
+from ndsl.dsl.typing import Field, Float, Int  # noqa: F401
 from ndsl.quantity import Quantity
 from ndsl.stencils.testing.grid import Grid  # type: ignore
 
@@ -32,7 +32,7 @@ def pad_field_in_j(field, nj: int, backend: str):
 
 
 def as_numpy(
-    value: Union[Dict[str, Any], Quantity, np.ndarray]
+    value: Union[Dict[str, Any], Quantity, np.ndarray],
 ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
     def _convert(value: Union[Quantity, np.ndarray]) -> np.ndarray:
         if isinstance(value, Quantity):
@@ -53,6 +53,9 @@ def _convert(value: Union[Quantity, np.ndarray]) -> np.ndarray:
 class TranslateFortranData2Py:
     max_error = 1e-14
     near_zero = 1e-18
+    mmr_absolute_eps = -1
+    mmr_relative_fraction = -1
+    mmr_ulp = -1
 
     def __init__(self, grid, stencil_factory: StencilFactory, origin=utils.origin):
         self.origin = origin
@@ -123,8 +126,12 @@ def make_storage_data(
                 axis=axis,
                 names=names_4d,
                 backend=self.stencil_factory.backend,
+                dtype=array.dtype,
             )
         else:
+            if len(array.shape) == 4:
+                start = (int(istart), int(jstart), int(kstart), 0)  # type: ignore
+                use_shape.append(array.shape[-1])
             return utils.make_storage_data(
                 array,
                 tuple(use_shape),
@@ -134,6 +141,7 @@ def make_storage_data(
                 axis=axis,
                 read_only=read_only,
                 backend=self.stencil_factory.backend,
+                dtype=array.dtype,
             )
 
     def storage_vars(self):
@@ -159,16 +167,16 @@ def collect_start_indices(self, datashape, varinfo):
         kstart = self.get_index_from_info(varinfo, "kstart", 0)
         return istart, jstart, kstart
 
-    def make_storage_data_input_vars(self, inputs, storage_vars=None):
+    def make_storage_data_input_vars(self, inputs, storage_vars=None, dict_4d=True):
         inputs_in = {**inputs}
         inputs_out = {}
         if storage_vars is None:
             storage_vars = self.storage_vars()
         for p in self.in_vars["parameters"]:
-            if type(inputs_in[p]) in [np.int64, np.int32]:
+            if type(inputs_in[p]) in [int, np.int64, np.int32]:
                 inputs_out[p] = int(inputs_in[p])
             elif type(inputs_in[p]) is bool:
-                inputs_out[p] == inputs_in[p]
+                inputs_out[p] = inputs_in[p]
             else:
                 inputs_out[p] = Float(inputs_in[p])
         for d, info in storage_vars.items():
@@ -185,7 +193,7 @@ def make_storage_data_input_vars(self, inputs, storage_vars=None):
             )
 
             names_4d = None
-            if len(inputs_in[serialname].shape) == 4:
+            if (len(inputs_in[serialname].shape) == 4) and dict_4d:
                 names_4d = info.get("names_4d", utils.tracer_variables)
 
             dummy_axes = info.get("dummy_axes", None)
diff --git a/ndsl/testing/README.md b/ndsl/testing/README.md
new file mode 100644
index 00000000..9154956c
--- /dev/null
+++ b/ndsl/testing/README.md
@@ -0,0 +1,111 @@
+# Translate test
+
+## Summary
+
+NDSL exposes a "Translate" test system which allows the automatic numerical regression test against a pre-defined sets of NetCDFs.
+
+To write a translate test, derive from `TranslateFortranData2Py`. The system works by matching name of Translate class and data, e.g.:
+
+- if `TranslateNAME` is the name of the translate class
+- then the name of the data should be `NAME-In.nc` for the inputs and `NAME-Out.nc` for outputs that'll be check.
+
+The test runs via the `pytest` harness and can be triggered with the `pytest` commands.
+
+Options ares:
+
+- --insert-assert-print:         Print statements that would be substituted for insert_assert(), instead of writing to files
+- --insert-assert-fail:          Fail tests which include one or more insert_assert() calls
+- --backend=BACKEND:             Backend to execute the test with, can only be one.
+- --which_modules=WHICH_MODULES: Whitelist of modules to run. Only the part after Translate, e.g. in TranslateXYZ it'd be XYZ
+- --skip_modules=SKIP_MODULES:   Blacklist of modules to not run. Only the part after Translate, e.g. in TranslateXYZ it'd be XYZ
+- --which_rank=WHICH_RANK:       Restrict test to a single rank
+- --data_path=DATA_PATH:         Path of Netcdf input and outputs. Naming pattern needs to be XYZ-In and XYZ-Out for a test class named TranslateXYZ
+- --threshold_overrides_file=THRESHOLD_OVERRIDES_FILE: Path to a yaml overriding the default error threshold for a custom value.
+- --print_failures:              Print the failures detail. Default to True.
+- --failure_stride=FAILURE_STRIDE: How many indices of failures to print from worst to best. Default to 1.
+- --grid=GRID:                   Grid loading mode. "file" looks for "Grid-Info.nc", "compute" does the same but recomputes MetricTerms, "default" creates a simple grid with no metrics terms. Default to "file".
+- --topology=TOPOLOGY            Topology of the grid. "cubed-sphere" means a 6-faced grid, "doubly-periodic" means a 1 tile grid. Default to "cubed-sphere".
+- --multimodal_metric:           Use the multi-modal float metric. Default to False.
+
+More options of `pytest` are available when doing `pytest --help`.
+
+## Metrics
+
+There is three state of a test in `pytest`: FAIL, PASS and XFAIL (expected fail). To clear the PASS status, the output data contained in `NAME-Out.nc` is compared to the computed data via the `TranslateNAME` test. Because this system was developed to port Fortran numerics to many targets (mostly C, but also Python, and CPU/GPU), we can't rely on bit-to-bit comparison and have been developing a couple of metrics.
+
+### Legacy metric
+
+The legacy metric was used throughout the development of the dynamical core and microphysics scheme at 64-bit precision. It tries to solve differences over big and small amplitude values with a single formula that goes as follows: $`\|computed-reference|/reference`$ where `reference` has been purged of 0.
+NaN values are considered no-pass.
+To pass the metric has to be lower than `1e-14`, any value lower than `1e-18` will be considered pass by default. The pass threshold can be overridden (see below).
+
+### Multi-modal metric
+
+Moving to mixed precision code, the legacy metric didn't give enough flexibility to account for 32-bit precision errors that could accumulate. Another metric was built with the intent of breaking the one-fit-all concept and giving back flexibility. The metric is a combination of three differences:
+
+- _Absolute Difference_ ($`|computed-reference|<threshold`$): the absolute difference between the reference value and the computed value. Good for small amplitude, decays to direct comparison in higher amplitude. Default threshold is `1e-13` for 64-bit, `1e-10` at 32-bit.
+- _Relative Difference_ ($`|computed-reference|<threshold*\max{|computed|, |reference|}`$): the difference relative to the maximum value. This can be seen at a % of error. Good for high amplitude value, decay to direct comparison at smaller amplitude. Default is `0.0001%`
+- _ULP Difference_ ($`|computed-reference|/\spacing{\max{|computed|, |reference|}}<=threshold`$): Unit of Least Precision (ULP) can be shortly describe as a way to quantify the space between two describable floating points. This is useful to measure differences that are in the "noise" of the machine representation. Default threshold is 1, meaning the two values are virtually indistinguishable.
+
+## Results
+
+A summary of all results is dumped in a `.translate-errors` directory. When failing, this folder carries also data (netcdf) and logs helpful to explore the errors.
+
+## Overrides
+
+### Threshold overrides
+
+`--threshold_overrides_file` takes in a yaml file with error thresholds specified for specific backend and platform configuration. Currently, two types of error overrides are allowed: maximum error and near zero.
+
+For maximum error, a blanket `max_error` is specified to override the parent classes relative error threshold.
+
+For near zero override, `ignore_near_zero_errors` is specified to allow some fields to pass with higher relative error if the absolute error is very small. Additionally, it is also possible to define a global near zero value for all remaining fields not specified in `ignore_near_zero_errors`. This is done by specifying `all_other_near_zero=<value>`.
+
+Override yaml file should have one of the following formats:
+
+### One near zero value for all variables
+
+```Stencil_name:
+ - backend: <backend>
+   max_error: <value>
+   near_zero: <value>
+   ignore_near_zero_errors:
+    - <var1>
+    - <var2>
+    - ...
+```
+
+### Variable specific near zero value
+
+```Stencil_name:
+ - backend: <backend>
+   max_error: <value>
+   ignore_near_zero_errors:
+    <var1>:<value1>
+    <var2>:<value2>
+    ...
+```
+
+### [optional] Global near zero value for remaining fields
+
+```Stencil_name:
+ - backend: <backend>
+   max_error: <value>
+   ignore_near_zero_errors:
+    <var1>:<value1>
+    <var2>:<value2>
+   all_other_near_zero:<global_value>
+    ...
+```
+
+where fields other than `var1` and `var2` will use `global_value`.
+
+### Multimodal overrides
+
+```Stencil_name:
+ - backend: <backend>
+   multimodal:
+    absolute_eps: <value>
+    relative_fraction: <value>
+    ulp_threshold: <value>
+```
diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index 9e2d1d59..5bce02dd 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -23,6 +23,9 @@ def __repr__(self) -> str:
     def report(self, file_path: Optional[str] = None) -> List[str]:
         ...
 
+    def one_line_report(self) -> str:
+        ...
+
 
 class LegacyMetric(BaseMetric):
     """Legacy (AI2) metric used for original FV3 port.
@@ -65,7 +68,7 @@ def _compute_errors(
             self._calculated_metric = np.logical_xor(self.computed, self.references)
         else:
             raise TypeError(
-                f"recieved data with unexpected dtype {self.references.dtype}"
+                f"received data with unexpected dtype {self.references.dtype}"
             )
         success = np.logical_or(
             np.logical_and(np.isnan(self.computed), np.isnan(self.references)),
@@ -91,13 +94,16 @@ def _compute_errors(
             )
         return success
 
-    def report(self, file_path: Optional[str] = None) -> List[str]:
-        report = []
+    def one_line_report(self) -> str:
         if self.check:
-            report.append("✅ No numerical differences")
+            return "✅ No numerical differences"
         else:
-            report.append("❌ Numerical failures")
+            return "❌ Numerical failures"
 
+    def report(self, file_path: Optional[str] = None) -> List[str]:
+        report = []
+        report.append(self.one_line_report())
+        if not self.check:
             found_indices = np.logical_not(self.success).nonzero()
             computed_failures = self.computed[found_indices]
             reference_failures = self.references[found_indices]
@@ -109,7 +115,7 @@ def report(self, file_path: Optional[str] = None) -> List[str]:
             abs_errs = []
             details = [
                 "All failures:",
-                "Index  Computed  Reference  Absloute E  Metric E",
+                "Index  Computed  Reference  Absolute E  Metric E",
             ]
             for b in range(bad_indices_count):
                 full_index = tuple([f[b] for f in found_indices])
@@ -166,6 +172,21 @@ def __repr__(self) -> str:
         return "\n".join(report)
 
 
+class _Metric:
+    def __init__(self, value):
+        self._value: float = value
+        self.is_default = True
+
+    @property
+    def value(self) -> float:
+        return self._value
+
+    @value.setter
+    def value(self, _value: float):
+        self._value = _value
+        self.is_default = False
+
+
 class MultiModalFloatMetric(BaseMetric):
     """Combination of absolute, relative & ULP comparison for floats
 
@@ -174,19 +195,21 @@ class MultiModalFloatMetric(BaseMetric):
     floating errors.
 
     ULP is used to clear noise (ULP<=1.0 passes)
-    Absolute errors for large amplitute
+    Absolute errors for large amplitude
     """
 
-    _f32_absolute_eps = 1e-10
-    _f64_absolute_eps = 1e-13
-    relative_fraction = 0.000001  # 0.0001%
-    ulp_threshold = 1.0
+    _f32_absolute_eps = _Metric(1e-10)
+    _f64_absolute_eps = _Metric(1e-13)
+    relative_fraction = _Metric(0.000001)  # 0.0001%
+    ulp_threshold = _Metric(1.0)
 
     def __init__(
         self,
         reference_values: np.ndarray,
         computed_values: np.ndarray,
-        eps: float,
+        absolute_eps_override: float = -1,
+        relative_fraction_override: float = -1,
+        ulp_override: float = -1,
         **kwargs,
     ):
         super().__init__(reference_values, computed_values)
@@ -197,10 +220,18 @@ def __init__(
         self.ulp_distance = np.empty_like(self.references)
         self.ulp_distance_metric = np.empty_like(self.references, dtype=np.bool_)
 
-        if self.references.dtype is (np.float32, np.int32):
-            self.absolute_eps = max(eps, self._f32_absolute_eps)
+        if self.references.dtype in [np.float32, np.int32]:
+            self.absolute_eps = _Metric(self._f32_absolute_eps.value)
         else:
-            self.absolute_eps = max(eps, self._f64_absolute_eps)
+            self.absolute_eps = _Metric(self._f64_absolute_eps.value)
+
+        # Assign overrides if needed
+        if absolute_eps_override > self.absolute_eps.value:
+            self.absolute_eps.value = absolute_eps_override
+        if relative_fraction_override > self.relative_fraction.value:
+            self.relative_fraction.value = relative_fraction_override
+        if ulp_override > self.ulp_threshold.value:
+            self.ulp_threshold.value = ulp_override
 
         self.success = self._compute_all_metrics()
         self.check = np.all(self.success)
@@ -214,44 +245,67 @@ def _compute_all_metrics(
             )
             # Absolute distance
             self.absolute_distance = np.absolute(self.computed - self.references)
-            self.absolute_distance_metric = self.absolute_distance < self.absolute_eps
+            self.absolute_distance_metric = (
+                self.absolute_distance < self.absolute_eps.value
+            )
             # Relative distance (in pct)
             self.relative_distance = np.divide(self.absolute_distance, max_values)
             self.relative_distance_metric = (
-                self.absolute_distance < self.relative_fraction * max_values
+                self.absolute_distance < self.relative_fraction.value * max_values
             )
             # ULP distance
             self.ulp_distance = np.divide(
                 self.absolute_distance, np.spacing(max_values)
             )
-            self.ulp_distance_metric = self.ulp_distance <= self.ulp_threshold
-
-            # Combine all distances into sucess or failure
-            # Success = no NANs & ( abs or rel or ulp )
-            naninf_success = not np.logical_and(
+            self.ulp_distance_metric = self.ulp_distance <= self.ulp_threshold.value
+
+            # Combine all distances into success or failure
+            # Success =
+            # - no unexpected NANs (e.g. NaN in the ref MUST BE in computation) OR
+            # - absolute distance pass OR
+            # - relative distance pass OR
+            # - ulp distance pass
+            naninf_success = np.logical_and(
                 np.isnan(self.computed), np.isnan(self.references)
-            ).all()
+            )
             metric_success = np.logical_or(
                 self.relative_distance_metric, self.absolute_distance_metric
             )
             metric_success = np.logical_or(metric_success, self.ulp_distance_metric)
-            success = np.logical_and(naninf_success, metric_success)
+            success = np.logical_or(naninf_success, metric_success)
             return success
         elif self.references.dtype in (np.bool_, bool):
             success = np.logical_xor(self.computed, self.references)
             return success
         else:
             raise TypeError(
-                f"recieved data with unexpected dtype {self.references.dtype}"
+                f"received data with unexpected dtype {self.references.dtype}"
             )
 
-    def report(self, file_path: Optional[str] = None) -> List[str]:
-        report = []
-        if self.check:
-            report.append("✅ No numerical differences")
+    def _has_override(self) -> bool:
+        return (
+            not self.relative_fraction.is_default
+            or not self.absolute_eps.is_default
+            or not self.ulp_threshold.is_default
+        )
+
+    def one_line_report(self) -> str:
+        metric_thresholds = f"{'🔶 ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
+        metric_thresholds += f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
+        metric_thresholds += f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})"
+        if self.check and self._has_override():
+            return f"🔶 No numerical differences with threshold override - metric: {metric_thresholds}"
+        elif self.check:
+            return f"✅ No numerical differences - metric: {metric_thresholds}"
         else:
-            report.append("❌ Numerical failures")
+            failed_indices = len(np.logical_not(self.success).nonzero()[0])
+            all_indices = len(self.references.flatten())
+            return f"❌ Numerical failures: {failed_indices}/{all_indices} failed - metric: {metric_thresholds}"
 
+    def report(self, file_path: Optional[str] = None) -> List[str]:
+        report = []
+        report.append(self.one_line_report())
+        if not self.check:
             found_indices = np.logical_not(self.success).nonzero()
             # List all errors to terminal and file
             bad_indices_count = len(found_indices[0])
@@ -260,18 +314,23 @@ def report(self, file_path: Optional[str] = None) -> List[str]:
             report = [
                 f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
                 f"Index   Computed   Reference   "
-                f"Absolute E(<{self.absolute_eps:.2e})  "
-                f"Relative E(<{self.relative_fraction * 100:.2e}%)   "
-                f"ULP E(<{self.ulp_threshold})",
+                f"{'🔶 ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
+                f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
+                f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})",
             ]
             # Summary and worst result
             for iBad in range(bad_indices_count):
                 fi = tuple([f[iBad] for f in found_indices])
+                ulp_dist = (
+                    self.ulp_distance[fi]
+                    if np.isnan(self.ulp_distance[fi])
+                    else int(self.ulp_distance[fi])
+                )
                 report.append(
                     f"{str(fi)}  {self.computed[fi]:.16e}  {self.references[fi]:.16e}  "
                     f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'}  "
                     f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'}  "
-                    f"{int(self.ulp_distance[fi]):02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
+                    f"{ulp_dist:02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
                 )
 
         if file_path:
diff --git a/ndsl/utils.py b/ndsl/utils.py
index 0d22330c..36316680 100644
--- a/ndsl/utils.py
+++ b/ndsl/utils.py
@@ -90,7 +90,7 @@ def safe_mpi_allocate(
     """Make sure the allocation use an allocator that works with MPI
 
     For G2G transfer, MPICH requires the allocation to not be done
-    with managedmemory. Since we can't know what state `cupy` is in
+    with managed memory. Since we can't know what state `cupy` is in
     with switch for the default pooled allocator.
 
     If allocator comes from cupy, it must be cupy.empty or cupy.zeros.
diff --git a/setup.py b/setup.py
index 03dcbf0f..0b19d38e 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ def local_pkg(name: str, relative_path: str) -> str:
     "xarray",
     "f90nml>=1.1.0",
     "fsspec",
-    "netcdf4==1.7.0",
+    "netcdf4==1.7.1",
     "scipy",  # restart capacities only
     "h5netcdf",  # for xarray
     "dask",  # for xarray
@@ -57,7 +57,7 @@ def local_pkg(name: str, relative_path: str) -> str:
     packages=find_namespace_packages(include=["ndsl", "ndsl.*"]),
     include_package_data=True,
     url="https://github.com/NOAA-GFDL/NDSL",
-    version="2024.09.00",
+    version="2025.01.00",
     zip_safe=False,
     entry_points={
         "console_scripts": [
diff --git a/tests/dsl/test_caches.py b/tests/dsl/test_caches.py
index 893fb89d..768238e2 100644
--- a/tests/dsl/test_caches.py
+++ b/tests/dsl/test_caches.py
@@ -60,7 +60,7 @@ def _build_stencil(backend, orchestrated: DaCeOrchestration):
     return built_stencil, grid_indexing, stencil_config
 
 
-class OrchestratedProgam:
+class OrchestratedProgram:
     def __init__(self, backend, orchestration):
         self.stencil, grid_indexing, stencil_config = _build_stencil(
             backend, orchestration
@@ -92,7 +92,7 @@ def test_relocatability_orchestration(backend):
     working_dir = str(os.getcwd())
 
     # Compile on default
-    p0 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p0 = OrchestratedProgram(backend, DaCeOrchestration.BuildAndRun)
     p0()
     assert os.path.exists(
         f"{working_dir}/.gt_cache_FV3_A/dacecache/"
@@ -105,7 +105,7 @@ def test_relocatability_orchestration(backend):
 
     custom_path = f"{working_dir}/.my_cache_path"
     gt_config.cache_settings["root_path"] = custom_path
-    p1 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p1 = OrchestratedProgram(backend, DaCeOrchestration.BuildAndRun)
     p1()
     assert os.path.exists(
         f"{custom_path}/.gt_cache_FV3_A/dacecache/"
@@ -119,14 +119,14 @@ def test_relocatability_orchestration(backend):
     relocated_path = f"{working_dir}/.my_relocated_cache_path"
     shutil.copytree(custom_path, relocated_path, dirs_exist_ok=True)
     gt_config.cache_settings["root_path"] = relocated_path
-    p2 = OrchestratedProgam(backend, DaCeOrchestration.Run)
+    p2 = OrchestratedProgram(backend, DaCeOrchestration.Run)
     p2()
 
     # Generate a file exists error to check for bad path
     bogus_path = "./nope/notatall/nothappening"
     gt_config.cache_settings["root_path"] = bogus_path
     with pytest.raises(RuntimeError):
-        OrchestratedProgam(backend, DaCeOrchestration.Run)
+        OrchestratedProgram(backend, DaCeOrchestration.Run)
 
     # Restore cache settings
     gt_config.cache_settings["root_path"] = original_root_directory
@@ -156,7 +156,7 @@ def test_relocatability(backend: str):
     backend_sanitized = backend.replace(":", "")
 
     # Compile on default
-    p0 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p0 = OrchestratedProgram(backend, DaCeOrchestration.Python)
     p0()
     assert os.path.exists(
         f"./.gt_cache_000000/py38_1013/{backend_sanitized}/test_caches/_stencil/"
@@ -166,7 +166,7 @@ def test_relocatability(backend: str):
 
     custom_path = "./.my_cache_path"
     gt_config.cache_settings["root_path"] = custom_path
-    p1 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p1 = OrchestratedProgram(backend, DaCeOrchestration.Python)
     p1()
     assert os.path.exists(
         f"{custom_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
@@ -178,7 +178,7 @@ def test_relocatability(backend: str):
     relocated_path = "./.my_relocated_cache_path"
     shutil.copytree("./.gt_cache_000000", relocated_path, dirs_exist_ok=True)
     gt_config.cache_settings["root_path"] = relocated_path
-    p2 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p2 = OrchestratedProgram(backend, DaCeOrchestration.Python)
     p2()
     assert os.path.exists(
         f"{relocated_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
diff --git a/tests/dsl/test_compilation_config.py b/tests/dsl/test_compilation_config.py
index 62049d91..fa323b06 100644
--- a/tests/dsl/test_compilation_config.py
+++ b/tests/dsl/test_compilation_config.py
@@ -7,6 +7,7 @@
     CompilationConfig,
     CubedSphereCommunicator,
     CubedSpherePartitioner,
+    NullComm,
     RunMode,
     TilePartitioner,
 )
@@ -33,8 +34,7 @@ def test_check_communicator_valid(
     partitioner = CubedSpherePartitioner(
         TilePartitioner((int(sqrt(size / 6)), int((sqrt(size / 6)))))
     )
-    comm = unittest.mock.MagicMock()
-    comm.Get_size.return_value = size
+    comm = NullComm(rank=0, total_ranks=size)
     cubed_sphere_comm = CubedSphereCommunicator(comm, partitioner)
     config = CompilationConfig(
         run_mode=run_mode, use_minimal_caching=use_minimal_caching
@@ -52,8 +52,7 @@ def test_check_communicator_invalid(
     nx: int, ny: int, use_minimal_caching: bool, run_mode: RunMode
 ):
     partitioner = CubedSpherePartitioner(TilePartitioner((nx, ny)))
-    comm = unittest.mock.MagicMock()
-    comm.Get_size.return_value = nx * ny * 6
+    comm = NullComm(rank=0, total_ranks=nx * ny * 6)
     cubed_sphere_comm = CubedSphereCommunicator(comm, partitioner)
     config = CompilationConfig(
         run_mode=run_mode, use_minimal_caching=use_minimal_caching
@@ -91,9 +90,7 @@ def test_get_decomposition_info_from_comm(
     partitioner = CubedSpherePartitioner(
         TilePartitioner((int(sqrt(size / 6)), int(sqrt(size / 6))))
     )
-    comm = unittest.mock.MagicMock()
-    comm.Get_rank.return_value = rank
-    comm.Get_size.return_value = size
+    comm = NullComm(rank=rank, total_ranks=size)
     cubed_sphere_comm = CubedSphereCommunicator(comm, partitioner)
     config = CompilationConfig(use_minimal_caching=True, run_mode=RunMode.Run)
     (
@@ -133,8 +130,7 @@ def test_determine_compiling_equivalent(
         TilePartitioner((sqrt(size / 6), sqrt(size / 6)))
     )
     comm = unittest.mock.MagicMock()
-    comm.Get_rank.return_value = rank
-    comm.Get_size.return_value = size
+    comm = NullComm(rank=rank, total_ranks=size)
     cubed_sphere_comm = CubedSphereCommunicator(comm, partitioner)
     assert (
         config.determine_compiling_equivalent(rank, cubed_sphere_comm.partitioner)
diff --git a/tests/dsl/test_stencil_factory.py b/tests/dsl/test_stencil_factory.py
index ac189ad8..2af1218d 100644
--- a/tests/dsl/test_stencil_factory.py
+++ b/tests/dsl/test_stencil_factory.py
@@ -107,6 +107,23 @@ def test_get_stencils_with_varied_bounds_and_regions(backend: str):
     np.testing.assert_array_equal(q_orig.data, q_ref.data)
 
 
+def test_stencil_vertical_bounds(backend: str):
+    factory = get_stencil_factory(backend)
+    origins = [(3, 3, 0), (2, 2, 1)]
+    domains = [(1, 1, 3), (2, 2, 4)]
+    stencils = get_stencils_with_varied_bounds(
+        add_1_in_region_stencil,
+        origins,
+        domains,
+        stencil_factory=factory,
+    )
+
+    assert "k_start" in stencils[0].externals and stencils[0].externals["k_start"] == 0
+    assert "k_end" in stencils[0].externals and stencils[0].externals["k_end"] == 2
+    assert "k_start" in stencils[1].externals and stencils[1].externals["k_start"] == 1
+    assert "k_end" in stencils[1].externals and stencils[1].externals["k_end"] == 4
+
+
 @pytest.mark.parametrize("enabled", [True, False])
 def test_stencil_factory_numpy_comparison_from_dims_halo(enabled: bool):
     backend = "numpy"
diff --git a/tests/grid/test_eta.py b/tests/grid/test_eta.py
index ab0539f8..1acd9c6d 100755
--- a/tests/grid/test_eta.py
+++ b/tests/grid/test_eta.py
@@ -22,8 +22,8 @@
 values are read-in and stored properly.
 In addition, this test checks to ensure that
 the function set_hybrid_pressure_coefficients
-fail as expected if the computed eta values
-vary non-mononitically and if the eta_file
+fails as expected if the computed eta values
+vary non-monotonically and if the eta_file
 is not provided.
 """
 
@@ -171,7 +171,7 @@ def test_set_hybrid_pressure_coefficients_not_mono():
     eta_file is specified in test_config_not_mono.yaml
     file and the ak and bk values in the eta_file
     have been changed nonsensically to result in
-    erronenous eta values.
+    erroneous eta values.
     """
 
     working_dir = str(os.getcwd())
@@ -216,7 +216,7 @@ def test_set_hybrid_pressure_coefficients_not_mono():
         if os.path.isfile(out_eta_file):
             os.remove(out_eta_file)
         if str(error) == "ETA values are not monotonically increasing":
-            pytest.xfail("testing eta values are not monotomincally increasing")
+            pytest.xfail("testing eta values are not monotonically increasing")
         else:
             pytest.fail(
                 "ERROR in testing eta values not are not monotonically increasing"
diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
new file mode 100644
index 00000000..bec096dd
--- /dev/null
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -0,0 +1,149 @@
+import numpy as np
+import pytest
+
+from ndsl import (
+    CubedSphereCommunicator,
+    CubedSpherePartitioner,
+    Quantity,
+    TilePartitioner,
+)
+from ndsl.comm.comm_abc import ReductionOperator
+from ndsl.comm.mpi import MPIComm
+from ndsl.dsl.typing import Float
+from tests.mpi.mpi_comm import MPI
+
+
+@pytest.fixture
+def layout():
+    if MPI is not None:
+        size = MPI.COMM_WORLD.Get_size()
+        ranks_per_tile = size // 6
+        ranks_per_edge = int(ranks_per_tile ** 0.5)
+        return (ranks_per_edge, ranks_per_edge)
+    else:
+        return (1, 1)
+
+
+@pytest.fixture(params=[0.1, 1.0])
+def edge_interior_ratio(request):
+    return request.param
+
+
+@pytest.fixture
+def tile_partitioner(layout, edge_interior_ratio: float):
+    return TilePartitioner(layout, edge_interior_ratio=edge_interior_ratio)
+
+
+@pytest.fixture
+def cube_partitioner(tile_partitioner):
+    return CubedSpherePartitioner(tile_partitioner)
+
+
+@pytest.fixture()
+def communicator(cube_partitioner):
+    return CubedSphereCommunicator(
+        comm=MPIComm(),
+        partitioner=cube_partitioner,
+    )
+
+
+@pytest.mark.skipif(
+    MPI is None, reason="mpi4py is not available or pytest was not run in parallel"
+)
+def test_all_reduce(communicator):
+    backends = ["dace:cpu", "gt:cpu_kfirst", "numpy"]
+
+    for backend in backends:
+        base_array = np.array([i for i in range(5)], dtype=Float)
+
+        testQuantity_1D = Quantity(
+            data=base_array,
+            dims=["K"],
+            units="Some 1D unit",
+            gt4py_backend=backend,
+        )
+
+        base_array = np.array([i for i in range(5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5)
+
+        testQuantity_2D = Quantity(
+            data=base_array,
+            dims=["I", "J"],
+            units="Some 2D unit",
+            gt4py_backend=backend,
+        )
+
+        base_array = np.array([i for i in range(5 * 5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5, 5)
+
+        testQuantity_3D = Quantity(
+            data=base_array,
+            dims=["I", "J", "K"],
+            units="Some 3D unit",
+            gt4py_backend=backend,
+        )
+
+        global_sum_q = communicator.all_reduce(testQuantity_1D, ReductionOperator.SUM)
+        assert global_sum_q.metadata == testQuantity_1D.metadata
+        assert (global_sum_q.data == (testQuantity_1D.data * communicator.size)).all()
+
+        global_sum_q = communicator.all_reduce(testQuantity_2D, ReductionOperator.SUM)
+        assert global_sum_q.metadata == testQuantity_2D.metadata
+        assert (global_sum_q.data == (testQuantity_2D.data * communicator.size)).all()
+
+        global_sum_q = communicator.all_reduce(testQuantity_3D, ReductionOperator.SUM)
+        assert global_sum_q.metadata == testQuantity_3D.metadata
+        assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()
+
+        base_array = np.array([i for i in range(5)], dtype=Float)
+        testQuantity_1D_out = Quantity(
+            data=base_array,
+            dims=["K"],
+            units="New 1D unit",
+            gt4py_backend=backend,
+            origin=(8,),
+            extent=(7,),
+        )
+
+        base_array = np.array([i for i in range(5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5)
+
+        testQuantity_2D_out = Quantity(
+            data=base_array,
+            dims=["I", "J"],
+            units="Some 2D unit",
+            gt4py_backend=backend,
+        )
+
+        base_array = np.array([i for i in range(5 * 5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5, 5)
+
+        testQuantity_3D_out = Quantity(
+            data=base_array,
+            dims=["I", "J", "K"],
+            units="Some 3D unit",
+            gt4py_backend=backend,
+        )
+        communicator.all_reduce(
+            testQuantity_1D, ReductionOperator.SUM, testQuantity_1D_out
+        )
+        assert testQuantity_1D_out.metadata == testQuantity_1D.metadata
+        assert (
+            testQuantity_1D_out.data == (testQuantity_1D.data * communicator.size)
+        ).all()
+
+        communicator.all_reduce(
+            testQuantity_2D, ReductionOperator.SUM, testQuantity_2D_out
+        )
+        assert testQuantity_2D_out.metadata == testQuantity_2D.metadata
+        assert (
+            testQuantity_2D_out.data == (testQuantity_2D.data * communicator.size)
+        ).all()
+
+        communicator.all_reduce(
+            testQuantity_3D, ReductionOperator.SUM, testQuantity_3D_out
+        )
+        assert testQuantity_3D_out.metadata == testQuantity_3D.metadata
+        assert (
+            testQuantity_3D_out.data == (testQuantity_3D.data * communicator.size)
+        ).all()
diff --git a/tests/mpi/test_mpi_halo_update.py b/tests/mpi/test_mpi_halo_update.py
index ab11b16e..b6c38e95 100644
--- a/tests/mpi/test_mpi_halo_update.py
+++ b/tests/mpi/test_mpi_halo_update.py
@@ -9,6 +9,7 @@
     TilePartitioner,
 )
 from ndsl.comm._boundary_utils import get_boundary_slice
+from ndsl.comm.mpi import MPIComm
 from ndsl.constants import (
     BOUNDARY_TYPES,
     EDGE_BOUNDARY_TYPES,
@@ -176,7 +177,7 @@ def extent(n_points, dims, nz, ny, nx):
 @pytest.fixture()
 def communicator(cube_partitioner):
     return CubedSphereCommunicator(
-        comm=MPI.COMM_WORLD,
+        comm=MPIComm(),
         partitioner=cube_partitioner,
     )
 
diff --git a/tests/mpi/test_mpi_mock.py b/tests/mpi/test_mpi_mock.py
index b8202995..6b441702 100644
--- a/tests/mpi/test_mpi_mock.py
+++ b/tests/mpi/test_mpi_mock.py
@@ -38,7 +38,7 @@ def send_recv(comm, numpy):
         comm.Send(data, dest=rank + 1)
     if rank > 0:
         if isinstance(comm, DummyComm):
-            print(f"recieving data from {rank - 1} to {rank}")
+            print(f"receiving data from {rank - 1} to {rank}")
         comm.Recv(data, source=rank - 1)
     return data
 
@@ -55,7 +55,7 @@ def send_recv_big_data(comm, numpy):
         comm.Send(data, dest=rank + 1)
     if rank > 0:
         if isinstance(comm, DummyComm):
-            print(f"recieving data from {rank - 1} to {rank}")
+            print(f"receiving data from {rank - 1} to {rank}")
         comm.Recv(data, source=rank - 1)
     return data
 
@@ -101,7 +101,7 @@ def send_f_contiguous_buffer(comm, numpy):
         comm.Send(data, dest=rank + 1)
     if rank > 0:
         if isinstance(comm, DummyComm):
-            print(f"recieving data from {rank - 1} to {rank}")
+            print(f"receiving data from {rank - 1} to {rank}")
         comm.Recv(data, source=rank - 1)
     return data
 
@@ -156,7 +156,7 @@ def recv_to_subarray(comm, numpy):
         comm.Send(data, dest=rank + 1)
     if rank > 0:
         if isinstance(comm, DummyComm):
-            print(f"recieving data from {rank - 1} to {rank}")
+            print(f"receiving data from {rank - 1} to {rank}")
         try:
             comm.Recv(recv_buffer[1:-1, 1:-1, 1:-1], source=rank - 1)
         except Exception as err:
diff --git a/tests/quantity/test_quantity.py b/tests/quantity/test_quantity.py
index a6de628b..61e92025 100644
--- a/tests/quantity/test_quantity.py
+++ b/tests/quantity/test_quantity.py
@@ -1,8 +1,8 @@
 import numpy as np
 import pytest
 
-import ndsl.quantity as qty
 from ndsl import Quantity
+from ndsl.quantity.bounds import _shift_slice
 
 
 try:
@@ -229,7 +229,7 @@ def test_compute_view_edit_all_domain(quantity, n_halo, n_dims, extent_1d):
     ],
 )
 def test_shift_slice(slice_in, shift, extent, slice_out):
-    result = qty.shift_slice(slice_in, shift, extent)
+    result = _shift_slice(slice_in, shift, extent)
     assert result == slice_out
 
 
diff --git a/tests/test_boilerplate.py b/tests/test_boilerplate.py
index a8de4075..c0211fb3 100644
--- a/tests/test_boilerplate.py
+++ b/tests/test_boilerplate.py
@@ -35,10 +35,10 @@ def test_boilerplate_import_numpy():
 
     Dev Note: the import inside the function are part of the test.
     """
-    from ndsl.boilerplate import get_factories_single_tile_numpy
+    from ndsl.boilerplate import get_factories_single_tile
 
     # Boilerplate
-    stencil_factory, quantity_factory = get_factories_single_tile_numpy(
+    stencil_factory, quantity_factory = get_factories_single_tile(
         nx=5, ny=5, nz=2, nhalo=1
     )
 
@@ -50,10 +50,10 @@ def test_boilerplate_import_orchestrated_cpu():
 
     Dev Note: the import inside the function are part of the test.
     """
-    from ndsl.boilerplate import get_factories_single_tile_orchestrated_cpu
+    from ndsl.boilerplate import get_factories_single_tile_orchestrated
 
     # Boilerplate
-    stencil_factory, quantity_factory = get_factories_single_tile_orchestrated_cpu(
+    stencil_factory, quantity_factory = get_factories_single_tile_orchestrated(
         nx=5, ny=5, nz=2, nhalo=1
     )
 
diff --git a/tests/test_halo_data_transformer.py b/tests/test_halo_data_transformer.py
index e3f6d851..7e3385e7 100644
--- a/tests/test_halo_data_transformer.py
+++ b/tests/test_halo_data_transformer.py
@@ -353,8 +353,8 @@ def test_data_transformer_scalar_pack_unpack(quantity, rotation, n_halos):
 
 
 def test_data_transformer_vector_pack_unpack(quantity, rotation, n_halos):
-    targe_quanity_x = copy.deepcopy(quantity)
-    targe_quanity_y = copy.deepcopy(targe_quanity_x)
+    target_quantity_x = copy.deepcopy(quantity)
+    target_quantity_y = copy.deepcopy(target_quantity_x)
     x_quantity = quantity
     y_quantity = copy.deepcopy(x_quantity)
 
@@ -450,8 +450,8 @@ def test_data_transformer_vector_pack_unpack(quantity, rotation, n_halos):
         quantity.dims,
         quantity.metadata.np,
     )
-    targe_quanity_x.data[N_edge_boundaries[rotation][1]] = rotated_x
-    targe_quanity_y.data[N_edge_boundaries[rotation][1]] = rotated_y
+    target_quantity_x.data[N_edge_boundaries[rotation][1]] = rotated_x
+    target_quantity_y.data[N_edge_boundaries[rotation][1]] = rotated_y
     rotated_x, rotated_y = rotate_vector_data(
         quantity.data[NE_corner_boundaries[rotation][0]],
         quantity.data[NE_corner_boundaries[rotation][0]],
@@ -459,8 +459,8 @@ def test_data_transformer_vector_pack_unpack(quantity, rotation, n_halos):
         quantity.dims,
         quantity.metadata.np,
     )
-    targe_quanity_x.data[NE_corner_boundaries[rotation][1]] = rotated_x
-    targe_quanity_y.data[NE_corner_boundaries[rotation][1]] = rotated_y
+    target_quantity_x.data[NE_corner_boundaries[rotation][1]] = rotated_x
+    target_quantity_y.data[NE_corner_boundaries[rotation][1]] = rotated_y
 
-    assert (targe_quanity_x.data == x_quantity.data).all()
-    assert (targe_quanity_y.data == y_quantity.data).all()
+    assert (target_quantity_x.data == x_quantity.data).all()
+    assert (target_quantity_y.data == y_quantity.data).all()
diff --git a/tests/test_halo_update.py b/tests/test_halo_update.py
index 3d3bf501..9c1ac220 100644
--- a/tests/test_halo_update.py
+++ b/tests/test_halo_update.py
@@ -914,7 +914,7 @@ def test_halo_updater_stability(
     assert len(BUFFER_CACHE) == 1
     assert len(next(iter(BUFFER_CACHE.values()))) == 0
 
-    # Manually call finalize on the transfomers
+    # Manually call finalize on the transformers
     # This should recache all the buffers
     # DSL-816 will refactor that behavior out
     for halo_updater in halo_updaters: