From b7b84e04dbe33f9e41b423b673e58daa308974fd Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Tue, 27 Aug 2024 13:15:10 -0400
Subject: [PATCH 01/78] updating 4d handling

---
 ndsl/dsl/gt4py_utils.py            | 20 ++++++++++++++++++++
 ndsl/stencils/testing/translate.py |  7 +++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index f1facfe4..9970d4f1 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -154,6 +154,9 @@ def make_storage_data(
         data = _make_storage_data_2d(
             data, shape, start, dummy, axis, read_only, backend=backend
         )
+    elif n_dims == 4:
+        
+        data = _make_storage_data_4d(data, shape, start, backend=backend)
     else:
         data = _make_storage_data_3d(data, shape, start, backend=backend)
 
@@ -256,6 +259,23 @@ def _make_storage_data_3d(
     ] = asarray(data, type(buffer))
     return buffer
 
+def _make_storage_data_4d(
+    data: Field,
+    shape: Tuple[int, ...],
+    start: Tuple[int, ...] = (0, 0, 0, 0),
+    *,
+    backend: str,
+) -> Field:
+    istart, jstart, kstart, lstart = start
+    isize, jsize, ksize, lsize = data.shape
+    buffer = zeros(shape, backend=backend)
+    buffer[
+        istart : istart + isize,
+        jstart : jstart + jsize,
+        kstart : kstart + ksize,
+        lstart : lstart + lsize,
+    ] = asarray(data, type(buffer))
+    return buffer
 
 def make_storage_from_shape(
     shape: Tuple[int, ...],
diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index 2f07f82f..b3a80ae3 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -125,6 +125,9 @@ def make_storage_data(
                 backend=self.stencil_factory.backend,
             )
         else:
+            if len(array.shape == 4):
+                start = (int(istart), int(jstart), int(kstart), 0)
+                use_shape.append(array.shape[-1])
             return utils.make_storage_data(
                 array,
                 tuple(use_shape),
@@ -159,7 +162,7 @@ def collect_start_indices(self, datashape, varinfo):
         kstart = self.get_index_from_info(varinfo, "kstart", 0)
         return istart, jstart, kstart
 
-    def make_storage_data_input_vars(self, inputs, storage_vars=None):
+    def make_storage_data_input_vars(self, inputs, storage_vars=None, dict_4d=True):
         inputs_in = {**inputs}
         inputs_out = {}
         if storage_vars is None:
@@ -183,7 +186,7 @@ def make_storage_data_input_vars(self, inputs, storage_vars=None):
             )
 
             names_4d = None
-            if len(inputs_in[serialname].shape) == 4:
+            if (len(inputs_in[serialname].shape) == 4) and dict_4d:
                 names_4d = info.get("names_4d", utils.tracer_variables)
 
             dummy_axes = info.get("dummy_axes", None)

From bf4e2f22054cb2ed836d82ab2a828e36a0bbe8e9 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Tue, 27 Aug 2024 13:40:40 -0400
Subject: [PATCH 02/78] debug 4d test data

---
 ndsl/dsl/gt4py_utils.py            | 5 ++++-
 ndsl/stencils/testing/translate.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index 9970d4f1..1bce6b6c 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -53,11 +53,14 @@ def wrapper(*args, **kwargs) -> Any:
 def _mask_to_dimensions(
     mask: Tuple[bool, ...], shape: Sequence[int]
 ) -> List[Union[str, int]]:
-    assert len(mask) == 3
+    assert len(mask) >= 3
     dimensions: List[Union[str, int]] = []
     for i, axis in enumerate(("I", "J", "K")):
         if mask[i]:
             dimensions.append(axis)
+    if len(mask) > 3:
+        for i in range(len(mask) - 3):
+            dimensions.append(None)
     offset = int(sum(mask))
     dimensions.extend(shape[offset:])
     return dimensions
diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index b3a80ae3..7e580b70 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -125,7 +125,7 @@ def make_storage_data(
                 backend=self.stencil_factory.backend,
             )
         else:
-            if len(array.shape == 4):
+            if len(array.shape) == 4:
                 start = (int(istart), int(jstart), int(kstart), 0)
                 use_shape.append(array.shape[-1])
             return utils.make_storage_data(

From dfc4e5f642a37b521ab8cf910449409789c4d6a5 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Tue, 27 Aug 2024 13:45:00 -0400
Subject: [PATCH 03/78] more iter

---
 ndsl/dsl/gt4py_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index 1bce6b6c..d263c5d2 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -59,8 +59,8 @@ def _mask_to_dimensions(
         if mask[i]:
             dimensions.append(axis)
     if len(mask) > 3:
-        for i in range(len(mask) - 3):
-            dimensions.append(None)
+        for i in range(3, len(mask)):
+            dimensions.append(str(shape[i]))
     offset = int(sum(mask))
     dimensions.extend(shape[offset:])
     return dimensions

From 6ab1dd529602b08a9912b73bef3957e19c9fbf99 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Tue, 27 Aug 2024 14:44:23 -0400
Subject: [PATCH 04/78] moving ser_to_nc here

---
 ndsl/stencils/testing/serialbox_to_netcdf.py | 57 +++++++++++++++++---
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/ndsl/stencils/testing/serialbox_to_netcdf.py b/ndsl/stencils/testing/serialbox_to_netcdf.py
index 11814fff..c34b8fc3 100644
--- a/ndsl/stencils/testing/serialbox_to_netcdf.py
+++ b/ndsl/stencils/testing/serialbox_to_netcdf.py
@@ -32,6 +32,12 @@ def get_parser():
         type=str,
         help="[Optional] Give the name of the data, will default to Generator_rankX",
     )
+    parser.add_argument(
+        "-m", "--merge",
+        action='store_true',
+        default=False,
+        help="merges datastreams blocked into separate savepoints"
+    )
     return parser
 
 
@@ -58,7 +64,7 @@ def get_serializer(data_path: str, rank: int, data_name: Optional[str] = None):
     return serialbox.Serializer(serialbox.OpenModeKind.Read, data_path, name)
 
 
-def main(data_path: str, output_path: str, data_name: Optional[str] = None):
+def main(data_path: str, output_path: str, merge_blocks: bool, data_name: Optional[str] = None):
     os.makedirs(output_path, exist_ok=True)
     namelist_filename_in = os.path.join(data_path, "input.nml")
 
@@ -69,9 +75,20 @@ def main(data_path: str, output_path: str, data_name: Optional[str] = None):
     if namelist_filename_out != namelist_filename_in:
         shutil.copyfile(os.path.join(data_path, "input.nml"), namelist_filename_out)
     namelist = f90nml.read(namelist_filename_out)
-    total_ranks = (
-        6 * namelist["fv_core_nml"]["layout"][0] * namelist["fv_core_nml"]["layout"][1]
-    )
+    if namelist["fv_core_nml"]["grid_type"] <= 3:
+        total_ranks = (
+            6 * namelist["fv_core_nml"]["layout"][0] * namelist["fv_core_nml"]["layout"][1]
+        )
+    else:
+        total_ranks = (
+            namelist["fv_core_nml"]["layout"][0] * namelist["fv_core_nml"]["layout"][1]
+        )
+    nx = int((namelist["fv_core_nml"]['npx'] - 1) / (
+        namelist["fv_core_nml"]['layout'][0]
+    ))
+    ny = int((namelist["fv_core_nml"]['npy'] - 1) / (
+        namelist["fv_core_nml"]['layout'][1]
+    ))
 
     # all ranks have the same names, just look at first one
     serializer_0 = get_serializer(data_path, rank=0, data_name=data_name)
@@ -96,8 +113,33 @@ def main(data_path: str, output_path: str, data_name: Optional[str] = None):
                     rank_data[name].append(
                         read_serialized_data(serializer, savepoint, name)
                     )
+                if merge_blocks and len(rank_data[name] > 1):
+                    full_data = np.array(rank_data[name])
+                    if len(full_data.shape) > 1:
+                        if (nx * ny == full_data.shape[0] * full_data.shape[1]):
+                            # If we have an (i, x) array from each block reshape it
+                            new_shape = (nx, ny) + full_data.shape[2:]
+                            full_data = full_data.reshape(new_shape)
+                        elif full_data.shape[1] == namelist["fv_core_nml"]['npz']:
+                            # If it's a k-array from each block just take one
+                            full_data = full_data[0]
+                        else:
+                            return IndexError(
+                                "Shape mismatch in block merging: "
+                                f"{full_data.shape[0]} by {full_data.shape[1]} "
+                                f"is not compatible with {nx} by {ny}"
+                            )
+                    elif len(full_data.shape) == 1:
+                        # if it's a scalar from each block then just take one
+                        full_data = full_data[0]
+                    else:
+                        raise IndexError(f"{name} data appears to be empty")
+                    rank_data[name] = [full_data]
             rank_list.append(rank_data)
-        n_savepoints = len(savepoints)  # checking from last rank is fine
+        if merge_blocks:
+            n_savepoints = 1
+        else:
+            n_savepoints = len(savepoints)  # checking from last rank is fine
         data_vars = {}
         if n_savepoints > 0:
             encoding = {}
@@ -166,7 +208,10 @@ def entry_point():
     parser = get_parser()
     args = parser.parse_args()
     main(
-        data_path=args.data_path, output_path=args.output_path, data_name=args.data_name
+        data_path=args.data_path,
+        output_path=args.output_path,
+        merge_blocks=args.merge,
+        data_name=args.data_name,
     )
 
 

From 2d7062a9e999cd2cdfa34c301f85717b0f279643 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Thu, 5 Sep 2024 12:33:45 -0400
Subject: [PATCH 05/78] updating datatype in translate test

---
 ndsl/stencils/testing/translate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index 7e580b70..a25316ad 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -123,6 +123,7 @@ def make_storage_data(
                 axis=axis,
                 names=names_4d,
                 backend=self.stencil_factory.backend,
+                dtype=array.dtype,
             )
         else:
             if len(array.shape) == 4:
@@ -137,6 +138,7 @@ def make_storage_data(
                 axis=axis,
                 read_only=read_only,
                 backend=self.stencil_factory.backend,
+                dtype=array.dtype,
             )
 
     def storage_vars(self):

From d44551e5f7c6933cc9eb5c84fd591b7721aac13e Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Thu, 5 Sep 2024 12:48:18 -0400
Subject: [PATCH 06/78] typing works

---
 ndsl/stencils/testing/translate.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index a25316ad..09143083 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -5,7 +5,7 @@
 
 import ndsl.dsl.gt4py_utils as utils
 from ndsl.dsl.stencil import StencilFactory
-from ndsl.dsl.typing import Field, Float  # noqa: F401
+from ndsl.dsl.typing import Field, Float, Int  # noqa: F401
 from ndsl.quantity import Quantity
 from ndsl.stencils.testing.grid import Grid  # type: ignore
 
@@ -113,6 +113,12 @@ def make_storage_data(
         elif not full_shape and len(array.shape) < 3 and axis == len(array.shape) - 1:
             use_shape[1] = 1
         start = (int(istart), int(jstart), int(kstart))
+        if 'float' in str(array.dtype):
+            dtype = Float
+        elif 'int' in str(array.dtype):
+            dtype = Int
+        else:
+            dtype = array.dtype
         if names_4d:
             return utils.make_storage_dict(
                 array,
@@ -123,7 +129,7 @@ def make_storage_data(
                 axis=axis,
                 names=names_4d,
                 backend=self.stencil_factory.backend,
-                dtype=array.dtype,
+                dtype=dtype,
             )
         else:
             if len(array.shape) == 4:
@@ -138,7 +144,7 @@ def make_storage_data(
                 axis=axis,
                 read_only=read_only,
                 backend=self.stencil_factory.backend,
-                dtype=array.dtype,
+                dtype=dtype,
             )
 
     def storage_vars(self):

From ed3d4319b5bea0c1ea87416954f3f3a49c5b5c39 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Thu, 5 Sep 2024 14:31:03 -0400
Subject: [PATCH 07/78] fix dict, lint

---
 ndsl/dsl/gt4py_utils.py            | 6 +++++-
 ndsl/stencils/testing/translate.py | 6 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index d263c5d2..227eb57d 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -158,7 +158,7 @@ def make_storage_data(
             data, shape, start, dummy, axis, read_only, backend=backend
         )
     elif n_dims == 4:
-        
+
         data = _make_storage_data_4d(data, shape, start, backend=backend)
     else:
         data = _make_storage_data_3d(data, shape, start, backend=backend)
@@ -262,6 +262,7 @@ def _make_storage_data_3d(
     ] = asarray(data, type(buffer))
     return buffer
 
+
 def _make_storage_data_4d(
     data: Field,
     shape: Tuple[int, ...],
@@ -280,6 +281,7 @@ def _make_storage_data_4d(
     ] = asarray(data, type(buffer))
     return buffer
 
+
 def make_storage_from_shape(
     shape: Tuple[int, ...],
     origin: Tuple[int, ...] = origin,
@@ -333,6 +335,7 @@ def make_storage_dict(
     axis: int = 2,
     *,
     backend: str,
+    dtype: DTypes = Float,
 ) -> Dict[str, "Field"]:
     assert names is not None, "for 4d variable storages, specify a list of names"
     if shape is None:
@@ -347,6 +350,7 @@ def make_storage_dict(
             dummy=dummy,
             axis=axis,
             backend=backend,
+            dtype=dtype,
         )
     return data_dict
 
diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index 09143083..779d1339 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -113,9 +113,9 @@ def make_storage_data(
         elif not full_shape and len(array.shape) < 3 and axis == len(array.shape) - 1:
             use_shape[1] = 1
         start = (int(istart), int(jstart), int(kstart))
-        if 'float' in str(array.dtype):
+        if "float" in str(array.dtype):
             dtype = Float
-        elif 'int' in str(array.dtype):
+        elif "int" in str(array.dtype):
             dtype = Int
         else:
             dtype = array.dtype
@@ -133,7 +133,7 @@ def make_storage_data(
             )
         else:
             if len(array.shape) == 4:
-                start = (int(istart), int(jstart), int(kstart), 0)
+                start = (int(istart), int(jstart), int(kstart), 0)  # type: ignore
                 use_shape.append(array.shape[-1])
             return utils.make_storage_data(
                 array,

From 2225fd934d6bd4fb99540b476ba3eee3f7a28e19 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Thu, 5 Sep 2024 14:48:42 -0400
Subject: [PATCH 08/78] remove empty line

---
 ndsl/dsl/gt4py_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index 227eb57d..c1e81733 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -158,7 +158,6 @@ def make_storage_data(
             data, shape, start, dummy, axis, read_only, backend=backend
         )
     elif n_dims == 4:
-
         data = _make_storage_data_4d(data, shape, start, backend=backend)
     else:
         data = _make_storage_data_3d(data, shape, start, backend=backend)

From f3cf32d03125749e5d363ba36e9eb6a09f0f46bf Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Mon, 30 Sep 2024 14:52:30 -0400
Subject: [PATCH 09/78] change from 4d to Nd

---
 ndsl/dsl/gt4py_utils.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index c1e81733..acfee07f 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -157,8 +157,8 @@ def make_storage_data(
         data = _make_storage_data_2d(
             data, shape, start, dummy, axis, read_only, backend=backend
         )
-    elif n_dims == 4:
-        data = _make_storage_data_4d(data, shape, start, backend=backend)
+    elif n_dims >= 4:
+        data = _make_storage_data_Nd(data, shape, start, backend=backend)
     else:
         data = _make_storage_data_3d(data, shape, start, backend=backend)
 
@@ -262,22 +262,18 @@ def _make_storage_data_3d(
     return buffer
 
 
-def _make_storage_data_4d(
+def _make_storage_data_Nd(
     data: Field,
     shape: Tuple[int, ...],
-    start: Tuple[int, ...] = (0, 0, 0, 0),
+    start: Tuple[int, ...] = None,
     *,
     backend: str,
 ) -> Field:
-    istart, jstart, kstart, lstart = start
-    isize, jsize, ksize, lsize = data.shape
+    if start is None:
+        start = tuple([0] * data.ndim)
     buffer = zeros(shape, backend=backend)
-    buffer[
-        istart : istart + isize,
-        jstart : jstart + jsize,
-        kstart : kstart + ksize,
-        lstart : lstart + lsize,
-    ] = asarray(data, type(buffer))
+    idx = tuple([slice(start[i], start[i] + data.shape[i]) for i in range(len(start))])
+    buffer[idx] = asarray(data, type(buffer))
     return buffer
 
 

From ed4ddd46c71a0523749d61fdd0946bb7c076c568 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Mon, 7 Oct 2024 11:17:07 -0400
Subject: [PATCH 10/78] Expose `k_start` and `k_end` automatically for any
 FrozenStencil

---
 ndsl/dsl/stencil.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ndsl/dsl/stencil.py b/ndsl/dsl/stencil.py
index 6a70b4d8..bb9f5c70 100644
--- a/ndsl/dsl/stencil.py
+++ b/ndsl/dsl/stencil.py
@@ -737,6 +737,8 @@ def axis_offsets(
             "local_js": gtscript.J[0] + self.jsc - origin[1],
             "j_end": j_end,
             "local_je": gtscript.J[-1] + self.jec - origin[1] - domain[1] + 1,
+            "k_start": self.origin[2],
+            "k_end": self.origin[2] + domain[2] - 1,
         }
 
     def get_origin_domain(

From 5b09a676394de64d972ded94e04cabde4c647320 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Mon, 7 Oct 2024 11:37:22 -0400
Subject: [PATCH 11/78] Fix k_start + utest

---
 ndsl/dsl/stencil.py               | 14 +++++++-------
 tests/dsl/test_stencil_factory.py | 17 +++++++++++++++++
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/ndsl/dsl/stencil.py b/ndsl/dsl/stencil.py
index bb9f5c70..085e4822 100644
--- a/ndsl/dsl/stencil.py
+++ b/ndsl/dsl/stencil.py
@@ -349,14 +349,14 @@ def __init__(
             ):
                 unblock_waiting_tiles(MPI.COMM_WORLD)
 
-        self._timing_collector.build_info[
-            _stencil_object_name(self.stencil_object)
-        ] = build_info
+        self._timing_collector.build_info[_stencil_object_name(self.stencil_object)] = (
+            build_info
+        )
         field_info = self.stencil_object.field_info
 
-        self._field_origins: Dict[
-            str, Tuple[int, ...]
-        ] = FrozenStencil._compute_field_origins(field_info, self.origin)
+        self._field_origins: Dict[str, Tuple[int, ...]] = (
+            FrozenStencil._compute_field_origins(field_info, self.origin)
+        )
         """mapping from field names to field origins"""
 
         self._stencil_run_kwargs: Dict[str, Any] = {
@@ -737,7 +737,7 @@ def axis_offsets(
             "local_js": gtscript.J[0] + self.jsc - origin[1],
             "j_end": j_end,
             "local_je": gtscript.J[-1] + self.jec - origin[1] - domain[1] + 1,
-            "k_start": self.origin[2],
+            "k_start": origin[2],
             "k_end": self.origin[2] + domain[2] - 1,
         }
 
diff --git a/tests/dsl/test_stencil_factory.py b/tests/dsl/test_stencil_factory.py
index ac189ad8..aa7a23fd 100644
--- a/tests/dsl/test_stencil_factory.py
+++ b/tests/dsl/test_stencil_factory.py
@@ -107,6 +107,23 @@ def test_get_stencils_with_varied_bounds_and_regions(backend: str):
     np.testing.assert_array_equal(q_orig.data, q_ref.data)
 
 
+def test_stencil_vertical_bounds(backend: str):
+    factory = get_stencil_factory(backend)
+    origins = [(3, 3, 0), (2, 2, 1)]
+    domains = [(1, 1, 3), (2, 2, 4)]
+    stencils = get_stencils_with_varied_bounds(
+        add_1_in_region_stencil,
+        origins,
+        domains,
+        stencil_factory=factory,
+    )
+
+    assert "k_start" in stencils[0].externals and stencils[0].externals["k_start"] == 0
+    assert "k_end" in stencils[0].externals and stencils[0].externals["k_end"] == 2
+    assert "k_start" in stencils[1].externals and stencils[1].externals["k_start"] == 1
+    assert "k_end" in stencils[1].externals and stencils[1].externals["k_end"] == 3
+
+
 @pytest.mark.parametrize("enabled", [True, False])
 def test_stencil_factory_numpy_comparison_from_dims_halo(enabled: bool):
     backend = "numpy"

From b0b2940502f36d0f5c8fc18c9becae6546043d7f Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Mon, 7 Oct 2024 11:43:02 -0400
Subject: [PATCH 12/78] lint

---
 ndsl/dsl/stencil.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ndsl/dsl/stencil.py b/ndsl/dsl/stencil.py
index 085e4822..25b3388a 100644
--- a/ndsl/dsl/stencil.py
+++ b/ndsl/dsl/stencil.py
@@ -349,14 +349,14 @@ def __init__(
             ):
                 unblock_waiting_tiles(MPI.COMM_WORLD)
 
-        self._timing_collector.build_info[_stencil_object_name(self.stencil_object)] = (
-            build_info
-        )
+        self._timing_collector.build_info[
+            _stencil_object_name(self.stencil_object)
+        ] = build_info
         field_info = self.stencil_object.field_info
 
-        self._field_origins: Dict[str, Tuple[int, ...]] = (
-            FrozenStencil._compute_field_origins(field_info, self.origin)
-        )
+        self._field_origins: Dict[
+            str, Tuple[int, ...]
+        ] = FrozenStencil._compute_field_origins(field_info, self.origin)
         """mapping from field names to field origins"""
 
         self._stencil_run_kwargs: Dict[str, Any] = {

From 0c7c90225d7b6731aa1838113653cc3253aa37eb Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Mon, 7 Oct 2024 12:03:03 -0400
Subject: [PATCH 13/78] Fix for 2d stencils

---
 ndsl/dsl/stencil.py               | 5 +++--
 tests/dsl/test_stencil_factory.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ndsl/dsl/stencil.py b/ndsl/dsl/stencil.py
index 25b3388a..5e917e66 100644
--- a/ndsl/dsl/stencil.py
+++ b/ndsl/dsl/stencil.py
@@ -737,8 +737,9 @@ def axis_offsets(
             "local_js": gtscript.J[0] + self.jsc - origin[1],
             "j_end": j_end,
             "local_je": gtscript.J[-1] + self.jec - origin[1] - domain[1] + 1,
-            "k_start": origin[2],
-            "k_end": self.origin[2] + domain[2] - 1,
+            "k_start": origin[2] if len(origin) > 2 else 0,
+            "k_end": (origin[2] if len(origin) > 2 else 0)
+            + (domain[2] - 1 if len(domain) > 2 else 0),
         }
 
     def get_origin_domain(
diff --git a/tests/dsl/test_stencil_factory.py b/tests/dsl/test_stencil_factory.py
index aa7a23fd..2af1218d 100644
--- a/tests/dsl/test_stencil_factory.py
+++ b/tests/dsl/test_stencil_factory.py
@@ -121,7 +121,7 @@ def test_stencil_vertical_bounds(backend: str):
     assert "k_start" in stencils[0].externals and stencils[0].externals["k_start"] == 0
     assert "k_end" in stencils[0].externals and stencils[0].externals["k_end"] == 2
     assert "k_start" in stencils[1].externals and stencils[1].externals["k_start"] == 1
-    assert "k_end" in stencils[1].externals and stencils[1].externals["k_end"] == 3
+    assert "k_end" in stencils[1].externals and stencils[1].externals["k_end"] == 4
 
 
 @pytest.mark.parametrize("enabled", [True, False])

From 4b8d4b9d4510c562909b8bf31e75ac813f09ab49 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Mon, 7 Oct 2024 16:21:01 -0400
Subject: [PATCH 14/78] Add threshold overrides to the multimodal metric

---
 ndsl/stencils/testing/test_translate.py | 18 +++++--
 ndsl/stencils/testing/translate.py      |  5 +-
 ndsl/testing/comparison.py              | 64 +++++++++++++++++++------
 3 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index db8e6047..6a3f47bd 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -75,6 +75,18 @@ def process_override(threshold_overrides, testobj, test_name, backend):
                     raise TypeError(
                         "ignore_near_zero_errors is either a list or a dict"
                     )
+            if "multimodal" in match:
+                parsed_mutimodal = match["multimodal"]
+                if "mmr_absolute_eps" in parsed_mutimodal:
+                    testobj.mmr_absolute_eps = float(
+                        parsed_mutimodal["mmr_absolute_eps"]
+                    )
+                if "mmr_relative_fraction" in parsed_mutimodal:
+                    testobj.mmr_relative_fraction = float(
+                        parsed_mutimodal["mmr_relative_fraction"]
+                    )
+                if "mmr_ulp_threshold" in parsed_mutimodal:
+                    testobj.mmr_ulp = float(parsed_mutimodal["mmr_ulp_threshold"])
             if "skip_test" in match:
                 testobj.skip_test = bool(match["skip_test"])
         elif len(matches) > 1:
@@ -197,9 +209,9 @@ def test_sequential_savepoint(
                 metric = MultiModalFloatMetric(
                     reference_values=ref_data,
                     computed_values=output_data,
-                    eps=case.testobj.max_error,
-                    ignore_near_zero_errors=ignore_near_zero,
-                    near_zero=case.testobj.near_zero,
+                    absolute_eps_override=case.testobj.mmr_absolute_eps,
+                    relative_fraction_override=case.testobj.mmr_relative_fraction,
+                    ulp_override=case.testobj.mmr_ulp,
                 )
             else:
                 metric = LegacyMetric(
diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index 3d9d20f9..401a6f81 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -32,7 +32,7 @@ def pad_field_in_j(field, nj: int, backend: str):
 
 
 def as_numpy(
-    value: Union[Dict[str, Any], Quantity, np.ndarray]
+    value: Union[Dict[str, Any], Quantity, np.ndarray],
 ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
     def _convert(value: Union[Quantity, np.ndarray]) -> np.ndarray:
         if isinstance(value, Quantity):
@@ -53,6 +53,9 @@ def _convert(value: Union[Quantity, np.ndarray]) -> np.ndarray:
 class TranslateFortranData2Py:
     max_error = 1e-14
     near_zero = 1e-18
+    mmr_absolute_eps = -1
+    mmr_relative_fraction = -1
+    mmr_ulp = -1
 
     def __init__(self, grid, stencil_factory: StencilFactory, origin=utils.origin):
         self.origin = origin
diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index 9e2d1d59..cd1e2ffe 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -166,6 +166,21 @@ def __repr__(self) -> str:
         return "\n".join(report)
 
 
+class _Metric:
+    def __init__(self, value):
+        self._value: float = value
+        self.is_default = True
+
+    @property
+    def value(self) -> float:
+        return self._value
+
+    @value.setter
+    def value(self, _value: float):
+        self._value = _value
+        self.is_default = False
+
+
 class MultiModalFloatMetric(BaseMetric):
     """Combination of absolute, relative & ULP comparison for floats
 
@@ -177,16 +192,18 @@ class MultiModalFloatMetric(BaseMetric):
     Absolute errors for large amplitute
     """
 
-    _f32_absolute_eps = 1e-10
-    _f64_absolute_eps = 1e-13
-    relative_fraction = 0.000001  # 0.0001%
-    ulp_threshold = 1.0
+    _f32_absolute_eps = _Metric(1e-10)
+    _f64_absolute_eps = _Metric(1e-13)
+    relative_fraction = _Metric(0.000001)  # 0.0001%
+    ulp_threshold = _Metric(1.0)
 
     def __init__(
         self,
         reference_values: np.ndarray,
         computed_values: np.ndarray,
-        eps: float,
+        absolute_eps_override: float = -1,
+        relative_fraction_override: float = -1,
+        ulp_override: float = -1,
         **kwargs,
     ):
         super().__init__(reference_values, computed_values)
@@ -198,9 +215,17 @@ def __init__(
         self.ulp_distance_metric = np.empty_like(self.references, dtype=np.bool_)
 
         if self.references.dtype is (np.float32, np.int32):
-            self.absolute_eps = max(eps, self._f32_absolute_eps)
+            self.absolute_eps = _Metric(self._f32_absolute_eps.value)
         else:
-            self.absolute_eps = max(eps, self._f64_absolute_eps)
+            self.absolute_eps = _Metric(self._f64_absolute_eps.value)
+
+        # Assign overrides if needed
+        if absolute_eps_override > self.absolute_eps.value:
+            self.absolute_eps.value = absolute_eps_override
+        if relative_fraction_override > self.relative_fraction.value:
+            self.relative_fraction.value = relative_fraction_override
+        if ulp_override > self.ulp_threshold.value:
+            self.ulp_threshold.value = ulp_override
 
         self.success = self._compute_all_metrics()
         self.check = np.all(self.success)
@@ -214,17 +239,19 @@ def _compute_all_metrics(
             )
             # Absolute distance
             self.absolute_distance = np.absolute(self.computed - self.references)
-            self.absolute_distance_metric = self.absolute_distance < self.absolute_eps
+            self.absolute_distance_metric = (
+                self.absolute_distance < self.absolute_eps.value
+            )
             # Relative distance (in pct)
             self.relative_distance = np.divide(self.absolute_distance, max_values)
             self.relative_distance_metric = (
-                self.absolute_distance < self.relative_fraction * max_values
+                self.absolute_distance < self.relative_fraction.value * max_values
             )
             # ULP distance
             self.ulp_distance = np.divide(
                 self.absolute_distance, np.spacing(max_values)
             )
-            self.ulp_distance_metric = self.ulp_distance <= self.ulp_threshold
+            self.ulp_distance_metric = self.ulp_distance <= self.ulp_threshold.value
 
             # Combine all distances into sucess or failure
             # Success = no NANs & ( abs or rel or ulp )
@@ -245,9 +272,18 @@ def _compute_all_metrics(
                 f"recieved data with unexpected dtype {self.references.dtype}"
             )
 
+    def _has_override(self) -> bool:
+        return (
+            not self.relative_fraction.is_default
+            or not self.absolute_eps.is_default
+            or not self.ulp_threshold.is_default
+        )
+
     def report(self, file_path: Optional[str] = None) -> List[str]:
         report = []
-        if self.check:
+        if self.check and self._has_override():
+            report.append("❎ Numerical differences with threshold override")
+        elif self.check:
             report.append("✅ No numerical differences")
         else:
             report.append("❌ Numerical failures")
@@ -260,9 +296,9 @@ def report(self, file_path: Optional[str] = None) -> List[str]:
             report = [
                 f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
                 f"Index   Computed   Reference   "
-                f"Absolute E(<{self.absolute_eps:.2e})  "
-                f"Relative E(<{self.relative_fraction * 100:.2e}%)   "
-                f"ULP E(<{self.ulp_threshold})",
+                f"{'❎ ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
+                f"{'❎ ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
+                f"{'❎ ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})",
             ]
             # Summary and worst result
             for iBad in range(bad_indices_count):

From 720149aee413d70e4c9c38b9b7d3f381b6ef7350 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Wed, 9 Oct 2024 12:11:36 -0400
Subject: [PATCH 15/78] Always report results, add summary with one liners

---
 ndsl/stencils/testing/test_translate.py | 43 +++++++++++++++++--------
 ndsl/testing/comparison.py              | 36 ++++++++++++++-------
 2 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index 6a3f47bd..6bda31af 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -15,14 +15,14 @@
 from ndsl.quantity import Quantity
 from ndsl.restart._legacy_restart import RESTART_PROPERTIES
 from ndsl.stencils.testing.savepoint import SavepointCase, dataset_to_dict
-from ndsl.testing.comparison import LegacyMetric, MultiModalFloatMetric
+from ndsl.testing.comparison import BaseMetric, LegacyMetric, MultiModalFloatMetric
 from ndsl.testing.perturbation import perturb
 
 
 # this only matters for manually-added print statements
 np.set_printoptions(threshold=4096)
 
-OUTDIR = "./.translate-errors"
+OUTDIR = "./.translate-outputs"
 GPU_MAX_ERR = 1e-10
 GPU_NEAR_ZERO = 1e-15
 
@@ -197,6 +197,9 @@ def test_sequential_savepoint(
     passing_names: List[str] = []
     all_ref_data = dataset_to_dict(case.ds_out)
     ref_data_out = {}
+    results = {}
+
+    # Assign metrics and report on terminal any failures
     for varname in case.testobj.serialnames(case.testobj.out_vars):
         ignore_near_zero = case.testobj.ignore_near_zero_errors.get(varname, False)
         ref_data = all_ref_data[varname]
@@ -221,16 +224,14 @@ def test_sequential_savepoint(
                     ignore_near_zero_errors=ignore_near_zero,
                     near_zero=case.testobj.near_zero,
                 )
+            results[varname] = metric
             if not metric.check:
-                os.makedirs(OUTDIR, exist_ok=True)
-                log_filename = os.path.join(
-                    OUTDIR,
-                    f"details-{case.savepoint_name}-{varname}-rank{case.rank}.log",
-                )
-                metric.report(log_filename)
                 pytest.fail(str(metric), pytrace=False)
             passing_names.append(failing_names.pop())
         ref_data_out[varname] = [ref_data]
+
+    # Reporting & data save
+    _report_results(case.savepoint_name, results)
     if len(failing_names) > 0:
         get_thresholds(case.testobj, input_data=original_input_data)
         os.makedirs(OUTDIR, exist_ok=True)
@@ -344,6 +345,9 @@ def test_parallel_savepoint(
     passing_names = []
     ref_data: Dict[str, Any] = {}
     all_ref_data = dataset_to_dict(case.ds_out)
+    results = {}
+
+    # Assign metrics and report on terminal any failures
     for varname in out_vars:
         ref_data[varname] = []
         new_ref_data = all_ref_data[varname]
@@ -370,14 +374,13 @@ def test_parallel_savepoint(
                     ignore_near_zero_errors=ignore_near_zero,
                     near_zero=case.testobj.near_zero,
                 )
+            results[varname] = metric
             if not metric.check:
-                os.makedirs(OUTDIR, exist_ok=True)
-                log_filename = os.path.join(
-                    OUTDIR, f"details-{case.savepoint_name}-{varname}.log"
-                )
-                metric.report(log_filename)
                 pytest.fail(str(metric), pytrace=False)
             passing_names.append(failing_names.pop())
+
+    # Reporting & data save
+    _report_results(case.savepoint_name, results)
     if len(failing_names) > 0:
         os.makedirs(OUTDIR, exist_ok=True)
         nct_filename = os.path.join(
@@ -405,6 +408,20 @@ def test_parallel_savepoint(
         pytest.fail("No tests passed")
 
 
+def _report_results(savepoint_name: str, results: Dict[str, BaseMetric]) -> None:
+    os.makedirs(OUTDIR, exist_ok=True)
+
+    # Summary
+    with open(f"{OUTDIR}/summary.log", "w") as f:
+        for varname, metric in results.items():
+            f.write(f"{varname}: {metric.one_line_report()}\n")
+
+    # Detailled log
+    for varname, metric in results.items():
+        log_filename = os.path.join(OUTDIR, f"details-{savepoint_name}-{varname}.log")
+        metric.report(log_filename)
+
+
 def save_netcdf(
     testobj,
     # first list over rank, second list over savepoint
diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index cd1e2ffe..9fd14b58 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -23,6 +23,9 @@ def __repr__(self) -> str:
     def report(self, file_path: Optional[str] = None) -> List[str]:
         ...
 
+    def one_line_report(self) -> str:
+        ...
+
 
 class LegacyMetric(BaseMetric):
     """Legacy (AI2) metric used for original FV3 port.
@@ -91,13 +94,16 @@ def _compute_errors(
             )
         return success
 
-    def report(self, file_path: Optional[str] = None) -> List[str]:
-        report = []
+    def one_line_report(self) -> str:
         if self.check:
-            report.append("✅ No numerical differences")
+            return "✅ No numerical differences"
         else:
-            report.append("❌ Numerical failures")
+            return "❌ Numerical failures"
 
+    def report(self, file_path: Optional[str] = None) -> List[str]:
+        report = []
+        report.append(self.one_line_report())
+        if not self.check:
             found_indices = np.logical_not(self.success).nonzero()
             computed_failures = self.computed[found_indices]
             reference_failures = self.references[found_indices]
@@ -279,15 +285,21 @@ def _has_override(self) -> bool:
             or not self.ulp_threshold.is_default
         )
 
-    def report(self, file_path: Optional[str] = None) -> List[str]:
-        report = []
+    def one_line_report(self) -> str:
+        metric_threholds = f"{'🔶 ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
+        metric_threholds += f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
+        metric_threholds += f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})"
         if self.check and self._has_override():
-            report.append("❎ Numerical differences with threshold override")
+            return f"🔶 No numerical differences with threshold override ({metric_threholds})"
         elif self.check:
-            report.append("✅ No numerical differences")
+            return f"✅ No numerical differences ({metric_threholds})"
         else:
-            report.append("❌ Numerical failures")
+            return f"❌ Numerical failures ({metric_threholds})"
 
+    def report(self, file_path: Optional[str] = None) -> List[str]:
+        report = []
+        report.append(self.one_line_report())
+        if not self.check:
             found_indices = np.logical_not(self.success).nonzero()
             # List all errors to terminal and file
             bad_indices_count = len(found_indices[0])
@@ -296,9 +308,9 @@ def report(self, file_path: Optional[str] = None) -> List[str]:
             report = [
                 f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
                 f"Index   Computed   Reference   "
-                f"{'❎ ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
-                f"{'❎ ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
-                f"{'❎ ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})",
+                f"{'🔶 ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
+                f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
+                f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})",
             ]
             # Summary and worst result
             for iBad in range(bad_indices_count):

From 020a2596f8cc028a831e00b202e5c0274c6dbe1c Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Wed, 9 Oct 2024 12:29:37 -0400
Subject: [PATCH 16/78] Remove "mmr" from the keys

---
 ndsl/stencils/testing/test_translate.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index 6bda31af..b9e212cd 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -77,16 +77,14 @@ def process_override(threshold_overrides, testobj, test_name, backend):
                     )
             if "multimodal" in match:
                 parsed_mutimodal = match["multimodal"]
-                if "mmr_absolute_eps" in parsed_mutimodal:
-                    testobj.mmr_absolute_eps = float(
-                        parsed_mutimodal["mmr_absolute_eps"]
-                    )
-                if "mmr_relative_fraction" in parsed_mutimodal:
+                if "absolute_epsilon" in parsed_mutimodal:
+                    testobj.mmr_absolute_eps = float(parsed_mutimodal["absolute_eps"])
+                if "relative_fraction" in parsed_mutimodal:
                     testobj.mmr_relative_fraction = float(
-                        parsed_mutimodal["mmr_relative_fraction"]
+                        parsed_mutimodal["relative_fraction"]
                     )
-                if "mmr_ulp_threshold" in parsed_mutimodal:
-                    testobj.mmr_ulp = float(parsed_mutimodal["mmr_ulp_threshold"])
+                if "ulp_threshold" in parsed_mutimodal:
+                    testobj.mmr_ulp = float(parsed_mutimodal["ulp_threshold"])
             if "skip_test" in match:
                 testobj.skip_test = bool(match["skip_test"])
         elif len(matches) > 1:

From d59918bfcb28926ad022b91224ad1ae573613afb Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Thu, 10 Oct 2024 09:42:09 -0400
Subject: [PATCH 17/78] README in testing

---
 ndsl/testing/README.md | 111 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 ndsl/testing/README.md

diff --git a/ndsl/testing/README.md b/ndsl/testing/README.md
new file mode 100644
index 00000000..ecd3be96
--- /dev/null
+++ b/ndsl/testing/README.md
@@ -0,0 +1,111 @@
+# Translate test
+
+## Summary
+
+NDSL exposes a "Translate" test system which allows the automatic numerical regression test against a pre-defined sets of NetCDFs.
+
+To write a translate test, derive from `TranslateFortranData2Py`. The system works by matching name of Translate class and data, e.g.:
+
+- if `TranslateNAME` is the name of the translate class
+- then the name of the data should be `NAME-In.nc` for the inputs and `NAME-Out.nc` for outputs that'll be check.
+
+The test runs via the `pytest` harness and can be triggered with the `pytest` commands.
+
+Options ares:
+
+- --insert-assert-print:         Print statements that would be substituted for insert_assert(), instead of writing to files
+- --insert-assert-fail:          Fail tests which include one or more insert_assert() calls
+- --backend=BACKEND:             Backend to execute the test with, can only be one.
+- --which_modules=WHICH_MODULES: Whitelist of modules to run. Only the part after Translate, e.g. in TranslateXYZ it'd be XYZ
+- --skip_modules=SKIP_MODULES:   Blacklist of modules to not run. Only the part after Translate, e.g. in TranslateXYZ it'd be XYZ
+- --which_rank=WHICH_RANK:       Restrict test to a single rank
+- --data_path=DATA_PATH:         Path of Netcdf input and outputs. Naming pattern needs to be XYZ-In and XYZ-Out for a test class named TranslateXYZ
+- --threshold_overrides_file=THRESHOLD_OVERRIDES_FILE: Path to a yaml overriding the default error threshold for a custom value.
+- --print_failures:              Print the failures detail. Default to True.
+- --failure_stride=FAILURE_STRIDE: How many indices of failures to print from worst to best. Default to 1.
+- --grid=GRID:                   Grid loading mode. "file" looks for "Grid-Info.nc", "compute" does the same but recomputes MetricTerms, "default" creates a simple grid with no metrics terms. Default to "file".
+- --topology=TOPOLOGY            Topology of the grid. "cubed-sphere" means a 6-faced grid, "doubly-periodic" means a 1 tile grid. Default to "cubed-sphere".
+- --multimodal_metric:           Use the multi-modal float metric. Default to False.
+
+More options of `pytest` are available when doing `pytest --help`.
+
+## Metrics
+
+There is three state of a test in `pytest`: FAIL, PASS and XFAIL (expected fail). To clear the PASS status, the output data contained in `NAME-Out.nc` is compared to the computed data via the `TranslateNAME` test. Because this system was developped to port Fortran numerics to many targets (mostly C, but also Python, and CPU/GPU), we can't rely on bit-to-bit comparison and have been developping a couple of metrics.
+
+### Legacy metric
+
+The legacy metric was used throughout the developement of the dynamical core and microphysics scheme at 64-bit precision. It tries to solve differences over big and small amplitutde values with a single formula that goes as follows: $\abs{computed-reference}/reference$ where `reference` has been purged of 0.
+NaN values are considered no-pass.
+To pass the metric has to be lower than `1e-14`, any value lower than `1e-18` will be considered pass by default. The pass threshold can be overriden (see below).
+
+### Multi-modal metric
+
+Moving to mixed precision code, the legacy metric didn't give enough flexibility to account for 32-bit precision errors that could accumulate. Another metric was built with the intent of breaking the one-fit-all concept and giving back flexibility. The metric is a combination of three differences:
+
+- _Absolute Difference_ ($`\abs{computed-reference}<threshold`$): the absolute difference between the reference value and the computed value. Good for small amplitude, decays to direct comparison in higher amplitude. Default threshold is `1e-13` for 64-bit, `1e-10` at 32-bit.
+- _Relative Difference_ ($`\asb{computed-reference}<threshold*max{abs{computed, abs{reference}}}`$): the difference relative to the maximum value. This can be seen at a % of error. Good for high amplitutde value, decay to direct comparison at smaller amplitude. Default is `0.0001%`
+- _ULP Difference_ ($`abs{computed-reference}/spacing{max{abs{computed, abs{reference}}}}<=threshold`$): Unit of Least Precision (ULP) can be shortly describe as a way to quantify the space between two describable floating points. This is useful to measure differences that are in the "noise" of the machine representation. Default threshold is 1, meaning the two values are virtually indistiguinshible.
+
+## Results
+
+A summary of all results is dumped in a `.translate-errors` directory. When failing, this folder carries also data (netcdf) and logs helpful to explore the errors.
+
+## Overrides
+
+### Threshold overrides
+
+`--threshold_overrides_file` takes in a yaml file with error thresholds specified for specific backend and platform configuration. Currently, two types of error overrides are allowed: maximum error and near zero.
+
+For maximum error, a blanket `max_error` is specified to override the parent classes relative error threshold.
+
+For near zero override, `ignore_near_zero_errors` is specified to allow some fields to pass with higher relative error if the absolute error is very small. Additionally, it is also possible to define a global near zero value for all remaining fields not specified in `ignore_near_zero_errors`. This is done by specifying `all_other_near_zero=<value>`.
+
+Override yaml file should have one of the following formats:
+
+### One near zero value for all variables
+
+```Stencil_name:
+ - backend: <backend>
+   max_error: <value>
+   near_zero: <value>
+   ignore_near_zero_errors:
+    - <var1>
+    - <var2>
+    - ...
+```
+
+### Variable specific near zero value
+
+```Stencil_name:
+ - backend: <backend>
+   max_error: <value>
+   ignore_near_zero_errors:
+    <var1>:<value1>
+    <var2>:<value2>
+    ...
+```
+
+### [optional] Global near zero value for remaining fields
+
+```Stencil_name:
+ - backend: <backend>
+   max_error: <value>
+   ignore_near_zero_errors:
+    <var1>:<value1>
+    <var2>:<value2>
+   all_other_near_zero:<global_value>
+    ...
+```
+
+where fields other than `var1` and `var2` will use `global_value`.
+
+### Multimodal overrides
+
+```Stencil_name:
+ - backend: <backend>
+   multimodal:
+    absolute_eps: <value>
+    relative_fraction: <value>
+    ulp_threshold: <value>
+```

From 4bdc914465c6cee117266510fd89b9706514e01d Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Thu, 10 Oct 2024 09:47:16 -0400
Subject: [PATCH 18/78] Better Latex (?)

---
 ndsl/testing/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ndsl/testing/README.md b/ndsl/testing/README.md
index ecd3be96..4cdcec0e 100644
--- a/ndsl/testing/README.md
+++ b/ndsl/testing/README.md
@@ -43,9 +43,9 @@ To pass the metric has to be lower than `1e-14`, any value lower than `1e-18` wi
 
 Moving to mixed precision code, the legacy metric didn't give enough flexibility to account for 32-bit precision errors that could accumulate. Another metric was built with the intent of breaking the one-fit-all concept and giving back flexibility. The metric is a combination of three differences:
 
-- _Absolute Difference_ ($`\abs{computed-reference}<threshold`$): the absolute difference between the reference value and the computed value. Good for small amplitude, decays to direct comparison in higher amplitude. Default threshold is `1e-13` for 64-bit, `1e-10` at 32-bit.
-- _Relative Difference_ ($`\asb{computed-reference}<threshold*max{abs{computed, abs{reference}}}`$): the difference relative to the maximum value. This can be seen at a % of error. Good for high amplitutde value, decay to direct comparison at smaller amplitude. Default is `0.0001%`
-- _ULP Difference_ ($`abs{computed-reference}/spacing{max{abs{computed, abs{reference}}}}<=threshold`$): Unit of Least Precision (ULP) can be shortly describe as a way to quantify the space between two describable floating points. This is useful to measure differences that are in the "noise" of the machine representation. Default threshold is 1, meaning the two values are virtually indistiguinshible.
+- _Absolute Difference_ ($`|computed-reference|<threshold`$): the absolute difference between the reference value and the computed value. Good for small amplitude, decays to direct comparison in higher amplitude. Default threshold is `1e-13` for 64-bit, `1e-10` at 32-bit.
+- _Relative Difference_ ($`|computed-reference|<threshold*\max{|computed|, |reference|}`$): the difference relative to the maximum value. This can be seen at a % of error. Good for high amplitutde value, decay to direct comparison at smaller amplitude. Default is `0.0001%`
+- _ULP Difference_ ($`|computed-reference|/\spacing{\max{|computed|, |reference|}}<=threshold`$): Unit of Least Precision (ULP) can be shortly describe as a way to quantify the space between two describable floating points. This is useful to measure differences that are in the "noise" of the machine representation. Default threshold is 1, meaning the two values are virtually indistiguinshible.
 
 ## Results
 

From bdb3820bc7ab89d072833af1d37458f6795521c8 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <florian.deconinck@gmail.com>
Date: Thu, 10 Oct 2024 09:48:18 -0400
Subject: [PATCH 19/78] Better Latex (?)

---
 ndsl/testing/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndsl/testing/README.md b/ndsl/testing/README.md
index 4cdcec0e..f6c7a27d 100644
--- a/ndsl/testing/README.md
+++ b/ndsl/testing/README.md
@@ -35,7 +35,7 @@ There is three state of a test in `pytest`: FAIL, PASS and XFAIL (expected fail)
 
 ### Legacy metric
 
-The legacy metric was used throughout the developement of the dynamical core and microphysics scheme at 64-bit precision. It tries to solve differences over big and small amplitutde values with a single formula that goes as follows: $\abs{computed-reference}/reference$ where `reference` has been purged of 0.
+The legacy metric was used throughout the developement of the dynamical core and microphysics scheme at 64-bit precision. It tries to solve differences over big and small amplitutde values with a single formula that goes as follows: $`\|computed-reference|/reference`$ where `reference` has been purged of 0.
 NaN values are considered no-pass.
 To pass the metric has to be lower than `1e-14`, any value lower than `1e-18` will be considered pass by default. The pass threshold can be overriden (see below).
 

From e17539d3ea3ae2d14091c2c49c75d1b0186f2944 Mon Sep 17 00:00:00 2001
From: Oliver Elbert <Oliver.Elbert@noaa.gov>
Date: Thu, 10 Oct 2024 09:58:37 -0400
Subject: [PATCH 20/78] fixing a typo that breaks bools in translate tests
 (#80)

---
 ndsl/stencils/testing/translate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index 2584deb1..c4b33171 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -179,7 +179,7 @@ def make_storage_data_input_vars(self, inputs, storage_vars=None, dict_4d=True):
             if type(inputs_in[p]) in [np.int64, np.int32]:
                 inputs_out[p] = int(inputs_in[p])
             elif type(inputs_in[p]) is bool:
-                inputs_out[p] == inputs_in[p]
+                inputs_out[p] = inputs_in[p]
             else:
                 inputs_out[p] = Float(inputs_in[p])
         for d, info in storage_vars.items():

From f8105dde9e6684b53080f16f52cf293e2fe3bc9b Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 11 Oct 2024 12:07:28 -0400
Subject: [PATCH 21/78] Fix summary filename

---
 ndsl/stencils/testing/test_translate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index b9e212cd..fdb2dc32 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -410,7 +410,7 @@ def _report_results(savepoint_name: str, results: Dict[str, BaseMetric]) -> None
     os.makedirs(OUTDIR, exist_ok=True)
 
     # Summary
-    with open(f"{OUTDIR}/summary.log", "w") as f:
+    with open(f"{OUTDIR}/{savepoint_name}-summary.log", "w") as f:
         for varname, metric in results.items():
             f.write(f"{varname}: {metric.one_line_report()}\n")
 

From 5ac067f9cf9a510202ea1dc0aa80dc18e3162814 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Sun, 13 Oct 2024 15:16:05 -0400
Subject: [PATCH 22/78] Fix report, filename

---
 ndsl/stencils/testing/test_translate.py | 2 +-
 ndsl/testing/comparison.py              | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index fdb2dc32..821b3d17 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -410,7 +410,7 @@ def _report_results(savepoint_name: str, results: Dict[str, BaseMetric]) -> None
     os.makedirs(OUTDIR, exist_ok=True)
 
     # Summary
-    with open(f"{OUTDIR}/{savepoint_name}-summary.log", "w") as f:
+    with open(f"{OUTDIR}/summary-{savepoint_name}.log", "w") as f:
         for varname, metric in results.items():
             f.write(f"{varname}: {metric.one_line_report()}\n")
 
diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index 9fd14b58..4b225ae1 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -290,11 +290,13 @@ def one_line_report(self) -> str:
         metric_threholds += f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
         metric_threholds += f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})"
         if self.check and self._has_override():
-            return f"🔶 No numerical differences with threshold override ({metric_threholds})"
+            return f"🔶 No numerical differences with threshold override - metric: {metric_threholds}"
         elif self.check:
-            return f"✅ No numerical differences ({metric_threholds})"
+            return f"✅ No numerical differences - metric: {metric_threholds}"
         else:
-            return f"❌ Numerical failures ({metric_threholds})"
+            failed_indices = len(np.logical_not(self.success).nonzero()[0])
+            all_indices = len(self.references.flatten())
+            return f"❌ Numerical failures: {failed_indices}/{all_indices} failed - metric: {metric_threholds}"
 
     def report(self, file_path: Optional[str] = None) -> List[str]:
         report = []

From 8870e46240ea3afc6319dad3aa7fdbec744c4de1 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 17 Oct 2024 12:31:04 -0400
Subject: [PATCH 23/78] Fix choosing right absolute difference for F32

---
 ndsl/testing/comparison.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index 4b225ae1..c302ffe0 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -220,7 +220,7 @@ def __init__(
         self.ulp_distance = np.empty_like(self.references)
         self.ulp_distance_metric = np.empty_like(self.references, dtype=np.bool_)
 
-        if self.references.dtype is (np.float32, np.int32):
+        if self.references.dtype in [np.float32, np.int32]:
             self.absolute_eps = _Metric(self._f32_absolute_eps.value)
         else:
             self.absolute_eps = _Metric(self._f64_absolute_eps.value)

From 4d6d96c629005197cfdffc680b918f7cd5fc5b0e Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 23 Oct 2024 12:28:24 -0400
Subject: [PATCH 24/78] Make robust for NaN value

---
 ndsl/testing/comparison.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index c302ffe0..9c59231e 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -317,11 +317,16 @@ def report(self, file_path: Optional[str] = None) -> List[str]:
             # Summary and worst result
             for iBad in range(bad_indices_count):
                 fi = tuple([f[iBad] for f in found_indices])
+                ulp_dist = (
+                    self.ulp_distance[fi]
+                    if np.isnan(self.ulp_distance[fi])
+                    else int(self.ulp_distance[fi])
+                )
                 report.append(
                     f"{str(fi)}  {self.computed[fi]:.16e}  {self.references[fi]:.16e}  "
                     f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'}  "
                     f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'}  "
-                    f"{int(self.ulp_distance[fi]):02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
+                    f"{ulp_dist:02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
                 )
 
         if file_path:

From 001d2bde4226a06be69130ccd08ddccf7caa2d06 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 24 Oct 2024 13:52:43 -0400
Subject: [PATCH 25/78] Detect when array have different dimensions, if only
 one dimension, collapse Clean up type infer and log work

---
 ndsl/stencils/testing/serialbox_to_netcdf.py | 79 +++++++++++++++-----
 1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/ndsl/stencils/testing/serialbox_to_netcdf.py b/ndsl/stencils/testing/serialbox_to_netcdf.py
index a29139c5..927e8006 100644
--- a/ndsl/stencils/testing/serialbox_to_netcdf.py
+++ b/ndsl/stencils/testing/serialbox_to_netcdf.py
@@ -62,7 +62,7 @@ def get_serializer(data_path: str, rank: int, data_name: Optional[str] = None):
         name = data_name
     else:
         name = f"Generator_rank{rank}"
-    return serialbox.Serializer(serialbox.OpenModeKind.Read, data_path, name)
+    return serialbox.Serializer(serialbox.OpenModeKind.Read, data_path, name)  # type: ignore
 
 
 def main(
@@ -81,22 +81,13 @@ def main(
     if namelist_filename_out != namelist_filename_in:
         shutil.copyfile(os.path.join(data_path, "input.nml"), namelist_filename_out)
     namelist = f90nml.read(namelist_filename_out)
-    if namelist["fv_core_nml"]["grid_type"] <= 3:
-        total_ranks = (
-            6
-            * namelist["fv_core_nml"]["layout"][0]
-            * namelist["fv_core_nml"]["layout"][1]
-        )
+    fv_core_nml: Dict[str, Any] = namelist["fv_core_nml"]  # type: ignore
+    if fv_core_nml["grid_type"] <= 3:
+        total_ranks = 6 * fv_core_nml["layout"][0] * fv_core_nml["layout"][1]
     else:
-        total_ranks = (
-            namelist["fv_core_nml"]["layout"][0] * namelist["fv_core_nml"]["layout"][1]
-        )
-    nx = int(
-        (namelist["fv_core_nml"]["npx"] - 1) / (namelist["fv_core_nml"]["layout"][0])
-    )
-    ny = int(
-        (namelist["fv_core_nml"]["npy"] - 1) / (namelist["fv_core_nml"]["layout"][1])
-    )
+        total_ranks = fv_core_nml["layout"][0] * fv_core_nml["layout"][1]
+    nx = int((fv_core_nml["npx"] - 1) / (fv_core_nml["layout"][0]))
+    ny = int((fv_core_nml["npy"] - 1) / (fv_core_nml["layout"][1]))
 
     # all ranks have the same names, just look at first one
     serializer_0 = get_serializer(data_path, rank=0, data_name=data_name)
@@ -109,6 +100,7 @@ def main(
                 serializer_0.get_savepoint(savepoint_name)[0]
             )
         )
+        print(f"Exporting {savepoint_name}")
         serializer_list = []
         for rank in range(total_ranks):
             serializer = get_serializer(data_path, rank, data_name)
@@ -149,7 +141,26 @@ def main(
         if n_savepoints > 0:
             encoding = {}
             for varname in set(names_list).difference(["rank"]):
+                # Check that all ranks have the same size. If not, aggregate and
+                # feedback on one rank
+                colapse_all_ranks = False
                 data_shape = list(rank_list[0][varname][0].shape)
+                for rank in range(total_ranks):
+                    this_shape = list(rank_list[rank][varname][0].shape)
+                    if data_shape != this_shape:
+                        if len(data_shape) != 1:
+                            raise ValueError(
+                                "Arrays have different dimensions. "
+                                f"E.g. rank 0 is {data_shape} "
+                                f"and rank {rank} is {this_shape} "
+                            )
+                        else:
+                            print(
+                                f"... different shape for {varname} across ranks, collapsing in on rank."
+                            )
+                            colapse_all_ranks = True
+                            break
+
                 if savepoint_name in [
                     "FVDynamics-In",
                     "FVDynamics-Out",
@@ -173,22 +184,54 @@ def main(
                         data_vars[varname] = get_data(
                             data_shape, total_ranks, n_savepoints, rank_list, varname
                         )
+                elif colapse_all_ranks:
+                    data_vars[varname] = get_data_collapse_all_ranks(
+                        total_ranks, n_savepoints, rank_list, varname
+                    )
                 else:
                     data_vars[varname] = get_data(
                         data_shape, total_ranks, n_savepoints, rank_list, varname
                     )
                 if len(data_shape) > 2:
                     encoding[varname] = {"zlib": True, "complevel": 1}
+
             dataset = xr.Dataset(data_vars=data_vars)
             dataset.to_netcdf(
                 os.path.join(output_path, f"{savepoint_name}.nc"), encoding=encoding
             )
 
 
+def get_data_collapse_all_ranks(total_ranks, n_savepoints, output_list, varname):
+    if total_ranks <= 0:
+        return xr.DataArray([], dims=[])
+    # Build array shape - we hypothesis there's only 1 axis
+    K_shape = 0
+    for rank in range(total_ranks):
+        assert len(output_list[rank][varname][0].shape) == 1
+        K_shape = K_shape + output_list[rank][varname][0].shape[0]
+
+    array = np.full(
+        [n_savepoints] + [K_shape],
+        fill_value=np.nan,
+        dtype=output_list[0][varname][0].dtype,
+    )
+    data = xr.DataArray(array, dims=["savepoint", f"dim_{varname}"])
+    last_size = 0
+    for rank in range(total_ranks):
+        for i_savepoint in range(n_savepoints):
+            this_rank_data = output_list[rank][varname][i_savepoint]
+            data[i_savepoint, last_size : last_size + this_rank_data.shape[0]] = (
+                this_rank_data[:]
+            )
+            last_size += this_rank_data.shape[0]
+
+    return data
+
+
 def get_data(data_shape, total_ranks, n_savepoints, output_list, varname):
-    # Read in dtype
     if total_ranks <= 0:
-        return
+        return xr.DataArray([], dims=[])
+    # Read in dtype
     varname_dtype = output_list[0][varname][0].dtype
     # Build data array
     array = np.full(

From eaf1d20de94b870c6aee0a168fe82db801b89c4d Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 24 Oct 2024 14:03:53 -0400
Subject: [PATCH 26/78] Lint

---
 ndsl/stencils/testing/serialbox_to_netcdf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ndsl/stencils/testing/serialbox_to_netcdf.py b/ndsl/stencils/testing/serialbox_to_netcdf.py
index 927e8006..573af712 100644
--- a/ndsl/stencils/testing/serialbox_to_netcdf.py
+++ b/ndsl/stencils/testing/serialbox_to_netcdf.py
@@ -220,9 +220,9 @@ def get_data_collapse_all_ranks(total_ranks, n_savepoints, output_list, varname)
     for rank in range(total_ranks):
         for i_savepoint in range(n_savepoints):
             this_rank_data = output_list[rank][varname][i_savepoint]
-            data[i_savepoint, last_size : last_size + this_rank_data.shape[0]] = (
-                this_rank_data[:]
-            )
+            data[
+                i_savepoint, last_size : last_size + this_rank_data.shape[0]
+            ] = this_rank_data[:]
             last_size += this_rank_data.shape[0]
 
     return data

From 489aab19343edaf17114799bd632b0c83ad307bb Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 24 Oct 2024 15:48:32 -0400
Subject: [PATCH 27/78] Add rank 0 to the data

---
 ndsl/stencils/testing/serialbox_to_netcdf.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/ndsl/stencils/testing/serialbox_to_netcdf.py b/ndsl/stencils/testing/serialbox_to_netcdf.py
index 573af712..a0e2363c 100644
--- a/ndsl/stencils/testing/serialbox_to_netcdf.py
+++ b/ndsl/stencils/testing/serialbox_to_netcdf.py
@@ -211,19 +211,18 @@ def get_data_collapse_all_ranks(total_ranks, n_savepoints, output_list, varname)
         K_shape = K_shape + output_list[rank][varname][0].shape[0]
 
     array = np.full(
-        [n_savepoints] + [K_shape],
+        [n_savepoints, 1] + [K_shape],
         fill_value=np.nan,
         dtype=output_list[0][varname][0].dtype,
     )
-    data = xr.DataArray(array, dims=["savepoint", f"dim_{varname}"])
+    data = xr.DataArray(array, dims=["savepoint", "rank", f"dim_{varname}"])
     last_size = 0
     for rank in range(total_ranks):
         for i_savepoint in range(n_savepoints):
-            this_rank_data = output_list[rank][varname][i_savepoint]
-            data[
-                i_savepoint, last_size : last_size + this_rank_data.shape[0]
-            ] = this_rank_data[:]
-            last_size += this_rank_data.shape[0]
+            rank_data = output_list[rank][varname][i_savepoint]
+            rank_data_size = rank_data.shape[0]
+            data[i_savepoint, 0, last_size : last_size + rank_data_size] = rank_data[:]
+            last_size += rank_data_size
 
     return data
 

From 2af8dfb6f4155c86eadfd084092a86cec7d9c7ea Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 24 Oct 2024 15:55:56 -0400
Subject: [PATCH 28/78] Check data exists for rank, skip & print if not

---
 ndsl/stencils/testing/savepoint.py      | 9 +++++++++
 ndsl/stencils/testing/test_translate.py | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/ndsl/stencils/testing/savepoint.py b/ndsl/stencils/testing/savepoint.py
index 77d71917..7571befb 100644
--- a/ndsl/stencils/testing/savepoint.py
+++ b/ndsl/stencils/testing/savepoint.py
@@ -45,6 +45,15 @@ class SavepointCase:
     def __str__(self):
         return f"{self.savepoint_name}-rank={self.rank}-call={self.i_call}"
 
+    @property
+    def exists(self) -> bool:
+        return (
+            xr.open_dataset(
+                os.path.join(self.data_dir, f"{self.savepoint_name}-In.nc")
+            ).sizes["rank"]
+            > self.rank
+        )
+
     @property
     def ds_in(self) -> xr.Dataset:
         return (
diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index 821b3d17..59c7a51f 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -177,6 +177,8 @@ def test_sequential_savepoint(
         )
     if case.testobj.skip_test:
         return
+    if not case.exists:
+        pytest.skip(f"Data at rank {case.rank} does not exists")
     input_data = dataset_to_dict(case.ds_in)
     input_names = (
         case.testobj.serialnames(case.testobj.in_vars["data_vars"])
@@ -334,6 +336,8 @@ def test_parallel_savepoint(
         return
     if (grid == "compute") and not case.testobj.compute_grid_option:
         pytest.xfail(f"Grid compute option not used for test {case.savepoint_name}")
+    if case.exists:
+        pytest.skip(f"Data at rank {case.rank} does not exists")
     input_data = dataset_to_dict(case.ds_in)
     # run python version of functionality
     output = case.testobj.compute_parallel(input_data, communicator)

From ce38ce0f59494345e8b7bc3fe207e850e77f033b Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 31 Oct 2024 12:46:55 -0400
Subject: [PATCH 29/78] Fix bad logic on skip test for parallel

---
 ndsl/stencils/testing/test_translate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index 59c7a51f..f8181c6f 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -336,7 +336,7 @@ def test_parallel_savepoint(
         return
     if (grid == "compute") and not case.testobj.compute_grid_option:
         pytest.xfail(f"Grid compute option not used for test {case.savepoint_name}")
-    if case.exists:
+    if not case.exists:
         pytest.skip(f"Data at rank {case.rank} does not exists")
     input_data = dataset_to_dict(case.ds_in)
     # run python version of functionality

From 05952aa282f1c70ec0017449e8179f7eb055a40d Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 12 Nov 2024 13:49:59 -0500
Subject: [PATCH 30/78] Verbose exported names

---
 ndsl/stencils/testing/serialbox_to_netcdf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ndsl/stencils/testing/serialbox_to_netcdf.py b/ndsl/stencils/testing/serialbox_to_netcdf.py
index a0e2363c..d03f295d 100644
--- a/ndsl/stencils/testing/serialbox_to_netcdf.py
+++ b/ndsl/stencils/testing/serialbox_to_netcdf.py
@@ -145,6 +145,7 @@ def main(
                 # feedback on one rank
                 colapse_all_ranks = False
                 data_shape = list(rank_list[0][varname][0].shape)
+                print(f"  Exporting {varname} - {data_shape}")
                 for rank in range(total_ranks):
                     this_shape = list(rank_list[rank][varname][0].shape)
                     if data_shape != this_shape:

From 33474315cbcec46c0a2818b6f37f10305f4e0628 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 12 Nov 2024 16:16:52 -0500
Subject: [PATCH 31/78] Make boilerplate calls more nimble

---
 examples/NDSL/03_orchestration_basics.ipynb |  4 ++--
 ndsl/boilerplate.py                         | 13 ++++++-------
 tests/test_boilerplate.py                   |  8 ++++----
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/examples/NDSL/03_orchestration_basics.ipynb b/examples/NDSL/03_orchestration_basics.ipynb
index eea24154..01a77dd8 100644
--- a/examples/NDSL/03_orchestration_basics.ipynb
+++ b/examples/NDSL/03_orchestration_basics.ipynb
@@ -37,7 +37,7 @@
     ")\n",
     "from ndsl.constants import X_DIM, Y_DIM, Z_DIM\n",
     "from ndsl.dsl.typing import FloatField, Float\n",
-    "from ndsl.boilerplate import get_factories_single_tile_orchestrated_cpu"
+    "from ndsl.boilerplate import get_factories_single_tile_orchestrated"
    ]
   },
   {
@@ -126,7 +126,7 @@
     "    tile_size = (3, 3, 3)\n",
     "\n",
     "    # Setup\n",
-    "    stencil_factory, qty_factory = get_factories_single_tile_orchestrated_cpu(\n",
+    "    stencil_factory, qty_factory = get_factories_single_tile_orchestrated(\n",
     "        nx=tile_size[0],\n",
     "        ny=tile_size[1],\n",
     "        nz=tile_size[2],\n",
diff --git a/ndsl/boilerplate.py b/ndsl/boilerplate.py
index f22e0b9f..dece7ce4 100644
--- a/ndsl/boilerplate.py
+++ b/ndsl/boilerplate.py
@@ -79,8 +79,8 @@ def _get_factories(
     return stencil_factory, quantity_factory
 
 
-def get_factories_single_tile_orchestrated_cpu(
-    nx, ny, nz, nhalo
+def get_factories_single_tile_orchestrated(
+    nx, ny, nz, nhalo, on_cpu: bool = True
 ) -> Tuple[StencilFactory, QuantityFactory]:
     """Build a Stencil & Quantity factory for orchestrated CPU, on a single tile topology."""
     return _get_factories(
@@ -88,22 +88,21 @@ def get_factories_single_tile_orchestrated_cpu(
         ny=ny,
         nz=nz,
         nhalo=nhalo,
-        backend="dace:cpu",
+        backend="dace:cpu" if on_cpu else "dace:gpu",
         orchestration=DaCeOrchestration.BuildAndRun,
         topology="tile",
     )
 
 
-def get_factories_single_tile_numpy(
-    nx, ny, nz, nhalo
+def get_factories_single_tile(
+    nx, ny, nz, nhalo, backend: str = "numpy"
 ) -> Tuple[StencilFactory, QuantityFactory]:
-    """Build a Stencil & Quantity factory for Numpy, on a single tile topology."""
     return _get_factories(
         nx=nx,
         ny=ny,
         nz=nz,
         nhalo=nhalo,
-        backend="numpy",
+        backend=backend,
         orchestration=DaCeOrchestration.Python,
         topology="tile",
     )
diff --git a/tests/test_boilerplate.py b/tests/test_boilerplate.py
index a8de4075..c0211fb3 100644
--- a/tests/test_boilerplate.py
+++ b/tests/test_boilerplate.py
@@ -35,10 +35,10 @@ def test_boilerplate_import_numpy():
 
     Dev Note: the import inside the function are part of the test.
     """
-    from ndsl.boilerplate import get_factories_single_tile_numpy
+    from ndsl.boilerplate import get_factories_single_tile
 
     # Boilerplate
-    stencil_factory, quantity_factory = get_factories_single_tile_numpy(
+    stencil_factory, quantity_factory = get_factories_single_tile(
         nx=5, ny=5, nz=2, nhalo=1
     )
 
@@ -50,10 +50,10 @@ def test_boilerplate_import_orchestrated_cpu():
 
     Dev Note: the import inside the function are part of the test.
     """
-    from ndsl.boilerplate import get_factories_single_tile_orchestrated_cpu
+    from ndsl.boilerplate import get_factories_single_tile_orchestrated
 
     # Boilerplate
-    stencil_factory, quantity_factory = get_factories_single_tile_orchestrated_cpu(
+    stencil_factory, quantity_factory = get_factories_single_tile_orchestrated(
         nx=5, ny=5, nz=2, nhalo=1
     )
 

From 1eda108c16275f7965a902523c53f4f85aa8ab12 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 12 Nov 2024 16:18:56 -0500
Subject: [PATCH 32/78] New option: `which_savepoint` Better error on bad
 output data Fix missing integer type check

---
 ndsl/stencils/testing/conftest.py       | 19 ++++++++++++++++++-
 ndsl/stencils/testing/test_translate.py |  5 ++++-
 ndsl/stencils/testing/translate.py      |  2 +-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/ndsl/stencils/testing/conftest.py b/ndsl/stencils/testing/conftest.py
index af5bb6a6..f01d17be 100644
--- a/ndsl/stencils/testing/conftest.py
+++ b/ndsl/stencils/testing/conftest.py
@@ -47,6 +47,9 @@ def pytest_addoption(parser):
     parser.addoption(
         "--which_rank", action="store", help="Restrict test to a single rank"
     )
+    parser.addoption(
+        "--which_savepoint", action="store", help="Restrict test to a single savepoint"
+    )
     parser.addoption(
         "--data_path",
         action="store",
@@ -204,6 +207,11 @@ def get_ranks(metafunc, layout):
         return [int(only_rank)]
 
 
+def get_savepoint_restriction(metafunc):
+    svpt = metafunc.config.getoption("which_savepoint")
+    return int(svpt) if svpt else None
+
+
 def get_namelist(namelist_filename):
     return Namelist.from_f90nml(f90nml.read(namelist_filename))
 
@@ -226,11 +234,13 @@ def sequential_savepoint_cases(metafunc, data_path, namelist_filename, *, backen
     namelist = get_namelist(namelist_filename)
     stencil_config = get_config(backend, None)
     ranks = get_ranks(metafunc, namelist.layout)
+    savepoint_to_replay = get_savepoint_restriction(metafunc)
     grid_mode = metafunc.config.getoption("grid")
     topology_mode = metafunc.config.getoption("topology")
     return _savepoint_cases(
         savepoint_names,
         ranks,
+        savepoint_to_replay,
         stencil_config,
         namelist,
         backend,
@@ -243,6 +253,7 @@ def sequential_savepoint_cases(metafunc, data_path, namelist_filename, *, backen
 def _savepoint_cases(
     savepoint_names,
     ranks,
+    savepoint_to_replay,
     stencil_config,
     namelist: Namelist,
     backend: str,
@@ -289,7 +300,11 @@ def _savepoint_cases(
             n_calls = xr.open_dataset(
                 os.path.join(data_path, f"{test_name}-In.nc")
             ).dims["savepoint"]
-            for i_call in range(n_calls):
+            if savepoint_to_replay is not None:
+                savepoint_iterator = range(savepoint_to_replay, savepoint_to_replay + 1)
+            else:
+                savepoint_iterator = range(n_calls)
+            for i_call in savepoint_iterator:
                 return_list.append(
                     SavepointCase(
                         savepoint_name=test_name,
@@ -322,9 +337,11 @@ def parallel_savepoint_cases(
     stencil_config = get_config(backend, communicator)
     savepoint_names = get_parallel_savepoint_names(metafunc, data_path)
     grid_mode = metafunc.config.getoption("grid")
+    savepoint_to_replay = get_savepoint_restriction(metafunc)
     return _savepoint_cases(
         savepoint_names,
         [mpi_rank],
+        savepoint_to_replay,
         stencil_config,
         namelist,
         backend,
diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index 821b3d17..a4709f84 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -200,7 +200,10 @@ def test_sequential_savepoint(
     # Assign metrics and report on terminal any failures
     for varname in case.testobj.serialnames(case.testobj.out_vars):
         ignore_near_zero = case.testobj.ignore_near_zero_errors.get(varname, False)
-        ref_data = all_ref_data[varname]
+        try:
+            ref_data = all_ref_data[varname]
+        except KeyError:
+            raise KeyError(f'Output "{varname}" couldn\'t be found in output data')
         if hasattr(case.testobj, "subset_output"):
             ref_data = case.testobj.subset_output(varname, ref_data)
         with subtests.test(varname=varname):
diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index b9c9ecea..1a48c1e0 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -179,7 +179,7 @@ def make_storage_data_input_vars(self, inputs, storage_vars=None, dict_4d=True):
         if storage_vars is None:
             storage_vars = self.storage_vars()
         for p in self.in_vars["parameters"]:
-            if type(inputs_in[p]) in [np.int64, np.int32]:
+            if type(inputs_in[p]) in [int, np.int64, np.int32]:
                 inputs_out[p] = int(inputs_in[p])
             elif type(inputs_in[p]) is bool:
                 inputs_out[p] = inputs_in[p]

From b1b3ac07a26894386c007b2924673987ea940709 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 12 Nov 2024 16:19:15 -0500
Subject: [PATCH 33/78] QOL for mypy/flak8 type hints

---
 ndsl/dsl/typing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ndsl/dsl/typing.py b/ndsl/dsl/typing.py
index 5eab76ef..69b35c81 100644
--- a/ndsl/dsl/typing.py
+++ b/ndsl/dsl/typing.py
@@ -46,7 +46,6 @@ def global_set_floating_point_precision():
         NotImplementedError(
             f"{precision_in_bit} bit precision not implemented or tested"
         )
-    return None
 
 
 # Default float and int types

From 88129fc4ef51e773826248f13e28c9001bddbd66 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 2 Dec 2024 11:19:19 -0500
Subject: [PATCH 34/78] Add SECONDS_PER_DAY as a constants following mixed
 precision standards

---
 ndsl/constants.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ndsl/constants.py b/ndsl/constants.py
index 0f7d55da..ae1ae5a3 100644
--- a/ndsl/constants.py
+++ b/ndsl/constants.py
@@ -2,6 +2,7 @@
 from enum import Enum
 
 from ndsl.logging import ndsl_log
+from ndsl.dsl.typing import Float
 
 
 # The FV3GFS model ships with two sets of constants, one used in the GFS physics
@@ -119,6 +120,7 @@ class ConstantVersions(Enum):
 else:
     raise RuntimeError("Constant selector failed, bad code.")
 
+SECONDS_PER_DAY = Float(86400.0)
 DZ_MIN = 2.0
 CV_AIR = CP_AIR - RDGAS  # Heat capacity of dry air at constant volume
 RDG = -RDGAS / GRAV

From c436b0b9ba088b40a2d4667078db727937ec3d87 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 2 Dec 2024 11:23:20 -0500
Subject: [PATCH 35/78] Lint

---
 ndsl/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndsl/constants.py b/ndsl/constants.py
index ae1ae5a3..c8be0605 100644
--- a/ndsl/constants.py
+++ b/ndsl/constants.py
@@ -1,8 +1,8 @@
 import os
 from enum import Enum
 
-from ndsl.logging import ndsl_log
 from ndsl.dsl.typing import Float
+from ndsl.logging import ndsl_log
 
 
 # The FV3GFS model ships with two sets of constants, one used in the GFS physics

From 9efb5f425a30bedcdb5edf992d788ac07e88d554 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <>
Date: Wed, 4 Dec 2024 09:59:12 +0100
Subject: [PATCH 36/78] Cleanups in dace orchestration

Readability improvements in dace orchestration including

- early returns
- spelling out variable names
- fixing typos
---
 ndsl/dsl/dace/orchestration.py | 280 ++++++++++++++++-----------------
 1 file changed, 134 insertions(+), 146 deletions(-)

diff --git a/ndsl/dsl/dace/orchestration.py b/ndsl/dsl/dace/orchestration.py
index b5417cec..12df4278 100644
--- a/ndsl/dsl/dace/orchestration.py
+++ b/ndsl/dsl/dace/orchestration.py
@@ -40,12 +40,12 @@
     cp = None
 
 
-def dace_inhibitor(func: Callable):
+def dace_inhibitor(func: Callable) -> Callable:
     """Triggers callback generation wrapping `func` while doing DaCe parsing."""
     return func
 
 
-def _upload_to_device(host_data: List[Any]):
+def _upload_to_device(host_data: List[Any]) -> None:
     """Make sure any ndarrays gets uploaded to the device
 
     This will raise an assertion if cupy is not installed.
@@ -60,22 +60,11 @@ def _download_results_from_dace(
     config: DaceConfig, dace_result: Optional[List[Any]], args: List[Any]
 ):
     """Move all data from DaCe memory space to GT4Py"""
-    gt4py_results = None
-    if dace_result is not None:
-        if config.is_gpu_backend():
-            gt4py_results = [
-                gt4py.storage.from_array(
-                    r,
-                    backend=config.get_backend(),
-                )
-                for r in dace_result
-            ]
-        else:
-            gt4py_results = [
-                gt4py.storage.from_array(r, backend=config.get_backend())
-                for r in dace_result
-            ]
-    return gt4py_results
+    if dace_result is None:
+        return None
+
+    backend = config.get_backend()
+    return [gt4py.storage.from_array(result, backend=backend) for result in dace_result]
 
 
 def _to_gpu(sdfg: dace.SDFG):
@@ -99,8 +88,8 @@ def _to_gpu(sdfg: dace.SDFG):
         else:
             arr.storage = dace.StorageType.GPU_Global
 
-    # All maps will be scedule on GPU
-    for mapentry, state in topmaps:
+    # All maps will be schedule on GPU
+    for mapentry, _state in topmaps:
         mapentry.schedule = dace.ScheduleType.GPU_Device
 
     # Deactivate OpenMP sections
@@ -108,25 +97,30 @@ def _to_gpu(sdfg: dace.SDFG):
         sd.openmp_sections = False
 
 
-def _simplify(sdfg: dace.SDFG, validate=True, verbose=False):
+def _simplify(
+    sdfg: dace.SDFG,
+    *,
+    validate: bool = True,
+    validate_all: bool = False,
+    verbose: bool = False,
+):
     """Override of sdfg.simplify to skip failing transformation
     per https://github.com/spcl/dace/issues/1328
     """
     return SimplifyPass(
         validate=validate,
+        validate_all=validate_all,
         verbose=verbose,
         skip=["ConstantPropagation"],
     ).apply_pass(sdfg, {})
 
 
 def _build_sdfg(
-    daceprog: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
+    program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
 ):
     """Build the .so out of the SDFG on the top tile ranks only"""
-    if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-        is_compiling = True
-    else:
-        is_compiling = config.do_compile
+    is_compiling = True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile
+
     if is_compiling:
         # Make the transients array persistents
         if config.is_gpu_backend():
@@ -136,18 +130,18 @@ def _build_sdfg(
             # Upload args to device
             _upload_to_device(list(args) + list(kwargs.values()))
         else:
-            for sd, _aname, arr in sdfg.arrays_recursive():
+            for _sd, _aname, arr in sdfg.arrays_recursive():
                 if arr.shape == (1,):
                     arr.storage = DaceStorageType.Register
             make_transients_persistent(sdfg=sdfg, device=DaceDeviceType.CPU)
 
         # Build non-constants & non-transients from the sdfg_kwargs
-        sdfg_kwargs = daceprog._create_sdfg_args(sdfg, args, kwargs)
-        for k in daceprog.constant_args:
+        sdfg_kwargs = program._create_sdfg_args(sdfg, args, kwargs)
+        for k in program.constant_args:
             if k in sdfg_kwargs:
                 del sdfg_kwargs[k]
         sdfg_kwargs = {k: v for k, v in sdfg_kwargs.items() if v is not None}
-        for k, tup in daceprog.resolver.closure_arrays.items():
+        for k, tup in program.resolver.closure_arrays.items():
             if k in sdfg_kwargs and tup[1].transient:
                 del sdfg_kwargs[k]
 
@@ -204,13 +198,16 @@ def _build_sdfg(
     # On BuildAndRun: all ranks sync, then load the SDFG from
     #                 the expected path (made available by build).
     # We use a "FrozenCompiledSDFG" to minimize re-entry cost at call time
+
+    mode = config.get_orchestrate()
     # DEV NOTE: we explicitly use MPI.COMM_WORLD here because it is
     # a true multi-machine sync, outside of our own communicator class.
-    if config.get_orchestrate() == DaCeOrchestration.Build:
+    if mode == DaCeOrchestration.Build:
         MPI.COMM_WORLD.Barrier()  # Protect against early exist which kill SLURM jobs
         ndsl_log.info(f"{DaCeProgress.default_prefix(config)} Build only, exiting.")
         exit(0)
-    elif config.get_orchestrate() == DaCeOrchestration.BuildAndRun:
+
+    if mode == DaCeOrchestration.BuildAndRun:
         if not is_compiling:
             ndsl_log.info(
                 f"{DaCeProgress.default_prefix(config)} Rank is not compiling."
@@ -219,52 +216,46 @@ def _build_sdfg(
         MPI.COMM_WORLD.Barrier()
 
         with DaCeProgress(config, "Loading"):
-            sdfg_path = get_sdfg_path(daceprog.name, config, override_run_only=True)
-            csdfg, _ = daceprog.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
-            config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
-                daceprog, csdfg, args, kwargs
+            sdfg_path = get_sdfg_path(program.name, config, override_run_only=True)
+            compiledSDFG, _ = program.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
+            config.loaded_precompiled_SDFG[program] = FrozenCompiledSDFG(
+                program, compiledSDFG, args, kwargs
             )
 
-        return _call_sdfg(daceprog, sdfg, config, args, kwargs)
+        return _call_sdfg(program, sdfg, config, args, kwargs)
 
 
-def _call_sdfg(
-    daceprog: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
-):
+def _call_sdfg(program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs):
     """Dispatch the SDFG execution and/or build"""
     # Pre-compiled SDFG code path does away with any data checks and
     # cached the marshalling - leading to almost direct C call
     # DaceProgram performs argument transformation & checks for a cost ~200ms
     # of overhead
-    if daceprog in config.loaded_precompiled_SDFG:
+    if program in config.loaded_precompiled_SDFG:
         with DaCeProgress(config, "Run"):
             if config.is_gpu_backend():
                 _upload_to_device(list(args) + list(kwargs.values()))
-            res = config.loaded_precompiled_SDFG[daceprog]()
+            res = config.loaded_precompiled_SDFG[program]()
             res = _download_results_from_dace(
                 config, res, list(args) + list(kwargs.values())
             )
         return res
+
+    mode = config.get_orchestrate()
+    if mode in [DaCeOrchestration.Build, DaCeOrchestration.BuildAndRun]:
+        ndsl_log.info("Building DaCe orchestration")
+        return _build_sdfg(program, sdfg, config, args, kwargs)
+
+    if mode == DaCeOrchestration.Run:
+        # We should never hit this, it should be caught by the
+        # loaded_precompiled_SDFG check above
+        raise RuntimeError("Unexpected call - pre-compiled SDFG failed to load")
     else:
-        if (
-            config.get_orchestrate() == DaCeOrchestration.Build
-            or config.get_orchestrate() == DaCeOrchestration.BuildAndRun
-        ):
-            ndsl_log.info("Building DaCe orchestration")
-            res = _build_sdfg(daceprog, sdfg, config, args, kwargs)
-        elif config.get_orchestrate() == DaCeOrchestration.Run:
-            # We should never hit this, it should be caught by the
-            # loaded_precompiled_SDFG check above
-            raise RuntimeError("Unexpected call - csdfg didn't get caught")
-        else:
-            raise NotImplementedError(
-                f"Mode {config.get_orchestrate()} unimplemented at call time"
-            )
-        return res
+        raise NotImplementedError(f"Mode '{mode}' unimplemented at call time")
 
 
 def _parse_sdfg(
-    daceprog: DaceProgram,
+    program: DaceProgram,
     config: DaceConfig,
     *args,
     **kwargs,
@@ -273,45 +264,46 @@ def _parse_sdfg(
     Either parses, load a .sdfg or load .so (as a compiled sdfg)
 
     Attributes:
-        daceprog: the DaceProgram carrying reference to the original method/function
+        program: the DaceProgram carrying reference to the original method/function
         config: the DaceConfig configuration for this execution
     """
     # Check cache for already loaded SDFG
-    if daceprog in config.loaded_precompiled_SDFG:
-        return config.loaded_precompiled_SDFG[daceprog]
+    if program in config.loaded_precompiled_SDFG:
+        return config.loaded_precompiled_SDFG[program]
 
     # Build expected path
-    sdfg_path = get_sdfg_path(daceprog.name, config)
+    sdfg_path = get_sdfg_path(program.name, config)
     if sdfg_path is None:
-        if DEACTIVATE_DISTRIBUTED_DACE_COMPILE:
-            is_compiling = True
-        else:
-            is_compiling = config.do_compile
+        is_compiling = (
+            True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile
+        )
+
         if not is_compiling:
             # We can not parse the SDFG since we will load the proper
             # compiled SDFG from the compiling rank
             return None
-        with DaCeProgress(config, f"Parse code of {daceprog.name} to SDFG"):
-            sdfg = daceprog.to_sdfg(
+
+        with DaCeProgress(config, f"Parse code of {program.name} to SDFG"):
+            sdfg = program.to_sdfg(
                 *args,
-                **daceprog.__sdfg_closure__(),
+                **program.__sdfg_closure__(),
                 **kwargs,
                 save=False,
                 simplify=False,
             )
         return sdfg
-    else:
-        if os.path.isfile(sdfg_path):
-            with DaCeProgress(config, "Load .sdfg"):
-                sdfg, _ = daceprog.load_sdfg(sdfg_path, *args, **kwargs)
-            return sdfg
-        else:
-            with DaCeProgress(config, "Load precompiled .sdfg (.so)"):
-                csdfg, _ = daceprog.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
-                config.loaded_precompiled_SDFG[daceprog] = FrozenCompiledSDFG(
-                    daceprog, csdfg, args, kwargs
-                )
-            return csdfg
+
+    if os.path.isfile(sdfg_path):
+        with DaCeProgress(config, "Load .sdfg"):
+            sdfg, _ = program.load_sdfg(sdfg_path, *args, **kwargs)
+        return sdfg
+
+    with DaCeProgress(config, "Load precompiled .sdfg (.so)"):
+        compiledSDFG, _ = program.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
+        config.loaded_precompiled_SDFG[program] = FrozenCompiledSDFG(
+            program, compiledSDFG, args, kwargs
+        )
+    return compiledSDFG
 
 
 class _LazyComputepathFunction(SDFGConvertible):
@@ -448,7 +440,7 @@ def orchestrate(
 ):
     """
     Orchestrate a method of an object with DaCe.
-    The method object is patched in place, replacing the orignal Callable with
+    The method object is patched in place, replacing the original Callable with
     a wrapper that will trigger orchestration at call time.
     If the model configuration doesn't demand orchestration, this won't do anything.
 
@@ -463,72 +455,71 @@ def orchestrate(
         dace_compiletime_args = []
 
     if config.is_dace_orchestrated():
-        if hasattr(obj, method_to_orchestrate):
-            func = type.__getattribute__(type(obj), method_to_orchestrate)
-
-            # Flag argument as dace.constant
-            for argument in dace_compiletime_args:
-                func.__annotations__[argument] = DaceCompiletime
-
-            # Build DaCe orchestrated wrapper
-            # This is a JIT object, e.g. DaCe compilation will happen on call
-            wrapped = _LazyComputepathMethod(func, config).__get__(obj)
-
-            if method_to_orchestrate == "__call__":
-                # Grab the function from the type of the child class
-                # Dev note: we need to use type for dunder call because:
-                #   a = A()
-                #   a()
-                # resolved to: type(a).__call__(a)
-                # therefore patching the instance call (e.g a.__call__) is not enough.
-                # We could patch the type(self), ergo the class itself
-                # but that would patch _every_ instance of A.
-                # What we can do is patch the instance.__class__ with a local made class
-                # in order to keep each instance with it's own patch.
-                #
-                # Re: type:ignore
-                # Mypy is unhappy about dynamic class name and the devs (per github
-                # issues discussion) is to make a plugin. Too much work -> ignore mypy
-
-                class _(type(obj)):  # type: ignore
-                    __qualname__ = f"{type(obj).__qualname__}_patched"
-                    __name__ = f"{type(obj).__name__}_patched"
-
-                    def __call__(self, *arg, **kwarg):
-                        return wrapped(*arg, **kwarg)
-
-                    def __sdfg__(self, *args, **kwargs):
-                        return wrapped.__sdfg__(*args, **kwargs)
-
-                    def __sdfg_closure__(self, reevaluate=None):
-                        return wrapped.__sdfg_closure__(reevaluate)
-
-                    def __sdfg_signature__(self):
-                        return wrapped.__sdfg_signature__()
-
-                    def closure_resolver(
-                        self, constant_args, given_args, parent_closure=None
-                    ):
-                        return wrapped.closure_resolver(
-                            constant_args, given_args, parent_closure
-                        )
-
-                # We keep the original class type name to not perturb
-                # the workflows that uses it to build relevant info (path, hash...)
-                previous_cls_name = type(obj).__name__
-                obj.__class__ = _
-                type(obj).__name__ = previous_cls_name
-            else:
-                # For regular attribute - we can just patch as usual
-                setattr(obj, method_to_orchestrate, wrapped)
-
-        else:
+        if not hasattr(obj, method_to_orchestrate):
             raise RuntimeError(
                 f"Could not orchestrate, "
                 f"{type(obj).__name__}.{method_to_orchestrate} "
                 "does not exists"
             )
 
+        func = type.__getattribute__(type(obj), method_to_orchestrate)
+
+        # Flag argument as dace.constant
+        for argument in dace_compiletime_args:
+            func.__annotations__[argument] = DaceCompiletime
+
+        # Build DaCe orchestrated wrapper
+        # This is a JIT object, e.g. DaCe compilation will happen on call
+        wrapped = _LazyComputepathMethod(func, config).__get__(obj)
+
+        if method_to_orchestrate == "__call__":
+            # Grab the function from the type of the child class
+            # Dev note: we need to use type for dunder call because:
+            #   a = A()
+            #   a()
+            # resolved to: type(a).__call__(a)
+            # therefore patching the instance call (e.g a.__call__) is not enough.
+            # We could patch the type(self), ergo the class itself
+            # but that would patch _every_ instance of A.
+            # What we can do is patch the instance.__class__ with a local made class
+            # in order to keep each instance with it's own patch.
+            #
+            # Re: type:ignore
+            # Mypy is unhappy about dynamic class name and the devs (per github
+            # issues discussion) is to make a plugin. Too much work -> ignore mypy
+
+            class _(type(obj)):  # type: ignore
+                __qualname__ = f"{type(obj).__qualname__}_patched"
+                __name__ = f"{type(obj).__name__}_patched"
+
+                def __call__(self, *arg, **kwarg):
+                    return wrapped(*arg, **kwarg)
+
+                def __sdfg__(self, *args, **kwargs):
+                    return wrapped.__sdfg__(*args, **kwargs)
+
+                def __sdfg_closure__(self, reevaluate=None):
+                    return wrapped.__sdfg_closure__(reevaluate)
+
+                def __sdfg_signature__(self):
+                    return wrapped.__sdfg_signature__()
+
+                def closure_resolver(
+                    self, constant_args, given_args, parent_closure=None
+                ):
+                    return wrapped.closure_resolver(
+                        constant_args, given_args, parent_closure
+                    )
+
+            # We keep the original class type name to not perturb
+            # the workflows that uses it to build relevant info (path, hash...)
+            previous_cls_name = type(obj).__name__
+            obj.__class__ = _
+            type(obj).__name__ = previous_cls_name
+        else:
+            # For regular attribute - we can just patch as usual
+            setattr(obj, method_to_orchestrate, wrapped)
+
 
 def orchestrate_function(
     config: DaceConfig = None,
@@ -553,9 +544,6 @@ def _wrapper(*args, **kwargs):
                 func.__annotations__[argument] = DaceCompiletime
             return _LazyComputepathFunction(func, config)
 
-        if config.is_dace_orchestrated():
-            return _wrapper(func)
-        else:
-            return func
+        return _wrapper(func) if config.is_dace_orchestrated() else func
 
     return _decorator

From c7d6c4f6aa33e80ab9b7765ae374f8058a1d0a45 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <>
Date: Wed, 4 Dec 2024 17:10:49 +0100
Subject: [PATCH 37/78] Rename program -> dace_program

---
 ndsl/dsl/dace/orchestration.py | 54 ++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/ndsl/dsl/dace/orchestration.py b/ndsl/dsl/dace/orchestration.py
index 12df4278..767610c3 100644
--- a/ndsl/dsl/dace/orchestration.py
+++ b/ndsl/dsl/dace/orchestration.py
@@ -116,7 +116,7 @@ def _simplify(
 
 
 def _build_sdfg(
-    program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
+    dace_program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
 ):
     """Build the .so out of the SDFG on the top tile ranks only"""
     is_compiling = True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile
@@ -136,12 +136,12 @@ def _build_sdfg(
             make_transients_persistent(sdfg=sdfg, device=DaceDeviceType.CPU)
 
         # Build non-constants & non-transients from the sdfg_kwargs
-        sdfg_kwargs = program._create_sdfg_args(sdfg, args, kwargs)
-        for k in program.constant_args:
+        sdfg_kwargs = dace_program._create_sdfg_args(sdfg, args, kwargs)
+        for k in dace_program.constant_args:
             if k in sdfg_kwargs:
                 del sdfg_kwargs[k]
         sdfg_kwargs = {k: v for k, v in sdfg_kwargs.items() if v is not None}
-        for k, tup in program.resolver.closure_arrays.items():
+        for k, tup in dace_program.resolver.closure_arrays.items():
             if k in sdfg_kwargs and tup[1].transient:
                 del sdfg_kwargs[k]
 
@@ -216,26 +216,30 @@ def _build_sdfg(
         MPI.COMM_WORLD.Barrier()
 
         with DaCeProgress(config, "Loading"):
-            sdfg_path = get_sdfg_path(program.name, config, override_run_only=True)
-            compiledSDFG, _ = program.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
-            config.loaded_precompiled_SDFG[program] = FrozenCompiledSDFG(
-                program, compiledSDFG, args, kwargs
+            sdfg_path = get_sdfg_path(dace_program.name, config, override_run_only=True)
+            compiledSDFG, _ = dace_program.load_precompiled_sdfg(
+                sdfg_path, *args, **kwargs
+            )
+            config.loaded_precompiled_SDFG[dace_program] = FrozenCompiledSDFG(
+                dace_program, compiledSDFG, args, kwargs
             )
 
-        return _call_sdfg(program, sdfg, config, args, kwargs)
+        return _call_sdfg(dace_program, sdfg, config, args, kwargs)
 
 
-def _call_sdfg(program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs):
+def _call_sdfg(
+    dace_program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args, kwargs
+):
     """Dispatch the SDFG execution and/or build"""
     # Pre-compiled SDFG code path does away with any data checks and
     # cached the marshalling - leading to almost direct C call
     # DaceProgram performs argument transformation & checks for a cost ~200ms
     # of overhead
-    if program in config.loaded_precompiled_SDFG:
+    if dace_program in config.loaded_precompiled_SDFG:
         with DaCeProgress(config, "Run"):
             if config.is_gpu_backend():
                 _upload_to_device(list(args) + list(kwargs.values()))
-            res = config.loaded_precompiled_SDFG[program]()
+            res = config.loaded_precompiled_SDFG[dace_program]()
             res = _download_results_from_dace(
                 config, res, list(args) + list(kwargs.values())
             )
@@ -244,7 +248,7 @@ def _call_sdfg(program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args,
     mode = config.get_orchestrate()
     if mode in [DaCeOrchestration.Build, DaCeOrchestration.BuildAndRun]:
         ndsl_log.info("Building DaCe orchestration")
-        return _build_sdfg(program, sdfg, config, args, kwargs)
+        return _build_sdfg(dace_program, sdfg, config, args, kwargs)
 
     if mode == DaCeOrchestration.Run:
         # We should never hit this, it should be caught by the
@@ -255,7 +259,7 @@ def _call_sdfg(program: DaceProgram, sdfg: dace.SDFG, config: DaceConfig, args,
 
 
 def _parse_sdfg(
-    program: DaceProgram,
+    dace_program: DaceProgram,
     config: DaceConfig,
     *args,
     **kwargs,
@@ -264,15 +268,15 @@ def _parse_sdfg(
     Either parses, load a .sdfg or load .so (as a compiled sdfg)
 
     Attributes:
-        program: the DaceProgram carrying reference to the original method/function
+        dace_program: the DaceProgram carrying reference to the original method/function
         config: the DaceConfig configuration for this execution
     """
     # Check cache for already loaded SDFG
-    if program in config.loaded_precompiled_SDFG:
-        return config.loaded_precompiled_SDFG[program]
+    if dace_program in config.loaded_precompiled_SDFG:
+        return config.loaded_precompiled_SDFG[dace_program]
 
     # Build expected path
-    sdfg_path = get_sdfg_path(program.name, config)
+    sdfg_path = get_sdfg_path(dace_program.name, config)
     if sdfg_path is None:
         is_compiling = (
             True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile
@@ -283,10 +287,10 @@ def _parse_sdfg(
             # compiled SDFG from the compiling rank
             return None
 
-        with DaCeProgress(config, f"Parse code of {program.name} to SDFG"):
-            sdfg = program.to_sdfg(
+        with DaCeProgress(config, f"Parse code of {dace_program.name} to SDFG"):
+            sdfg = dace_program.to_sdfg(
                 *args,
-                **program.__sdfg_closure__(),
+                **dace_program.__sdfg_closure__(),
                 **kwargs,
                 save=False,
                 simplify=False,
@@ -295,13 +299,13 @@ def _parse_sdfg(
 
     if os.path.isfile(sdfg_path):
         with DaCeProgress(config, "Load .sdfg"):
-            sdfg, _ = program.load_sdfg(sdfg_path, *args, **kwargs)
+            sdfg, _ = dace_program.load_sdfg(sdfg_path, *args, **kwargs)
         return sdfg
 
     with DaCeProgress(config, "Load precompiled .sdfg (.so)"):
-        compiledSDFG, _ = program.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
-        config.loaded_precompiled_SDFG[program] = FrozenCompiledSDFG(
-            program, compiledSDFG, args, kwargs
+        compiledSDFG, _ = dace_program.load_precompiled_sdfg(sdfg_path, *args, **kwargs)
+        config.loaded_precompiled_SDFG[dace_program] = FrozenCompiledSDFG(
+            dace_program, compiledSDFG, args, kwargs
         )
     return compiledSDFG
 

From ce3ac7e89d6814b16934eaa003120e5639839bc6 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 9 Dec 2024 12:31:20 -0500
Subject: [PATCH 38/78] Make sure all constants adhere to the floating point
 precision set by the system

---
 ndsl/constants.py | 111 ++++++++++++++++++++++++----------------------
 1 file changed, 57 insertions(+), 54 deletions(-)

diff --git a/ndsl/constants.py b/ndsl/constants.py
index c8be0605..f3b51dfe 100644
--- a/ndsl/constants.py
+++ b/ndsl/constants.py
@@ -1,6 +1,8 @@
 import os
 from enum import Enum
 
+import numpy as np
+
 from ndsl.dsl.typing import Float
 from ndsl.logging import ndsl_log
 
@@ -76,70 +78,71 @@ class ConstantVersions(Enum):
 # Physical constants
 #####################
 if CONST_VERSION == ConstantVersions.GEOS:
-    RADIUS = 6.371e6
-    PI = 3.14159265358979323846
-    OMEGA = 2.0 * PI / 86164.0
-    GRAV = 9.80665
-    RGRAV = 1.0 / GRAV
-    RDGAS = 8314.47 / 28.965
-    RVGAS = 8314.47 / 18.015
-    HLV = 2.4665e6
-    HLF = 3.3370e5
-    KAPPA = RDGAS / (3.5 * RDGAS)
+    RADIUS = Float(6.371e6)
+    PI_8 = np.float64(3.14159265358979323846)
+    PI = Float(PI_8)
+    OMEGA = Float(2.0) * PI / Float(86164.0)
+    GRAV = Float(9.80665)
+    RGRAV = Float(1.0) / GRAV
+    RDGAS = Float(8314.47) / Float(28.965)
+    RVGAS = Float(8314.47) / Float(18.015)
+    HLV = Float(2.4665e6)
+    HLF = Float(3.3370e5)
+    KAPPA = RDGAS / (Float(3.5) * RDGAS)
     CP_AIR = RDGAS / KAPPA
-    TFREEZE = 273.15
-    SAT_ADJUST_THRESHOLD = 1.0e-6
+    TFREEZE = Float(273.16)
+    SAT_ADJUST_THRESHOLD = Float(1.0e-6)
 elif CONST_VERSION == ConstantVersions.GFS:
-    RADIUS = 6.3712e6  # Radius of the Earth [m]
-    PI = 3.1415926535897931
-    OMEGA = 7.2921e-5  # Rotation of the earth
-    GRAV = 9.80665  # Acceleration due to gravity [m/s^2].04
-    RGRAV = 1.0 / GRAV  # Inverse of gravitational acceleration
-    RDGAS = 287.05  # Gas constant for dry air [J/kg/deg] # 287.04
-    RVGAS = 461.50  # Gas constant for water vapor [J/kg/deg]
-    HLV = 2.5e6  # Latent heat of evaporation [J/kg]
-    HLF = 3.3358e5  # Latent heat of fusion [J/kg]  # 3.34e5
-    CP_AIR = 1004.6
+    RADIUS = Float(6.3712e6)  # Radius of the Earth [m]
+    PI = Float(3.1415926535897931)
+    OMEGA = Float(7.2921e-5)  # Rotation of the earth
+    GRAV = Float(9.80665)  # Acceleration due to gravity [m/s^2].04
+    RGRAV = Float(1.0 / GRAV)  # Inverse of gravitational acceleration
+    RDGAS = Float(287.05)  # Gas constant for dry air [J/kg/deg] # 287.04
+    RVGAS = Float(461.50)  # Gas constant for water vapor [J/kg/deg]
+    HLV = Float(2.5e6)  # Latent heat of evaporation [J/kg]
+    HLF = Float(3.3358e5)  # Latent heat of fusion [J/kg]  # 3.34e5
+    CP_AIR = Float(1004.6)
     KAPPA = RDGAS / CP_AIR  # Specific heat capacity of dry air at
-    TFREEZE = 273.15
-    SAT_ADJUST_THRESHOLD = 1.0e-8
+    TFREEZE = Float(273.15)
+    SAT_ADJUST_THRESHOLD = Float(1.0e-8)
 elif CONST_VERSION == ConstantVersions.GFDL:
-    RADIUS = 6371.0e3  # Radius of the Earth [m] #6371.0e3
-    PI = 3.14159265358979323846  # 3.14159265358979323846
-    OMEGA = 7.292e-5  # Rotation of the earth  # 7.292e-5
-    GRAV = 9.80  # Acceleration due to gravity [m/s^2].04
-    RGRAV = 1.0 / GRAV  # Inverse of gravitational acceleration
-    RDGAS = 287.04  # Gas constant for dry air [J/kg/deg] # 287.04
-    RVGAS = 461.50  # Gas constant for water vapor [J/kg/deg]
-    HLV = 2.500e6  # Latent heat of evaporation [J/kg]
-    HLF = 3.34e5  # Latent heat of fusion [J/kg]  # 3.34e5
-    KAPPA = 2.0 / 7.0
+    RADIUS = Float(6371.0e3)  # Radius of the Earth [m] #6371.0e3
+    PI = Float(3.14159265358979323846)  # 3.14159265358979323846
+    OMEGA = Float(7.292e-5)  # Rotation of the earth  # 7.292e-5
+    GRAV = Float(9.80)  # Acceleration due to gravity [m/s^2].04
+    RGRAV = Float(1.0) / GRAV  # Inverse of gravitational acceleration
+    RDGAS = Float(287.04)  # Gas constant for dry air [J/kg/deg] # 287.04
+    RVGAS = Float(461.50)  # Gas constant for water vapor [J/kg/deg]
+    HLV = Float(2.500e6)  # Latent heat of evaporation [J/kg]
+    HLF = Float(3.34e5)  # Latent heat of fusion [J/kg]  # 3.34e5
+    KAPPA = Float(2.0) / Float(7.0)
     CP_AIR = RDGAS / KAPPA  # Specific heat capacity of dry air at
-    TFREEZE = 273.16  # Freezing temperature of fresh water [K]
-    SAT_ADJUST_THRESHOLD = 1.0e-8
+    TFREEZE = Float(273.16)  # Freezing temperature of fresh water [K]
+    SAT_ADJUST_THRESHOLD = Float(1.0e-8)
 else:
     raise RuntimeError("Constant selector failed, bad code.")
 
 SECONDS_PER_DAY = Float(86400.0)
-DZ_MIN = 2.0
+DZ_MIN = Float(2.0)
 CV_AIR = CP_AIR - RDGAS  # Heat capacity of dry air at constant volume
 RDG = -RDGAS / GRAV
-CNST_0P20 = 0.2
+CNST_0P20 = Float(0.2)
 K1K = RDGAS / CV_AIR
-CNST_0P20 = 0.2
-CV_VAP = 3.0 * RVGAS  # Heat capacity of water vapor at constant volume
-ZVIR = RVGAS / RDGAS - 1  # con_fvirt in Fortran physics
-C_ICE = 1972.0  # Heat capacity of ice at -15 degrees Celsius
-C_ICE_0 = 2106.0  # Heat capacity of ice at 0 degrees Celsius
-C_LIQ = 4.1855e3  # Heat capacity of water at 15 degrees Celsius
-CP_VAP = 4.0 * RVGAS  # Heat capacity of water vapor at constant pressure
-TICE = 273.16  # Freezing temperature
+CNST_0P20 = Float(0.2)
+CV_VAP = Float(3.0) * RVGAS  # Heat capacity of water vapor at constant volume
+ZVIR = RVGAS / RDGAS - Float(1)  # con_fvirt in Fortran physics
+C_ICE = Float(1972.0)  # Heat capacity of ice at -15 degrees Celsius
+C_ICE_0 = Float(2106.0)  # Heat capacity of ice at 0 degrees Celsius
+C_LIQ = Float(4.1855e3)  # Heat capacity of water at 15 degrees Celsius
+CP_VAP = Float(4.0) * RVGAS  # Heat capacity of water vapor at constant pressure
+TICE = Float(273.16)  # Freezing temperature
 DC_ICE = C_LIQ - C_ICE  # Isobaric heating / cooling
 DC_VAP = CP_VAP - C_LIQ  # Isobaric heating / cooling
 D2ICE = DC_VAP + DC_ICE  # Isobaric heating / cooling
 LI0 = HLF - DC_ICE * TICE
 EPS = RDGAS / RVGAS
-EPSM1 = EPS - 1.0
+EPSM1 = EPS - Float(1.0)
 LV0 = (
     HLV - DC_VAP * TICE
 )  # 3.13905782e6, evaporation latent heat coefficient at 0 degrees Kelvin
@@ -149,10 +152,10 @@ class ConstantVersions(Enum):
 LI2 = (
     LV0 + LI00
 )  # 2.86799816e6, sublimation latent heat coefficient at 0 degrees Kelvin
-E00 = 611.21  # Saturation vapor pressure at 0 degrees Celsius (Pa)
-PSAT = 610.78  # Saturation vapor pressure at H2O 3pt (Pa)
-T_WFR = TICE - 40.0  # homogeneous freezing temperature
-TICE0 = TICE - 0.01
-T_MIN = 178.0  # Minimum temperature to freeze-dry all water vapor
-T_SAT_MIN = TICE - 160.0
-LAT2 = (HLV + HLF) ** 2  # used in bigg mechanism
+E00 = Float(611.21)  # Saturation vapor pressure at 0 degrees Celsius (Pa)
+PSAT = Float(610.78)  # Saturation vapor pressure at H2O 3pt (Pa)
+T_WFR = TICE - Float(40.0)  # homogeneous freezing temperature
+TICE0 = TICE - Float(0.01)
+T_MIN = Float(178.0)  # Minimum temperature to freeze-dry all water vapor
+T_SAT_MIN = TICE - Float(160.0)
+LAT2 = np.power((HLV + HLF), 2, dtype=Float)  # used in bigg mechanism

From 502486f4847bc574bc84730d303097ece2f455d4 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 10 Dec 2024 09:37:44 -0500
Subject: [PATCH 39/78] Move `is_float` to `dsl.typing`

---
 ndsl/dsl/typing.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/ndsl/dsl/typing.py b/ndsl/dsl/typing.py
index 69b35c81..513ab1fb 100644
--- a/ndsl/dsl/typing.py
+++ b/ndsl/dsl/typing.py
@@ -79,3 +79,14 @@ def cast_to_index3d(val: Tuple[int, ...]) -> Index3D:
     if len(val) != 3:
         raise ValueError(f"expected 3d index, received {val}")
     return cast(Index3D, val)
+
+
+def is_float(dtype: type):
+    """Expected floating point type"""
+    return dtype in [
+        Float,
+        float,
+        np.float16,
+        np.float32,
+        np.float64,
+    ]

From a13776fa2d91f558be4d8ce914022e4e6ccc9c4f Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 10 Dec 2024 09:38:16 -0500
Subject: [PATCH 40/78] Move Quantity to sub-directory + breakout the
 subcomponent

---
 ndsl/quantity/__init__.py       |   9 +
 ndsl/quantity/bounds.py         | 190 +++++++++++++++++++
 ndsl/quantity/metadata.py       |  56 ++++++
 ndsl/{ => quantity}/quantity.py | 317 ++++----------------------------
 4 files changed, 288 insertions(+), 284 deletions(-)
 create mode 100644 ndsl/quantity/__init__.py
 create mode 100644 ndsl/quantity/bounds.py
 create mode 100644 ndsl/quantity/metadata.py
 rename ndsl/{ => quantity}/quantity.py (59%)

diff --git a/ndsl/quantity/__init__.py b/ndsl/quantity/__init__.py
new file mode 100644
index 00000000..c8f68ea5
--- /dev/null
+++ b/ndsl/quantity/__init__.py
@@ -0,0 +1,9 @@
+from ndsl.quantity.quantity import Quantity
+from ndsl.quantity.metadata import QuantityMetadata, QuantityHaloSpec
+
+
+__all__ = [
+    "Quantity",
+    "QuantityMetadata",
+    "QuantityHaloSpec",
+]
diff --git a/ndsl/quantity/bounds.py b/ndsl/quantity/bounds.py
new file mode 100644
index 00000000..419cff22
--- /dev/null
+++ b/ndsl/quantity/bounds.py
@@ -0,0 +1,190 @@
+from typing import Sequence, Tuple, Union
+
+import numpy as np
+
+import ndsl.constants as constants
+from ndsl.comm._boundary_utils import bound_default_slice, shift_boundary_slice_tuple
+
+
+class BoundaryArrayView:
+    def __init__(self, data, boundary_type, dims, origin, extent):
+        self._data = data
+        self._boundary_type = boundary_type
+        self._dims = dims
+        self._origin = origin
+        self._extent = extent
+
+    def __getitem__(self, index):
+        if len(self._origin) == 0:
+            if isinstance(index, tuple) and len(index) > 0:
+                raise IndexError("more than one index given for a zero-dimension array")
+            elif isinstance(index, slice) and index != slice(None, None, None):
+                raise IndexError("cannot slice a zero-dimension array")
+            else:
+                return self._data  # array[()] does not return an ndarray
+        else:
+            return self._data[self._get_array_index(index)]
+
+    def __setitem__(self, index, value):
+        self._data[self._get_array_index(index)] = value
+
+    def _get_array_index(self, index):
+        if isinstance(index, list):
+            index = tuple(index)
+        if not isinstance(index, tuple):
+            index = (index,)
+        if len(index) > len(self._dims):
+            raise IndexError(
+                f"{len(index)} is too many indices for a "
+                f"{len(self._dims)}-dimensional quantity"
+            )
+        if len(index) < len(self._dims):
+            index = index + (slice(None, None),) * (len(self._dims) - len(index))
+        return shift_boundary_slice_tuple(
+            self._dims, self._origin, self._extent, self._boundary_type, index
+        )
+
+    def sel(self, **kwargs: Union[slice, int]) -> np.ndarray:
+        """Convenience method to perform indexing using dimension names
+        without knowing dimension order.
+
+        Args:
+            **kwargs: slice/index to retrieve for a given dimension name
+
+        Returns:
+            view_selection: an ndarray-like selection of the given indices
+                on `self.view`
+        """
+        return self[tuple(kwargs.get(dim, slice(None, None)) for dim in self._dims)]
+
+
+class BoundedArrayView:
+    """
+    A container of objects which provide indexing relative to corners and edges
+    of the computational domain for convenience.
+
+    Default start and end indices for all dimensions are modified to be the
+    start and end of the compute domain. When using edge and corner attributes, it is
+    recommended to explicitly write start and end offsets to avoid confusion.
+
+    Indexing on the object itself (view[:]) is offset by the origin, and default
+    start and end indices are modified to be the start and end of the compute domain.
+
+    For corner attributes e.g. `northwest`, modified indexing is done for the two
+    axes according to the edges which make up the corner. In other words, indexing
+    is offset relative to the intersection of the two edges which make the corner.
+
+    For `interior`, start indices of the horizontal dimensions are relative to the
+    origin, and end indices are relative to the origin + extent. For example,
+    view.interior[0:0, 0:0, :] would retrieve the entire compute domain for an x/y/z
+    array, while view.interior[-1:1, -1:1, :] would also include one halo point.
+    """
+
+    def __init__(
+        self, array, dims: Sequence[str], origin: Sequence[int], extent: Sequence[int]
+    ):
+        self._data = array
+        self._dims = tuple(dims)
+        self._origin = tuple(origin)
+        self._extent = tuple(extent)
+        self._northwest = BoundaryArrayView(
+            array, constants.NORTHWEST, dims, origin, extent
+        )
+        self._northeast = BoundaryArrayView(
+            array, constants.NORTHEAST, dims, origin, extent
+        )
+        self._southwest = BoundaryArrayView(
+            array, constants.SOUTHWEST, dims, origin, extent
+        )
+        self._southeast = BoundaryArrayView(
+            array, constants.SOUTHEAST, dims, origin, extent
+        )
+        self._interior = BoundaryArrayView(
+            array, constants.INTERIOR, dims, origin, extent
+        )
+
+    @property
+    def origin(self) -> Tuple[int, ...]:
+        """the start of the computational domain"""
+        return self._origin
+
+    @property
+    def extent(self) -> Tuple[int, ...]:
+        """the shape of the computational domain"""
+        return self._extent
+
+    def __getitem__(self, index):
+        if len(self.origin) == 0:
+            if isinstance(index, tuple) and len(index) > 0:
+                raise IndexError("more than one index given for a zero-dimension array")
+            elif isinstance(index, slice) and index != slice(None, None, None):
+                raise IndexError("cannot slice a zero-dimension array")
+            else:
+                return self._data  # array[()] does not return an ndarray
+        else:
+            return self._data[self._get_compute_index(index)]
+
+    def __setitem__(self, index, value):
+        self._data[self._get_compute_index(index)] = value
+
+    def _get_compute_index(self, index):
+        if not isinstance(index, (tuple, list)):
+            index = (index,)
+        if len(index) > len(self._dims):
+            raise IndexError(
+                f"{len(index)} is too many indices for a "
+                f"{len(self._dims)}-dimensional quantity"
+            )
+        index = _fill_index(index, len(self._data.shape))
+        shifted_index = []
+        for entry, origin, extent in zip(index, self.origin, self.extent):
+            if isinstance(entry, slice):
+                shifted_slice = _shift_slice(entry, origin, extent)
+                shifted_index.append(
+                    bound_default_slice(shifted_slice, origin, origin + extent)
+                )
+            elif entry is None:
+                shifted_index.append(entry)
+            else:
+                shifted_index.append(entry + origin)
+        return tuple(shifted_index)
+
+    @property
+    def northwest(self) -> BoundaryArrayView:
+        return self._northwest
+
+    @property
+    def northeast(self) -> BoundaryArrayView:
+        return self._northeast
+
+    @property
+    def southwest(self) -> BoundaryArrayView:
+        return self._southwest
+
+    @property
+    def southeast(self) -> BoundaryArrayView:
+        return self._southeast
+
+    @property
+    def interior(self) -> BoundaryArrayView:
+        return self._interior
+
+
+def _fill_index(index, length):
+    return tuple(index) + (slice(None, None, None),) * (length - len(index))
+
+
+def _shift_slice(slice_in, shift, extent):
+    start = _shift_index(slice_in.start, shift, extent)
+    stop = _shift_index(slice_in.stop, shift, extent)
+    return slice(start, stop, slice_in.step)
+
+
+def _shift_index(current_value, shift, extent):
+    if current_value is None:
+        new_value = None
+    else:
+        new_value = current_value + shift
+        if new_value < 0:
+            new_value = extent + new_value
+    return new_value
diff --git a/ndsl/quantity/metadata.py b/ndsl/quantity/metadata.py
new file mode 100644
index 00000000..0051f6b8
--- /dev/null
+++ b/ndsl/quantity/metadata.py
@@ -0,0 +1,56 @@
+import dataclasses
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+from ndsl.optional_imports import cupy
+from ndsl.types import NumpyModule
+
+
+@dataclasses.dataclass
+class QuantityMetadata:
+    origin: Tuple[int, ...]
+    "the start of the computational domain"
+    extent: Tuple[int, ...]
+    "the shape of the computational domain"
+    dims: Tuple[str, ...]
+    "names of each dimension"
+    units: str
+    "units of the quantity"
+    data_type: type
+    "ndarray-like type used to store the data"
+    dtype: type
+    "dtype of the data in the ndarray-like object"
+    gt4py_backend: Union[str, None] = None
+    "backend to use for gt4py storages"
+
+    @property
+    def dim_lengths(self) -> Dict[str, int]:
+        """mapping of dimension names to their lengths"""
+        return dict(zip(self.dims, self.extent))
+
+    @property
+    def np(self) -> NumpyModule:
+        """numpy-like module used to interact with the data"""
+        if issubclass(self.data_type, cupy.ndarray):
+            return cupy
+        elif issubclass(self.data_type, np.ndarray):
+            return np
+        else:
+            raise TypeError(
+                f"quantity underlying data is of unexpected type {self.data_type}"
+            )
+
+
+@dataclasses.dataclass
+class QuantityHaloSpec:
+    """Describe the memory to be exchanged, including size of the halo."""
+
+    n_points: int
+    strides: Tuple[int]
+    itemsize: int
+    shape: Tuple[int]
+    origin: Tuple[int, ...]
+    extent: Tuple[int, ...]
+    dims: Tuple[str, ...]
+    numpy_module: NumpyModule
+    dtype: Any
diff --git a/ndsl/quantity.py b/ndsl/quantity/quantity.py
similarity index 59%
rename from ndsl/quantity.py
rename to ndsl/quantity/quantity.py
index b95a9aad..00e66f0a 100644
--- a/ndsl/quantity.py
+++ b/ndsl/quantity/quantity.py
@@ -1,273 +1,16 @@
-import dataclasses
 import warnings
-from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Iterable, Optional, Sequence, Tuple, Union, cast
 
 import matplotlib.pyplot as plt
 import numpy as np
 
 import ndsl.constants as constants
-from ndsl.comm._boundary_utils import bound_default_slice, shift_boundary_slice_tuple
-from ndsl.dsl.typing import Float
+from ndsl.dsl.typing import Float, is_float
 from ndsl.optional_imports import cupy, dace, gt4py
 from ndsl.optional_imports import xarray as xr
 from ndsl.types import NumpyModule
-
-
-if cupy is None:
-    import numpy as cupy
-
-__all__ = ["Quantity", "QuantityMetadata"]
-
-
-@dataclasses.dataclass
-class QuantityMetadata:
-    origin: Tuple[int, ...]
-    "the start of the computational domain"
-    extent: Tuple[int, ...]
-    "the shape of the computational domain"
-    dims: Tuple[str, ...]
-    "names of each dimension"
-    units: str
-    "units of the quantity"
-    data_type: type
-    "ndarray-like type used to store the data"
-    dtype: type
-    "dtype of the data in the ndarray-like object"
-    gt4py_backend: Union[str, None] = None
-    "backend to use for gt4py storages"
-
-    @property
-    def dim_lengths(self) -> Dict[str, int]:
-        """mapping of dimension names to their lengths"""
-        return dict(zip(self.dims, self.extent))
-
-    @property
-    def np(self) -> NumpyModule:
-        """numpy-like module used to interact with the data"""
-        if issubclass(self.data_type, cupy.ndarray):
-            return cupy
-        elif issubclass(self.data_type, np.ndarray):
-            return np
-        else:
-            raise TypeError(
-                f"quantity underlying data is of unexpected type {self.data_type}"
-            )
-
-
-@dataclasses.dataclass
-class QuantityHaloSpec:
-    """Describe the memory to be exchanged, including size of the halo."""
-
-    n_points: int
-    strides: Tuple[int]
-    itemsize: int
-    shape: Tuple[int]
-    origin: Tuple[int, ...]
-    extent: Tuple[int, ...]
-    dims: Tuple[str, ...]
-    numpy_module: NumpyModule
-    dtype: Any
-
-
-class BoundaryArrayView:
-    def __init__(self, data, boundary_type, dims, origin, extent):
-        self._data = data
-        self._boundary_type = boundary_type
-        self._dims = dims
-        self._origin = origin
-        self._extent = extent
-
-    def __getitem__(self, index):
-        if len(self._origin) == 0:
-            if isinstance(index, tuple) and len(index) > 0:
-                raise IndexError("more than one index given for a zero-dimension array")
-            elif isinstance(index, slice) and index != slice(None, None, None):
-                raise IndexError("cannot slice a zero-dimension array")
-            else:
-                return self._data  # array[()] does not return an ndarray
-        else:
-            return self._data[self._get_array_index(index)]
-
-    def __setitem__(self, index, value):
-        self._data[self._get_array_index(index)] = value
-
-    def _get_array_index(self, index):
-        if isinstance(index, list):
-            index = tuple(index)
-        if not isinstance(index, tuple):
-            index = (index,)
-        if len(index) > len(self._dims):
-            raise IndexError(
-                f"{len(index)} is too many indices for a "
-                f"{len(self._dims)}-dimensional quantity"
-            )
-        if len(index) < len(self._dims):
-            index = index + (slice(None, None),) * (len(self._dims) - len(index))
-        return shift_boundary_slice_tuple(
-            self._dims, self._origin, self._extent, self._boundary_type, index
-        )
-
-    def sel(self, **kwargs: Union[slice, int]) -> np.ndarray:
-        """Convenience method to perform indexing using dimension names
-        without knowing dimension order.
-
-        Args:
-            **kwargs: slice/index to retrieve for a given dimension name
-
-        Returns:
-            view_selection: an ndarray-like selection of the given indices
-                on `self.view`
-        """
-        return self[tuple(kwargs.get(dim, slice(None, None)) for dim in self._dims)]
-
-
-class BoundedArrayView:
-    """
-    A container of objects which provide indexing relative to corners and edges
-    of the computational domain for convenience.
-
-    Default start and end indices for all dimensions are modified to be the
-    start and end of the compute domain. When using edge and corner attributes, it is
-    recommended to explicitly write start and end offsets to avoid confusion.
-
-    Indexing on the object itself (view[:]) is offset by the origin, and default
-    start and end indices are modified to be the start and end of the compute domain.
-
-    For corner attributes e.g. `northwest`, modified indexing is done for the two
-    axes according to the edges which make up the corner. In other words, indexing
-    is offset relative to the intersection of the two edges which make the corner.
-
-    For `interior`, start indices of the horizontal dimensions are relative to the
-    origin, and end indices are relative to the origin + extent. For example,
-    view.interior[0:0, 0:0, :] would retrieve the entire compute domain for an x/y/z
-    array, while view.interior[-1:1, -1:1, :] would also include one halo point.
-    """
-
-    def __init__(
-        self, array, dims: Sequence[str], origin: Sequence[int], extent: Sequence[int]
-    ):
-        self._data = array
-        self._dims = tuple(dims)
-        self._origin = tuple(origin)
-        self._extent = tuple(extent)
-        self._northwest = BoundaryArrayView(
-            array, constants.NORTHWEST, dims, origin, extent
-        )
-        self._northeast = BoundaryArrayView(
-            array, constants.NORTHEAST, dims, origin, extent
-        )
-        self._southwest = BoundaryArrayView(
-            array, constants.SOUTHWEST, dims, origin, extent
-        )
-        self._southeast = BoundaryArrayView(
-            array, constants.SOUTHEAST, dims, origin, extent
-        )
-        self._interior = BoundaryArrayView(
-            array, constants.INTERIOR, dims, origin, extent
-        )
-
-    @property
-    def origin(self) -> Tuple[int, ...]:
-        """the start of the computational domain"""
-        return self._origin
-
-    @property
-    def extent(self) -> Tuple[int, ...]:
-        """the shape of the computational domain"""
-        return self._extent
-
-    def __getitem__(self, index):
-        if len(self.origin) == 0:
-            if isinstance(index, tuple) and len(index) > 0:
-                raise IndexError("more than one index given for a zero-dimension array")
-            elif isinstance(index, slice) and index != slice(None, None, None):
-                raise IndexError("cannot slice a zero-dimension array")
-            else:
-                return self._data  # array[()] does not return an ndarray
-        else:
-            return self._data[self._get_compute_index(index)]
-
-    def __setitem__(self, index, value):
-        self._data[self._get_compute_index(index)] = value
-
-    def _get_compute_index(self, index):
-        if not isinstance(index, (tuple, list)):
-            index = (index,)
-        if len(index) > len(self._dims):
-            raise IndexError(
-                f"{len(index)} is too many indices for a "
-                f"{len(self._dims)}-dimensional quantity"
-            )
-        index = fill_index(index, len(self._data.shape))
-        shifted_index = []
-        for entry, origin, extent in zip(index, self.origin, self.extent):
-            if isinstance(entry, slice):
-                shifted_slice = shift_slice(entry, origin, extent)
-                shifted_index.append(
-                    bound_default_slice(shifted_slice, origin, origin + extent)
-                )
-            elif entry is None:
-                shifted_index.append(entry)
-            else:
-                shifted_index.append(entry + origin)
-        return tuple(shifted_index)
-
-    @property
-    def northwest(self) -> BoundaryArrayView:
-        return self._northwest
-
-    @property
-    def northeast(self) -> BoundaryArrayView:
-        return self._northeast
-
-    @property
-    def southwest(self) -> BoundaryArrayView:
-        return self._southwest
-
-    @property
-    def southeast(self) -> BoundaryArrayView:
-        return self._southeast
-
-    @property
-    def interior(self) -> BoundaryArrayView:
-        return self._interior
-
-
-def ensure_int_tuple(arg, arg_name):
-    return_list = []
-    for item in arg:
-        try:
-            return_list.append(int(item))
-        except ValueError:
-            raise TypeError(
-                f"tuple arg {arg_name}={arg} contains item {item} of "
-                f"unexpected type {type(item)}"
-            )
-    return tuple(return_list)
-
-
-def _validate_quantity_property_lengths(shape, dims, origin, extent):
-    n_dims = len(shape)
-    for var, desc in (
-        (dims, "dimension names"),
-        (origin, "origins"),
-        (extent, "extents"),
-    ):
-        if len(var) != n_dims:
-            raise ValueError(
-                f"received {len(var)} {desc} for {n_dims} dimensions: {var}"
-            )
-
-
-def _is_float(dtype):
-    """Expected floating point type for Pace"""
-    return (
-        dtype == Float
-        or dtype == float
-        or dtype == np.float32
-        or dtype == np.float64
-        or dtype == np.float16
-    )
+from ndsl.quantity.bounds import BoundedArrayView
+from ndsl.quantity.metadata import QuantityMetadata, QuantityHaloSpec
 
 
 class Quantity:
@@ -302,7 +45,7 @@ def __init__(
 
         if (
             not allow_mismatch_float_precision
-            and _is_float(data.dtype)
+            and is_float(data.dtype)
             and data.dtype != Float
         ):
             raise ValueError(
@@ -362,8 +105,8 @@ def __init__(
 
         _validate_quantity_property_lengths(data.shape, dims, origin, extent)
         self._metadata = QuantityMetadata(
-            origin=ensure_int_tuple(origin, "origin"),
-            extent=ensure_int_tuple(extent, "extent"),
+            origin=_ensure_int_tuple(origin, "origin"),
+            extent=_ensure_int_tuple(extent, "extent"),
             dims=tuple(dims),
             units=units,
             data_type=type(self._data),
@@ -584,10 +327,10 @@ def transpose(
         transpose_order = [self.dims.index(dim) for dim in target_dims]
         transposed = Quantity(
             self.np.transpose(self.data, transpose_order),  # type: ignore[attr-defined]
-            dims=transpose_sequence(self.dims, transpose_order),
+            dims=_transpose_sequence(self.dims, transpose_order),
             units=self.units,
-            origin=transpose_sequence(self.origin, transpose_order),
-            extent=transpose_sequence(self.extent, transpose_order),
+            origin=_transpose_sequence(self.origin, transpose_order),
+            extent=_transpose_sequence(self.extent, transpose_order),
             gt4py_backend=self.gt4py_backend,
             allow_mismatch_float_precision=allow_mismatch_float_precision,
         )
@@ -611,7 +354,7 @@ def plot_k_level(self, k_index=0):
         plt.show()
 
 
-def transpose_sequence(sequence, order):
+def _transpose_sequence(sequence, order):
     return sequence.__class__(sequence[i] for i in order)
 
 
@@ -641,21 +384,27 @@ def _collapse_dims(target_dims, dims):
     return return_list
 
 
-def fill_index(index, length):
-    return tuple(index) + (slice(None, None, None),) * (length - len(index))
-
-
-def shift_slice(slice_in, shift, extent):
-    start = shift_index(slice_in.start, shift, extent)
-    stop = shift_index(slice_in.stop, shift, extent)
-    return slice(start, stop, slice_in.step)
+def _validate_quantity_property_lengths(shape, dims, origin, extent):
+    n_dims = len(shape)
+    for var, desc in (
+        (dims, "dimension names"),
+        (origin, "origins"),
+        (extent, "extents"),
+    ):
+        if len(var) != n_dims:
+            raise ValueError(
+                f"received {len(var)} {desc} for {n_dims} dimensions: {var}"
+            )
 
 
-def shift_index(current_value, shift, extent):
-    if current_value is None:
-        new_value = None
-    else:
-        new_value = current_value + shift
-        if new_value < 0:
-            new_value = extent + new_value
-    return new_value
+def _ensure_int_tuple(arg, arg_name):
+    return_list = []
+    for item in arg:
+        try:
+            return_list.append(int(item))
+        except ValueError:
+            raise TypeError(
+                f"tuple arg {arg_name}={arg} contains item {item} of "
+                f"unexpected type {type(item)}"
+            )
+    return tuple(return_list)

From 937417beda47d9d941a7872792d302c33c125715 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 10 Dec 2024 09:38:40 -0500
Subject: [PATCH 41/78] Fix tests

---
 tests/quantity/test_quantity.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/quantity/test_quantity.py b/tests/quantity/test_quantity.py
index a6de628b..61e92025 100644
--- a/tests/quantity/test_quantity.py
+++ b/tests/quantity/test_quantity.py
@@ -1,8 +1,8 @@
 import numpy as np
 import pytest
 
-import ndsl.quantity as qty
 from ndsl import Quantity
+from ndsl.quantity.bounds import _shift_slice
 
 
 try:
@@ -229,7 +229,7 @@ def test_compute_view_edit_all_domain(quantity, n_halo, n_dims, extent_1d):
     ],
 )
 def test_shift_slice(slice_in, shift, extent, slice_out):
-    result = qty.shift_slice(slice_in, shift, extent)
+    result = _shift_slice(slice_in, shift, extent)
     assert result == slice_out
 
 

From 45c318050ef652e3f80775c1b1376a424bc1294f Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 10 Dec 2024 09:51:12 -0500
Subject: [PATCH 42/78] Lint

---
 ndsl/quantity/__init__.py | 2 +-
 ndsl/quantity/metadata.py | 1 +
 ndsl/quantity/quantity.py | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ndsl/quantity/__init__.py b/ndsl/quantity/__init__.py
index c8f68ea5..fce0cf63 100644
--- a/ndsl/quantity/__init__.py
+++ b/ndsl/quantity/__init__.py
@@ -1,5 +1,5 @@
+from ndsl.quantity.metadata import QuantityHaloSpec, QuantityMetadata
 from ndsl.quantity.quantity import Quantity
-from ndsl.quantity.metadata import QuantityMetadata, QuantityHaloSpec
 
 
 __all__ = [
diff --git a/ndsl/quantity/metadata.py b/ndsl/quantity/metadata.py
index 0051f6b8..611cf909 100644
--- a/ndsl/quantity/metadata.py
+++ b/ndsl/quantity/metadata.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Tuple, Union
 
 import numpy as np
+
 from ndsl.optional_imports import cupy
 from ndsl.types import NumpyModule
 
diff --git a/ndsl/quantity/quantity.py b/ndsl/quantity/quantity.py
index 00e66f0a..a7d89dfd 100644
--- a/ndsl/quantity/quantity.py
+++ b/ndsl/quantity/quantity.py
@@ -8,9 +8,9 @@
 from ndsl.dsl.typing import Float, is_float
 from ndsl.optional_imports import cupy, dace, gt4py
 from ndsl.optional_imports import xarray as xr
-from ndsl.types import NumpyModule
 from ndsl.quantity.bounds import BoundedArrayView
-from ndsl.quantity.metadata import QuantityMetadata, QuantityHaloSpec
+from ndsl.quantity.metadata import QuantityHaloSpec, QuantityMetadata
+from ndsl.types import NumpyModule
 
 
 class Quantity:

From 0330cdbea8c65760bcb4f3541556b82b30bc6623 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 10 Dec 2024 09:56:01 -0500
Subject: [PATCH 43/78] Remove `cp.ndarray` since cupy is optional

---
 ndsl/quantity/quantity.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndsl/quantity/quantity.py b/ndsl/quantity/quantity.py
index a7d89dfd..35c232ef 100644
--- a/ndsl/quantity/quantity.py
+++ b/ndsl/quantity/quantity.py
@@ -231,7 +231,7 @@ def view(self) -> BoundedArrayView:
         return self._compute_domain_view
 
     @property
-    def data(self) -> Union[np.ndarray, cupy.ndarray]:
+    def data(self) -> np.ndarray:
         """the underlying array of data"""
         return self._data
 

From 18b2f3f8ed38b456e28e003b0a9735d498ee5599 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 10 Dec 2024 11:00:56 -0500
Subject: [PATCH 44/78] Restore workaround for optional cupy

---
 ndsl/quantity/quantity.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ndsl/quantity/quantity.py b/ndsl/quantity/quantity.py
index 35c232ef..5e8b3176 100644
--- a/ndsl/quantity/quantity.py
+++ b/ndsl/quantity/quantity.py
@@ -13,6 +13,10 @@
 from ndsl.types import NumpyModule
 
 
+if cupy is None:
+    import numpy as cupy
+
+
 class Quantity:
     """
     Data container for physical quantities.
@@ -231,7 +235,7 @@ def view(self) -> BoundedArrayView:
         return self._compute_domain_view
 
     @property
-    def data(self) -> np.ndarray:
+    def data(self) -> Union[np.ndarray, cupy.ndarray]:
         """the underlying array of data"""
         return self._data
 

From 7076740d1e11608b7dcf80a25c810cea08532211 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 10 Dec 2024 11:02:26 -0500
Subject: [PATCH 45/78] "GFS" -> "UFS"

---
 ndsl/constants.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ndsl/constants.py b/ndsl/constants.py
index f3b51dfe..62482145 100644
--- a/ndsl/constants.py
+++ b/ndsl/constants.py
@@ -7,16 +7,16 @@
 from ndsl.logging import ndsl_log
 
 
-# The FV3GFS model ships with two sets of constants, one used in the GFS physics
+# The FV3GFS model ships with two sets of constants, one used in the UFS physics
 # package and the other used for the Dycore. Their difference are small but significant
 # In addition the GSFC's GEOS model as its own variables
 class ConstantVersions(Enum):
     GFDL = "GFDL"  # NOAA's FV3 dynamical core constants (original port)
-    GFS = "GFS"  # Constant as defined in NOAA GFS
+    UFS = "UFS"  # Constant as defined in NOAA UFS
     GEOS = "GEOS"  # Constant as defined in GEOS v13
 
 
-CONST_VERSION_AS_STR = os.environ.get("PACE_CONSTANTS", "GFS")
+CONST_VERSION_AS_STR = os.environ.get("PACE_CONSTANTS", "UFS")
 
 try:
     CONST_VERSION = ConstantVersions[CONST_VERSION_AS_STR]
@@ -69,7 +69,7 @@ class ConstantVersions(Enum):
 if CONST_VERSION == ConstantVersions.GEOS:
     # 'qlcd' is exchanged in GEOS
     NQ = 9
-elif CONST_VERSION == ConstantVersions.GFS or CONST_VERSION == ConstantVersions.GFDL:
+elif CONST_VERSION == ConstantVersions.UFS or CONST_VERSION == ConstantVersions.GFDL:
     NQ = 8
 else:
     raise RuntimeError("Constant selector failed, bad code.")
@@ -92,7 +92,7 @@ class ConstantVersions(Enum):
     CP_AIR = RDGAS / KAPPA
     TFREEZE = Float(273.16)
     SAT_ADJUST_THRESHOLD = Float(1.0e-6)
-elif CONST_VERSION == ConstantVersions.GFS:
+elif CONST_VERSION == ConstantVersions.UFS:
     RADIUS = Float(6.3712e6)  # Radius of the Earth [m]
     PI = Float(3.1415926535897931)
     OMEGA = Float(7.2921e-5)  # Rotation of the earth

From a8a7c85accfac3ce62c992984008fe27b450756d Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 10 Dec 2024 11:08:00 -0500
Subject: [PATCH 46/78] Cupy trick for metadata

---
 ndsl/quantity/metadata.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ndsl/quantity/metadata.py b/ndsl/quantity/metadata.py
index 611cf909..e8591394 100644
--- a/ndsl/quantity/metadata.py
+++ b/ndsl/quantity/metadata.py
@@ -7,6 +7,10 @@
 from ndsl.types import NumpyModule
 
 
+if cupy is None:
+    import numpy as cupy
+
+
 @dataclasses.dataclass
 class QuantityMetadata:
     origin: Tuple[int, ...]

From a7ee68fb01ca9924aeb7f1c12d3468732be638b2 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 11 Dec 2024 08:59:33 -0500
Subject: [PATCH 47/78] Add comments for constant explanation

---
 ndsl/constants.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/ndsl/constants.py b/ndsl/constants.py
index 62482145..b7e91b18 100644
--- a/ndsl/constants.py
+++ b/ndsl/constants.py
@@ -12,8 +12,8 @@
 # In addition the GSFC's GEOS model as its own variables
 class ConstantVersions(Enum):
     GFDL = "GFDL"  # NOAA's FV3 dynamical core constants (original port)
-    UFS = "UFS"  # Constant as defined in NOAA UFS
-    GEOS = "GEOS"  # Constant as defined in GEOS v13
+    UFS = "UFS"  # Constant as defined in NOAA's UFS
+    GEOS = "GEOS"  # Constant as defined in GEOS v11.4.2
 
 
 CONST_VERSION_AS_STR = os.environ.get("PACE_CONSTANTS", "UFS")
@@ -78,19 +78,21 @@ class ConstantVersions(Enum):
 # Physical constants
 #####################
 if CONST_VERSION == ConstantVersions.GEOS:
-    RADIUS = Float(6.371e6)
+    RADIUS = Float(6.371e6)  # Radius of the Earth [m]
     PI_8 = np.float64(3.14159265358979323846)
     PI = Float(PI_8)
-    OMEGA = Float(2.0) * PI / Float(86164.0)
-    GRAV = Float(9.80665)
-    RGRAV = Float(1.0) / GRAV
-    RDGAS = Float(8314.47) / Float(28.965)
-    RVGAS = Float(8314.47) / Float(18.015)
-    HLV = Float(2.4665e6)
-    HLF = Float(3.3370e5)
-    KAPPA = RDGAS / (Float(3.5) * RDGAS)
+    OMEGA = Float(2.0) * PI / Float(86164.0)  # Rotation of the earth
+    GRAV = Float(9.80665)  # Acceleration due to gravity [m/s^2].04
+    RGRAV = Float(1.0) / GRAV  # Inverse of gravitational acceleration
+    RDGAS = Float(8314.47) / Float(
+        28.965
+    )  # Gas constant for dry air [J/kg/deg] ~287.04
+    RVGAS = Float(8314.47) / Float(18.015)  # Gas constant for water vapor [J/kg/deg]
+    HLV = Float(2.4665e6)  # Latent heat of evaporation [J/kg]
+    HLF = Float(3.3370e5)  # Latent heat of fusion [J/kg]  ~3.34e5
+    KAPPA = RDGAS / (Float(3.5) * RDGAS)  # Specific heat capacity of dry air at
     CP_AIR = RDGAS / KAPPA
-    TFREEZE = Float(273.16)
+    TFREEZE = Float(273.16)  # Freezing temperature of fresh water [K]
     SAT_ADJUST_THRESHOLD = Float(1.0e-6)
 elif CONST_VERSION == ConstantVersions.UFS:
     RADIUS = Float(6.3712e6)  # Radius of the Earth [m]
@@ -98,13 +100,13 @@ class ConstantVersions(Enum):
     OMEGA = Float(7.2921e-5)  # Rotation of the earth
     GRAV = Float(9.80665)  # Acceleration due to gravity [m/s^2].04
     RGRAV = Float(1.0 / GRAV)  # Inverse of gravitational acceleration
-    RDGAS = Float(287.05)  # Gas constant for dry air [J/kg/deg] # 287.04
+    RDGAS = Float(287.05)  # Gas constant for dry air [J/kg/deg] ~287.04
     RVGAS = Float(461.50)  # Gas constant for water vapor [J/kg/deg]
     HLV = Float(2.5e6)  # Latent heat of evaporation [J/kg]
-    HLF = Float(3.3358e5)  # Latent heat of fusion [J/kg]  # 3.34e5
+    HLF = Float(3.3358e5)  # Latent heat of fusion [J/kg]  ~3.34e5
     CP_AIR = Float(1004.6)
     KAPPA = RDGAS / CP_AIR  # Specific heat capacity of dry air at
-    TFREEZE = Float(273.15)
+    TFREEZE = Float(273.15)  # Freezing temperature of fresh water [K]
     SAT_ADJUST_THRESHOLD = Float(1.0e-8)
 elif CONST_VERSION == ConstantVersions.GFDL:
     RADIUS = Float(6371.0e3)  # Radius of the Earth [m] #6371.0e3
@@ -112,10 +114,10 @@ class ConstantVersions(Enum):
     OMEGA = Float(7.292e-5)  # Rotation of the earth  # 7.292e-5
     GRAV = Float(9.80)  # Acceleration due to gravity [m/s^2].04
     RGRAV = Float(1.0) / GRAV  # Inverse of gravitational acceleration
-    RDGAS = Float(287.04)  # Gas constant for dry air [J/kg/deg] # 287.04
+    RDGAS = Float(287.04)  # Gas constant for dry air [J/kg/deg] ~287.04
     RVGAS = Float(461.50)  # Gas constant for water vapor [J/kg/deg]
     HLV = Float(2.500e6)  # Latent heat of evaporation [J/kg]
-    HLF = Float(3.34e5)  # Latent heat of fusion [J/kg]  # 3.34e5
+    HLF = Float(3.34e5)  # Latent heat of fusion [J/kg]  ~3.34e5
     KAPPA = Float(2.0) / Float(7.0)
     CP_AIR = RDGAS / KAPPA  # Specific heat capacity of dry air at
     TFREEZE = Float(273.16)  # Freezing temperature of fresh water [K]

From 28e23756ef6e30df0a8c0df0b56289a1f7be960c Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 11 Dec 2024 09:55:15 -0500
Subject: [PATCH 48/78] Describe 64/32-bit FloatFields

---
 ndsl/dsl/typing.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/ndsl/dsl/typing.py b/ndsl/dsl/typing.py
index 69b35c81..3b6ba44c 100644
--- a/ndsl/dsl/typing.py
+++ b/ndsl/dsl/typing.py
@@ -54,10 +54,20 @@ def global_set_floating_point_precision():
 Bool = np.bool_
 
 FloatField = Field[gtscript.IJK, Float]
+FloatField64 = Field[gtscript.IJK, np.float64]
+FloatField32 = Field[gtscript.IJK, np.float32]
 FloatFieldI = Field[gtscript.I, Float]
+FloatFieldI64 = Field[gtscript.I, np.float64]
+FloatFieldI32 = Field[gtscript.I, np.float32]
 FloatFieldJ = Field[gtscript.J, Float]
+FloatFieldJ64 = Field[gtscript.J, np.float64]
+FloatFieldJ32 = Field[gtscript.J, np.float32]
 FloatFieldIJ = Field[gtscript.IJ, Float]
+FloatFieldIJ64 = Field[gtscript.IJ, np.float64]
+FloatFieldIJ32 = Field[gtscript.IJ, np.float32]
 FloatFieldK = Field[gtscript.K, Float]
+FloatFieldK64 = Field[gtscript.K, np.float64]
+FloatFieldK32 = Field[gtscript.K, np.float32]
 IntField = Field[gtscript.IJK, Int]
 IntFieldIJ = Field[gtscript.IJ, Int]
 IntFieldK = Field[gtscript.K, Int]

From 8daf5bd9eb78e8c966a689d2df62f134431506f8 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 11 Dec 2024 12:09:39 -0500
Subject: [PATCH 49/78] Make sure the `make_storage_data` respects the array
 dtype.

---
 ndsl/stencils/testing/translate.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index 1a48c1e0..e3fc8845 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -116,12 +116,6 @@ def make_storage_data(
         elif not full_shape and len(array.shape) < 3 and axis == len(array.shape) - 1:
             use_shape[1] = 1
         start = (int(istart), int(jstart), int(kstart))
-        if "float" in str(array.dtype):
-            dtype = Float
-        elif "int" in str(array.dtype):
-            dtype = Int
-        else:
-            dtype = array.dtype
         if names_4d:
             return utils.make_storage_dict(
                 array,
@@ -132,7 +126,7 @@ def make_storage_data(
                 axis=axis,
                 names=names_4d,
                 backend=self.stencil_factory.backend,
-                dtype=dtype,
+                dtype=array.dtype,
             )
         else:
             if len(array.shape) == 4:
@@ -147,7 +141,7 @@ def make_storage_data(
                 axis=axis,
                 read_only=read_only,
                 backend=self.stencil_factory.backend,
-                dtype=dtype,
+                dtype=array.dtype,
             )
 
     def storage_vars(self):

From 9faa4051757bac722d1afcc9bef09e6506096a20 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Wed, 11 Dec 2024 12:10:13 -0500
Subject: [PATCH 50/78] Fix logic for MultiModal metric and verbose it

---
 ndsl/testing/comparison.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index 9c59231e..04cdc55f 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -260,15 +260,19 @@ def _compute_all_metrics(
             self.ulp_distance_metric = self.ulp_distance <= self.ulp_threshold.value
 
             # Combine all distances into sucess or failure
-            # Success = no NANs & ( abs or rel or ulp )
-            naninf_success = not np.logical_and(
+            # Success =
+            # - no unexpected NANs (e.g. NaN in the ref MUST BE in computation) OR
+            # - absolute distance pass OR
+            # - relative distance pass OR
+            # - ulp distance pass
+            naninf_success = np.logical_and(
                 np.isnan(self.computed), np.isnan(self.references)
-            ).all()
+            )
             metric_success = np.logical_or(
                 self.relative_distance_metric, self.absolute_distance_metric
             )
             metric_success = np.logical_or(metric_success, self.ulp_distance_metric)
-            success = np.logical_and(naninf_success, metric_success)
+            success = np.logical_or(naninf_success, metric_success)
             return success
         elif self.references.dtype in (np.bool_, bool):
             success = np.logical_xor(self.computed, self.references)

From 75b47417d44164b423f6c7e959d3b6cff115c642 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Wed, 11 Dec 2024 10:54:30 -0800
Subject: [PATCH 51/78] Added an MPI all_reduce for quantities based on SUM
 operation to communicator.py

---
 ndsl/comm/communicator.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index ff270df5..e6cdc3b3 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -2,6 +2,7 @@
 from typing import List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 import numpy as np
+from mpi4py import MPI 
 
 import ndsl.constants as constants
 from ndsl.buffer import array_buffer, device_synchronize, recv_buffer, send_buffer
@@ -93,6 +94,25 @@ def _device_synchronize():
         # this is a method so we can profile it separately from other device syncs
         device_synchronize()
 
+    def _create_all_reduce_quantity(
+        self, input_metadata: QuantityMetadata, input_data
+    ) -> Quantity:
+        """Create a Quantity for all_reduce data and metadata"""
+        all_reduce_quantity = Quantity(
+            input_data,
+            dims=input_metadata.dims,
+            units=input_metadata.units,
+            origin=tuple([0 for dim in input_metadata.dims]),
+            gt4py_backend=input_metadata.gt4py_backend,
+            allow_mismatch_float_precision=True,
+        )
+        return all_reduce_quantity
+
+    def all_reduce_sum(self, quantity: Quantity):
+        reduced_quantity_data = self.comm.allreduce(quantity.data,MPI.SUM)
+        all_reduce_quantity = self._create_all_reduce_quantity(quantity.metadata, reduced_quantity_data)
+        return all_reduce_quantity
+    
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
         with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
             numpy_module.zeros, recvbuf

From 4c8632c9a5b1aba1c391d6d878cc0d62f72d8027 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Wed, 11 Dec 2024 13:41:55 -0800
Subject: [PATCH 52/78] linted

---
 ndsl/comm/communicator.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index e6cdc3b3..862844b3 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -2,7 +2,7 @@
 from typing import List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 import numpy as np
-from mpi4py import MPI 
+from mpi4py import MPI
 
 import ndsl.constants as constants
 from ndsl.buffer import array_buffer, device_synchronize, recv_buffer, send_buffer
@@ -109,10 +109,12 @@ def _create_all_reduce_quantity(
         return all_reduce_quantity
 
     def all_reduce_sum(self, quantity: Quantity):
-        reduced_quantity_data = self.comm.allreduce(quantity.data,MPI.SUM)
-        all_reduce_quantity = self._create_all_reduce_quantity(quantity.metadata, reduced_quantity_data)
+        reduced_quantity_data = self.comm.allreduce(quantity.data, MPI.SUM)
+        all_reduce_quantity = self._create_all_reduce_quantity(
+            quantity.metadata, reduced_quantity_data
+        )
         return all_reduce_quantity
-    
+
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
         with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
             numpy_module.zeros, recvbuf

From a2fac9f0df32dda33eb661335e230780ab661a46 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Thu, 12 Dec 2024 21:41:16 -0800
Subject: [PATCH 53/78] Add initial skeleton of pytest test for all reduce

---
 tests/mpi/test_mpi_all_reduce_sum.py | 49 ++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 tests/mpi/test_mpi_all_reduce_sum.py

diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
new file mode 100644
index 00000000..25effc72
--- /dev/null
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -0,0 +1,49 @@
+import pytest
+
+from tests.mpi.mpi_comm import MPI
+# from ndsl.typing import Communicator
+
+from ndsl import (
+    CubedSphereCommunicator,
+    CubedSpherePartitioner,
+    Quantity,
+    TilePartitioner,
+)
+
+from ndsl.comm.partitioner import Partitioner
+
+@pytest.fixture
+def layout():
+    if MPI is not None:
+        size = MPI.COMM_WORLD.Get_size()
+        ranks_per_tile = size // 6
+        ranks_per_edge = int(ranks_per_tile ** 0.5)
+        return (ranks_per_edge, ranks_per_edge)
+    else:
+        return (1, 1)
+
+@pytest.fixture(params=[0.1, 1.0])
+def edge_interior_ratio(request):
+    return request.param
+
+@pytest.fixture
+def tile_partitioner(layout, edge_interior_ratio: float):
+    return TilePartitioner(layout, edge_interior_ratio=edge_interior_ratio)
+
+@pytest.fixture
+def cube_partitioner(tile_partitioner):
+    return CubedSpherePartitioner(tile_partitioner)
+
+@pytest.fixture()
+def communicator(cube_partitioner):
+    return CubedSphereCommunicator(
+        comm=MPI.COMM_WORLD,
+        partitioner=cube_partitioner,
+    )
+
+def test_all_reduce_sum(
+    communicator,      
+):
+    print("Communicator rank = ", communicator.rank)
+    print("Communicator size = ", communicator.size)
+    assert True
\ No newline at end of file

From 8c5b5d5bd4979cac763866989f3760341df5b171 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Fri, 13 Dec 2024 09:43:34 -0800
Subject: [PATCH 54/78] Added assertion tests for 1, 2 and 3D quantities passed
 through mpi_allreduce_sum

---
 tests/mpi/test_mpi_all_reduce_sum.py | 55 +++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index 25effc72..f03787ee 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -1,7 +1,6 @@
 import pytest
-
+import numpy as np
 from tests.mpi.mpi_comm import MPI
-# from ndsl.typing import Communicator
 
 from ndsl import (
     CubedSphereCommunicator,
@@ -10,8 +9,11 @@
     TilePartitioner,
 )
 
+from ndsl.quantity import Quantity
 from ndsl.comm.partitioner import Partitioner
 
+from ndsl.dsl.typing import Float
+
 @pytest.fixture
 def layout():
     if MPI is not None:
@@ -44,6 +46,49 @@ def communicator(cube_partitioner):
 def test_all_reduce_sum(
     communicator,      
 ):
-    print("Communicator rank = ", communicator.rank)
-    print("Communicator size = ", communicator.size)
-    assert True
\ No newline at end of file
+    
+    backend = "numpy"
+    base_array = np.array([i for i in range(5)], dtype=Float)
+
+    testQuantity_1D = Quantity(
+                                data=base_array,
+                                dims=["K"],
+                                units="Some 1D unit",
+                                gt4py_backend=backend,
+                            )
+    
+    base_array = np.array([i for i in range(5*5)], dtype=Float)
+    base_array = base_array.reshape(5,5)
+
+    testQuantity_2D = Quantity(
+                                data=base_array,
+                                dims=["I","J"],
+                                units="Some 2D unit",
+                                gt4py_backend=backend,
+                            )
+
+    base_array = np.array([i for i in range(5*5*5)], dtype=Float)
+    base_array = base_array.reshape(5,5,5)
+
+    testQuantity_3D = Quantity(
+                                data=base_array,
+                                dims=["I","J","K"],
+                                units="Some 3D unit",
+                                gt4py_backend=backend,
+                            )
+
+    # print("Communicator rank = ", communicator.rank)
+    # print("Communicator size = ", communicator.size)
+    # print("nsize = ", nsize)
+
+    global_sum_q = communicator.all_reduce_sum(testQuantity_1D)
+    assert global_sum_q.metadata == testQuantity_1D.metadata
+    assert (global_sum_q.data == (testQuantity_1D.data * communicator.size)).all()
+
+    global_sum_q = communicator.all_reduce_sum(testQuantity_2D)
+    assert global_sum_q.metadata == testQuantity_2D.metadata
+    assert (global_sum_q.data == (testQuantity_2D.data * communicator.size)).all()
+
+    global_sum_q = communicator.all_reduce_sum(testQuantity_3D)
+    assert global_sum_q.metadata == testQuantity_3D.metadata
+    assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()
\ No newline at end of file

From fb4e74010615f15f962a8a0572ed48f1267b5581 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Fri, 13 Dec 2024 09:48:09 -0800
Subject: [PATCH 55/78] Linted

---
 tests/mpi/test_mpi_all_reduce_sum.py | 64 ++++++++++++++--------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index f03787ee..728ec4f8 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -1,6 +1,5 @@
-import pytest
 import numpy as np
-from tests.mpi.mpi_comm import MPI
+import pytest
 
 from ndsl import (
     CubedSphereCommunicator,
@@ -8,11 +7,9 @@
     Quantity,
     TilePartitioner,
 )
-
-from ndsl.quantity import Quantity
-from ndsl.comm.partitioner import Partitioner
-
 from ndsl.dsl.typing import Float
+from tests.mpi.mpi_comm import MPI
+
 
 @pytest.fixture
 def layout():
@@ -24,18 +21,22 @@ def layout():
     else:
         return (1, 1)
 
+
 @pytest.fixture(params=[0.1, 1.0])
 def edge_interior_ratio(request):
     return request.param
 
+
 @pytest.fixture
 def tile_partitioner(layout, edge_interior_ratio: float):
     return TilePartitioner(layout, edge_interior_ratio=edge_interior_ratio)
 
+
 @pytest.fixture
 def cube_partitioner(tile_partitioner):
     return CubedSpherePartitioner(tile_partitioner)
 
+
 @pytest.fixture()
 def communicator(cube_partitioner):
     return CubedSphereCommunicator(
@@ -43,43 +44,40 @@ def communicator(cube_partitioner):
         partitioner=cube_partitioner,
     )
 
+
 def test_all_reduce_sum(
-    communicator,      
+    communicator,
 ):
-    
+
     backend = "numpy"
     base_array = np.array([i for i in range(5)], dtype=Float)
 
     testQuantity_1D = Quantity(
-                                data=base_array,
-                                dims=["K"],
-                                units="Some 1D unit",
-                                gt4py_backend=backend,
-                            )
-    
-    base_array = np.array([i for i in range(5*5)], dtype=Float)
-    base_array = base_array.reshape(5,5)
+        data=base_array,
+        dims=["K"],
+        units="Some 1D unit",
+        gt4py_backend=backend,
+    )
+
+    base_array = np.array([i for i in range(5 * 5)], dtype=Float)
+    base_array = base_array.reshape(5, 5)
 
     testQuantity_2D = Quantity(
-                                data=base_array,
-                                dims=["I","J"],
-                                units="Some 2D unit",
-                                gt4py_backend=backend,
-                            )
+        data=base_array,
+        dims=["I", "J"],
+        units="Some 2D unit",
+        gt4py_backend=backend,
+    )
 
-    base_array = np.array([i for i in range(5*5*5)], dtype=Float)
-    base_array = base_array.reshape(5,5,5)
+    base_array = np.array([i for i in range(5 * 5 * 5)], dtype=Float)
+    base_array = base_array.reshape(5, 5, 5)
 
     testQuantity_3D = Quantity(
-                                data=base_array,
-                                dims=["I","J","K"],
-                                units="Some 3D unit",
-                                gt4py_backend=backend,
-                            )
-
-    # print("Communicator rank = ", communicator.rank)
-    # print("Communicator size = ", communicator.size)
-    # print("nsize = ", nsize)
+        data=base_array,
+        dims=["I", "J", "K"],
+        units="Some 3D unit",
+        gt4py_backend=backend,
+    )
 
     global_sum_q = communicator.all_reduce_sum(testQuantity_1D)
     assert global_sum_q.metadata == testQuantity_1D.metadata
@@ -91,4 +89,4 @@ def test_all_reduce_sum(
 
     global_sum_q = communicator.all_reduce_sum(testQuantity_3D)
     assert global_sum_q.metadata == testQuantity_3D.metadata
-    assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()
\ No newline at end of file
+    assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()

From 34f82fb5ac6275f6a3de1b71794cda25d7fc3495 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Fri, 13 Dec 2024 10:55:18 -0800
Subject: [PATCH 56/78] Added pytest.mark to skip test if mpi4py isn't
 available

---
 tests/mpi/test_mpi_all_reduce_sum.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index 728ec4f8..caddfa5f 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -45,6 +45,9 @@ def communicator(cube_partitioner):
     )
 
 
+@pytest.mark.skipif(
+    MPI is None, reason="mpi4py is not available or pytest was not run in parallel"
+)
 def test_all_reduce_sum(
     communicator,
 ):

From b4a6a5421149b43bf7c64ea2ca4bf86fe9ed2724 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Mon, 16 Dec 2024 08:45:42 -0800
Subject: [PATCH 57/78] lint changes

---
 ndsl/comm/communicator.py            |  7 ++-
 tests/mpi/test_mpi_all_reduce_sum.py | 84 ++++++++++++++--------------
 2 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index 862844b3..5f19b2eb 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -102,9 +102,10 @@ def _create_all_reduce_quantity(
             input_data,
             dims=input_metadata.dims,
             units=input_metadata.units,
-            origin=tuple([0 for dim in input_metadata.dims]),
+            origin=input_metadata.origin,
+            extent=input_metadata.extent,
             gt4py_backend=input_metadata.gt4py_backend,
-            allow_mismatch_float_precision=True,
+            allow_mismatch_float_precision=False,
         )
         return all_reduce_quantity
 
@@ -114,6 +115,8 @@ def all_reduce_sum(self, quantity: Quantity):
             quantity.metadata, reduced_quantity_data
         )
         return all_reduce_quantity
+        # quantity.data = reduced_quantity_data
+        # return quantity
 
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
         with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index caddfa5f..9ba01e0a 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -52,44 +52,46 @@ def test_all_reduce_sum(
     communicator,
 ):
 
-    backend = "numpy"
-    base_array = np.array([i for i in range(5)], dtype=Float)
-
-    testQuantity_1D = Quantity(
-        data=base_array,
-        dims=["K"],
-        units="Some 1D unit",
-        gt4py_backend=backend,
-    )
-
-    base_array = np.array([i for i in range(5 * 5)], dtype=Float)
-    base_array = base_array.reshape(5, 5)
-
-    testQuantity_2D = Quantity(
-        data=base_array,
-        dims=["I", "J"],
-        units="Some 2D unit",
-        gt4py_backend=backend,
-    )
-
-    base_array = np.array([i for i in range(5 * 5 * 5)], dtype=Float)
-    base_array = base_array.reshape(5, 5, 5)
-
-    testQuantity_3D = Quantity(
-        data=base_array,
-        dims=["I", "J", "K"],
-        units="Some 3D unit",
-        gt4py_backend=backend,
-    )
-
-    global_sum_q = communicator.all_reduce_sum(testQuantity_1D)
-    assert global_sum_q.metadata == testQuantity_1D.metadata
-    assert (global_sum_q.data == (testQuantity_1D.data * communicator.size)).all()
-
-    global_sum_q = communicator.all_reduce_sum(testQuantity_2D)
-    assert global_sum_q.metadata == testQuantity_2D.metadata
-    assert (global_sum_q.data == (testQuantity_2D.data * communicator.size)).all()
-
-    global_sum_q = communicator.all_reduce_sum(testQuantity_3D)
-    assert global_sum_q.metadata == testQuantity_3D.metadata
-    assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()
+    backends = ["dace:cpu", "gt:cpu_kfirst", "numpy"]
+
+    for backend in backends:
+        base_array = np.array([i for i in range(5)], dtype=Float)
+
+        testQuantity_1D = Quantity(
+            data=base_array,
+            dims=["K"],
+            units="Some 1D unit",
+            gt4py_backend=backend,
+        )
+
+        base_array = np.array([i for i in range(5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5)
+
+        testQuantity_2D = Quantity(
+            data=base_array,
+            dims=["I", "J"],
+            units="Some 2D unit",
+            gt4py_backend=backend,
+        )
+
+        base_array = np.array([i for i in range(5 * 5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5, 5)
+
+        testQuantity_3D = Quantity(
+            data=base_array,
+            dims=["I", "J", "K"],
+            units="Some 3D unit",
+            gt4py_backend=backend,
+        )
+
+        global_sum_q = communicator.all_reduce_sum(testQuantity_1D)
+        assert global_sum_q.metadata == testQuantity_1D.metadata
+        assert (global_sum_q.data == (testQuantity_1D.data * communicator.size)).all()
+
+        global_sum_q = communicator.all_reduce_sum(testQuantity_2D)
+        assert global_sum_q.metadata == testQuantity_2D.metadata
+        assert (global_sum_q.data == (testQuantity_2D.data * communicator.size)).all()
+
+        global_sum_q = communicator.all_reduce_sum(testQuantity_3D)
+        assert global_sum_q.metadata == testQuantity_3D.metadata
+        assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()

From f5ce8831a7918ed45be3fb89f962c558ccdc7035 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Mon, 16 Dec 2024 08:45:42 -0800
Subject: [PATCH 58/78] Addressed PR comments and added additional CPU backends
 to unit test

---
 ndsl/comm/communicator.py            |  7 ++-
 tests/mpi/test_mpi_all_reduce_sum.py | 84 ++++++++++++++--------------
 2 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index 862844b3..5f19b2eb 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -102,9 +102,10 @@ def _create_all_reduce_quantity(
             input_data,
             dims=input_metadata.dims,
             units=input_metadata.units,
-            origin=tuple([0 for dim in input_metadata.dims]),
+            origin=input_metadata.origin,
+            extent=input_metadata.extent,
             gt4py_backend=input_metadata.gt4py_backend,
-            allow_mismatch_float_precision=True,
+            allow_mismatch_float_precision=False,
         )
         return all_reduce_quantity
 
@@ -114,6 +115,8 @@ def all_reduce_sum(self, quantity: Quantity):
             quantity.metadata, reduced_quantity_data
         )
         return all_reduce_quantity
+        # quantity.data = reduced_quantity_data
+        # return quantity
 
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
         with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index caddfa5f..9ba01e0a 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -52,44 +52,46 @@ def test_all_reduce_sum(
     communicator,
 ):
 
-    backend = "numpy"
-    base_array = np.array([i for i in range(5)], dtype=Float)
-
-    testQuantity_1D = Quantity(
-        data=base_array,
-        dims=["K"],
-        units="Some 1D unit",
-        gt4py_backend=backend,
-    )
-
-    base_array = np.array([i for i in range(5 * 5)], dtype=Float)
-    base_array = base_array.reshape(5, 5)
-
-    testQuantity_2D = Quantity(
-        data=base_array,
-        dims=["I", "J"],
-        units="Some 2D unit",
-        gt4py_backend=backend,
-    )
-
-    base_array = np.array([i for i in range(5 * 5 * 5)], dtype=Float)
-    base_array = base_array.reshape(5, 5, 5)
-
-    testQuantity_3D = Quantity(
-        data=base_array,
-        dims=["I", "J", "K"],
-        units="Some 3D unit",
-        gt4py_backend=backend,
-    )
-
-    global_sum_q = communicator.all_reduce_sum(testQuantity_1D)
-    assert global_sum_q.metadata == testQuantity_1D.metadata
-    assert (global_sum_q.data == (testQuantity_1D.data * communicator.size)).all()
-
-    global_sum_q = communicator.all_reduce_sum(testQuantity_2D)
-    assert global_sum_q.metadata == testQuantity_2D.metadata
-    assert (global_sum_q.data == (testQuantity_2D.data * communicator.size)).all()
-
-    global_sum_q = communicator.all_reduce_sum(testQuantity_3D)
-    assert global_sum_q.metadata == testQuantity_3D.metadata
-    assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()
+    backends = ["dace:cpu", "gt:cpu_kfirst", "numpy"]
+
+    for backend in backends:
+        base_array = np.array([i for i in range(5)], dtype=Float)
+
+        testQuantity_1D = Quantity(
+            data=base_array,
+            dims=["K"],
+            units="Some 1D unit",
+            gt4py_backend=backend,
+        )
+
+        base_array = np.array([i for i in range(5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5)
+
+        testQuantity_2D = Quantity(
+            data=base_array,
+            dims=["I", "J"],
+            units="Some 2D unit",
+            gt4py_backend=backend,
+        )
+
+        base_array = np.array([i for i in range(5 * 5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5, 5)
+
+        testQuantity_3D = Quantity(
+            data=base_array,
+            dims=["I", "J", "K"],
+            units="Some 3D unit",
+            gt4py_backend=backend,
+        )
+
+        global_sum_q = communicator.all_reduce_sum(testQuantity_1D)
+        assert global_sum_q.metadata == testQuantity_1D.metadata
+        assert (global_sum_q.data == (testQuantity_1D.data * communicator.size)).all()
+
+        global_sum_q = communicator.all_reduce_sum(testQuantity_2D)
+        assert global_sum_q.metadata == testQuantity_2D.metadata
+        assert (global_sum_q.data == (testQuantity_2D.data * communicator.size)).all()
+
+        global_sum_q = communicator.all_reduce_sum(testQuantity_3D)
+        assert global_sum_q.metadata == testQuantity_3D.metadata
+        assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()

From 2e669dbae2fccce6c65dac33db9acf6a5ec564ac Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Wed, 18 Dec 2024 11:36:07 -0800
Subject: [PATCH 59/78] Added setters for various Quantity properties to enable
 setting of Quantity metadata and data properties.

---
 ndsl/comm/communicator.py            | 30 +++++++++++++-----
 ndsl/quantity.py                     | 30 ++++++++++++++++++
 tests/mpi/test_mpi_all_reduce_sum.py | 47 ++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 8 deletions(-)

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index 5f19b2eb..55efa961 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -109,14 +109,28 @@ def _create_all_reduce_quantity(
         )
         return all_reduce_quantity
 
-    def all_reduce_sum(self, quantity: Quantity):
-        reduced_quantity_data = self.comm.allreduce(quantity.data, MPI.SUM)
-        all_reduce_quantity = self._create_all_reduce_quantity(
-            quantity.metadata, reduced_quantity_data
-        )
-        return all_reduce_quantity
-        # quantity.data = reduced_quantity_data
-        # return quantity
+    def all_reduce_sum(
+        self, input_quantity: Quantity, output_quantity: Quantity = None
+    ):
+        reduced_quantity_data = self.comm.allreduce(input_quantity.data, MPI.SUM)
+        if output_quantity is None:
+            all_reduce_quantity = self._create_all_reduce_quantity(
+                input_quantity.metadata, reduced_quantity_data
+            )
+            return all_reduce_quantity
+        else:
+            if output_quantity.data.shape != input_quantity.data.shape:
+                raise TypeError("Shapes not matching")
+
+            output_quantity.metadata.dims = input_quantity.metadata.dims
+            output_quantity.metadata.units = input_quantity.metadata.units
+            output_quantity.metadata.origin = input_quantity.metadata.origin
+            output_quantity.metadata.extent = input_quantity.metadata.extent
+            output_quantity.metadata.gt4py_backend = (
+                input_quantity.metadata.gt4py_backend
+            )
+
+            output_quantity.data = reduced_quantity_data
 
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
         with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
diff --git a/ndsl/quantity.py b/ndsl/quantity.py
index b95a9aad..80bb4d06 100644
--- a/ndsl/quantity.py
+++ b/ndsl/quantity.py
@@ -458,10 +458,20 @@ def units(self) -> str:
         """units of the quantity"""
         return self.metadata.units
 
+    @units.setter
+    def units(self, newUnits):
+        if type(newUnits) is str:
+            self.metadata.units = newUnits
+
     @property
     def gt4py_backend(self) -> Union[str, None]:
         return self.metadata.gt4py_backend
 
+    @gt4py_backend.setter
+    def gt4py_backend(self, newBackend):
+        if type(newBackend) is Union[str, None]:
+            self.metadata.gt4py_backend = newBackend
+
     @property
     def attrs(self) -> dict:
         return dict(**self._attrs, units=self._metadata.units)
@@ -471,6 +481,11 @@ def dims(self) -> Tuple[str, ...]:
         """names of each dimension"""
         return self.metadata.dims
 
+    @dims.setter
+    def dims(self, newDims):
+        if type(newDims) is Tuple:
+            self.metadata.dims = newDims
+
     @property
     def values(self) -> np.ndarray:
         warnings.warn(
@@ -492,16 +507,31 @@ def data(self) -> Union[np.ndarray, cupy.ndarray]:
         """the underlying array of data"""
         return self._data
 
+    @data.setter
+    def data(self, inputData):
+        if type(inputData) in [np.ndarray, cupy.ndarray]:
+            self._data = inputData
+
     @property
     def origin(self) -> Tuple[int, ...]:
         """the start of the computational domain"""
         return self.metadata.origin
 
+    @origin.setter
+    def origin(self, newOrigin):
+        if type(newOrigin) is Tuple:
+            self.metadata.origin = newOrigin
+
     @property
     def extent(self) -> Tuple[int, ...]:
         """the shape of the computational domain"""
         return self.metadata.extent
 
+    @extent.setter
+    def extent(self, newExtent):
+        if type(newExtent) is Tuple:
+            self.metadata.extent = newExtent
+
     @property
     def data_array(self) -> xr.DataArray:
         return xr.DataArray(self.view[:], dims=self.dims, attrs=self.attrs)
diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index 9ba01e0a..9c2b3a3a 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -95,3 +95,50 @@ def test_all_reduce_sum(
         global_sum_q = communicator.all_reduce_sum(testQuantity_3D)
         assert global_sum_q.metadata == testQuantity_3D.metadata
         assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()
+
+        base_array = np.array([i for i in range(5)], dtype=Float)
+        testQuantity_1D_out = Quantity(
+            data=base_array,
+            dims=["K"],
+            units="New 1D unit",
+            gt4py_backend=backend,
+            origin=(8,),
+            extent=(7,),
+        )
+
+        base_array = np.array([i for i in range(5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5)
+
+        testQuantity_2D_out = Quantity(
+            data=base_array,
+            dims=["I", "J"],
+            units="Some 2D unit",
+            gt4py_backend=backend,
+        )
+
+        base_array = np.array([i for i in range(5 * 5 * 5)], dtype=Float)
+        base_array = base_array.reshape(5, 5, 5)
+
+        testQuantity_3D_out = Quantity(
+            data=base_array,
+            dims=["I", "J", "K"],
+            units="Some 3D unit",
+            gt4py_backend=backend,
+        )
+        communicator.all_reduce_sum(testQuantity_1D, testQuantity_1D_out)
+        assert testQuantity_1D_out.metadata == testQuantity_1D_out.metadata
+        assert (
+            testQuantity_1D_out.data == (testQuantity_1D.data * communicator.size)
+        ).all()
+
+        communicator.all_reduce_sum(testQuantity_2D, testQuantity_2D_out)
+        assert testQuantity_2D_out.metadata == testQuantity_2D.metadata
+        assert (
+            testQuantity_2D_out.data == (testQuantity_2D.data * communicator.size)
+        ).all()
+
+        communicator.all_reduce_sum(testQuantity_3D, testQuantity_3D_out)
+        assert testQuantity_3D_out.metadata == testQuantity_3D.metadata
+        assert (
+            testQuantity_3D_out.data == (testQuantity_3D.data * communicator.size)
+        ).all()

From fd2fa97beb979b7bbd25eeca273b269e742f3de6 Mon Sep 17 00:00:00 2001
From: Christopher Kung <christopher.w.kung@nasa.gov>
Date: Thu, 19 Dec 2024 08:42:56 -0800
Subject: [PATCH 60/78] Added function in QuantityMetadata class that allows
 copying of Metadata properties from one class to another.  Subsequent
 Quantity setters that performed the copying of QuantityMetadata properties
 were removed

---
 ndsl/comm/communicator.py            |  8 +------
 ndsl/quantity.py                     | 34 ++++++++--------------------
 tests/mpi/test_mpi_all_reduce_sum.py |  2 +-
 3 files changed, 11 insertions(+), 33 deletions(-)

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index 55efa961..ead24896 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -122,13 +122,7 @@ def all_reduce_sum(
             if output_quantity.data.shape != input_quantity.data.shape:
                 raise TypeError("Shapes not matching")
 
-            output_quantity.metadata.dims = input_quantity.metadata.dims
-            output_quantity.metadata.units = input_quantity.metadata.units
-            output_quantity.metadata.origin = input_quantity.metadata.origin
-            output_quantity.metadata.extent = input_quantity.metadata.extent
-            output_quantity.metadata.gt4py_backend = (
-                input_quantity.metadata.gt4py_backend
-            )
+            input_quantity.metadata.duplicate_metadata(output_quantity.metadata)
 
             output_quantity.data = reduced_quantity_data
 
diff --git a/ndsl/quantity.py b/ndsl/quantity.py
index 80bb4d06..a38a7a5d 100644
--- a/ndsl/quantity.py
+++ b/ndsl/quantity.py
@@ -53,6 +53,15 @@ def np(self) -> NumpyModule:
                 f"quantity underlying data is of unexpected type {self.data_type}"
             )
 
+    def duplicate_metadata(self, metadata_copy):
+        metadata_copy.origin = self.origin
+        metadata_copy.extent = self.extent
+        metadata_copy.dims = self.dims
+        metadata_copy.units = self.units
+        metadata_copy.data_type = self.data_type
+        metadata_copy.dtype = self.dtype
+        metadata_copy.gt4py_backend = self.gt4py_backend
+
 
 @dataclasses.dataclass
 class QuantityHaloSpec:
@@ -458,20 +467,10 @@ def units(self) -> str:
         """units of the quantity"""
         return self.metadata.units
 
-    @units.setter
-    def units(self, newUnits):
-        if type(newUnits) is str:
-            self.metadata.units = newUnits
-
     @property
     def gt4py_backend(self) -> Union[str, None]:
         return self.metadata.gt4py_backend
 
-    @gt4py_backend.setter
-    def gt4py_backend(self, newBackend):
-        if type(newBackend) is Union[str, None]:
-            self.metadata.gt4py_backend = newBackend
-
     @property
     def attrs(self) -> dict:
         return dict(**self._attrs, units=self._metadata.units)
@@ -481,11 +480,6 @@ def dims(self) -> Tuple[str, ...]:
         """names of each dimension"""
         return self.metadata.dims
 
-    @dims.setter
-    def dims(self, newDims):
-        if type(newDims) is Tuple:
-            self.metadata.dims = newDims
-
     @property
     def values(self) -> np.ndarray:
         warnings.warn(
@@ -517,21 +511,11 @@ def origin(self) -> Tuple[int, ...]:
         """the start of the computational domain"""
         return self.metadata.origin
 
-    @origin.setter
-    def origin(self, newOrigin):
-        if type(newOrigin) is Tuple:
-            self.metadata.origin = newOrigin
-
     @property
     def extent(self) -> Tuple[int, ...]:
         """the shape of the computational domain"""
         return self.metadata.extent
 
-    @extent.setter
-    def extent(self, newExtent):
-        if type(newExtent) is Tuple:
-            self.metadata.extent = newExtent
-
     @property
     def data_array(self) -> xr.DataArray:
         return xr.DataArray(self.view[:], dims=self.dims, attrs=self.attrs)
diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index 9c2b3a3a..858a7f94 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -126,7 +126,7 @@ def test_all_reduce_sum(
             gt4py_backend=backend,
         )
         communicator.all_reduce_sum(testQuantity_1D, testQuantity_1D_out)
-        assert testQuantity_1D_out.metadata == testQuantity_1D_out.metadata
+        assert testQuantity_1D_out.metadata == testQuantity_1D.metadata
         assert (
             testQuantity_1D_out.data == (testQuantity_1D.data * communicator.size)
         ).all()

From ad19be33a9124b3000b44047b701c520334feaad Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 20 Dec 2024 14:16:00 -0500
Subject: [PATCH 61/78] Expose all SG metric terms in grid_data

---
 ndsl/grid/helper.py           | 60 +++++++++++++++++++++++++++++++++++
 ndsl/stencils/testing/grid.py | 50 +++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/ndsl/grid/helper.py b/ndsl/grid/helper.py
index 745dce37..6cb6a374 100644
--- a/ndsl/grid/helper.py
+++ b/ndsl/grid/helper.py
@@ -284,10 +284,20 @@ class AngleGridData:
     sin_sg2: Quantity
     sin_sg3: Quantity
     sin_sg4: Quantity
+    sin_sg5: Quantity
+    sin_sg6: Quantity
+    sin_sg7: Quantity
+    sin_sg8: Quantity
+    sin_sg9: Quantity
     cos_sg1: Quantity
     cos_sg2: Quantity
     cos_sg3: Quantity
     cos_sg4: Quantity
+    cos_sg5: Quantity
+    cos_sg6: Quantity
+    cos_sg7: Quantity
+    cos_sg8: Quantity
+    cos_sg9: Quantity
 
     @classmethod
     def new_from_metric_terms(cls, metric_terms: MetricTerms) -> "AngleGridData":
@@ -296,10 +306,20 @@ def new_from_metric_terms(cls, metric_terms: MetricTerms) -> "AngleGridData":
             sin_sg2=metric_terms.sin_sg2,
             sin_sg3=metric_terms.sin_sg3,
             sin_sg4=metric_terms.sin_sg4,
+            sin_sg5=metric_terms.sin_sg5,
+            sin_sg6=metric_terms.sin_sg6,
+            sin_sg7=metric_terms.sin_sg7,
+            sin_sg8=metric_terms.sin_sg8,
+            sin_sg9=metric_terms.sin_sg9,
             cos_sg1=metric_terms.cos_sg1,
             cos_sg2=metric_terms.cos_sg2,
             cos_sg3=metric_terms.cos_sg3,
             cos_sg4=metric_terms.cos_sg4,
+            cos_sg5=metric_terms.cos_sg5,
+            cos_sg6=metric_terms.cos_sg6,
+            cos_sg7=metric_terms.cos_sg7,
+            cos_sg8=metric_terms.cos_sg8,
+            cos_sg9=metric_terms.cos_sg9,
         )
 
 
@@ -618,6 +638,26 @@ def sin_sg3(self):
     def sin_sg4(self):
         return self._angle_data.sin_sg4
 
+    @property
+    def sin_sg5(self):
+        return self._angle_data.sin_sg5
+
+    @property
+    def sin_sg6(self):
+        return self._angle_data.sin_sg6
+
+    @property
+    def sin_sg7(self):
+        return self._angle_data.sin_sg7
+
+    @property
+    def sin_sg8(self):
+        return self._angle_data.sin_sg8
+
+    @property
+    def sin_sg9(self):
+        return self._angle_data.sin_sg9
+
     @property
     def cos_sg1(self):
         return self._angle_data.cos_sg1
@@ -634,6 +674,26 @@ def cos_sg3(self):
     def cos_sg4(self):
         return self._angle_data.cos_sg4
 
+    @property
+    def cos_sg5(self):
+        return self._angle_data.cos_sg5
+
+    @property
+    def cos_sg6(self):
+        return self._angle_data.cos_sg6
+
+    @property
+    def cos_sg7(self):
+        return self._angle_data.cos_sg7
+
+    @property
+    def cos_sg8(self):
+        return self._angle_data.cos_sg8
+
+    @property
+    def cos_sg9(self):
+        return self._angle_data.cos_sg9
+
 
 @dataclasses.dataclass(frozen=True)
 class DriverGridData:
diff --git a/ndsl/stencils/testing/grid.py b/ndsl/stencils/testing/grid.py
index 273a0f3d..fbee10a5 100644
--- a/ndsl/stencils/testing/grid.py
+++ b/ndsl/stencils/testing/grid.py
@@ -734,6 +734,31 @@ def grid_data(self) -> "GridData":
                 dims=GridDefinitions.sin_sg4.dims,
                 units=GridDefinitions.sin_sg4.units,
             ),
+            sin_sg5=self.quantity_factory.from_array(
+                data=self.sin_sg5,
+                dims=GridDefinitions.sin_sg5.dims,
+                units=GridDefinitions.sin_sg5.units,
+            ),
+            sin_sg6=self.quantity_factory.from_array(
+                data=self.sin_sg6,
+                dims=GridDefinitions.sin_sg6.dims,
+                units=GridDefinitions.sin_sg6.units,
+            ),
+            sin_sg7=self.quantity_factory.from_array(
+                data=self.sin_sg7,
+                dims=GridDefinitions.sin_sg7.dims,
+                units=GridDefinitions.sin_sg7.units,
+            ),
+            sin_sg8=self.quantity_factory.from_array(
+                data=self.sin_sg8,
+                dims=GridDefinitions.sin_sg8.dims,
+                units=GridDefinitions.sin_sg8.units,
+            ),
+            sin_sg9=self.quantity_factory.from_array(
+                data=self.sin_sg9,
+                dims=GridDefinitions.sin_sg9.dims,
+                units=GridDefinitions.sin_sg9.units,
+            ),
             cos_sg1=self.quantity_factory.from_array(
                 data=self.cos_sg1,
                 dims=GridDefinitions.cos_sg1.dims,
@@ -754,6 +779,31 @@ def grid_data(self) -> "GridData":
                 dims=GridDefinitions.cos_sg4.dims,
                 units=GridDefinitions.cos_sg4.units,
             ),
+            cos_sg5=self.quantity_factory.from_array(
+                data=self.cos_sg5,
+                dims=GridDefinitions.cos_sg5.dims,
+                units=GridDefinitions.cos_sg5.units,
+            ),
+            cos_sg6=self.quantity_factory.from_array(
+                data=self.cos_sg6,
+                dims=GridDefinitions.cos_sg6.dims,
+                units=GridDefinitions.cos_sg6.units,
+            ),
+            cos_sg7=self.quantity_factory.from_array(
+                data=self.cos_sg7,
+                dims=GridDefinitions.cos_sg7.dims,
+                units=GridDefinitions.cos_sg7.units,
+            ),
+            cos_sg8=self.quantity_factory.from_array(
+                data=self.cos_sg8,
+                dims=GridDefinitions.cos_sg8.dims,
+                units=GridDefinitions.cos_sg8.units,
+            ),
+            cos_sg9=self.quantity_factory.from_array(
+                data=self.cos_sg9,
+                dims=GridDefinitions.cos_sg9.dims,
+                units=GridDefinitions.cos_sg9.units,
+            ),
         )
         self._grid_data = GridData(
             horizontal_data=horizontal,

From cc620c6283dcd35dcf6829a37d411009e6a011c6 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Sun, 22 Dec 2024 10:42:42 -0500
Subject: [PATCH 62/78] Add `Allreduce` and all MPI OP

---
 ndsl/comm/caching_comm.py | 12 +++++++++---
 ndsl/comm/comm_abc.py     | 26 +++++++++++++++++++++++++-
 ndsl/comm/communicator.py | 19 +++++++++++++++----
 ndsl/comm/mpi.py          | 31 ++++++++++++++++++++++++++++---
 ndsl/comm/null_comm.py    | 10 +++++++---
 5 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/ndsl/comm/caching_comm.py b/ndsl/comm/caching_comm.py
index 36587d73..42f92ea2 100644
--- a/ndsl/comm/caching_comm.py
+++ b/ndsl/comm/caching_comm.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from ndsl.comm.comm_abc import Comm, Request
+from ndsl.comm.comm_abc import Comm, ReductionOperator, Request
 
 
 T = TypeVar("T")
@@ -147,9 +147,12 @@ def Split(self, color, key) -> "CachingCommReader":
         new_data = self._data.get_split()
         return CachingCommReader(data=new_data)
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op: Optional[ReductionOperator] = None) -> Any:
         return self._data.get_generic_obj()
 
+    def Allreduce(self, sendobj, recvobj, op: ReductionOperator) -> Any:
+        raise NotImplementedError("CachingCommReader.Allreduce")
+
     @classmethod
     def load(cls, file: BinaryIO) -> "CachingCommReader":
         data = CachingCommData.load(file)
@@ -229,7 +232,10 @@ def Split(self, color, key) -> "CachingCommWriter":
     def dump(self, file: BinaryIO):
         self._data.dump(file)
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op: Optional[ReductionOperator] = None) -> Any:
         result = self._comm.allreduce(sendobj, op)
         self._data.generic_obj_buffers.append(copy.deepcopy(result))
         return result
+
+    def Allreduce(self, sendobj, recvobj, op: ReductionOperator) -> Any:
+        raise NotImplementedError("CachingCommWriter.Allreduce")
diff --git a/ndsl/comm/comm_abc.py b/ndsl/comm/comm_abc.py
index 77f56586..8192560b 100644
--- a/ndsl/comm/comm_abc.py
+++ b/ndsl/comm/comm_abc.py
@@ -1,10 +1,30 @@
 import abc
+import enum
 from typing import List, Optional, TypeVar
 
 
 T = TypeVar("T")
 
 
+@enum.unique
+class ReductionOperator(enum.Enum):
+    OP_NULL = enum.auto()
+    MAX = enum.auto()
+    MIN = enum.auto()
+    SUM = enum.auto()
+    PROD = enum.auto()
+    LAND = enum.auto()
+    BAND = enum.auto()
+    LOR = enum.auto()
+    BOR = enum.auto()
+    LXOR = enum.auto()
+    BXOR = enum.auto()
+    MAXLOC = enum.auto()
+    MINLOC = enum.auto()
+    REPLACE = enum.auto()
+    NO_OP = enum.auto()
+
+
 class Request(abc.ABC):
     @abc.abstractmethod
     def wait(self):
@@ -69,5 +89,9 @@ def Split(self, color, key) -> "Comm":
         ...
 
     @abc.abstractmethod
-    def allreduce(self, sendobj: T, op=None) -> T:
+    def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
+        ...
+
+    @abc.abstractmethod
+    def Allreduce(self, sendobj: T, recvobj: T, op: ReductionOperator) -> T:
         ...
diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index ead24896..ed1f264d 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -2,11 +2,11 @@
 from typing import List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 import numpy as np
-from mpi4py import MPI
 
 import ndsl.constants as constants
 from ndsl.buffer import array_buffer, device_synchronize, recv_buffer, send_buffer
 from ndsl.comm.boundary import Boundary
+from ndsl.comm.comm_abc import ReductionOperator
 from ndsl.comm.partitioner import CubedSpherePartitioner, Partitioner, TilePartitioner
 from ndsl.halo.updater import HaloUpdater, HaloUpdateRequest, VectorInterfaceHaloUpdater
 from ndsl.performance.timer import NullTimer, Timer
@@ -109,10 +109,13 @@ def _create_all_reduce_quantity(
         )
         return all_reduce_quantity
 
-    def all_reduce_sum(
-        self, input_quantity: Quantity, output_quantity: Quantity = None
+    def all_reduce(
+        self,
+        input_quantity: Quantity,
+        op: ReductionOperator,
+        output_quantity: Quantity = None,
     ):
-        reduced_quantity_data = self.comm.allreduce(input_quantity.data, MPI.SUM)
+        reduced_quantity_data = self.comm.allreduce(input_quantity.data, op)
         if output_quantity is None:
             all_reduce_quantity = self._create_all_reduce_quantity(
                 input_quantity.metadata, reduced_quantity_data
@@ -126,6 +129,14 @@ def all_reduce_sum(
 
             output_quantity.data = reduced_quantity_data
 
+    def all_reduce_per_element(
+        self,
+        input_quantity: Quantity,
+        output_quantity: Quantity,
+        op: ReductionOperator,
+    ):
+        self.comm.Allreduce(input_quantity.data, output_quantity.data, op)
+
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
         with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
             numpy_module.zeros, recvbuf
diff --git a/ndsl/comm/mpi.py b/ndsl/comm/mpi.py
index 6f47c791..b3b834b4 100644
--- a/ndsl/comm/mpi.py
+++ b/ndsl/comm/mpi.py
@@ -1,10 +1,11 @@
 try:
+    import mpi4py
     from mpi4py import MPI
 except ImportError:
     MPI = None
-from typing import List, Optional, TypeVar, cast
+from typing import Dict, List, Optional, TypeVar, cast
 
-from ndsl.comm.comm_abc import Comm, Request
+from ndsl.comm.comm_abc import Comm, ReductionOperator, Request
 from ndsl.logging import ndsl_log
 
 
@@ -12,6 +13,24 @@
 
 
 class MPIComm(Comm):
+    _op_mapping: Dict[ReductionOperator, mpi4py.MPI.Op] = {
+        ReductionOperator.OP_NULL: mpi4py.MPI.OP_NULL,
+        ReductionOperator.MAX: mpi4py.MPI.MAX,
+        ReductionOperator.MIN: mpi4py.MPI.MIN,
+        ReductionOperator.SUM: mpi4py.MPI.SUM,
+        ReductionOperator.PROD: mpi4py.MPI.PROD,
+        ReductionOperator.LAND: mpi4py.MPI.LAND,
+        ReductionOperator.BAND: mpi4py.MPI.BAND,
+        ReductionOperator.LOR: mpi4py.MPI.LOR,
+        ReductionOperator.BOR: mpi4py.MPI.BOR,
+        ReductionOperator.LXOR: mpi4py.MPI.LXOR,
+        ReductionOperator.BXOR: mpi4py.MPI.BXOR,
+        ReductionOperator.MAXLOC: mpi4py.MPI.MAXLOC,
+        ReductionOperator.MINLOC: mpi4py.MPI.MINLOC,
+        ReductionOperator.REPLACE: mpi4py.MPI.REPLACE,
+        ReductionOperator.NO_OP: mpi4py.MPI.NO_OP,
+    }
+
     def __init__(self):
         if MPI is None:
             raise RuntimeError("MPI not available")
@@ -72,8 +91,14 @@ def Split(self, color, key) -> "Comm":
         )
         return self._comm.Split(color, key)
 
-    def allreduce(self, sendobj: T, op=None) -> T:
+    def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
         ndsl_log.debug(
             "allreduce on rank %s with operator %s", self._comm.Get_rank(), op
         )
         return self._comm.allreduce(sendobj, op)
+
+    def Allreduce(self, sendobj: T, recvobj: T, op: ReductionOperator) -> T:
+        ndsl_log.debug(
+            "allreduce on rank %s with operator %s", self._comm.Get_rank(), op
+        )
+        return self._comm.Allreduce(sendobj, recvobj, self._op_mapping[op])
diff --git a/ndsl/comm/null_comm.py b/ndsl/comm/null_comm.py
index 7e0c07fa..5ca92359 100644
--- a/ndsl/comm/null_comm.py
+++ b/ndsl/comm/null_comm.py
@@ -1,7 +1,7 @@
 import copy
-from typing import Any, Mapping
+from typing import Any, Mapping, Optional
 
-from ndsl.comm.comm_abc import Comm, Request
+from ndsl.comm.comm_abc import Comm, ReductionOperator, Request
 
 
 class NullAsyncResult(Request):
@@ -91,5 +91,9 @@ def Split(self, color, key):
         self._split_comms[color].append(new_comm)
         return new_comm
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op: Optional[ReductionOperator] = None) -> Any:
         return self._fill_value
+
+    def Allreduce(self, sendobj, recvobj, op: ReductionOperator) -> Any:
+        recvobj = sendobj
+        return recvobj

From 0e8089eed9c909bf91cc1a117ecf12cf6cfe7397 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Sun, 22 Dec 2024 10:45:06 -0500
Subject: [PATCH 63/78] Update utest

---
 tests/mpi/test_mpi_all_reduce_sum.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index 858a7f94..4a15ad53 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -7,6 +7,7 @@
     Quantity,
     TilePartitioner,
 )
+from ndsl.comm.comm_abc import ReductionOperator
 from ndsl.dsl.typing import Float
 from tests.mpi.mpi_comm import MPI
 
@@ -48,10 +49,7 @@ def communicator(cube_partitioner):
 @pytest.mark.skipif(
     MPI is None, reason="mpi4py is not available or pytest was not run in parallel"
 )
-def test_all_reduce_sum(
-    communicator,
-):
-
+def test_all_reduce(communicator):
     backends = ["dace:cpu", "gt:cpu_kfirst", "numpy"]
 
     for backend in backends:
@@ -84,15 +82,15 @@ def test_all_reduce_sum(
             gt4py_backend=backend,
         )
 
-        global_sum_q = communicator.all_reduce_sum(testQuantity_1D)
+        global_sum_q = communicator.all_reduce(testQuantity_1D, ReductionOperator.SUM)
         assert global_sum_q.metadata == testQuantity_1D.metadata
         assert (global_sum_q.data == (testQuantity_1D.data * communicator.size)).all()
 
-        global_sum_q = communicator.all_reduce_sum(testQuantity_2D)
+        global_sum_q = communicator.all_reduce(testQuantity_2D, ReductionOperator.SUM)
         assert global_sum_q.metadata == testQuantity_2D.metadata
         assert (global_sum_q.data == (testQuantity_2D.data * communicator.size)).all()
 
-        global_sum_q = communicator.all_reduce_sum(testQuantity_3D)
+        global_sum_q = communicator.all_reduce(testQuantity_3D, ReductionOperator.SUM)
         assert global_sum_q.metadata == testQuantity_3D.metadata
         assert (global_sum_q.data == (testQuantity_3D.data * communicator.size)).all()
 
@@ -125,19 +123,25 @@ def test_all_reduce_sum(
             units="Some 3D unit",
             gt4py_backend=backend,
         )
-        communicator.all_reduce_sum(testQuantity_1D, testQuantity_1D_out)
+        communicator.all_reduce(
+            testQuantity_1D, ReductionOperator.SUM, testQuantity_1D_out
+        )
         assert testQuantity_1D_out.metadata == testQuantity_1D.metadata
         assert (
             testQuantity_1D_out.data == (testQuantity_1D.data * communicator.size)
         ).all()
 
-        communicator.all_reduce_sum(testQuantity_2D, testQuantity_2D_out)
+        communicator.all_reduce(
+            testQuantity_2D, ReductionOperator.SUM, testQuantity_2D_out
+        )
         assert testQuantity_2D_out.metadata == testQuantity_2D.metadata
         assert (
             testQuantity_2D_out.data == (testQuantity_2D.data * communicator.size)
         ).all()
 
-        communicator.all_reduce_sum(testQuantity_3D, testQuantity_3D_out)
+        communicator.all_reduce(
+            testQuantity_3D, ReductionOperator.SUM, testQuantity_3D_out
+        )
         assert testQuantity_3D_out.metadata == testQuantity_3D.metadata
         assert (
             testQuantity_3D_out.data == (testQuantity_3D.data * communicator.size)

From 2188c75cb2b005a76859332bcc906987e0495116 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Sun, 22 Dec 2024 11:01:14 -0500
Subject: [PATCH 64/78] Fix `local_comm`

---
 ndsl/comm/local_comm.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ndsl/comm/local_comm.py b/ndsl/comm/local_comm.py
index 5ebfb47d..1ae10177 100644
--- a/ndsl/comm/local_comm.py
+++ b/ndsl/comm/local_comm.py
@@ -189,8 +189,14 @@ def Split(self, color, key):
         self._split_comms[color].append(new_comm)
         return new_comm
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op=None, recvobj=None) -> Any:
         raise NotImplementedError(
-            "sendrecv fundamentally cannot be written for LocalComm, "
+            "allreduce fundamentally cannot be written for LocalComm, "
+            "as it requires synchronicity"
+        )
+
+    def Allreduce(self, sendobj, recvobj, op) -> Any:
+        raise NotImplementedError(
+            "Allreduce fundamentally cannot be written for LocalComm, "
             "as it requires synchronicity"
         )

From f8cc2ce97617ee68df87652c5d75abf16b718db8 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Sun, 22 Dec 2024 11:08:38 -0500
Subject: [PATCH 65/78] Fix utest

---
 ndsl/comm/mpi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ndsl/comm/mpi.py b/ndsl/comm/mpi.py
index b3b834b4..3873cc52 100644
--- a/ndsl/comm/mpi.py
+++ b/ndsl/comm/mpi.py
@@ -95,7 +95,7 @@ def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
         ndsl_log.debug(
             "allreduce on rank %s with operator %s", self._comm.Get_rank(), op
         )
-        return self._comm.allreduce(sendobj, op)
+        return self._comm.allreduce(sendobj, self._op_mapping[op])
 
     def Allreduce(self, sendobj: T, recvobj: T, op: ReductionOperator) -> T:
         ndsl_log.debug(

From 7ad271f878ef8c605cba02e07828ee596055eed0 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Sun, 22 Dec 2024 11:30:04 -0500
Subject: [PATCH 66/78] Enforce `comm_abc.Comm` into Communicator

---
 ndsl/comm/communicator.py            | 30 +++++++++++++++++++---------
 tests/mpi/test_mpi_all_reduce_sum.py |  3 ++-
 tests/mpi/test_mpi_halo_update.py    |  5 +++--
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index ed1f264d..c952c022 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -6,6 +6,7 @@
 import ndsl.constants as constants
 from ndsl.buffer import array_buffer, device_synchronize, recv_buffer, send_buffer
 from ndsl.comm.boundary import Boundary
+from ndsl.comm.comm_abc import Comm as CommABC
 from ndsl.comm.comm_abc import ReductionOperator
 from ndsl.comm.partitioner import CubedSpherePartitioner, Partitioner, TilePartitioner
 from ndsl.halo.updater import HaloUpdater, HaloUpdateRequest, VectorInterfaceHaloUpdater
@@ -45,7 +46,11 @@ def to_numpy(array, dtype=None) -> np.ndarray:
 
 class Communicator(abc.ABC):
     def __init__(
-        self, comm, partitioner, force_cpu: bool = False, timer: Optional[Timer] = None
+        self,
+        comm: CommABC,
+        partitioner,
+        force_cpu: bool = False,
+        timer: Optional[Timer] = None,
     ):
         self.comm = comm
         self.partitioner: Partitioner = partitioner
@@ -62,7 +67,7 @@ def tile(self) -> "TileCommunicator":
     @abc.abstractmethod
     def from_layout(
         cls,
-        comm,
+        comm: CommABC,
         layout: Tuple[int, int],
         force_cpu: bool = False,
         timer: Optional[Timer] = None,
@@ -138,15 +143,17 @@ def all_reduce_per_element(
         self.comm.Allreduce(input_quantity.data, output_quantity.data, op)
 
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
-        with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
-            numpy_module.zeros, recvbuf
-        ) as recv:
+        with (
+            send_buffer(numpy_module.zeros, sendbuf) as send,
+            recv_buffer(numpy_module.zeros, recvbuf) as recv,
+        ):
             self.comm.Scatter(send, recv, **kwargs)
 
     def _Gather(self, numpy_module, sendbuf, recvbuf, **kwargs):
-        with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
-            numpy_module.zeros, recvbuf
-        ) as recv:
+        with (
+            send_buffer(numpy_module.zeros, sendbuf) as send,
+            recv_buffer(numpy_module.zeros, recvbuf) as recv,
+        ):
             self.comm.Gather(send, recv, **kwargs)
 
     def scatter(
@@ -753,7 +760,7 @@ class CubedSphereCommunicator(Communicator):
 
     def __init__(
         self,
-        comm,
+        comm: CommABC,
         partitioner: CubedSpherePartitioner,
         force_cpu: bool = False,
         timer: Optional[Timer] = None,
@@ -766,6 +773,11 @@ def __init__(
             force_cpu: Force all communication to go through central memory.
             timer: Time communication operations.
         """
+        if not issubclass(type(comm), CommABC):
+            raise TypeError(
+                "Communictor needs to be instantiated with communication subsytem"
+                f" derived from `comm_abc.Comm`, got {type(comm)}."
+            )
         if comm.Get_size() != partitioner.total_ranks:
             raise ValueError(
                 f"was given a partitioner for {partitioner.total_ranks} ranks but a "
diff --git a/tests/mpi/test_mpi_all_reduce_sum.py b/tests/mpi/test_mpi_all_reduce_sum.py
index 4a15ad53..bec096dd 100644
--- a/tests/mpi/test_mpi_all_reduce_sum.py
+++ b/tests/mpi/test_mpi_all_reduce_sum.py
@@ -8,6 +8,7 @@
     TilePartitioner,
 )
 from ndsl.comm.comm_abc import ReductionOperator
+from ndsl.comm.mpi import MPIComm
 from ndsl.dsl.typing import Float
 from tests.mpi.mpi_comm import MPI
 
@@ -41,7 +42,7 @@ def cube_partitioner(tile_partitioner):
 @pytest.fixture()
 def communicator(cube_partitioner):
     return CubedSphereCommunicator(
-        comm=MPI.COMM_WORLD,
+        comm=MPIComm(),
         partitioner=cube_partitioner,
     )
 
diff --git a/tests/mpi/test_mpi_halo_update.py b/tests/mpi/test_mpi_halo_update.py
index ab11b16e..1e6aaefc 100644
--- a/tests/mpi/test_mpi_halo_update.py
+++ b/tests/mpi/test_mpi_halo_update.py
@@ -8,6 +8,7 @@
     Quantity,
     TilePartitioner,
 )
+from ndsl.comm.mpi import MPIComm
 from ndsl.comm._boundary_utils import get_boundary_slice
 from ndsl.constants import (
     BOUNDARY_TYPES,
@@ -39,7 +40,7 @@ def layout():
     if MPI is not None:
         size = MPI.COMM_WORLD.Get_size()
         ranks_per_tile = size // 6
-        ranks_per_edge = int(ranks_per_tile ** 0.5)
+        ranks_per_edge = int(ranks_per_tile**0.5)
         return (ranks_per_edge, ranks_per_edge)
     else:
         return (1, 1)
@@ -176,7 +177,7 @@ def extent(n_points, dims, nz, ny, nx):
 @pytest.fixture()
 def communicator(cube_partitioner):
     return CubedSphereCommunicator(
-        comm=MPI.COMM_WORLD,
+        comm=MPIComm(),
         partitioner=cube_partitioner,
     )
 

From 07cd0f32cb5e203ae98c0946a092c5edc9e7f7c0 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Sun, 22 Dec 2024 11:58:52 -0500
Subject: [PATCH 67/78] Fix `comm` object in serial utest

---
 tests/dsl/test_compilation_config.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tests/dsl/test_compilation_config.py b/tests/dsl/test_compilation_config.py
index 62049d91..95ca7f74 100644
--- a/tests/dsl/test_compilation_config.py
+++ b/tests/dsl/test_compilation_config.py
@@ -9,6 +9,7 @@
     CubedSpherePartitioner,
     RunMode,
     TilePartitioner,
+    NullComm,
 )
 
 
@@ -33,8 +34,7 @@ def test_check_communicator_valid(
     partitioner = CubedSpherePartitioner(
         TilePartitioner((int(sqrt(size / 6)), int((sqrt(size / 6)))))
     )
-    comm = unittest.mock.MagicMock()
-    comm.Get_size.return_value = size
+    comm = NullComm(rank=0, total_ranks=size)
     cubed_sphere_comm = CubedSphereCommunicator(comm, partitioner)
     config = CompilationConfig(
         run_mode=run_mode, use_minimal_caching=use_minimal_caching
@@ -52,8 +52,7 @@ def test_check_communicator_invalid(
     nx: int, ny: int, use_minimal_caching: bool, run_mode: RunMode
 ):
     partitioner = CubedSpherePartitioner(TilePartitioner((nx, ny)))
-    comm = unittest.mock.MagicMock()
-    comm.Get_size.return_value = nx * ny * 6
+    comm = NullComm(rank=0, total_ranks=nx * ny * 6)
     cubed_sphere_comm = CubedSphereCommunicator(comm, partitioner)
     config = CompilationConfig(
         run_mode=run_mode, use_minimal_caching=use_minimal_caching
@@ -91,9 +90,7 @@ def test_get_decomposition_info_from_comm(
     partitioner = CubedSpherePartitioner(
         TilePartitioner((int(sqrt(size / 6)), int(sqrt(size / 6))))
     )
-    comm = unittest.mock.MagicMock()
-    comm.Get_rank.return_value = rank
-    comm.Get_size.return_value = size
+    comm = NullComm(rank=rank, total_ranks=size)
     cubed_sphere_comm = CubedSphereCommunicator(comm, partitioner)
     config = CompilationConfig(use_minimal_caching=True, run_mode=RunMode.Run)
     (
@@ -133,8 +130,7 @@ def test_determine_compiling_equivalent(
         TilePartitioner((sqrt(size / 6), sqrt(size / 6)))
     )
     comm = unittest.mock.MagicMock()
-    comm.Get_rank.return_value = rank
-    comm.Get_size.return_value = size
+    comm = NullComm(rank=rank, total_ranks=size)
     cubed_sphere_comm = CubedSphereCommunicator(comm, partitioner)
     assert (
         config.determine_compiling_equivalent(rank, cubed_sphere_comm.partitioner)

From 224e6e24ecfc09e65bf8f0ec1f9b3f3b0d4c7ed2 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Sun, 22 Dec 2024 13:50:37 -0500
Subject: [PATCH 68/78] Lint + `MPIComm` on testing architecture

---
 ndsl/comm/communicator.py               | 16 ++++++----------
 ndsl/stencils/testing/conftest.py       | 11 +++++------
 ndsl/stencils/testing/test_translate.py | 17 +++++++++--------
 tests/dsl/test_compilation_config.py    |  2 +-
 tests/mpi/test_mpi_halo_update.py       |  4 ++--
 5 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index c952c022..1ea4f5a3 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -143,18 +143,14 @@ def all_reduce_per_element(
         self.comm.Allreduce(input_quantity.data, output_quantity.data, op)
 
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
-        with (
-            send_buffer(numpy_module.zeros, sendbuf) as send,
-            recv_buffer(numpy_module.zeros, recvbuf) as recv,
-        ):
-            self.comm.Scatter(send, recv, **kwargs)
+        with send_buffer(numpy_module.zeros, sendbuf) as send:
+            with recv_buffer(numpy_module.zeros, recvbuf) as recv:
+                self.comm.Scatter(send, recv, **kwargs)
 
     def _Gather(self, numpy_module, sendbuf, recvbuf, **kwargs):
-        with (
-            send_buffer(numpy_module.zeros, sendbuf) as send,
-            recv_buffer(numpy_module.zeros, recvbuf) as recv,
-        ):
-            self.comm.Gather(send, recv, **kwargs)
+        with send_buffer(numpy_module.zeros, sendbuf) as send:
+            with recv_buffer(numpy_module.zeros, recvbuf) as recv:
+                self.comm.Gather(send, recv, **kwargs)
 
     def scatter(
         self,
diff --git a/ndsl/stencils/testing/conftest.py b/ndsl/stencils/testing/conftest.py
index af5bb6a6..9810fb4a 100644
--- a/ndsl/stencils/testing/conftest.py
+++ b/ndsl/stencils/testing/conftest.py
@@ -13,7 +13,7 @@
     CubedSphereCommunicator,
     TileCommunicator,
 )
-from ndsl.comm.mpi import MPI
+from ndsl.comm.mpi import MPI, MPIComm
 from ndsl.comm.partitioner import CubedSpherePartitioner, TilePartitioner
 from ndsl.dsl.dace.dace_config import DaceConfig
 from ndsl.namelist import Namelist
@@ -308,7 +308,7 @@ def compute_grid_data(grid, namelist, backend, layout, topology_mode):
         npx=namelist.npx,
         npy=namelist.npy,
         npz=namelist.npz,
-        communicator=get_communicator(MPI.COMM_WORLD, layout, topology_mode),
+        communicator=get_communicator(MPIComm(), layout, topology_mode),
         backend=backend,
     )
 
@@ -360,13 +360,12 @@ def generate_parallel_stencil_tests(metafunc, *, backend: str):
         metafunc.config
     )
     # get MPI environment
-    comm = MPI.COMM_WORLD
-    mpi_rank = comm.Get_rank()
+    comm = MPIComm()
     savepoint_cases = parallel_savepoint_cases(
         metafunc,
         data_path,
         namelist_filename,
-        mpi_rank,
+        comm.Get_rank(),
         backend=backend,
         comm=comm,
     )
@@ -376,7 +375,7 @@ def generate_parallel_stencil_tests(metafunc, *, backend: str):
 
 
 def get_communicator(comm, layout, topology_mode):
-    if (MPI.COMM_WORLD.Get_size() > 1) and (topology_mode == "cubed-sphere"):
+    if (comm.Get_size() > 1) and (topology_mode == "cubed-sphere"):
         partitioner = CubedSpherePartitioner(TilePartitioner(layout))
         communicator = CubedSphereCommunicator(comm, partitioner)
     else:
diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index db8e6047..64ae5f62 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -8,7 +8,7 @@
 
 import ndsl.dsl.gt4py_utils as gt_utils
 from ndsl.comm.communicator import CubedSphereCommunicator, TileCommunicator
-from ndsl.comm.mpi import MPI
+from ndsl.comm.mpi import MPI, MPIComm
 from ndsl.comm.partitioner import CubedSpherePartitioner, TilePartitioner
 from ndsl.dsl.dace.dace_config import DaceConfig
 from ndsl.dsl.stencil import CompilationConfig, StencilConfig
@@ -288,18 +288,19 @@ def test_parallel_savepoint(
     multimodal_metric,
     xy_indices=True,
 ):
-    if MPI.COMM_WORLD.Get_size() % 6 != 0:
+    mpi_comm = MPIComm()
+    if mpi_comm.Get_size() % 6 != 0:
         layout = (
-            int(MPI.COMM_WORLD.Get_size() ** 0.5),
-            int(MPI.COMM_WORLD.Get_size() ** 0.5),
+            int(mpi_comm.Get_size() ** 0.5),
+            int(mpi_comm.Get_size() ** 0.5),
         )
-        communicator = get_tile_communicator(MPI.COMM_WORLD, layout)
+        communicator = get_tile_communicator(mpi_comm, layout)
     else:
         layout = (
-            int((MPI.COMM_WORLD.Get_size() // 6) ** 0.5),
-            int((MPI.COMM_WORLD.Get_size() // 6) ** 0.5),
+            int((mpi_comm.Get_size() // 6) ** 0.5),
+            int((mpi_comm.Get_size() // 6) ** 0.5),
         )
-        communicator = get_communicator(MPI.COMM_WORLD, layout)
+        communicator = get_communicator(mpi_comm, layout)
     if case.testobj is None:
         pytest.xfail(
             f"no translate object available for savepoint {case.savepoint_name}"
diff --git a/tests/dsl/test_compilation_config.py b/tests/dsl/test_compilation_config.py
index 95ca7f74..fa323b06 100644
--- a/tests/dsl/test_compilation_config.py
+++ b/tests/dsl/test_compilation_config.py
@@ -7,9 +7,9 @@
     CompilationConfig,
     CubedSphereCommunicator,
     CubedSpherePartitioner,
+    NullComm,
     RunMode,
     TilePartitioner,
-    NullComm,
 )
 
 
diff --git a/tests/mpi/test_mpi_halo_update.py b/tests/mpi/test_mpi_halo_update.py
index 1e6aaefc..b6c38e95 100644
--- a/tests/mpi/test_mpi_halo_update.py
+++ b/tests/mpi/test_mpi_halo_update.py
@@ -8,8 +8,8 @@
     Quantity,
     TilePartitioner,
 )
-from ndsl.comm.mpi import MPIComm
 from ndsl.comm._boundary_utils import get_boundary_slice
+from ndsl.comm.mpi import MPIComm
 from ndsl.constants import (
     BOUNDARY_TYPES,
     EDGE_BOUNDARY_TYPES,
@@ -40,7 +40,7 @@ def layout():
     if MPI is not None:
         size = MPI.COMM_WORLD.Get_size()
         ranks_per_tile = size // 6
-        ranks_per_edge = int(ranks_per_tile**0.5)
+        ranks_per_edge = int(ranks_per_tile ** 0.5)
         return (ranks_per_edge, ranks_per_edge)
     else:
         return (1, 1)

From f99914a3c47b038fb5f145126ceda9e3bfdd7cd4 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Fri, 27 Dec 2024 09:46:18 -0500
Subject: [PATCH 69/78] Make sure the correct allocator backend is used for
 Quantities

---
 ndsl/boilerplate.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ndsl/boilerplate.py b/ndsl/boilerplate.py
index dece7ce4..a777cd82 100644
--- a/ndsl/boilerplate.py
+++ b/ndsl/boilerplate.py
@@ -16,6 +16,7 @@
     TileCommunicator,
     TilePartitioner,
 )
+from ndsl.optional_imports import cupy as cp
 
 
 def _get_factories(
@@ -74,7 +75,9 @@ def _get_factories(
 
     grid_indexing = GridIndexing.from_sizer_and_communicator(sizer, comm)
     stencil_factory = StencilFactory(config=stencil_config, grid_indexing=grid_indexing)
-    quantity_factory = QuantityFactory(sizer, np)
+    quantity_factory = QuantityFactory(
+        sizer, cp if stencil_config.is_gpu_backend else np
+    )
 
     return stencil_factory, quantity_factory
 

From 760578c4e8cf505c96e030f521eb3addf463b871 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Mon, 30 Dec 2024 11:21:39 -0500
Subject: [PATCH 70/78] Add in_place option for Allreduce

---
 ndsl/comm/comm_abc.py     |  3 +++
 ndsl/comm/communicator.py |  5 +++++
 ndsl/comm/mpi.py          | 14 +++++++++++---
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/ndsl/comm/comm_abc.py b/ndsl/comm/comm_abc.py
index 8192560b..45596f1e 100644
--- a/ndsl/comm/comm_abc.py
+++ b/ndsl/comm/comm_abc.py
@@ -95,3 +95,6 @@ def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
     @abc.abstractmethod
     def Allreduce(self, sendobj: T, recvobj: T, op: ReductionOperator) -> T:
         ...
+
+    def Allreduce_inplace(self, obj: T, op: ReductionOperator) -> T:
+        ...
diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index 1ea4f5a3..ba980d19 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -142,6 +142,11 @@ def all_reduce_per_element(
     ):
         self.comm.Allreduce(input_quantity.data, output_quantity.data, op)
 
+    def all_reduce_per_element_in_place(
+        self, quantity: Quantity, op: ReductionOperator
+    ):
+        self.comm.Allreduce_inplace(quantity.data, op)
+
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
         with send_buffer(numpy_module.zeros, sendbuf) as send:
             with recv_buffer(numpy_module.zeros, recvbuf) as recv:
diff --git a/ndsl/comm/mpi.py b/ndsl/comm/mpi.py
index 3873cc52..6b3ff17f 100644
--- a/ndsl/comm/mpi.py
+++ b/ndsl/comm/mpi.py
@@ -97,8 +97,16 @@ def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
         )
         return self._comm.allreduce(sendobj, self._op_mapping[op])
 
-    def Allreduce(self, sendobj: T, recvobj: T, op: ReductionOperator) -> T:
+    def Allreduce(self, sendobj_or_inplace: T, recvobj: T, op: ReductionOperator) -> T:
         ndsl_log.debug(
-            "allreduce on rank %s with operator %s", self._comm.Get_rank(), op
+            "Allreduce on rank %s with operator %s", self._comm.Get_rank(), op
+        )
+        return self._comm.Allreduce(sendobj_or_inplace, recvobj, self._op_mapping[op])
+
+    def Allreduce_inplace(self, recvobj: T, op: ReductionOperator) -> T:
+        ndsl_log.debug(
+            "Allreduce (in place) on rank %s with operator %s",
+            self._comm.Get_rank(),
+            op,
         )
-        return self._comm.Allreduce(sendobj, recvobj, self._op_mapping[op])
+        return self._comm.Allreduce(mpi4py.MPI.IN_PLACE, recvobj, self._op_mapping[op])

From 4e06ee8063e636141b2c3e1f4b098da3fc408dce Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <romanc@users.noreply.github.com>
Date: Tue, 7 Jan 2025 17:12:17 +0100
Subject: [PATCH 71/78] Cleanup ndsl/dsl/dace/utils.py (#96)

* Fix typos
* DaCeProgress: avoid double assignment of prefix
* Add type hints/simplify kernel_theoretical_timing

Adding type hints allowed to simplify `kernel_theoretical_timing`.
---
 ndsl/__init__.py       |  2 +-
 ndsl/dsl/dace/utils.py | 86 +++++++++++++++++++++---------------------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/ndsl/__init__.py b/ndsl/__init__.py
index a2f771cd..5ec303de 100644
--- a/ndsl/__init__.py
+++ b/ndsl/__init__.py
@@ -10,7 +10,7 @@
 from .dsl.dace.utils import (
     ArrayReport,
     DaCeProgress,
-    MaxBandwithBenchmarkProgram,
+    MaxBandwidthBenchmarkProgram,
     StorageReport,
 )
 from .dsl.dace.wrapped_halo_exchange import WrappedHaloUpdater
diff --git a/ndsl/dsl/dace/utils.py b/ndsl/dsl/dace/utils.py
index 29e3211d..05fa5754 100644
--- a/ndsl/dsl/dace/utils.py
+++ b/ndsl/dsl/dace/utils.py
@@ -15,26 +15,22 @@
 from ndsl.optional_imports import cupy as cp
 
 
-# ----------------------------------------------------------
-# Rough timer & log for major operations of DaCe build stack
-# ----------------------------------------------------------
 class DaCeProgress:
-    """Timer and log to track build progress"""
+    """Rough timer & log for major operations of DaCe build stack."""
 
-    def __init__(self, config: DaceConfig, label: str):
+    def __init__(self, config: DaceConfig, label: str) -> None:
         self.prefix = DaCeProgress.default_prefix(config)
-        self.prefix = f"[{config.get_orchestrate()}]"
         self.label = label
 
     @classmethod
     def default_prefix(cls, config: DaceConfig) -> str:
         return f"[{config.get_orchestrate()}]"
 
-    def __enter__(self):
+    def __enter__(self) -> None:
         ndsl_log.debug(f"{self.prefix} {self.label}...")
         self.start = time.time()
 
-    def __exit__(self, _type, _val, _traceback):
+    def __exit__(self, _type, _val, _traceback) -> None:
         elapsed = time.time() - self.start
         ndsl_log.debug(f"{self.prefix} {self.label}...{elapsed}s.")
 
@@ -81,7 +77,7 @@ def memory_static_analysis(
     """Analysis an SDFG for memory pressure.
 
     The results split memory by type (dace.StorageType) and account for
-    allocated, unreferenced and top lovel (e.g. top-most SDFG) memory
+    allocated, unreferenced and top level (e.g. top-most SDFG) memory
     """
     # We report all allocation type
     allocations: Dict[dace.StorageType, StorageReport] = {}
@@ -92,7 +88,7 @@ def memory_static_analysis(
         array_size_in_bytes = arr.total_size * arr.dtype.bytes
         ref = _is_ref(sd, aname)
 
-        # Transient in maps (refrence and not referenced)
+        # Transient in maps (reference and not referenced)
         if sd is not sdfg and arr.transient:
             if arr.pool:
                 allocations[arr.storage].in_pooled_in_bytes += array_size_in_bytes
@@ -111,7 +107,7 @@ def memory_static_analysis(
             else:
                 allocations[arr.storage].unreferenced_in_bytes += array_size_in_bytes
 
-        # SDFG-level memory (refrence, not referenced and pooled)
+        # SDFG-level memory (reference, not referenced and pooled)
         elif sd is sdfg:
             if arr.pool:
                 allocations[arr.storage].in_pooled_in_bytes += array_size_in_bytes
@@ -137,7 +133,7 @@ def memory_static_analysis(
 def report_memory_static_analysis(
     sdfg: dace.sdfg.SDFG,
     allocations: Dict[dace.StorageType, StorageReport],
-    detail_report=False,
+    detail_report: bool = False,
 ) -> str:
     """Create a human readable report form the memory analysis results"""
     report = f"{sdfg.name}:\n"
@@ -145,14 +141,14 @@ def report_memory_static_analysis(
         alloc_in_mb = float(allocs.referenced_in_bytes / (1024 * 1024))
         unref_alloc_in_mb = float(allocs.unreferenced_in_bytes / (1024 * 1024))
         in_pooled_in_mb = float(allocs.in_pooled_in_bytes / (1024 * 1024))
-        toplvlalloc_in_mb = float(allocs.top_level_in_bytes / (1024 * 1024))
-        if alloc_in_mb or toplvlalloc_in_mb > 0:
+        top_level_alloc_in_mb = float(allocs.top_level_in_bytes / (1024 * 1024))
+        if alloc_in_mb or top_level_alloc_in_mb > 0:
             report += (
                 f"{storage}:\n"
                 f"  Alloc ref {alloc_in_mb:.2f} mb\n"
                 f"  Alloc unref {unref_alloc_in_mb:.2f} mb\n"
                 f"  Pooled {in_pooled_in_mb:.2f} mb\n"
-                f"  Top lvl alloc: {toplvlalloc_in_mb:.2f}mb\n"
+                f"  Top lvl alloc: {top_level_alloc_in_mb:.2f}mb\n"
             )
             if detail_report:
                 report += "\n"
@@ -172,7 +168,9 @@ def report_memory_static_analysis(
     return report
 
 
-def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str:
+def memory_static_analysis_from_path(
+    sdfg_path: str, detail_report: bool = False
+) -> str:
     """Open a SDFG and report the memory analysis"""
     sdfg = dace.SDFG.from_file(sdfg_path)
     return report_memory_static_analysis(
@@ -183,53 +181,55 @@ def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str
 
 
 # ----------------------------------------------------------
-# Theoritical bandwith from SDFG
+# Theoretical bandwidth from SDFG
 # ----------------------------------------------------------
-def copy_defn(q_in: FloatField, q_out: FloatField):
+def copy_kernel(q_in: FloatField, q_out: FloatField) -> None:
     with computation(PARALLEL), interval(...):
         q_in = q_out
 
 
-class MaxBandwithBenchmarkProgram:
+class MaxBandwidthBenchmarkProgram:
     def __init__(self, size, backend) -> None:
         from ndsl.dsl.dace.orchestration import DaCeOrchestration, orchestrate
 
-        dconfig = DaceConfig(None, backend, orchestration=DaCeOrchestration.BuildAndRun)
+        dace_config = DaceConfig(
+            None, backend, orchestration=DaCeOrchestration.BuildAndRun
+        )
         c = CompilationConfig(backend=backend)
-        s = StencilConfig(dace_config=dconfig, compilation_config=c)
+        s = StencilConfig(dace_config=dace_config, compilation_config=c)
         self.copy_stencil = FrozenStencil(
-            func=copy_defn,
+            func=copy_kernel,
             origin=(0, 0, 0),
             domain=size,
             stencil_config=s,
         )
-        orchestrate(obj=self, config=dconfig)
+        orchestrate(obj=self, config=dace_config)
 
-    def __call__(self, A, B, n: int):
+    def __call__(self, A, B, n: int) -> None:
         for i in dace.nounroll(range(n)):
             self.copy_stencil(A, B)
 
 
 def kernel_theoretical_timing(
     sdfg: dace.sdfg.SDFG,
-    hardware_bw_in_GB_s=None,
-    backend=None,
+    hardware_bw_in_GB_s: Optional[float] = None,
+    backend: Optional[str] = None,
 ) -> Dict[str, float]:
     """Compute a lower timing bound for kernels with the following hypothesis:
 
     - Performance is memory bound, e.g. arithmetic intensity isn't counted
     - Hardware bandwidth comes from a GT4Py/DaCe test rather than a spec sheet for
-      for higher accuracy. Best is to run a copy_stencils on a full domain
+      for higher accuracy. Best is to run a copy_stencil on a full domain
     - Memory pressure is mostly in read/write from global memory, inner scalar & shared
       memory is not counted towards memory movement.
     """
-    if not hardware_bw_in_GB_s:
+    if hardware_bw_in_GB_s is None:
         size = np.array(sdfg.arrays["__g_self__w"].shape)
         print(
-            f"Calculating experimental hardware bandwith on {size}"
+            f"Calculating experimental hardware bandwidth on {size}"
             f" arrays at {Float} precision..."
         )
-        bench = MaxBandwithBenchmarkProgram(size, backend)
+        bench = MaxBandwidthBenchmarkProgram(size, backend)
         if backend == "dace:gpu":
             A = cp.ones(size, dtype=Float)
             B = cp.ones(size, dtype=Float)
@@ -248,13 +248,19 @@ def kernel_theoretical_timing(
             bench(A, B, n)
             dt.append((time.time() - s) / n)
         memory_size_in_b = np.prod(size) * np.dtype(Float).itemsize * 8
-        bandwidth_in_bytes_s = memory_size_in_b / np.median(dt)
-        print(
-            f"Hardware bandwith computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s"
-        )
-    else:
-        bandwidth_in_bytes_s = hardware_bw_in_GB_s * 1024 * 1024 * 1024
-        print(f"Given hardware bandwith: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
+        measured_bandwidth_in_bytes_s = memory_size_in_b / np.median(dt)
+
+    bandwidth_in_bytes_s = (
+        measured_bandwidth_in_bytes_s
+        if hardware_bw_in_GB_s is None
+        else hardware_bw_in_GB_s * 1024 * 1024 * 1024
+    )
+    label = (
+        "Hardware bandwidth computed"
+        if hardware_bw_in_GB_s
+        else "Given hardware bandwidth"
+    )
+    print(f"{label}: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
 
     allmaps = [
         (me, state)
@@ -307,12 +313,6 @@ def kernel_theoretical_timing(
             except TypeError:
                 pass
 
-        # Bad expansion
-        if not isinstance(newresult_in_us, sympy.core.numbers.Float) and not isinstance(
-            newresult_in_us, float
-        ):
-            continue
-
         result[node.label] = float(newresult_in_us)
 
     return result

From fcfb0584540eeaee558ac26bd40be74ceb47ca4d Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Tue, 7 Jan 2025 13:52:33 -0500
Subject: [PATCH 72/78] Fix merge

---
 ndsl/quantity/metadata.py | 9 +++++++++
 ndsl/quantity/quantity.py | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/ndsl/quantity/metadata.py b/ndsl/quantity/metadata.py
index e8591394..d7ddba0f 100644
--- a/ndsl/quantity/metadata.py
+++ b/ndsl/quantity/metadata.py
@@ -45,6 +45,15 @@ def np(self) -> NumpyModule:
                 f"quantity underlying data is of unexpected type {self.data_type}"
             )
 
+    def duplicate_metadata(self, metadata_copy):
+        metadata_copy.origin = self.origin
+        metadata_copy.extent = self.extent
+        metadata_copy.dims = self.dims
+        metadata_copy.units = self.units
+        metadata_copy.data_type = self.data_type
+        metadata_copy.dtype = self.dtype
+        metadata_copy.gt4py_backend = self.gt4py_backend
+
 
 @dataclasses.dataclass
 class QuantityHaloSpec:
diff --git a/ndsl/quantity/quantity.py b/ndsl/quantity/quantity.py
index 6ddc90e5..c88ba140 100644
--- a/ndsl/quantity/quantity.py
+++ b/ndsl/quantity/quantity.py
@@ -262,15 +262,6 @@ def data_array(self) -> xr.DataArray:
     def np(self) -> NumpyModule:
         return self.metadata.np
 
-    def duplicate_metadata(self, metadata_copy):
-        metadata_copy.origin = self.origin
-        metadata_copy.extent = self.extent
-        metadata_copy.dims = self.dims
-        metadata_copy.units = self.units
-        metadata_copy.data_type = self.data_type
-        metadata_copy.dtype = self.dtype
-        metadata_copy.gt4py_backend = self.gt4py_backend
-
     @property
     def __array_interface__(self):
         return self.data.__array_interface__

From 1fa8d79d13d3659ae0cc32c1fa6e2836752b6b37 Mon Sep 17 00:00:00 2001
From: Frank Malatino <frank.malatino@noaa.gov>
Date: Thu, 16 Jan 2025 13:58:40 -0500
Subject: [PATCH 73/78] Hotfix for grid generation use of mpi operators

---
 ndsl/grid/generation.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/ndsl/grid/generation.py b/ndsl/grid/generation.py
index 75437272..50209d21 100644
--- a/ndsl/grid/generation.py
+++ b/ndsl/grid/generation.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 
+from ndsl.comm.comm_abc import ReductionOperator
 from ndsl.comm.communicator import Communicator
 from ndsl.constants import (
     N_HALO_DEFAULT,
@@ -3428,7 +3429,11 @@ def _reduce_global_area_minmaxes(self):
         max_area = self._np.max(self.area.data[3:-4, 3:-4])[()]
         min_area_c = self._np.min(self.area_c.data[3:-4, 3:-4])[()]
         max_area_c = self._np.max(self.area_c.data[3:-4, 3:-4])[()]
-        self._da_min = float(self._comm.comm.allreduce(min_area, min))
-        self._da_max = float(self._comm.comm.allreduce(max_area, max))
-        self._da_min_c = float(self._comm.comm.allreduce(min_area_c, min))
-        self._da_max_c = float(self._comm.comm.allreduce(max_area_c, max))
+        self._da_min = float(self._comm.comm.allreduce(min_area, ReductionOperator.MIN))
+        self._da_max = float(self._comm.comm.allreduce(max_area, ReductionOperator.MAX))
+        self._da_min_c = float(
+            self._comm.comm.allreduce(min_area_c, ReductionOperator.MIN)
+        )
+        self._da_max_c = float(
+            self._comm.comm.allreduce(max_area_c, ReductionOperator.MAX)
+        )

From 3252ec7f2834c56bd30edd22c6393785ae658447 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <1116746+romanc@users.noreply.github.com>
Date: Mon, 20 Jan 2025 18:54:20 +0100
Subject: [PATCH 74/78] Merge examples/mpi/.gitignore into top-level .gitignore

---
 .gitignore              | 3 +++
 examples/mpi/.gitignore | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)
 delete mode 100644 examples/mpi/.gitignore

diff --git a/.gitignore b/.gitignore
index 24f794a4..35923df0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@ driver/examples/comm
 20*-*-*-*-*-*.json
 *.pkl
 
+# example outputs
+examples/mpi/output
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/examples/mpi/.gitignore b/examples/mpi/.gitignore
deleted file mode 100644
index 53752db2..00000000
--- a/examples/mpi/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-output

From 04ecf871b985f7608ddae62fa27c209998874552 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <1116746+romanc@users.noreply.github.com>
Date: Mon, 20 Jan 2025 18:55:12 +0100
Subject: [PATCH 75/78] Remove hard-coded __version__ numbers

Removes hard-coded version numbers from `__init__` files.
---
 ndsl/dsl/__init__.py      | 2 --
 ndsl/stencils/__init__.py | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/ndsl/dsl/__init__.py b/ndsl/dsl/__init__.py
index ed44420a..b7034e07 100644
--- a/ndsl/dsl/__init__.py
+++ b/ndsl/dsl/__init__.py
@@ -9,5 +9,3 @@
     gt4py.cartesian.config.cache_settings["dir_name"] = os.environ.get(
         "GT_CACHE_DIR_NAME", f".gt_cache_{MPI.COMM_WORLD.Get_rank():06}"
     )
-
-__version__ = "0.2.0"
diff --git a/ndsl/stencils/__init__.py b/ndsl/stencils/__init__.py
index cbbd3f82..5115a28e 100644
--- a/ndsl/stencils/__init__.py
+++ b/ndsl/stencils/__init__.py
@@ -1,4 +1 @@
 from .corners import CopyCorners, CopyCornersXY, FillCornersBGrid
-
-
-__version__ = "0.2.0"

From 4881b34c14f8d3ffcaae387c75ac016a9e520a85 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <1116746+romanc@users.noreply.github.com>
Date: Mon, 20 Jan 2025 19:54:15 +0100
Subject: [PATCH 76/78] Fixing a bunch of typos

---
 ndsl/comm/boundary.py                        |  4 ++--
 ndsl/comm/communicator.py                    |  6 +++---
 ndsl/dsl/dace/dace_config.py                 |  4 ++--
 ndsl/dsl/gt4py_utils.py                      |  2 +-
 ndsl/grid/generation.py                      | 14 ++++++-------
 ndsl/grid/global_setup.py                    |  6 +++---
 ndsl/grid/gnomonic.py                        |  6 +++---
 ndsl/grid/mirror.py                          |  2 +-
 ndsl/grid/stretch_transformation.py          |  4 ++--
 ndsl/halo/data_transformer.py                |  2 +-
 ndsl/halo/updater.py                         | 10 ++++-----
 ndsl/io.py                                   |  2 +-
 ndsl/namelist.py                             | 10 ++++-----
 ndsl/stencils/c2l_ord.py                     |  2 +-
 ndsl/stencils/corners.py                     |  2 +-
 ndsl/stencils/testing/README.md              | 12 +++++------
 ndsl/stencils/testing/serialbox_to_netcdf.py |  6 +++---
 ndsl/stencils/testing/test_translate.py      | 16 +++++++-------
 ndsl/testing/README.md                       | 10 ++++-----
 ndsl/testing/comparison.py                   | 22 ++++++++++----------
 ndsl/utils.py                                |  2 +-
 tests/dsl/test_caches.py                     | 16 +++++++-------
 tests/grid/test_eta.py                       |  8 +++----
 tests/mpi/test_mpi_mock.py                   |  8 +++----
 tests/test_halo_data_transformer.py          | 16 +++++++-------
 tests/test_halo_update.py                    |  2 +-
 26 files changed, 97 insertions(+), 97 deletions(-)

diff --git a/ndsl/comm/boundary.py b/ndsl/comm/boundary.py
index 020798c6..f6016f6e 100644
--- a/ndsl/comm/boundary.py
+++ b/ndsl/comm/boundary.py
@@ -28,7 +28,7 @@ def send_view(self, quantity: Quantity, n_points: int):
         return self._view(quantity, n_points, interior=True)
 
     def recv_view(self, quantity: Quantity, n_points: int):
-        """Return a sliced view of points which should be recieved at this boundary.
+        """Return a sliced view of points which should be received at this boundary.
 
         Args:
             quantity: quantity for which to return a slice
@@ -37,7 +37,7 @@ def recv_view(self, quantity: Quantity, n_points: int):
         return self._view(quantity, n_points, interior=False)
 
     def send_slice(self, specification: QuantityHaloSpec) -> Tuple[slice]:
-        """Return the index slices which shoud be sent at this boundary.
+        """Return the index slices which should be sent at this boundary.
 
         Args:
             specification: data specifications for the halo. Including shape
diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
index ba980d19..014142da 100644
--- a/ndsl/comm/communicator.py
+++ b/ndsl/comm/communicator.py
@@ -304,7 +304,7 @@ def gather_state(self, send_state=None, recv_state=None, transfer_type=None):
 
         Args:
             send_state: the model state to be sent containing the subtile data
-            recv_state: the pre-allocated state in which to recieve the full tile
+            recv_state: the pre-allocated state in which to receive the full tile
                 state. Only variables which are scattered will be written to.
         Returns:
             recv_state: on the root rank, the state containing the entire tile
@@ -340,7 +340,7 @@ def scatter_state(self, send_state=None, recv_state=None):
         Args:
             send_state: the model state to be sent containing the entire tile,
                 required only from the root rank
-            recv_state: the pre-allocated state in which to recieve the scattered
+            recv_state: the pre-allocated state in which to receive the scattered
                 state. Only variables which are scattered will be written to.
         Returns:
             rank_state: the state corresponding to this rank's subdomain
@@ -776,7 +776,7 @@ def __init__(
         """
         if not issubclass(type(comm), CommABC):
             raise TypeError(
-                "Communictor needs to be instantiated with communication subsytem"
+                "Communicator needs to be instantiated with communication subsystem"
                 f" derived from `comm_abc.Comm`, got {type(comm)}."
             )
         if comm.Get_size() != partitioner.total_ranks:
diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py
index 7f1c1477..5129dac8 100644
--- a/ndsl/dsl/dace/dace_config.py
+++ b/ndsl/dsl/dace/dace_config.py
@@ -65,7 +65,7 @@ def _determine_compiling_ranks(
         6 7 8
         3 4 5
         0 1 2
-    Using the partitionner we find mapping of the given layout
+    Using the partitioner we find mapping of the given layout
     to all of those. For example on 4x4 layout
         12 13 14 15
         8  9  10 11
@@ -217,7 +217,7 @@ def __init__(
             # Block size/thread count is defaulted to an average value for recent
             # hardware (Pascal and upward). The problem of setting an optimized
             # block/thread is both hardware and problem dependant. Fine tuners
-            # available in DaCe should be relied on for futher tuning of this value.
+            # available in DaCe should be relied on for further tuning of this value.
             dace.config.Config.set(
                 "compiler", "cuda", "default_block_size", value="64,8,1"
             )
diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index acfee07f..31c60ca7 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -68,7 +68,7 @@ def _mask_to_dimensions(
 
 def _translate_origin(origin: Sequence[int], mask: Tuple[bool, ...]) -> Sequence[int]:
     if len(origin) == int(sum(mask)):
-        # Correct length. Assumedd to be correctly specified.
+        # Correct length. Assumed to be correctly specified.
         return origin
 
     assert len(mask) == 3
diff --git a/ndsl/grid/generation.py b/ndsl/grid/generation.py
index 50209d21..f77e2cd2 100644
--- a/ndsl/grid/generation.py
+++ b/ndsl/grid/generation.py
@@ -687,7 +687,7 @@ def ptop(self) -> Quantity:
     @property
     def ec1(self) -> Quantity:
         """
-        cartesian components of the local unit vetcor
+        cartesian components of the local unit vector
         in the x-direction at the cell centers
         3d array whose last dimension is length 3 and indicates cartesian x/y/z value
         """
@@ -698,8 +698,8 @@ def ec1(self) -> Quantity:
     @property
     def ec2(self) -> Quantity:
         """
-        cartesian components of the local unit vetcor
-        in the y-direation at the cell centers
+        cartesian components of the local unit vector
+        in the y-direction at the cell centers
         3d array whose last dimension is length 3 and indicates cartesian x/y/z value
         """
         if self._ec2 is None:
@@ -709,8 +709,8 @@ def ec2(self) -> Quantity:
     @property
     def ew1(self) -> Quantity:
         """
-        cartesian components of the local unit vetcor
-        in the x-direation at the left/right cell edges
+        cartesian components of the local unit vector
+        in the x-direction at the left/right cell edges
         3d array whose last dimension is length 3 and indicates cartesian x/y/z value
         """
         if self._ew1 is None:
@@ -720,8 +720,8 @@ def ew1(self) -> Quantity:
     @property
     def ew2(self) -> Quantity:
         """
-        cartesian components of the local unit vetcor
-        in the y-direation at the left/right cell edges
+        cartesian components of the local unit vector
+        in the y-direction at the left/right cell edges
         3d array whose last dimension is length 3 and indicates cartesian x/y/z value
         """
         if self._ew2 is None:
diff --git a/ndsl/grid/global_setup.py b/ndsl/grid/global_setup.py
index a0237ec6..60bd3c3b 100644
--- a/ndsl/grid/global_setup.py
+++ b/ndsl/grid/global_setup.py
@@ -21,8 +21,8 @@ def gnomonic_grid(grid_type: int, lon, lat, np):
 
     args:
         grid_type: type of grid to apply
-        lon: longitute array with dimensions [x, y]
-        lat: latitude array with dimensionos [x, y]
+        lon: longitude array with dimensions [x, y]
+        lat: latitude array with dimensions [x, y]
     """
     _check_shapes(lon, lat)
     if grid_type == 0:
@@ -142,7 +142,7 @@ def global_mirror_grid(
                 y1, grid_global[ng + npx - (i + 1), ng + npy - (j + 1), 1, nreg]
             )
 
-            # force dateline/greenwich-meridion consistency
+            # force dateline/greenwich-meridian consistency
             if npx % 2 != 0:
                 if i == (npx - 1) // 2:
                     grid_global[ng + i, ng + j, 0, nreg] = 0.0
diff --git a/ndsl/grid/gnomonic.py b/ndsl/grid/gnomonic.py
index 778d9064..0fc421ef 100644
--- a/ndsl/grid/gnomonic.py
+++ b/ndsl/grid/gnomonic.py
@@ -595,8 +595,8 @@ def get_rectangle_area(p1, p2, p3, p4, radius, np):
     counterclockwise order, return an array of spherical rectangle areas.
     NOTE, this is not the exact same order of operations as the Fortran code
     This results in some errors in the last digit, but the spherical_angle
-    is an exact match. The errors in the last digit multipled out by the radius
-    end up causing relative errors larger than 1e-14, but still wtihin 1e-12.
+    is an exact match. The errors in the last digit multiplied out by the radius
+    end up causing relative errors larger than 1e-14, but still within 1e-12.
     """
     total_angle = spherical_angle(p2, p3, p1, np)
     for (
@@ -702,7 +702,7 @@ def spherical_cos(p_center, p2, p3, np):
 
 def get_unit_vector_direction(p1, p2, np):
     """
-    Returms the unit vector pointing from a set of lonlat points p1 to lonlat points p2
+    Returns the unit vector pointing from a set of lonlat points p1 to lonlat points p2
     """
     xyz1 = lon_lat_to_xyz(p1[:, :, 0], p1[:, :, 1], np)
     xyz2 = lon_lat_to_xyz(p2[:, :, 0], p2[:, :, 1], np)
diff --git a/ndsl/grid/mirror.py b/ndsl/grid/mirror.py
index cf547bdf..3f3a4b6b 100644
--- a/ndsl/grid/mirror.py
+++ b/ndsl/grid/mirror.py
@@ -63,7 +63,7 @@ def mirror_grid(
                 y1, mirror_data["local"][i, j, 1]
             )
 
-            # force dateline/greenwich-meridion consistency
+            # force dateline/greenwich-meridian consistency
             if npx % 2 != 0:
                 if x_center_tile and i == ng + i_mid:
                     mirror_data["local"][i, j, 0] = 0.0
diff --git a/ndsl/grid/stretch_transformation.py b/ndsl/grid/stretch_transformation.py
index 5604a110..9a481ab9 100644
--- a/ndsl/grid/stretch_transformation.py
+++ b/ndsl/grid/stretch_transformation.py
@@ -21,9 +21,9 @@ def direct_transform(
     """
     The direct_transform subroutine from fv_grid_utils.F90.
     Takes in latitude and longitude in radians.
-    Shrinks tile 6 by stretch factor in area to increse resolution locally.
+    Shrinks tile 6 by stretch factor in area to increase resolution locally.
     Then performs translation of all tiles so that the now-smaller tile 6 is
-    centeres on lon_target, lat_target.
+    centered on lon_target, lat_target.
 
     Args:
         lon (in) in radians
diff --git a/ndsl/halo/data_transformer.py b/ndsl/halo/data_transformer.py
index cb50cc12..9f9ab2f6 100644
--- a/ndsl/halo/data_transformer.py
+++ b/ndsl/halo/data_transformer.py
@@ -239,7 +239,7 @@ def get(
         """Construct a module from a numpy-like module.
 
         Args:
-            np_module: numpy-like module to determin child transformer type.
+            np_module: numpy-like module to determine child transformer type.
             exchange_descriptors_x: list of memory information describing an exchange.
                 Used for scalar data and the x-component of vectors.
             exchange_descriptors_y: list of memory information describing an exchange.
diff --git a/ndsl/halo/updater.py b/ndsl/halo/updater.py
index 665d0b95..76f7608f 100644
--- a/ndsl/halo/updater.py
+++ b/ndsl/halo/updater.py
@@ -38,7 +38,7 @@ class HaloUpdater:
     - update and start/wait trigger the halo exchange
     - the class creates a "pattern" of exchange that can fit
       any memory given to do/start
-    - temporary references to the Quanitites are held between start and wait
+    - temporary references to the Quantities are held between start and wait
     """
 
     def __init__(
@@ -106,7 +106,7 @@ def from_scalar_specifications(
             numpy_like_module: module implementing numpy API
             specifications: data specifications to exchange, including
                 number of halo points
-            boundaries: informations on the exchange boundaries.
+            boundaries: information on the exchange boundaries.
             tag: network tag (to differentiate messaging) for this node.
             optional_timer: timing of operations.
 
@@ -161,7 +161,7 @@ def from_vector_specifications(
                 Length must match y specifications.
             specifications_y: specifications to exchange along the y axis.
                 Length must match x specifications.
-            boundaries: informations on the exchange boundaries.
+            boundaries: information on the exchange boundaries.
             tag: network tag (to differentiate messaging) for this node.
             optional_timer: timing of operations.
 
@@ -210,7 +210,7 @@ def update(
         quantities_x: List[Quantity],
         quantities_y: Optional[List[Quantity]] = None,
     ):
-        """Exhange the data and blocks until finished."""
+        """Exchange the data and blocks until finished."""
         self.start(quantities_x, quantities_y)
         self.wait()
 
@@ -283,7 +283,7 @@ def wait(self):
             for recv_req in self._recv_requests:
                 recv_req.wait()
 
-        # Unpack buffers (updated by MPI with neighbouring halos)
+        # Unpack buffers (updated by MPI with neighboring halos)
         # to proper quantities
         with self._timer.clock("unpack"):
             for buffer in self._transformers.values():
diff --git a/ndsl/io.py b/ndsl/io.py
index 9d9f9149..b07248bb 100644
--- a/ndsl/io.py
+++ b/ndsl/io.py
@@ -44,7 +44,7 @@ def write_state(state: dict, filename: str) -> None:
 
 
 def _extract_time(value: xr.DataArray) -> cftime.datetime:
-    """Exctract time value from read-in state."""
+    """Extract time value from read-in state."""
     if value.ndim > 0:
         raise ValueError(
             "State must be representative of a single scalar time. " f"Got {value}."
diff --git a/ndsl/namelist.py b/ndsl/namelist.py
index 205954ca..304d9160 100644
--- a/ndsl/namelist.py
+++ b/ndsl/namelist.py
@@ -42,7 +42,7 @@ class NamelistDefaults:
     qi_gen = 1.82e-6  # max cloud ice generation during remapping step
     qi_lim = 1.0  # cloud ice limiter to prevent large ice build up
     qi0_max = 1.0e-4  # max cloud ice value (by other sources)
-    rad_snow = True  # consider snow in cloud fraciton calculation
+    rad_snow = True  # consider snow in cloud fraction calculation
     rad_rain = True  # consider rain in cloud fraction calculation
     rad_graupel = True  # consider graupel in cloud fraction calculation
     tintqs = False  # use temperature in the saturation mixing in PDF
@@ -110,7 +110,7 @@ class NamelistDefaults:
     z_slope_liq = True  # Use linear mono slope for autoconversions
     tice = 273.16  # set tice = 165. to turn off ice - phase phys (kessler emulator)
     alin = 842.0  # "a" in lin1983
-    clin = 4.8  # "c" in lin 1983, 4.8 -- > 6. (to ehance ql -- > qs)
+    clin = 4.8  # "c" in lin 1983, 4.8 -- > 6. (to enhance ql -- > qs)
     isatmedmf = 0  # which version of satmedmfvdif to use
     dspheat = False  # flag for tke dissipative heating
     xkzm_h = 1.0  # background vertical diffusion for heat q over ocean
@@ -123,10 +123,10 @@ class NamelistDefaults:
     xkzm_lim = 0.01  # background vertical diffusion limit
     xkzminv = 0.15  # diffusivity in inversion layers
     xkgdx = 25.0e3  # background vertical diffusion threshold
-    rlmn = 30.0  # lower-limter on asymtotic mixing length in satmedmfdiff
-    rlmx = 300.0  # upper-limter on asymtotic mixing length in satmedmfdiff
+    rlmn = 30.0  # lower-limiter on asymtotic mixing length in satmedmfdiff
+    rlmx = 300.0  # upper-limiter on asymtotic mixing length in satmedmfdiff
     do_dk_hb19 = False  # flag for using hb19 background diff formula in satmedmfdiff
-    cap_k0_land = False  # flag for applying limter on background diff in inversion layer over land in satmedmfdiff
+    cap_k0_land = False  # flag for applying limiter on background diff in inversion layer over land in satmedmfdiff
 
     @classmethod
     def as_dict(cls):
diff --git a/ndsl/stencils/c2l_ord.py b/ndsl/stencils/c2l_ord.py
index 67f2b5a1..39194232 100644
--- a/ndsl/stencils/c2l_ord.py
+++ b/ndsl/stencils/c2l_ord.py
@@ -227,7 +227,7 @@ def __init__(
             )
 
             # TODO:
-            # To break the depedency to pyFV3 we allow ourselves to not have a type
+            # To break the dependency to pyFV3 we allow ourselves to not have a type
             # hint around state and we check for u and v to make sure we don't
             # have bad input.
             # This entire code should be retired when WrappedHaloUpdater is no longer
diff --git a/ndsl/stencils/corners.py b/ndsl/stencils/corners.py
index 5eb7767a..18c1f6c1 100644
--- a/ndsl/stencils/corners.py
+++ b/ndsl/stencils/corners.py
@@ -54,7 +54,7 @@ def __init__(self, direction: str, stencil_factory: StencilFactory) -> None:
     def __call__(self, field: FloatField):
         """
         Fills cell quantity field using corners from itself and multipliers
-        in the dirction specified initialization of the instance of this class.
+        in the direction specified initialization of the instance of this class.
         """
         self._copy_corners(field, field)
 
diff --git a/ndsl/stencils/testing/README.md b/ndsl/stencils/testing/README.md
index f8b0f04f..098dd489 100644
--- a/ndsl/stencils/testing/README.md
+++ b/ndsl/stencils/testing/README.md
@@ -6,7 +6,7 @@ First, make sure you have followed the instruction in the top level [README](../
 
 The unit and regression tests of pace require data generated from the Fortran reference implementation which has to be downloaded from a Google Cloud Platform storage bucket. Since the bucket is setup as "requester pays", you need a valid GCP account to download the test data.
 
-First, make sure you have configured the authentication with user credientials and configured Docker with the following commands:
+First, make sure you have configured the authentication with user credentials and configured Docker with the following commands:
 
 ```shell
 gcloud auth login
@@ -22,11 +22,11 @@ cd $(git rev-parse --show-toplevel)/physics
 make get_test_data
 ```
 
-If you do not have a GCP account, there is an option to download basic test data from a public FTP server and you can skip the GCP authentication step above. To download test data from the FTP server, use `make USE_FTP=yes get_test_data` instead and this will avoid fetching from a GCP storage bucket. You will need a valid in stallation of the `lftp` command.
+If you do not have a GCP account, there is an option to download basic test data from a public FTP server and you can skip the GCP authentication step above. To download test data from the FTP server, use `make USE_FTP=yes get_test_data` instead and this will avoid fetching from a GCP storage bucket. You will need a valid installation of the `lftp` command.
 
 ## Running the tests (manually)
 
-There are two ways to run the tests, manually by explicitly invoking `pytest` or autmatically using make targets. The former can be used both inside the Docker container as well as for a bare-metal installation and will be described here.
+There are two ways to run the tests, manually by explicitly invoking `pytest` or automatically using make targets. The former can be used both inside the Docker container as well as for a bare-metal installation and will be described here.
 
 First enter the container and navigate to the pace directory:
 
@@ -40,7 +40,7 @@ Note that by entering the container with the `make dev` command, volumes for cod
 
 There are two sets of tests. The "sequential tests" test components which do not require MPI-parallelism. The "parallel tests" can only within an MPI environment.
 
-To run the sequential and parallel tests for the dynmical core (fv3core), you can execute the following commands (these take a bit of time):
+To run the sequential and parallel tests for the dynamical core (fv3core), you can execute the following commands (these take a bit of time):
 
 ```shell
 pytest -v -s --data_path=/pace/fv3core/test_data/8.1.1/c12_6ranks_standard/dycore/ ./fv3core/tests
@@ -79,8 +79,8 @@ DEV=y make physics_savepoint_tests_mpi
 ## Test failure
 
 Test are running for each gridpoint of the domain, unless the Translate class for the test specifically restricts it.
-Upon failure, the test will drop a `netCDF` faile in a `./.translate-errors` directory and named `translate-TestCase(-Rank).nc` containing input, computed output, reference and errors.
+Upon failure, the test will drop a `netCDF` file in a `./.translate-errors` directory and named `translate-TestCase(-Rank).nc` containing input, computed output, reference and errors.
 
 ## Environment variables
 
-- `PACE_TEST_N_THRESHOLD_SAMPLES`: Upon failure the system will try to pertub the output in an attempt to check for numerical instability. This means re-running the test for N samples. Default is `10`, `0` or less turns this feature off.
+- `PACE_TEST_N_THRESHOLD_SAMPLES`: Upon failure the system will try to perturb the output in an attempt to check for numerical instability. This means re-running the test for N samples. Default is `10`, `0` or less turns this feature off.
diff --git a/ndsl/stencils/testing/serialbox_to_netcdf.py b/ndsl/stencils/testing/serialbox_to_netcdf.py
index d03f295d..46996dbe 100644
--- a/ndsl/stencils/testing/serialbox_to_netcdf.py
+++ b/ndsl/stencils/testing/serialbox_to_netcdf.py
@@ -143,7 +143,7 @@ def main(
             for varname in set(names_list).difference(["rank"]):
                 # Check that all ranks have the same size. If not, aggregate and
                 # feedback on one rank
-                colapse_all_ranks = False
+                collapse_all_ranks = False
                 data_shape = list(rank_list[0][varname][0].shape)
                 print(f"  Exporting {varname} - {data_shape}")
                 for rank in range(total_ranks):
@@ -159,7 +159,7 @@ def main(
                             print(
                                 f"... different shape for {varname} across ranks, collapsing in on rank."
                             )
-                            colapse_all_ranks = True
+                            collapse_all_ranks = True
                             break
 
                 if savepoint_name in [
@@ -185,7 +185,7 @@ def main(
                         data_vars[varname] = get_data(
                             data_shape, total_ranks, n_savepoints, rank_list, varname
                         )
-                elif colapse_all_ranks:
+                elif collapse_all_ranks:
                     data_vars[varname] = get_data_collapse_all_ranks(
                         total_ranks, n_savepoints, rank_list, varname
                     )
diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index 9f0278d8..70480c16 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -76,15 +76,15 @@ def process_override(threshold_overrides, testobj, test_name, backend):
                         "ignore_near_zero_errors is either a list or a dict"
                     )
             if "multimodal" in match:
-                parsed_mutimodal = match["multimodal"]
-                if "absolute_epsilon" in parsed_mutimodal:
-                    testobj.mmr_absolute_eps = float(parsed_mutimodal["absolute_eps"])
-                if "relative_fraction" in parsed_mutimodal:
+                parsed_multimodal = match["multimodal"]
+                if "absolute_epsilon" in parsed_multimodal:
+                    testobj.mmr_absolute_eps = float(parsed_multimodal["absolute_eps"])
+                if "relative_fraction" in parsed_multimodal:
                     testobj.mmr_relative_fraction = float(
-                        parsed_mutimodal["relative_fraction"]
+                        parsed_multimodal["relative_fraction"]
                     )
-                if "ulp_threshold" in parsed_mutimodal:
-                    testobj.mmr_ulp = float(parsed_mutimodal["ulp_threshold"])
+                if "ulp_threshold" in parsed_multimodal:
+                    testobj.mmr_ulp = float(parsed_multimodal["ulp_threshold"])
             if "skip_test" in match:
                 testobj.skip_test = bool(match["skip_test"])
         elif len(matches) > 1:
@@ -422,7 +422,7 @@ def _report_results(savepoint_name: str, results: Dict[str, BaseMetric]) -> None
         for varname, metric in results.items():
             f.write(f"{varname}: {metric.one_line_report()}\n")
 
-    # Detailled log
+    # Detailed log
     for varname, metric in results.items():
         log_filename = os.path.join(OUTDIR, f"details-{savepoint_name}-{varname}.log")
         metric.report(log_filename)
diff --git a/ndsl/testing/README.md b/ndsl/testing/README.md
index f6c7a27d..9154956c 100644
--- a/ndsl/testing/README.md
+++ b/ndsl/testing/README.md
@@ -31,21 +31,21 @@ More options of `pytest` are available when doing `pytest --help`.
 
 ## Metrics
 
-There is three state of a test in `pytest`: FAIL, PASS and XFAIL (expected fail). To clear the PASS status, the output data contained in `NAME-Out.nc` is compared to the computed data via the `TranslateNAME` test. Because this system was developped to port Fortran numerics to many targets (mostly C, but also Python, and CPU/GPU), we can't rely on bit-to-bit comparison and have been developping a couple of metrics.
+There is three state of a test in `pytest`: FAIL, PASS and XFAIL (expected fail). To clear the PASS status, the output data contained in `NAME-Out.nc` is compared to the computed data via the `TranslateNAME` test. Because this system was developed to port Fortran numerics to many targets (mostly C, but also Python, and CPU/GPU), we can't rely on bit-to-bit comparison and have been developing a couple of metrics.
 
 ### Legacy metric
 
-The legacy metric was used throughout the developement of the dynamical core and microphysics scheme at 64-bit precision. It tries to solve differences over big and small amplitutde values with a single formula that goes as follows: $`\|computed-reference|/reference`$ where `reference` has been purged of 0.
+The legacy metric was used throughout the development of the dynamical core and microphysics scheme at 64-bit precision. It tries to solve differences over big and small amplitude values with a single formula that goes as follows: $`\|computed-reference|/reference`$ where `reference` has been purged of 0.
 NaN values are considered no-pass.
-To pass the metric has to be lower than `1e-14`, any value lower than `1e-18` will be considered pass by default. The pass threshold can be overriden (see below).
+To pass the metric has to be lower than `1e-14`, any value lower than `1e-18` will be considered pass by default. The pass threshold can be overridden (see below).
 
 ### Multi-modal metric
 
 Moving to mixed precision code, the legacy metric didn't give enough flexibility to account for 32-bit precision errors that could accumulate. Another metric was built with the intent of breaking the one-fit-all concept and giving back flexibility. The metric is a combination of three differences:
 
 - _Absolute Difference_ ($`|computed-reference|<threshold`$): the absolute difference between the reference value and the computed value. Good for small amplitude, decays to direct comparison in higher amplitude. Default threshold is `1e-13` for 64-bit, `1e-10` at 32-bit.
-- _Relative Difference_ ($`|computed-reference|<threshold*\max{|computed|, |reference|}`$): the difference relative to the maximum value. This can be seen at a % of error. Good for high amplitutde value, decay to direct comparison at smaller amplitude. Default is `0.0001%`
-- _ULP Difference_ ($`|computed-reference|/\spacing{\max{|computed|, |reference|}}<=threshold`$): Unit of Least Precision (ULP) can be shortly describe as a way to quantify the space between two describable floating points. This is useful to measure differences that are in the "noise" of the machine representation. Default threshold is 1, meaning the two values are virtually indistiguinshible.
+- _Relative Difference_ ($`|computed-reference|<threshold*\max{|computed|, |reference|}`$): the difference relative to the maximum value. This can be seen at a % of error. Good for high amplitude value, decay to direct comparison at smaller amplitude. Default is `0.0001%`
+- _ULP Difference_ ($`|computed-reference|/\spacing{\max{|computed|, |reference|}}<=threshold`$): Unit of Least Precision (ULP) can be shortly describe as a way to quantify the space between two describable floating points. This is useful to measure differences that are in the "noise" of the machine representation. Default threshold is 1, meaning the two values are virtually indistinguishable.
 
 ## Results
 
diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index 04cdc55f..5bce02dd 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -68,7 +68,7 @@ def _compute_errors(
             self._calculated_metric = np.logical_xor(self.computed, self.references)
         else:
             raise TypeError(
-                f"recieved data with unexpected dtype {self.references.dtype}"
+                f"received data with unexpected dtype {self.references.dtype}"
             )
         success = np.logical_or(
             np.logical_and(np.isnan(self.computed), np.isnan(self.references)),
@@ -115,7 +115,7 @@ def report(self, file_path: Optional[str] = None) -> List[str]:
             abs_errs = []
             details = [
                 "All failures:",
-                "Index  Computed  Reference  Absloute E  Metric E",
+                "Index  Computed  Reference  Absolute E  Metric E",
             ]
             for b in range(bad_indices_count):
                 full_index = tuple([f[b] for f in found_indices])
@@ -195,7 +195,7 @@ class MultiModalFloatMetric(BaseMetric):
     floating errors.
 
     ULP is used to clear noise (ULP<=1.0 passes)
-    Absolute errors for large amplitute
+    Absolute errors for large amplitude
     """
 
     _f32_absolute_eps = _Metric(1e-10)
@@ -259,7 +259,7 @@ def _compute_all_metrics(
             )
             self.ulp_distance_metric = self.ulp_distance <= self.ulp_threshold.value
 
-            # Combine all distances into sucess or failure
+            # Combine all distances into success or failure
             # Success =
             # - no unexpected NANs (e.g. NaN in the ref MUST BE in computation) OR
             # - absolute distance pass OR
@@ -279,7 +279,7 @@ def _compute_all_metrics(
             return success
         else:
             raise TypeError(
-                f"recieved data with unexpected dtype {self.references.dtype}"
+                f"received data with unexpected dtype {self.references.dtype}"
             )
 
     def _has_override(self) -> bool:
@@ -290,17 +290,17 @@ def _has_override(self) -> bool:
         )
 
     def one_line_report(self) -> str:
-        metric_threholds = f"{'🔶 ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
-        metric_threholds += f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
-        metric_threholds += f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})"
+        metric_thresholds = f"{'🔶 ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
+        metric_thresholds += f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
+        metric_thresholds += f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})"
         if self.check and self._has_override():
-            return f"🔶 No numerical differences with threshold override - metric: {metric_threholds}"
+            return f"🔶 No numerical differences with threshold override - metric: {metric_thresholds}"
         elif self.check:
-            return f"✅ No numerical differences - metric: {metric_threholds}"
+            return f"✅ No numerical differences - metric: {metric_thresholds}"
         else:
             failed_indices = len(np.logical_not(self.success).nonzero()[0])
             all_indices = len(self.references.flatten())
-            return f"❌ Numerical failures: {failed_indices}/{all_indices} failed - metric: {metric_threholds}"
+            return f"❌ Numerical failures: {failed_indices}/{all_indices} failed - metric: {metric_thresholds}"
 
     def report(self, file_path: Optional[str] = None) -> List[str]:
         report = []
diff --git a/ndsl/utils.py b/ndsl/utils.py
index 0d22330c..36316680 100644
--- a/ndsl/utils.py
+++ b/ndsl/utils.py
@@ -90,7 +90,7 @@ def safe_mpi_allocate(
     """Make sure the allocation use an allocator that works with MPI
 
     For G2G transfer, MPICH requires the allocation to not be done
-    with managedmemory. Since we can't know what state `cupy` is in
+    with managed memory. Since we can't know what state `cupy` is in
     with switch for the default pooled allocator.
 
     If allocator comes from cupy, it must be cupy.empty or cupy.zeros.
diff --git a/tests/dsl/test_caches.py b/tests/dsl/test_caches.py
index 893fb89d..768238e2 100644
--- a/tests/dsl/test_caches.py
+++ b/tests/dsl/test_caches.py
@@ -60,7 +60,7 @@ def _build_stencil(backend, orchestrated: DaCeOrchestration):
     return built_stencil, grid_indexing, stencil_config
 
 
-class OrchestratedProgam:
+class OrchestratedProgram:
     def __init__(self, backend, orchestration):
         self.stencil, grid_indexing, stencil_config = _build_stencil(
             backend, orchestration
@@ -92,7 +92,7 @@ def test_relocatability_orchestration(backend):
     working_dir = str(os.getcwd())
 
     # Compile on default
-    p0 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p0 = OrchestratedProgram(backend, DaCeOrchestration.BuildAndRun)
     p0()
     assert os.path.exists(
         f"{working_dir}/.gt_cache_FV3_A/dacecache/"
@@ -105,7 +105,7 @@ def test_relocatability_orchestration(backend):
 
     custom_path = f"{working_dir}/.my_cache_path"
     gt_config.cache_settings["root_path"] = custom_path
-    p1 = OrchestratedProgam(backend, DaCeOrchestration.BuildAndRun)
+    p1 = OrchestratedProgram(backend, DaCeOrchestration.BuildAndRun)
     p1()
     assert os.path.exists(
         f"{custom_path}/.gt_cache_FV3_A/dacecache/"
@@ -119,14 +119,14 @@ def test_relocatability_orchestration(backend):
     relocated_path = f"{working_dir}/.my_relocated_cache_path"
     shutil.copytree(custom_path, relocated_path, dirs_exist_ok=True)
     gt_config.cache_settings["root_path"] = relocated_path
-    p2 = OrchestratedProgam(backend, DaCeOrchestration.Run)
+    p2 = OrchestratedProgram(backend, DaCeOrchestration.Run)
     p2()
 
     # Generate a file exists error to check for bad path
     bogus_path = "./nope/notatall/nothappening"
     gt_config.cache_settings["root_path"] = bogus_path
     with pytest.raises(RuntimeError):
-        OrchestratedProgam(backend, DaCeOrchestration.Run)
+        OrchestratedProgram(backend, DaCeOrchestration.Run)
 
     # Restore cache settings
     gt_config.cache_settings["root_path"] = original_root_directory
@@ -156,7 +156,7 @@ def test_relocatability(backend: str):
     backend_sanitized = backend.replace(":", "")
 
     # Compile on default
-    p0 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p0 = OrchestratedProgram(backend, DaCeOrchestration.Python)
     p0()
     assert os.path.exists(
         f"./.gt_cache_000000/py38_1013/{backend_sanitized}/test_caches/_stencil/"
@@ -166,7 +166,7 @@ def test_relocatability(backend: str):
 
     custom_path = "./.my_cache_path"
     gt_config.cache_settings["root_path"] = custom_path
-    p1 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p1 = OrchestratedProgram(backend, DaCeOrchestration.Python)
     p1()
     assert os.path.exists(
         f"{custom_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
@@ -178,7 +178,7 @@ def test_relocatability(backend: str):
     relocated_path = "./.my_relocated_cache_path"
     shutil.copytree("./.gt_cache_000000", relocated_path, dirs_exist_ok=True)
     gt_config.cache_settings["root_path"] = relocated_path
-    p2 = OrchestratedProgam(backend, DaCeOrchestration.Python)
+    p2 = OrchestratedProgram(backend, DaCeOrchestration.Python)
     p2()
     assert os.path.exists(
         f"{relocated_path}/.gt_cache_000000/py38_1013/{backend_sanitized}"
diff --git a/tests/grid/test_eta.py b/tests/grid/test_eta.py
index ab0539f8..1acd9c6d 100755
--- a/tests/grid/test_eta.py
+++ b/tests/grid/test_eta.py
@@ -22,8 +22,8 @@
 values are read-in and stored properly.
 In addition, this test checks to ensure that
 the function set_hybrid_pressure_coefficients
-fail as expected if the computed eta values
-vary non-mononitically and if the eta_file
+fails as expected if the computed eta values
+vary non-monotonically and if the eta_file
 is not provided.
 """
 
@@ -171,7 +171,7 @@ def test_set_hybrid_pressure_coefficients_not_mono():
     eta_file is specified in test_config_not_mono.yaml
     file and the ak and bk values in the eta_file
     have been changed nonsensically to result in
-    erronenous eta values.
+    erroneous eta values.
     """
 
     working_dir = str(os.getcwd())
@@ -216,7 +216,7 @@ def test_set_hybrid_pressure_coefficients_not_mono():
         if os.path.isfile(out_eta_file):
             os.remove(out_eta_file)
         if str(error) == "ETA values are not monotonically increasing":
-            pytest.xfail("testing eta values are not monotomincally increasing")
+            pytest.xfail("testing eta values are not monotonically increasing")
         else:
             pytest.fail(
                 "ERROR in testing eta values not are not monotonically increasing"
diff --git a/tests/mpi/test_mpi_mock.py b/tests/mpi/test_mpi_mock.py
index b8202995..6b441702 100644
--- a/tests/mpi/test_mpi_mock.py
+++ b/tests/mpi/test_mpi_mock.py
@@ -38,7 +38,7 @@ def send_recv(comm, numpy):
         comm.Send(data, dest=rank + 1)
     if rank > 0:
         if isinstance(comm, DummyComm):
-            print(f"recieving data from {rank - 1} to {rank}")
+            print(f"receiving data from {rank - 1} to {rank}")
         comm.Recv(data, source=rank - 1)
     return data
 
@@ -55,7 +55,7 @@ def send_recv_big_data(comm, numpy):
         comm.Send(data, dest=rank + 1)
     if rank > 0:
         if isinstance(comm, DummyComm):
-            print(f"recieving data from {rank - 1} to {rank}")
+            print(f"receiving data from {rank - 1} to {rank}")
         comm.Recv(data, source=rank - 1)
     return data
 
@@ -101,7 +101,7 @@ def send_f_contiguous_buffer(comm, numpy):
         comm.Send(data, dest=rank + 1)
     if rank > 0:
         if isinstance(comm, DummyComm):
-            print(f"recieving data from {rank - 1} to {rank}")
+            print(f"receiving data from {rank - 1} to {rank}")
         comm.Recv(data, source=rank - 1)
     return data
 
@@ -156,7 +156,7 @@ def recv_to_subarray(comm, numpy):
         comm.Send(data, dest=rank + 1)
     if rank > 0:
         if isinstance(comm, DummyComm):
-            print(f"recieving data from {rank - 1} to {rank}")
+            print(f"receiving data from {rank - 1} to {rank}")
         try:
             comm.Recv(recv_buffer[1:-1, 1:-1, 1:-1], source=rank - 1)
         except Exception as err:
diff --git a/tests/test_halo_data_transformer.py b/tests/test_halo_data_transformer.py
index e3f6d851..7e3385e7 100644
--- a/tests/test_halo_data_transformer.py
+++ b/tests/test_halo_data_transformer.py
@@ -353,8 +353,8 @@ def test_data_transformer_scalar_pack_unpack(quantity, rotation, n_halos):
 
 
 def test_data_transformer_vector_pack_unpack(quantity, rotation, n_halos):
-    targe_quanity_x = copy.deepcopy(quantity)
-    targe_quanity_y = copy.deepcopy(targe_quanity_x)
+    target_quantity_x = copy.deepcopy(quantity)
+    target_quantity_y = copy.deepcopy(target_quantity_x)
     x_quantity = quantity
     y_quantity = copy.deepcopy(x_quantity)
 
@@ -450,8 +450,8 @@ def test_data_transformer_vector_pack_unpack(quantity, rotation, n_halos):
         quantity.dims,
         quantity.metadata.np,
     )
-    targe_quanity_x.data[N_edge_boundaries[rotation][1]] = rotated_x
-    targe_quanity_y.data[N_edge_boundaries[rotation][1]] = rotated_y
+    target_quantity_x.data[N_edge_boundaries[rotation][1]] = rotated_x
+    target_quantity_y.data[N_edge_boundaries[rotation][1]] = rotated_y
     rotated_x, rotated_y = rotate_vector_data(
         quantity.data[NE_corner_boundaries[rotation][0]],
         quantity.data[NE_corner_boundaries[rotation][0]],
@@ -459,8 +459,8 @@ def test_data_transformer_vector_pack_unpack(quantity, rotation, n_halos):
         quantity.dims,
         quantity.metadata.np,
     )
-    targe_quanity_x.data[NE_corner_boundaries[rotation][1]] = rotated_x
-    targe_quanity_y.data[NE_corner_boundaries[rotation][1]] = rotated_y
+    target_quantity_x.data[NE_corner_boundaries[rotation][1]] = rotated_x
+    target_quantity_y.data[NE_corner_boundaries[rotation][1]] = rotated_y
 
-    assert (targe_quanity_x.data == x_quantity.data).all()
-    assert (targe_quanity_y.data == y_quantity.data).all()
+    assert (target_quantity_x.data == x_quantity.data).all()
+    assert (target_quantity_y.data == y_quantity.data).all()
diff --git a/tests/test_halo_update.py b/tests/test_halo_update.py
index 3d3bf501..9c1ac220 100644
--- a/tests/test_halo_update.py
+++ b/tests/test_halo_update.py
@@ -914,7 +914,7 @@ def test_halo_updater_stability(
     assert len(BUFFER_CACHE) == 1
     assert len(next(iter(BUFFER_CACHE.values()))) == 0
 
-    # Manually call finalize on the transfomers
+    # Manually call finalize on the transformers
     # This should recache all the buffers
     # DSL-816 will refactor that behavior out
     for halo_updater in halo_updaters:

From a51daf368bb8312b829d5f5fc73bce44cb42637a Mon Sep 17 00:00:00 2001
From: Oliver Elbert <oliver.elbert36@gmail.com>
Date: Wed, 22 Jan 2025 14:18:30 -0500
Subject: [PATCH 77/78] hotfix netcdf version for dockerfiles

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 03dcbf0f..1c272765 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ def local_pkg(name: str, relative_path: str) -> str:
     "xarray",
     "f90nml>=1.1.0",
     "fsspec",
-    "netcdf4==1.7.0",
+    "netcdf4==1.7.1",
     "scipy",  # restart capacities only
     "h5netcdf",  # for xarray
     "dask",  # for xarray

From 3f77863467cc84b385ed6bc99c58da69ecbc00e1 Mon Sep 17 00:00:00 2001
From: Frank Malatino <frank.malatino@noaa.gov>
Date: Thu, 23 Jan 2025 11:42:54 -0500
Subject: [PATCH 78/78] Updated version number in setup.py to reflect new
 release, 2025.01.00

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1c272765..0b19d38e 100644
--- a/setup.py
+++ b/setup.py
@@ -57,7 +57,7 @@ def local_pkg(name: str, relative_path: str) -> str:
     packages=find_namespace_packages(include=["ndsl", "ndsl.*"]),
     include_package_data=True,
     url="https://github.com/NOAA-GFDL/NDSL",
-    version="2024.09.00",
+    version="2025.01.00",
     zip_safe=False,
     entry_points={
         "console_scripts": [