add in-place MOZART LU decomp

NCAR · Dec 23, 2024 · 70a2956 · 70a2956
1 parent 1cbd28e
commit 70a2956
Show file tree

Hide file tree

Showing 8 changed files with 600 additions and 4 deletions.
diff --git a/include/micm/solver/lu_decomposition.hpp b/include/micm/solver/lu_decomposition.hpp
@@ -4,6 +4,7 @@
 
 #include "lu_decomposition_doolittle.hpp"
 #include "lu_decomposition_mozart.hpp"
+#include "lu_decomposition_mozart_in_place.hpp"
 
 namespace micm
 {

diff --git a/include/micm/solver/lu_decomposition_mozart_in_place.hpp b/include/micm/solver/lu_decomposition_mozart_in_place.hpp
@@ -0,0 +1,99 @@
+// Copyright (C) 2023-2024 National Center for Atmospheric Research
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <micm/profiler/instrumentation.hpp>
+#include <micm/util/sparse_matrix.hpp>
+
+namespace micm
+{
+
+  /// @brief LU decomposer for SparseMatrix following the algorithm from the MOZART model
+  ///
+  /// This LU decomposition uses the algorithm from the MOZART chemistry preprocessor at:
+  /// https://github.com/ESCOMP/CHEM_PREPROCESSOR/blob/f923081508f4264e61fcef2753a9898e55d1598e/src/cam_chempp/make_lu_fac.f#L127-L214
+  ///
+  /// The MOZART function overwrote the A matrix with the L and U matrices. The pseudo-code
+  /// in C++ for the corresponding dense matrix algorithm for matrix A (inline change) would be:
+  ///
+  /// for i = 0...n-1                     // Outer loop over columns of the sparse matrix A
+  ///   for j = i+1...n-1                 // Multiply column below diagonal
+  ///     A[j][i] = A[j][i] / A[i][i]
+  ///   for k = i+1...n-1                 // Modify sub-matrix
+  ///     for j = i+1...n-1
+  ///       A[j][k] = A[j][k] – A[j][i] * A[i][k]
+  ///
+  /// For the sparse matrix algorithm, the indices of non-zero terms are stored in
+  /// several arrays during construction. These arrays are iterated through during
+  /// calls to Decompose to do the actual decomposition.
+  ///
+  /// The GetLUMatrices function creates a new sparse matrix that includes the superset
+  /// of the non-zero elements in the L and U matrices. It is expected that the elements
+  /// of the L and U matrices that are zero in the A matrix will be set to zero before the
+  /// combined matrix is passed to the decomposition function.
+  class LuDecompositionMozartInPlace
+  {
+   protected:
+    /// Index in A.data_ for all diagonal elements, the number of iterations of the inner (j) loop
+    /// for each (i) used to set A[j][i], and the number of iterations of the middle (k) loop for
+    /// each (i) used to set A[j][k]
+    std::vector<std::tuple<std::size_t, std::size_t, std::size_t>> aii_nji_nki_;
+    /// Index in A.data_ for A[j][i] for each iteration of the inner (j) loop
+    /// used to set the value of A[j][i]
+    std::vector<std::size_t> aji_;
+    /// Index in A.data_ for A[i][k] for each iteration of the middle (k) loop,
+    /// and the number of iterations of the inner (j) loop for each (k) used to set A[j][k]
+    std::vector<std::pair<std::size_t, std::size_t>> aik_njk_;
+    /// Index in A.data_ for A[j][k] and A[j][i] for each iteration of the inner (j) loop
+    std::vector<std::pair<std::size_t, std::size_t>> ajk_aji_;
+
+   public:
+    /// @brief default constructor
+    LuDecompositionMozartInPlace();
+
+    LuDecompositionMozartInPlace(const LuDecompositionMozartInPlace&) = delete;
+    LuDecompositionMozartInPlace& operator=(const LuDecompositionMozartInPlace&) = delete;
+
+    LuDecompositionMozartInPlace(LuDecompositionMozartInPlace&& other) = default;
+    LuDecompositionMozartInPlace& operator=(LuDecompositionMozartInPlace&&) = default;
+
+    /// @brief Construct an LU decomposition algorithm for a given sparse matrix
+    /// @param matrix Sparse matrix
+    template<class SparseMatrixPolicy>
+    requires(SparseMatrixConcept<SparseMatrixPolicy>) LuDecompositionMozartInPlace(const SparseMatrixPolicy& matrix);
+
+    ~LuDecompositionMozartInPlace() = default;
+
+    /// @brief Create an LU decomposition algorithm for a given sparse matrix policy
+    /// @param matrix Sparse matrix
+    template<class SparseMatrixPolicy>
+    requires(SparseMatrixConcept<SparseMatrixPolicy>) static LuDecompositionMozartInPlace Create(const SparseMatrixPolicy& matrix);
+
+    /// @brief Create a combined sparse L and U matrix for a given A matrix
+    /// @param A Sparse matrix that will be decomposed
+    /// @return combined L and U Sparse matrices
+    template<class SparseMatrixPolicy>
+    requires(SparseMatrixConcept<SparseMatrixPolicy>) static SparseMatrixPolicy GetLUMatrix(
+        const SparseMatrixPolicy& A,
+        typename SparseMatrixPolicy::value_type initial_value);
+
+    /// @brief Perform an LU decomposition on a given A matrix.
+    ///        All elements of L and U that are zero in A should be set to zero before calling this function.
+    /// @param ALU Sparse matrix to decompose (will be overwritten with L and U matrices)
+    template<class SparseMatrixPolicy>
+    requires(!VectorizableSparse<SparseMatrixPolicy>) void Decompose(
+        SparseMatrixPolicy& ALU) const;
+    template<class SparseMatrixPolicy>
+    requires(VectorizableSparse<SparseMatrixPolicy>) void Decompose(
+        SparseMatrixPolicy& ALU) const;
+
+   private:
+    /// @brief Initialize arrays for the LU decomposition
+    /// @param A Sparse matrix to decompose
+    template<class SparseMatrixPolicy>
+    requires(SparseMatrixConcept<SparseMatrixPolicy>) void Initialize(const SparseMatrixPolicy& matrix, auto initial_value);
+  };
+
+}  // namespace micm
+
+#include "lu_decomposition_mozart_in_place.inl"
diff --git a/include/micm/solver/lu_decomposition_mozart_in_place.inl b/include/micm/solver/lu_decomposition_mozart_in_place.inl
@@ -0,0 +1,183 @@
+// Copyright (C) 2023-2024 National Center for Atmospheric Research
+// SPDX-License-Identifier: Apache-2.0
+
+namespace micm
+{
+
+  inline LuDecompositionMozartInPlace::LuDecompositionMozartInPlace()
+  {
+  }
+
+  template<class SparseMatrixPolicy>
+  requires(SparseMatrixConcept<SparseMatrixPolicy>) inline LuDecompositionMozartInPlace::LuDecompositionMozartInPlace(const SparseMatrixPolicy& matrix)
+  {
+    Initialize<SparseMatrixPolicy>(matrix, typename SparseMatrixPolicy::value_type());
+  }
+
+  template<class SparseMatrixPolicy>
+  requires(SparseMatrixConcept<SparseMatrixPolicy>) inline LuDecompositionMozartInPlace LuDecompositionMozartInPlace::Create(
+      const SparseMatrixPolicy& matrix)
+  {
+    LuDecompositionMozartInPlace lu_decomp{};
+    lu_decomp.Initialize<SparseMatrixPolicy>(matrix, typename SparseMatrixPolicy::value_type());
+    return lu_decomp;
+  }
+
+  template<class SparseMatrixPolicy>
+  requires(SparseMatrixConcept<SparseMatrixPolicy>) inline void LuDecompositionMozartInPlace::Initialize(
+      const SparseMatrixPolicy& matrix,
+      auto initial_value)
+  {
+    MICM_PROFILE_FUNCTION();
+
+    std::size_t n = matrix.NumRows();
+    auto ALU = GetLUMatrix<SparseMatrixPolicy>(matrix, initial_value);
+    for (std::size_t i = 0; i < n; ++i)
+    {
+      if (ALU.IsZero(i, i)) {
+        throw std::runtime_error("Diagonal element is zero in LU decomposition");
+      }
+      std::tuple<std::size_t, std::size_t, std::size_t> aii_nji_nki(ALU.VectorIndex(0, i, i), 0, 0);
+      for (std::size_t j = i + 1; j < n; ++j)
+      {
+        if (ALU.IsZero(j, i))
+          continue;
+        aji_.push_back(ALU.VectorIndex(0, j, i));
+        ++(std::get<1>(aii_nji_nki));
+      }
+      for (std::size_t k = i + 1; k < n; ++k)
+      {
+        if (ALU.IsZero(i, k))
+          continue;
+        std::pair<std::size_t, std::size_t> aik_njk(ALU.VectorIndex(0, i, k), 0);
+        for (std::size_t j = i + 1; j < n; ++j)
+        {
+          if (ALU.IsZero(j, i))
+            continue;
+          std::pair<std::size_t, std::size_t> ajk_aji(ALU.VectorIndex(0, j, k), ALU.VectorIndex(0, j, i));
+          ajk_aji_.push_back(ajk_aji);
+          ++(std::get<1>(aik_njk));
+        }
+        aik_njk_.push_back(aik_njk);
+        ++(std::get<2>(aii_nji_nki));
+      }
+      aii_nji_nki_.push_back(aii_nji_nki);
+    }
+  }
+
+  template<class SparseMatrixPolicy>
+  requires(
+      SparseMatrixConcept<SparseMatrixPolicy>) inline SparseMatrixPolicy LuDecompositionMozartInPlace::
+      GetLUMatrix(const SparseMatrixPolicy& A, typename SparseMatrixPolicy::value_type initial_value)
+  {
+    MICM_PROFILE_FUNCTION();
+
+    std::size_t n = A.NumRows();
+    std::set<std::pair<std::size_t, std::size_t>> ALU_ids;
+    for (std::size_t i = 0; i < n; ++i)
+    {
+      for (std::size_t j = 0; j < n; ++j)
+        if (!A.IsZero(i, j))
+          ALU_ids.insert(std::make_pair(i, j));
+    }
+    for (std::size_t i = 0; i < n; ++i)
+    {
+      for (std::size_t j = i + 1; j < n; ++j)
+        if (std::find(ALU_ids.begin(), ALU_ids.end(), std::make_pair(j, i)) != ALU_ids.end())
+          ALU_ids.insert(std::make_pair(j, i));
+      for (std::size_t k = i + 1; k < n; ++k)
+        if (std::find(ALU_ids.begin(), ALU_ids.end(), std::make_pair(i, k)) != ALU_ids.end())
+          for (std::size_t j = i + 1; j < n; ++j)
+            if (std::find(ALU_ids.begin(), ALU_ids.end(), std::make_pair(j, i)) != ALU_ids.end())
+              ALU_ids.insert(std::make_pair(j, k));
+    }
+    auto ALU_builder = SparseMatrixPolicy::Create(n).SetNumberOfBlocks(A.NumberOfBlocks()).InitialValue(initial_value);
+    for (auto& pair : ALU_ids)
+    {
+      ALU_builder = ALU_builder.WithElement(pair.first, pair.second);
+    }
+    return SparseMatrixPolicy(ALU_builder);
+  }
+
+  template<class SparseMatrixPolicy>
+  requires(!VectorizableSparse<SparseMatrixPolicy>) inline void LuDecompositionMozartInPlace::Decompose(
+      SparseMatrixPolicy& ALU) const
+  {
+    MICM_PROFILE_FUNCTION();
+    const std::size_t n = ALU.NumRows();
+
+    // Loop over blocks
+    for (std::size_t i_block = 0; i_block < ALU.NumberOfBlocks(); ++i_block)
+    {
+      auto ALU_vector = std::next(ALU.AsVector().begin(), i_block * ALU.FlatBlockSize());
+      auto aji = aji_.begin();
+      auto aik_njk = aik_njk_.begin();
+      auto ajk_aji = ajk_aji_.begin();
+
+      for (const auto& aii_nji_nki : aii_nji_nki_)
+      {
+        const typename SparseMatrixPolicy::value_type Aii_inverse = 1.0 / ALU_vector[std::get<0>(aii_nji_nki)];
+        for (std::size_t ij = 0; ij < std::get<1>(aii_nji_nki); ++ij)
+        {
+          ALU_vector[*aji] *= Aii_inverse;
+          ++aji;
+        }
+        for (std::size_t ik = 0; ik < std::get<2>(aii_nji_nki); ++ik)
+        {
+          const typename SparseMatrixPolicy::value_type Aik = ALU_vector[std::get<0>(*aik_njk)];
+          for (std::size_t ijk = 0; ijk < std::get<1>(*aik_njk); ++ijk)
+          {
+            ALU_vector[ajk_aji->first] -= ALU_vector[ajk_aji->second] * Aik;
+            ++ajk_aji;
+          }
+          ++aik_njk;
+        }
+      }
+    }
+  }
+
+  template<class SparseMatrixPolicy>
+  requires(VectorizableSparse<SparseMatrixPolicy>) inline void LuDecompositionMozartInPlace::Decompose(
+      SparseMatrixPolicy& ALU) const
+  {
+    MICM_PROFILE_FUNCTION();
+
+    const std::size_t n = ALU.NumRows();
+    const std::size_t ALU_BlockSize = ALU.NumberOfBlocks();
+    constexpr std::size_t ALU_GroupVectorSize = SparseMatrixPolicy::GroupVectorSize();
+    const std::size_t ALU_GroupSizeOfFlatBlockSize = ALU.GroupSize();
+    double Aii_inverse[ALU_GroupVectorSize];
+
+    // Loop over groups of blocks
+    for (std::size_t i_group = 0; i_group < ALU.NumberOfGroups(ALU_BlockSize); ++i_group)
+    {
+      auto ALU_vector = std::next(ALU.AsVector().begin(), i_group * ALU_GroupSizeOfFlatBlockSize);
+      const std::size_t n_cells = std::min(ALU_GroupVectorSize, ALU_BlockSize - i_group * ALU_GroupVectorSize);
+      auto aji = aji_.begin();
+      auto aik_njk = aik_njk_.begin();
+      auto ajk_aji = ajk_aji_.begin();
+      for (const auto& aii_nji_nki : aii_nji_nki_)
+      {
+        for (std::size_t i = 0; i < n_cells; ++i)
+          Aii_inverse[i] = 1.0 / ALU_vector[std::get<0>(aii_nji_nki) + i];
+        for (std::size_t ij = 0; ij < std::get<1>(aii_nji_nki); ++ij)
+        {
+          for (std::size_t i = 0; i < n_cells; ++i)
+            ALU_vector[*aji + i] *= Aii_inverse[i];
+          ++aji;
+        }
+        for (std::size_t ik = 0; ik < std::get<2>(aii_nji_nki); ++ik)
+        {
+          const std::size_t aik = std::get<0>(*aik_njk);
+          for (std::size_t ijk = 0; ijk < std::get<1>(*aik_njk); ++ijk)
+          {
+            for (std::size_t i = 0; i < n_cells; ++i)
+              ALU_vector[ajk_aji->first + i] -= ALU_vector[ajk_aji->second + i] * ALU_vector[aik + i];
+            ++ajk_aji;
+          }
+          ++aik_njk;
+        }
+      }
+    }
+  }
+}  // namespace micm
diff --git a/test/unit/solver/CMakeLists.txt b/test/unit/solver/CMakeLists.txt
@@ -10,6 +10,7 @@ create_standard_test(NAME chapman_ode_solver SOURCES test_chapman_ode_solver.cpp
 create_standard_test(NAME linear_solver SOURCES test_linear_solver.cpp)
 create_standard_test(NAME lu_decomposition_doolittle SOURCES test_lu_decomposition_doolittle.cpp)
 create_standard_test(NAME lu_decomposition_mozart SOURCES test_lu_decomposition_mozart.cpp)
+create_standard_test(NAME lu_decomposition_mozart_in_place SOURCES test_lu_decomposition_mozart_in_place.cpp)
 create_standard_test(NAME rosenbrock SOURCES test_rosenbrock.cpp)
 create_standard_test(NAME state SOURCES test_state.cpp)
 create_standard_test(NAME backward_euler SOURCES test_backward_euler.cpp)

diff --git a/test/unit/solver/test_lu_decomposition_doolittle.cpp b/test/unit/solver/test_lu_decomposition_doolittle.cpp
@@ -29,7 +29,7 @@ TEST(LuDecompositionDoolittle, DiagonalMatrixStandardOrdering)
   testDiagonalMatrix<SparseMatrixTest, micm::LuDecompositionDoolittle>(5);
 }
 
-TEST(LuDecompositionDoolittle, AgnosticToInitialValue)
+TEST(LuDecompositionDoolittle, AgnosticToInitialValueStandardOrdering)
 {
   double initial_values[5] = { -INFINITY, -1.0, 0.0, 1.0, INFINITY };
   for (auto& value : initial_values)
@@ -62,7 +62,7 @@ TEST(LuDecompositionDoolittle, DiagonalMatrixVectorOrdering)
   testDiagonalMatrix<Group4SparseVectorMatrix, micm::LuDecompositionDoolittle>(5);
 }
 
-TEST(LuDecompositionDoolittle, VectorOrderingAgnosticToInitialValue)
+TEST(LuDecompositionDoolittle, AgnosticToInitialValueVectorOrdering)
 {
   double initial_values[5] = { -INFINITY, -1.0, 0.0, 1.0, INFINITY };
   for (auto& value : initial_values)