From f045c42cedab63de6142a7f1001c886ad30063c1 Mon Sep 17 00:00:00 2001 From: "Garth N. Wells" Date: Mon, 13 Jan 2025 08:14:25 +0000 Subject: [PATCH] Tidy up spare matrix code (#3596) --- cpp/dolfinx/la/MatrixCSR.h | 223 +++++++++++++++---------------- cpp/dolfinx/la/matrix_csr_impl.h | 168 +++++++++++------------ 2 files changed, 187 insertions(+), 204 deletions(-) diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h index c15e7248dd..39d8dec1d7 100644 --- a/cpp/dolfinx/la/MatrixCSR.h +++ b/cpp/dolfinx/la/MatrixCSR.h @@ -289,7 +289,31 @@ class MatrixCSR /// expanded. /// @return Dense copy of the part of the matrix on the calling rank. /// Storage is row-major. - std::vector to_dense() const; + std::vector to_dense() const + { + const std::size_t nrows = num_all_rows(); + const std::size_t ncols = _index_maps[1]->size_global(); + std::vector A(nrows * ncols * _bs[0] * _bs[1], 0.0); + for (std::size_t r = 0; r < nrows; ++r) + { + for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j) + { + for (int i0 = 0; i0 < _bs[0]; ++i0) + { + for (int i1 = 0; i1 < _bs[1]; ++i1) + { + std::array local_col{_cols[j]}; + std::array global_col{0}; + _index_maps[1]->local_to_global(local_col, global_col); + A[(r * _bs[1] + i0) * ncols * _bs[0] + global_col[0] * _bs[1] + i1] + = _data[j * _bs[0] * _bs[1] + i0 * _bs[1] + i1]; + } + } + } + } + + return A; + } /// @brief Transfer ghost row data to the owning ranks accumulating /// received values on the owned rows, and zeroing any existing data @@ -310,18 +334,93 @@ class MatrixCSR /// must not be changed. /// @note This function does not change the matrix data. Data update /// only occurs with `scatter_rev_end()`. - void scatter_rev_begin(); + void scatter_rev_begin() + { + const std::int32_t local_size0 = _index_maps[0]->size_local(); + const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts(); + const int bs2 = _bs[0] * _bs[1]; + + // For each ghost row, pack and send values to send to neighborhood + std::vector insert_pos = _val_send_disp; + _ghost_value_data.resize(_val_send_disp.back()); + for (int i = 0; i < num_ghosts0; ++i) + { + const int rank = _ghost_row_to_rank[i]; + + // Get position in send buffer to place data to send to this + // neighbour + const std::int32_t val_pos = insert_pos[rank]; + std::copy(std::next(_data.data(), _row_ptr[local_size0 + i] * bs2), + std::next(_data.data(), _row_ptr[local_size0 + i + 1] * bs2), + std::next(_ghost_value_data.begin(), val_pos)); + insert_pos[rank] + += bs2 * (_row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]); + } + + _ghost_value_data_in.resize(_val_recv_disp.back()); + + // Compute data sizes for send and receive from displacements + std::vector val_send_count(_val_send_disp.size() - 1); + std::adjacent_difference(std::next(_val_send_disp.begin()), + _val_send_disp.end(), val_send_count.begin()); + + std::vector val_recv_count(_val_recv_disp.size() - 1); + std::adjacent_difference(std::next(_val_recv_disp.begin()), + _val_recv_disp.end(), val_recv_count.begin()); + + int status = MPI_Ineighbor_alltoallv( + _ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(), + dolfinx::MPI::mpi_t, _ghost_value_data_in.data(), + val_recv_count.data(), _val_recv_disp.data(), + dolfinx::MPI::mpi_t, _comm.comm(), &_request); + assert(status == MPI_SUCCESS); + } /// @brief End transfer of ghost row data to owning ranks. /// @note Must be preceded by MatrixCSR::scatter_rev_begin(). /// @note Matrix data received from other processes will be /// accumulated into locally owned rows, and ghost rows will be /// zeroed. - void scatter_rev_end(); + void scatter_rev_end() + { + int status = MPI_Wait(&_request, MPI_STATUS_IGNORE); + assert(status == MPI_SUCCESS); + + _ghost_value_data.clear(); + _ghost_value_data.shrink_to_fit(); + + // Add to local rows + const int bs2 = _bs[0] * _bs[1]; + assert(_ghost_value_data_in.size() == _unpack_pos.size() * bs2); + for (std::size_t i = 0; i < _unpack_pos.size(); ++i) + for (int j = 0; j < bs2; ++j) + _data[_unpack_pos[i] * bs2 + j] += _ghost_value_data_in[i * bs2 + j]; + + _ghost_value_data_in.clear(); + _ghost_value_data_in.shrink_to_fit(); + + // Set ghost row data to zero + const std::int32_t local_size0 = _index_maps[0]->size_local(); + std::fill(std::next(_data.begin(), _row_ptr[local_size0] * bs2), + _data.end(), 0); + } /// @brief Compute the Frobenius norm squared across all processes. /// @note MPI Collective - double squared_norm() const; + double squared_norm() const + { + const std::size_t num_owned_rows = _index_maps[0]->size_local(); + const int bs2 = _bs[0] * _bs[1]; + assert(num_owned_rows < _row_ptr.size()); + double norm_sq_local = std::accumulate( + _data.cbegin(), + std::next(_data.cbegin(), _row_ptr[num_owned_rows] * bs2), double(0), + [](auto norm, value_type y) { return norm + std::norm(y); }); + double norm_sq; + MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM, + _comm.comm()); + return norm_sq; + } /// @brief Compute the product `y += Ax`. /// @@ -434,7 +533,7 @@ MatrixCSR::MatrixCSR(const SparsityPattern& p, BlockMode mode) for (int i = 0; i < 2; ++i) { const auto im = _index_maps[i]; - const int size_local = im->size_local() * _bs[i]; + const std::int32_t size_local = im->size_local() * _bs[i]; std::span ghost_i = im->ghosts(); std::vector ghosts; const std::vector ghost_owner_i(im->owners().begin(), @@ -448,7 +547,8 @@ MatrixCSR::MatrixCSR(const SparsityPattern& p, BlockMode mode) src_rank.push_back(ghost_owner_i[j]); } } - const std::array, 2> src_dest0 + + std::array, 2> src_dest0 = {std::vector(_index_maps[i]->src().begin(), _index_maps[i]->src().end()), std::vector(_index_maps[i]->dest().begin(), @@ -520,7 +620,7 @@ MatrixCSR::MatrixCSR(const SparsityPattern& p, BlockMode mode) { auto it = std::ranges::lower_bound(src_ranks, r); assert(it != src_ranks.end() and *it == r); - int pos = std::distance(src_ranks.begin(), it); + std::size_t pos = std::distance(src_ranks.begin(), it); _ghost_row_to_rank.push_back(pos); } @@ -545,7 +645,7 @@ MatrixCSR::MatrixCSR(const SparsityPattern& p, BlockMode mode) for (std::size_t i = 0; i < _ghost_row_to_rank.size(); ++i) { const int rank = _ghost_row_to_rank[i]; - int row_id = local_size[0] + i; + std::int32_t row_id = local_size[0] + i; for (int j = _row_ptr[row_id]; j < _row_ptr[row_id + 1]; ++j) { // Get position in send buffer @@ -639,110 +739,6 @@ MatrixCSR::MatrixCSR(const SparsityPattern& p, BlockMode mode) } } //----------------------------------------------------------------------------- -template -std::vector::value_type> -MatrixCSR::to_dense() const -{ - const std::size_t nrows = num_all_rows(); - const std::size_t ncols = _index_maps[1]->size_global(); - std::vector A(nrows * ncols * _bs[0] * _bs[1], 0.0); - for (std::size_t r = 0; r < nrows; ++r) - for (std::int32_t j = _row_ptr[r]; j < _row_ptr[r + 1]; ++j) - for (int i0 = 0; i0 < _bs[0]; ++i0) - for (int i1 = 0; i1 < _bs[1]; ++i1) - { - std::array local_col{_cols[j]}; - std::array global_col{0}; - _index_maps[1]->local_to_global(local_col, global_col); - A[(r * _bs[1] + i0) * ncols * _bs[0] + global_col[0] * _bs[1] + i1] - = _data[j * _bs[0] * _bs[1] + i0 * _bs[1] + i1]; - } - - return A; -} -//----------------------------------------------------------------------------- -template -void MatrixCSR::scatter_rev_begin() -{ - const std::int32_t local_size0 = _index_maps[0]->size_local(); - const std::int32_t num_ghosts0 = _index_maps[0]->num_ghosts(); - const int bs2 = _bs[0] * _bs[1]; - - // For each ghost row, pack and send values to send to neighborhood - std::vector insert_pos = _val_send_disp; - _ghost_value_data.resize(_val_send_disp.back()); - for (int i = 0; i < num_ghosts0; ++i) - { - const int rank = _ghost_row_to_rank[i]; - - // Get position in send buffer to place data to send to this - // neighbour - const std::int32_t val_pos = insert_pos[rank]; - std::copy(std::next(_data.data(), _row_ptr[local_size0 + i] * bs2), - std::next(_data.data(), _row_ptr[local_size0 + i + 1] * bs2), - std::next(_ghost_value_data.begin(), val_pos)); - insert_pos[rank] - += bs2 * (_row_ptr[local_size0 + i + 1] - _row_ptr[local_size0 + i]); - } - - _ghost_value_data_in.resize(_val_recv_disp.back()); - - // Compute data sizes for send and receive from displacements - std::vector val_send_count(_val_send_disp.size() - 1); - std::adjacent_difference(std::next(_val_send_disp.begin()), - _val_send_disp.end(), val_send_count.begin()); - - std::vector val_recv_count(_val_recv_disp.size() - 1); - std::adjacent_difference(std::next(_val_recv_disp.begin()), - _val_recv_disp.end(), val_recv_count.begin()); - - int status = MPI_Ineighbor_alltoallv( - _ghost_value_data.data(), val_send_count.data(), _val_send_disp.data(), - dolfinx::MPI::mpi_t, _ghost_value_data_in.data(), - val_recv_count.data(), _val_recv_disp.data(), - dolfinx::MPI::mpi_t, _comm.comm(), &_request); - assert(status == MPI_SUCCESS); -} -//----------------------------------------------------------------------------- -template -void MatrixCSR::scatter_rev_end() -{ - int status = MPI_Wait(&_request, MPI_STATUS_IGNORE); - assert(status == MPI_SUCCESS); - - _ghost_value_data.clear(); - _ghost_value_data.shrink_to_fit(); - - // Add to local rows - const int bs2 = _bs[0] * _bs[1]; - assert(_ghost_value_data_in.size() == _unpack_pos.size() * bs2); - for (std::size_t i = 0; i < _unpack_pos.size(); ++i) - for (int j = 0; j < bs2; ++j) - _data[_unpack_pos[i] * bs2 + j] += _ghost_value_data_in[i * bs2 + j]; - - _ghost_value_data_in.clear(); - _ghost_value_data_in.shrink_to_fit(); - - // Set ghost row data to zero - const std::int32_t local_size0 = _index_maps[0]->size_local(); - std::fill(std::next(_data.begin(), _row_ptr[local_size0] * bs2), _data.end(), - 0); -} -//----------------------------------------------------------------------------- -template -double MatrixCSR::squared_norm() const -{ - const std::size_t num_owned_rows = _index_maps[0]->size_local(); - const int bs2 = _bs[0] * _bs[1]; - assert(num_owned_rows < _row_ptr.size()); - double norm_sq_local = std::accumulate( - _data.cbegin(), std::next(_data.cbegin(), _row_ptr[num_owned_rows] * bs2), - double(0), [](auto norm, value_type y) { return norm + std::norm(y); }); - double norm_sq; - MPI_Allreduce(&norm_sq_local, &norm_sq, 1, MPI_DOUBLE, MPI_SUM, _comm.comm()); - return norm_sq; -} -//----------------------------------------------------------------------------- // The matrix A is distributed across P processes by blocks of rows: // A = | A_0 | @@ -766,7 +762,8 @@ double MatrixCSR::squared_norm() const // y[0] = |Ai[0] Ai[1]| | x[0] | = Ai[0] x[0] + Ai[1] x[1] // | x[1] | // -/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors x,y +/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors +/// x,y template void MatrixCSR::mult(la::Vector& x, la::Vector& y) diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h index 8a218a130e..a7a918d177 100644 --- a/cpp/dolfinx/la/matrix_csr_impl.h +++ b/cpp/dolfinx/la/matrix_csr_impl.h @@ -6,7 +6,6 @@ #pragma once -#include #include #include #include @@ -16,28 +15,28 @@ namespace dolfinx::la { namespace impl { -/// @brief Incorporate data into a CSR matrix +/// @brief Incorporate data into a CSR matrix. /// -/// @tparam BS0 Row block size (of both matrix and data) -/// @tparam BS1 Column block size (of both matrix and data) -/// @tparam OP The operation (usually "set" or "add") -/// @param[out] data The CSR matrix data -/// @param[in] cols The CSR column indices -/// @param[in] row_ptr The pointer to the ith row in the CSR data +/// @tparam BS0 Row block size (of both matrix and data). +/// @tparam BS1 Column block size (of both matrix and data). +/// @tparam OP The operation (usually "set" or "add"). +/// @param[out] data CSR matrix data. +/// @param[in] cols CSR column indices. +/// @param[in] row_ptr Pointer to the ith row in the CSR data. /// @param[in] x The `m` by `n` dense block of values (row-major) to add -/// to the matrix -/// @param[in] xrows The row indices of `x` -/// @param[in] xcols The column indices of `x` -/// @param[in] op The operation (set or add) -/// @param[in] num_rows The maximum row index that can be set. Used -/// when debugging to check that rows beyond a permitted range -/// are not being set. +/// to the matrix. +/// @param[in] xrows Row indices of `x`. +/// @param[in] xcols Column indices of `x`. +/// @param[in] op The operation (set or add), +/// @param[in] num_rows Maximum row index that can be set. Used when +/// debugging to check that rows beyond a permitted range are not being +/// set. /// /// @note In the case of block data, where BS0 or BS1 are greater than -/// one, the layout of the input data is still the same. For example, the -/// following can be inserted into the top-left corner -/// with xrows={0,1} and xcols={0,1}, BS0=2, BS1=2 and -/// x={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}. +/// one, the layout of the input data is still the same. For example, +/// the following can be inserted into the top-left corner with +/// `xrows={0,1}` and `xcols={0,1}`, `BS0=2`, `BS1=2` and +/// `x={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}`. /// /// 0 1 | 2 3 /// 4 5 | 6 7 @@ -45,62 +44,6 @@ namespace impl /// 8 9 | 10 11 /// 12 13 | 14 15 /// -template -void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x, - const Y& xrows, const Y& xcols, OP op, - typename Y::value_type num_rows); - -/// @brief Incorporate blocked data with given block sizes into a non-blocked -/// MatrixCSR -/// @note Matrix block size (bs=1). Matrix sparsity must be correct to accept -/// the data. -/// @note see `insert_csr` for data layout -/// -/// @tparam BS0 Row block size of Data -/// @tparam BS1 Column block size of Data -/// @tparam OP The operation (usually "set" or "add") -/// @param[out] data The CSR matrix data -/// @param[in] cols The CSR column indices -/// @param[in] row_ptr The pointer to the ith row in the CSR data -/// @param[in] x The `m` by `n` dense block of values (row-major) to add -/// to the matrix -/// @param[in] xrows The row indices of `x` -/// @param[in] xcols The column indices of `x` -/// @param[in] op The operation (set or add) -/// @param[in] num_rows The maximum row index that can be set. Used -/// when debugging to check that rows beyond a permitted range -/// are not being set. -template -void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x, - const Y& xrows, const Y& xcols, OP op, - typename Y::value_type num_rows); - -/// @brief Incorporate non-blocked data into a blocked matrix (Data block size = -/// 1) -/// @note Matrix sparsity must be correct to accept the data -/// @note see `insert_csr` for data layout -/// @param[out] data The CSR matrix data -/// @param[in] cols The CSR column indices -/// @param[in] row_ptr The pointer to the ith row in the CSR data -/// @param[in] x The `m` by `n` dense block of values (row-major) to add -/// to the matrix -/// @param[in] xrows The row indices of `x` -/// @param[in] xcols The column indices of `x` -/// @param[in] op The operation (set or add) -/// @param[in] num_rows The maximum row index that can be set. Used -/// when debugging to check that rows beyond a permitted range -/// are not being set. -/// @param[in] bs0 Row block size of Matrix -/// @param[in] bs1 Column block size of Matrix -template -void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr, - const X& x, const Y& xrows, const Y& xcols, OP op, - typename Y::value_type num_rows, int bs0, int bs1); - -//----------------------------------------------------------------------------- template void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x, @@ -127,13 +70,12 @@ void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x, { // Find position of column index auto it = std::lower_bound(cit0, cit1, xcols[c]); - if (it == cit1 or *it != xcols[c]) throw std::runtime_error("Entry not in sparsity"); std::size_t d = std::distance(cols.begin(), it); - int di = d * BS0 * BS1; - int xi = c * BS1; + std::size_t di = d * BS0 * BS1; + std::size_t xi = c * BS1; assert(di < data.size()); for (int i = 0; i < BS0; ++i) { @@ -145,8 +87,28 @@ void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x, } } } -//----------------------------------------------------------------------------- -// Insert with block insertion into a regular CSR (block size 1) + +/// @brief Incorporate blocked data with given block sizes into a +/// non-blocked MatrixCSR. +/// +/// @note Matrix block size (bs=1). Matrix sparsity must be correct to +/// accept the data. +/// @note See ::insert_csr for data layout. +/// +/// @tparam BS0 Row block size of data. +/// @tparam BS1 Column block size of data. +/// @tparam OP The operation (usually "set" or "add"). +/// @param[out] data CSR matrix data. +/// @param[in] cols CSR column indices. +/// @param[in] row_ptr Pointer to the ith row in the CSR data. +/// @param[in] x The `m` by `n` dense block of values (row-major) to add +/// to the matrix. +/// @param[in] xrows Row indices of `x`. +/// @param[in] xcols Column indices of `x`. +/// @param[in] op The operation (set or add). +/// @param[in] num_rows Maximum row index that can be set. Used when +/// debugging to check that rows beyond a permitted range are not being +/// set. template void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x, @@ -159,7 +121,6 @@ void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x, { // Row index and current data row auto row = xrows[r] * BS0; - #ifndef NDEBUG if (row >= num_rows) throw std::runtime_error("Local row out of range"); @@ -169,6 +130,7 @@ void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x, { using T = typename X::value_type; const T* xr = x.data() + (r * BS0 + i) * nc * BS1; + // Columns indices for row auto cit0 = std::next(cols.begin(), row_ptr[row + i]); auto cit1 = std::next(cols.begin(), row_ptr[row + i + 1]); @@ -176,27 +138,43 @@ void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x, { // Find position of column index auto it = std::lower_bound(cit0, cit1, xcols[c] * BS1); - if (it == cit1 or *it != xcols[c] * BS1) throw std::runtime_error("Entry not in sparsity"); std::size_t d = std::distance(cols.begin(), it); assert(d < data.size()); - int xi = c * BS1; + std::size_t xi = c * BS1; for (int j = 0; j < BS1; ++j) op(data[d + j], xr[xi + j]); } } } } -//----------------------------------------------------------------------------- -// Add individual entries in block-CSR storage + +/// @brief Incorporate non-blocked data into a blocked matrix (data +/// block size=1) +/// +/// @note Matrix sparsity must be correct to accept the data. +/// @note See ::insert_csr for data layout. +/// +/// @param[out] data CSR matrix data. +/// @param[in] cols CSR column indices. +/// @param[in] row_ptr Pointer to the ith row in the CSR data. +/// @param[in] x The `m` by `n` dense block of values (row-major) to add +/// to the matrix. +/// @param[in] xrows Rrow indices of `x`. +/// @param[in] xcols Column indices of `x`. +/// @param[in] op The operation (set or add). +/// @param[in] num_rows Maximum row index that can be set. Used when +/// debugging to check that rows beyond a permitted range are not being +/// set. +/// @param[in] bs0 Row block size of matrix. +/// @param[in] bs1 Column block size of matrix. template void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x, const Y& xrows, const Y& xcols, OP op, - [[maybe_unused]] typename Y::value_type num_rows, - int bs0, int bs1) + typename Y::value_type num_rows, int bs0, int bs1) { const std::size_t nc = xcols.size(); const int nbs = bs0 * bs1; @@ -221,19 +199,28 @@ void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr, // Find position of column index auto cdiv = std::div(xcols[c], bs1); auto it = std::lower_bound(cit0, cit1, cdiv.quot); - if (it == cit1 or *it != cdiv.quot) throw std::runtime_error("Entry not in sparsity"); std::size_t d = std::distance(cols.begin(), it); - const int di = d * nbs + rdiv.rem * bs1 + cdiv.rem; + std::size_t di = d * nbs + rdiv.rem * bs1 + cdiv.rem; assert(di < data.size()); op(data[di], xr[c]); } } } -//----------------------------------------------------------------------------- +/// @brief Sparse matrix-vector product implementation. +/// @tparam T +/// @tparam BS1 +/// @param values +/// @param row_begin +/// @param row_end +/// @param indices +/// @param x +/// @param y +/// @param bs0 +/// @param bs1 template void spmv(std::span values, std::span row_begin, std::span row_end, @@ -241,7 +228,6 @@ void spmv(std::span values, std::span row_begin, std::span y, int bs0, int bs1) { assert(row_begin.size() == row_end.size()); - for (int k0 = 0; k0 < bs0; ++k0) { for (std::size_t i = 0; i < row_begin.size(); i++)