From c9ac694fa932b724365371400139abe8070f14d6 Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Fri, 11 Dec 2020 13:02:33 -0800 Subject: [PATCH 01/34] Add TransactionFuseable class --- source/SAMRAI/tbox/CMakeLists.txt | 2 ++ source/SAMRAI/tbox/TransactionFuseable.C | 0 source/SAMRAI/tbox/TransactionFuseable.h | 31 ++++++++++++++++++++++ source/SAMRAI/xfer/RefineCopyTransaction.h | 4 +-- 4 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 source/SAMRAI/tbox/TransactionFuseable.C create mode 100644 source/SAMRAI/tbox/TransactionFuseable.h diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt index f508e26830..e35242155b 100644 --- a/source/SAMRAI/tbox/CMakeLists.txt +++ b/source/SAMRAI/tbox/CMakeLists.txt @@ -58,6 +58,7 @@ set ( tbox_headers TimerManager.h Tracer.h Transaction.h + TransactionFuseable.h Utilities.h) set_source_files_properties( @@ -115,6 +116,7 @@ set (tbox_sources TimerManager.C Tracer.C Transaction.C + TransactionFuseable.C Utilities.C) if (ENABLE_HDF5) diff --git a/source/SAMRAI/tbox/TransactionFuseable.C b/source/SAMRAI/tbox/TransactionFuseable.C new file mode 100644 index 0000000000..e69de29bb2 diff --git a/source/SAMRAI/tbox/TransactionFuseable.h b/source/SAMRAI/tbox/TransactionFuseable.h new file mode 100644 index 0000000000..40bbf42428 --- /dev/null +++ b/source/SAMRAI/tbox/TransactionFuseable.h @@ -0,0 +1,31 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2020 Lawrence Livermore National Security, LLC + * Description: Abstract base class for all schedule transactions + * + ************************************************************************/ + +#ifndef included_tbox_TransactionFuseable +#define included_tbox_TransactionFuseable + +#include "SAMRAI/SAMRAI_config.h" + +#include "SAMRAI/tbox/Transaction.h" + +#include + +namespace SAMRAI { +namespace tbox { + + class TransactionFuseable : + public Transaction + { + }; + +} +} + +#endif \ No newline at end of file diff --git a/source/SAMRAI/xfer/RefineCopyTransaction.h b/source/SAMRAI/xfer/RefineCopyTransaction.h index 7859b23f2d..fbc10513a5 100644 --- a/source/SAMRAI/xfer/RefineCopyTransaction.h +++ b/source/SAMRAI/xfer/RefineCopyTransaction.h @@ -14,7 +14,7 @@ #include "SAMRAI/SAMRAI_config.h" -#include "SAMRAI/tbox/Transaction.h" +#include "SAMRAI/tbox/TransactionFuseable.h" #include "SAMRAI/hier/BaseGridGeometry.h" #include "SAMRAI/hier/PatchLevel.h" #include "SAMRAI/xfer/RefineClasses.h" @@ -40,7 +40,7 @@ namespace xfer { * @see tbox::Transaction */ -class RefineCopyTransaction:public tbox::Transaction +class RefineCopyTransaction:public tbox::TransactionFuseable { public: /*! From 5ff40e244de959035a5d597826935a0ad41cf50d Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Mon, 4 Jan 2021 17:33:40 -0800 Subject: [PATCH 02/34] Fixup handling fuseable/regular transactions Fuseable transactions still need to be fused. --- source/SAMRAI/pdat/CMakeLists.txt | 4 + source/SAMRAI/tbox/CMakeLists.txt | 3 + source/SAMRAI/tbox/Schedule.C | 393 +++++++++++++++++++++--------- source/SAMRAI/tbox/Schedule.h | 24 +- 4 files changed, 300 insertions(+), 124 deletions(-) diff --git a/source/SAMRAI/pdat/CMakeLists.txt b/source/SAMRAI/pdat/CMakeLists.txt index c3535b11c8..dba8331ccd 100644 --- a/source/SAMRAI/pdat/CMakeLists.txt +++ b/source/SAMRAI/pdat/CMakeLists.txt @@ -339,6 +339,10 @@ target_include_directories( $ $) +blt_print_target_properties(TARGET SAMRAI_pdat) +blt_print_target_properties(TARGET raja) +blt_print_target_properties(TARGET RAJA) + install(TARGETS SAMRAI_pdat EXPORT SAMRAITargets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt index e35242155b..a0561da1af 100644 --- a/source/SAMRAI/tbox/CMakeLists.txt +++ b/source/SAMRAI/tbox/CMakeLists.txt @@ -178,6 +178,9 @@ target_include_directories( SAMRAI_tbox $ $) +blt_print_target_properties( + TARGET SAMRAI_tbox) + install(TARGETS SAMRAI_tbox EXPORT SAMRAITargets diff --git a/source/SAMRAI/tbox/Schedule.C b/source/SAMRAI/tbox/Schedule.C index dc10e37395..54fab2ef95 100644 --- a/source/SAMRAI/tbox/Schedule.C +++ b/source/SAMRAI/tbox/Schedule.C @@ -60,7 +60,6 @@ Schedule::s_initialize_finalize_handler( */ Schedule::Schedule(): - d_coms(0), d_com_stage(), d_mpi(SAMRAI_MPI::getSAMRAIWorld()), d_first_tag(s_default_first_tag), @@ -101,13 +100,28 @@ Schedule::addTransaction( const int src_id = transaction->getSourceProcessor(); const int dst_id = transaction->getDestinationProcessor(); + std::shared_ptr fuseable_transaction{ + std::dynamic_pointer_cast(transaction)}; + if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { - d_local_set.push_front(transaction); + if (fuseable_transaction) { + d_local_set_fuseable.push_front(transaction); + } else { + d_local_set.push_front(transaction); + } } else { if (d_mpi.getRank() == dst_id) { - d_recv_sets[src_id].push_front(transaction); + if (fuseable_transaction) { + d_recv_sets_fuseable[src_id].push_front(transaction); + } else { + d_recv_sets[src_id].push_front(transaction); + } } else if (d_mpi.getRank() == src_id) { - d_send_sets[dst_id].push_front(transaction); + if (fuseable_transaction) { + d_send_sets_fuseable[dst_id].push_front(transaction); + } else { + d_send_sets[dst_id].push_front(transaction); + } } } } @@ -126,13 +140,28 @@ Schedule::appendTransaction( const int src_id = transaction->getSourceProcessor(); const int dst_id = transaction->getDestinationProcessor(); + std::shared_ptr fuseable_transaction{ + std::dynamic_pointer_cast(transaction)}; + if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { - d_local_set.push_back(transaction); + if (fuseable_transaction) { + d_local_set_fuseable.push_back(transaction); + } else { + d_local_set.push_back(transaction); + } } else { if (d_mpi.getRank() == dst_id) { - d_recv_sets[src_id].push_back(transaction); + if (fuseable_transaction) { + d_recv_sets_fuseable[src_id].push_back(transaction); + } else { + d_recv_sets[src_id].push_back(transaction); + } } else if (d_mpi.getRank() == src_id) { - d_send_sets[dst_id].push_back(transaction); + if (fuseable_transaction) { + d_send_sets_fuseable[dst_id].push_back(transaction); + } else { + d_send_sets[dst_id].push_back(transaction); + } } } } @@ -149,8 +178,13 @@ Schedule::getNumSendTransactions( int size = 0; TransactionSets::const_iterator mi = d_send_sets.find(rank); if (mi != d_send_sets.end()) { - size = static_cast(mi->second.size()); + size += static_cast(mi->second.size()); } + mi = d_send_sets_fuseable.find(rank); + if (mi != d_send_sets_fuseable.end()) { + size += static_cast(mi->second.size()); + } + return size; } @@ -166,7 +200,11 @@ Schedule::getNumRecvTransactions( int size = 0; TransactionSets::const_iterator mi = d_recv_sets.find(rank); if (mi != d_recv_sets.end()) { - size = static_cast(mi->second.size()); + size += static_cast(mi->second.size()); + } + mi = d_recv_sets_fuseable.find(rank); + if (mi != d_recv_sets_fuseable.end()) { + size += static_cast(mi->second.size()); } return size; } @@ -245,7 +283,7 @@ Schedule::finalizeCommunication() void Schedule::postReceives() { - if (d_recv_sets.empty()) { + if (d_recv_sets.empty() && d_recv_sets_fuseable.empty()) { /* * Short cut because some looping logic in this method assumes * non-empty d_recv_sets. @@ -263,60 +301,87 @@ Schedule::postReceives() * send posted earlier is paired with a receive that is also posted * earlier. */ - AsyncCommPeer* recv_coms = d_coms; + for (CommMap::reverse_iterator comm_peer(d_recv_coms.lower_bound(rank)); + comm_peer != d_recv_coms.rend(); + ++comm_peer) { + const int recv_rank = (*comm_peer).first; + auto& comm = (*comm_peer).second; + // Compute incoming message size, if possible. + unsigned int byte_count = 0; + bool can_estimate_incoming_message_size = true; - // Initialize iterators to where we want to start looping. - size_t icom = 0; // Index into recv_coms. - while (icom < d_recv_sets.size() && - recv_coms[icom].getPeerRank() < rank) { - ++icom; - } - icom = icom > 0 ? icom - 1 : d_recv_sets.size() - 1; + for (const auto& t : d_recv_sets[recv_rank] ) { + if (!t->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + break; + } + byte_count += + static_cast(t->computeIncomingMessageSize()); + } - // Map iterator mi corresponds to recv_coms[icom]. - TransactionSets::const_iterator mi = - d_recv_sets.find(recv_coms[icom].getPeerRank()); + for (const auto& t: d_recv_sets_fuseable[recv_rank]) { + if (!t->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + break; + } + byte_count += + static_cast(t->computeIncomingMessageSize()); + } - for (size_t counter = 0; - counter < d_recv_sets.size(); - ++counter, --mi, --icom) { + // Set AsyncCommPeer to receive known message length. + if (can_estimate_incoming_message_size) { + comm->limitFirstDataLength(byte_count); + } - TBOX_ASSERT(mi->first == recv_coms[icom].getPeerRank()); + // Begin non-blocking receive operation. + d_object_timers->t_post_receives->start(); + comm->beginRecv(); + if (comm->isDone()) { + comm->pushToCompletionQueue(); + } + d_object_timers->t_post_receives->stop(); + } + CommMap::reverse_iterator stop(d_recv_coms.lower_bound(rank)); + for (CommMap::reverse_iterator comm_peer = d_recv_coms.rbegin(); comm_peer != stop; ++comm_peer) { + const int recv_rank = (*comm_peer).first; + auto& comm = (*comm_peer).second; // Compute incoming message size, if possible. - const std::list >& transactions = - mi->second; unsigned int byte_count = 0; bool can_estimate_incoming_message_size = true; - for (ConstIterator r = transactions.begin(); - r != transactions.end(); ++r) { - if (!(*r)->canEstimateIncomingMessageSize()) { + + for (const auto& t : d_recv_sets[recv_rank] ) { + if (!t->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + break; + } + byte_count += + static_cast(t->computeIncomingMessageSize()); + } + + for (const auto& t: d_recv_sets_fuseable[recv_rank]) { + if (!t->canEstimateIncomingMessageSize()) { can_estimate_incoming_message_size = false; break; } byte_count += - static_cast((*r)->computeIncomingMessageSize()); + static_cast(t->computeIncomingMessageSize()); } // Set AsyncCommPeer to receive known message length. if (can_estimate_incoming_message_size) { - recv_coms[icom].limitFirstDataLength(byte_count); + comm->limitFirstDataLength(byte_count); } // Begin non-blocking receive operation. d_object_timers->t_post_receives->start(); - recv_coms[icom].beginRecv(); - if (recv_coms[icom].isDone()) { - recv_coms[icom].pushToCompletionQueue(); + comm->beginRecv(); + if (comm->isDone()) { + comm->pushToCompletionQueue(); } d_object_timers->t_post_receives->stop(); - - if (mi == d_recv_sets.begin()) { - // Continue loop at the opposite end. - mi = d_recv_sets.end(); - icom = d_recv_sets.size(); - } } + } /* @@ -339,38 +404,86 @@ Schedule::postSends() int rank = d_mpi.getRank(); - AsyncCommPeer* send_coms = d_coms + d_recv_sets.size(); + for (auto comm_peer = d_send_coms.lower_bound(rank); + comm_peer != d_send_coms.end(); + ++comm_peer) { + const int peer_rank = (*comm_peer).first; + auto& comm = (*comm_peer).second; - // Initialize iterators to where we want to start looping. - TransactionSets::const_iterator mi = d_send_sets.upper_bound(rank); - size_t icom = 0; // send_coms[icom] corresponds to mi. - while (icom < d_send_sets.size() && - send_coms[icom].getPeerRank() < rank) { - ++icom; - } + size_t byte_count = 0; + bool can_estimate_incoming_message_size = true; + for (const auto& transaction : d_send_sets[peer_rank]) { + if (!transaction->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + } + byte_count += transaction->computeOutgoingMessageSize(); + } - for (size_t counter = 0; - counter < d_send_sets.size(); - ++counter, ++mi, ++icom) { + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + if (!transaction->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + } + byte_count += transaction->computeOutgoingMessageSize(); + } - if (mi == d_send_sets.end()) { - // Continue loop at the opposite end. - mi = d_send_sets.begin(); - icom = 0; + // Pack outgoing data into a message. + MessageStream outgoing_stream( + byte_count, + MessageStream::Write, + nullptr, + true +#ifdef HAVE_UMPIRE + , AllocatorDatabase::getDatabase()->getStreamAllocator() +#endif + ); + + d_object_timers->t_pack_stream->start(); + for (const auto& transaction : d_send_sets[peer_rank]) { + transaction->packStream(outgoing_stream); + } + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + transaction->packStream(outgoing_stream); + } +#if defined(HAVE_RAJA) + parallel_synchronize(); +#endif + + d_object_timers->t_pack_stream->stop(); + + if (can_estimate_incoming_message_size) { + // Receiver knows message size so set it exactly. + comm->limitFirstDataLength(byte_count); } - TBOX_ASSERT(mi->first == send_coms[icom].getPeerRank()); - // Compute message size and whether receiver can estimate it. - const std::list >& transactions = - mi->second; + // Begin non-blocking send operation. + comm->beginSend( + (const char *)outgoing_stream.getBufferStart(), + static_cast(outgoing_stream.getCurrentSize())); + if (comm->isDone()) { + comm->pushToCompletionQueue(); + } + } + + for (auto comm_peer = d_send_coms.begin(); + comm_peer != d_send_coms.lower_bound(rank); + ++comm_peer) { + const int peer_rank = (*comm_peer).first; + auto& comm = (*comm_peer).second; + size_t byte_count = 0; bool can_estimate_incoming_message_size = true; - for (ConstIterator pack = transactions.begin(); - pack != transactions.end(); ++pack) { - if (!(*pack)->canEstimateIncomingMessageSize()) { + for (const auto& transaction : d_send_sets[peer_rank]) { + if (!transaction->canEstimateIncomingMessageSize()) { can_estimate_incoming_message_size = false; } - byte_count += (*pack)->computeOutgoingMessageSize(); + byte_count += transaction->computeOutgoingMessageSize(); + } + + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + if (!transaction->canEstimateIncomingMessageSize()) { + can_estimate_incoming_message_size = false; + } + byte_count += transaction->computeOutgoingMessageSize(); } // Pack outgoing data into a message. @@ -385,9 +498,11 @@ Schedule::postSends() ); d_object_timers->t_pack_stream->start(); - for (ConstIterator pack = transactions.begin(); - pack != transactions.end(); ++pack) { - (*pack)->packStream(outgoing_stream); + for (const auto& transaction : d_send_sets[peer_rank]) { + transaction->packStream(outgoing_stream); + } + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + transaction->packStream(outgoing_stream); } #if defined(HAVE_RAJA) parallel_synchronize(); @@ -397,15 +512,15 @@ Schedule::postSends() if (can_estimate_incoming_message_size) { // Receiver knows message size so set it exactly. - send_coms[icom].limitFirstDataLength(byte_count); + comm->limitFirstDataLength(byte_count); } // Begin non-blocking send operation. - send_coms[icom].beginSend( + comm->beginSend( (const char *)outgoing_stream.getBufferStart(), static_cast(outgoing_stream.getCurrentSize())); - if (send_coms[icom].isDone()) { - send_coms[icom].pushToCompletionQueue(); + if (comm->isDone()) { + comm->pushToCompletionQueue(); } } @@ -421,9 +536,12 @@ void Schedule::performLocalCopies() { d_object_timers->t_local_copies->start(); - for (Iterator local = d_local_set.begin(); - local != d_local_set.end(); ++local) { - (*local)->copyLocalData(); + // TODO: fuse these kernels + for (const auto& local : d_local_set_fuseable) { + local->copyLocalData(); + } + for (const auto& local : d_local_set) { + local->copyLocalData(); } d_object_timers->t_local_copies->stop(); } @@ -446,21 +564,20 @@ Schedule::processCompletedCommunications() if (d_unpack_in_deterministic_order) { // Unpack in deterministic order. Wait for receive as needed. + // Deterministic order is lowest to highest recv rank int irecv = 0; - for (TransactionSets::iterator recv_itr = d_recv_sets.begin(); - recv_itr != d_recv_sets.end(); ++recv_itr, ++irecv) { - - int sender = recv_itr->first; - AsyncCommPeer& completed_comm = d_coms[irecv]; - TBOX_ASSERT(sender == completed_comm.getPeerRank()); - completed_comm.completeCurrentOperation(); - completed_comm.yankFromCompletionQueue(); + for (auto& comms : d_recv_coms) { + auto& completed_comm = comms.second; + int sender = comms.first; + TBOX_ASSERT(sender == completed_comm->getPeerRank()); + completed_comm->completeCurrentOperation(); + completed_comm->yankFromCompletionQueue(); MessageStream incoming_stream( - static_cast(completed_comm.getRecvSize()) * sizeof(char), + static_cast(completed_comm->getRecvSize()) * sizeof(char), MessageStream::Read, - completed_comm.getRecvData(), + completed_comm->getRecvData(), false /* don't use deep copy */ #ifdef HAVE_UMPIRE , AllocatorDatabase::getDatabase()->getStreamAllocator() @@ -468,17 +585,20 @@ Schedule::processCompletedCommunications() ); d_object_timers->t_unpack_stream->start(); - for (Iterator recv = d_recv_sets[sender].begin(); - recv != d_recv_sets[sender].end(); ++recv) { - (*recv)->unpackStream(incoming_stream); + for (const auto& transaction : d_recv_sets[sender]) { + transaction->unpackStream(incoming_stream); + } +#if defined(HAVE_RAJA) + parallel_synchronize(); +#endif + for (const auto& transaction : d_recv_sets_fuseable[sender]) { + transaction->unpackStream(incoming_stream); } #if defined(HAVE_RAJA) parallel_synchronize(); #endif - d_object_timers->t_unpack_stream->stop(); - completed_comm.clearRecvData(); - + completed_comm->clearRecvData(); } // Complete sends. @@ -499,7 +619,7 @@ Schedule::processCompletedCommunications() TBOX_ASSERT(completed_comm != 0); TBOX_ASSERT(completed_comm->isDone()); - if (static_cast(completed_comm - d_coms) < num_senders) { + if (!completed_comm->isSender()) { const int sender = completed_comm->getPeerRank(); @@ -514,9 +634,14 @@ Schedule::processCompletedCommunications() ); d_object_timers->t_unpack_stream->start(); - for (Iterator recv = d_recv_sets[sender].begin(); - recv != d_recv_sets[sender].end(); ++recv) { - (*recv)->unpackStream(incoming_stream); + for (const auto& transaction : d_recv_sets[sender]) { + transaction->unpackStream(incoming_stream); + } +#if defined(HAVE_RAJA) + parallel_synchronize(); +#endif + for (const auto& transaction : d_recv_sets_fuseable[sender]) { + transaction->unpackStream(incoming_stream); } #if defined(HAVE_RAJA) parallel_synchronize(); @@ -542,37 +667,69 @@ Schedule::processCompletedCommunications() void Schedule::allocateCommunicationObjects() { - const size_t length = d_recv_sets.size() + d_send_sets.size(); - if (length > 0) { - d_coms = new AsyncCommPeer[length]; + for (const auto& transaction : d_recv_sets) { + int rank = transaction.first; + + auto peer = std::make_shared>(); + peer->initialize(&d_com_stage); + peer->setPeerRank(rank); + peer->setMPITag(d_first_tag, d_second_tag); + peer->setMPI(d_mpi); + peer->limitFirstDataLength(d_first_message_length); +#ifdef HAVE_UMPIRE + peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); +#endif + d_recv_coms[rank] = peer; + } + + for (const auto transaction : d_recv_sets_fuseable) { + int rank = transaction.first; + + if (d_recv_coms.find(rank) == d_recv_coms.end()) { + auto peer = std::make_shared>(); + peer->initialize(&d_com_stage); + peer->setPeerRank(rank); + peer->setMPITag(d_first_tag, d_second_tag); + peer->setMPI(d_mpi); + peer->limitFirstDataLength(d_first_message_length); +#ifdef HAVE_UMPIRE + peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); +#endif + d_recv_coms[rank] = peer; + } } - size_t counter = 0; - for (TransactionSets::iterator ti = d_recv_sets.begin(); - ti != d_recv_sets.end(); - ++ti) { - d_coms[counter].initialize(&d_com_stage); - d_coms[counter].setPeerRank(ti->first); - d_coms[counter].setMPITag(d_first_tag, d_second_tag); - d_coms[counter].setMPI(d_mpi); - d_coms[counter].limitFirstDataLength(d_first_message_length); + for (const auto& transaction : d_send_sets) { + int rank = transaction.first; + auto peer = std::make_shared>(); + + peer->initialize(&d_com_stage); + peer->setPeerRank(rank); + peer->setMPITag(d_first_tag, d_second_tag); + peer->setMPI(d_mpi); + peer->limitFirstDataLength(d_first_message_length); #ifdef HAVE_UMPIRE - d_coms[counter].setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); + peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); #endif - ++counter; + d_send_coms[rank] = peer; } - for (TransactionSets::iterator ti = d_send_sets.begin(); - ti != d_send_sets.end(); - ++ti) { - d_coms[counter].initialize(&d_com_stage); - d_coms[counter].setPeerRank(ti->first); - d_coms[counter].setMPITag(d_first_tag, d_second_tag); - d_coms[counter].setMPI(d_mpi); - d_coms[counter].limitFirstDataLength(d_first_message_length); + + for (const auto& transaction : d_send_sets_fuseable) { + int rank = transaction.first; + + if (d_send_coms.find(rank) == d_send_coms.end()) { + auto peer = std::make_shared>(); + + peer->initialize(&d_com_stage); + peer->setPeerRank(rank); + peer->setMPITag(d_first_tag, d_second_tag); + peer->setMPI(d_mpi); + peer->limitFirstDataLength(d_first_message_length); #ifdef HAVE_UMPIRE - d_coms[counter].setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); + peer->setAllocator(AllocatorDatabase::getDatabase()->getStreamAllocator()); #endif - ++counter; + d_send_coms[rank] = peer; + } } } diff --git a/source/SAMRAI/tbox/Schedule.h b/source/SAMRAI/tbox/Schedule.h index 31f0a34734..21adf15983 100644 --- a/source/SAMRAI/tbox/Schedule.h +++ b/source/SAMRAI/tbox/Schedule.h @@ -17,6 +17,7 @@ #include "SAMRAI/tbox/SAMRAI_MPI.h" #include "SAMRAI/tbox/MessageStream.h" #include "SAMRAI/tbox/Transaction.h" +#include "SAMRAI/tbox/TransactionFuseable.h" #include #include @@ -275,7 +276,7 @@ class Schedule bool allocatedCommunicationObjects() { - return d_coms != 0; + return (d_recv_coms.size() > 0 && d_send_coms.size() > 0); } /*! @@ -293,10 +294,8 @@ class Schedule void deallocateCommunicationObjects() { - if (d_coms) { - delete[] d_coms; - } - d_coms = 0; + d_send_coms.clear(); + d_recv_coms.clear(); } void @@ -344,12 +343,22 @@ class Schedule TransactionSets d_send_sets; TransactionSets d_recv_sets; + TransactionSets d_send_sets_fuseable; + TransactionSets d_recv_sets_fuseable; + /* * @brief Transactions where the source and destination are the * local process. */ std::list > d_local_set; + /* + * @brief Fuseable transactions where the source and destination are the + * local process. + */ + std::list > d_local_set_fuseable; + + //@{ @name High-level asynchronous messages passing objects /*! @@ -359,7 +368,10 @@ class Schedule * d_coms is typed for byte sending because our data is of * unknown mixed type. */ - AsyncCommPeer* d_coms; + using CommMap = std::map>>; + CommMap d_send_coms; + CommMap d_recv_coms; + /*! * @brief Stage for advancing communication operations to * completion. From 5cc28c3398fabc6f63ea11a428182fab254955ad Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Thu, 4 Mar 2021 13:23:47 -0800 Subject: [PATCH 03/34] Adding new API for fusing --- source/SAMRAI/hier/PatchData.C | 27 ++++++++++ source/SAMRAI/hier/PatchData.h | 41 +++++++++++++++ source/SAMRAI/tbox/CMakeLists.txt | 4 +- source/SAMRAI/tbox/ExecutionPolicy.h | 5 ++ source/SAMRAI/tbox/KernelFuser.h | 46 +++++++++++++++++ source/SAMRAI/tbox/Schedule.C | 60 +++++++++++++++++----- source/SAMRAI/tbox/Schedule.h | 5 ++ source/SAMRAI/tbox/TransactionFuseable.C | 19 +++++++ source/SAMRAI/tbox/TransactionFuseable.h | 15 ++++-- source/SAMRAI/xfer/RefineCopyTransaction.C | 6 +-- 10 files changed, 206 insertions(+), 22 deletions(-) create mode 100644 source/SAMRAI/tbox/KernelFuser.h diff --git a/source/SAMRAI/hier/PatchData.C b/source/SAMRAI/hier/PatchData.C index 97425ab921..90d4e986cd 100644 --- a/source/SAMRAI/hier/PatchData.C +++ b/source/SAMRAI/hier/PatchData.C @@ -31,6 +31,33 @@ PatchData::~PatchData() { } +void +PatchData::copy( + const PatchData& src, + const BoxOverlap& overlap, + tbox::KernelFuser& fuser) +{ + copy(src, overlap); +} + +void +PatchData::packStream( + tbox::MessageStream& stream, + const BoxOverlap& overlap, + tbox::KernelFuser& fuser) +{ + packStream(stream, overlap); +} + +void +PatchData::unpackStream( + tbox::MessageStream& stream, + const BoxOverlap& overlap, + tbox::KernelFuser& fuser) +{ + unpackStream(stream, overlap); +} + /* ************************************************************************* * diff --git a/source/SAMRAI/hier/PatchData.h b/source/SAMRAI/hier/PatchData.h index d8ac6de6a2..21ddadee18 100644 --- a/source/SAMRAI/hier/PatchData.h +++ b/source/SAMRAI/hier/PatchData.h @@ -20,6 +20,15 @@ #include "SAMRAI/tbox/Utilities.h" namespace SAMRAI { + +/* + * Forward declaration of KernelFuser class - required here because it sucks in + * RAJA and requires CUDA. + */ +namespace tbox { +class KernelFuser; +} + namespace hier { /** @@ -160,6 +169,12 @@ class PatchData const PatchData& src, const BoxOverlap& overlap) = 0; + virtual void + copy( + const PatchData& src, + const BoxOverlap& overlap, + tbox::KernelFuser& fuser); + /** * Copy data from the source into the destination using the designated * overlap descriptor. The overlap description will have been computed @@ -206,6 +221,19 @@ class PatchData tbox::MessageStream& stream, const BoxOverlap& overlap) const = 0; + /** + * Pack data lying on the specified index set into the output stream using + * the given KernelFuser. The default implementation of this method will + * call packStream without the fuser argument. See the abstract stream + * virtual base class for more information about the packing operators + * defined for streams. + */ + virtual void + packStream( + tbox::MessageStream& stream, + const BoxOverlap& overlap, + tbox::KernelFuser& fuser); + /** * Unpack data from the message stream into the specified index set. * See the abstract stream virtual base class for more information about @@ -216,6 +244,19 @@ class PatchData tbox::MessageStream& stream, const BoxOverlap& overlap) = 0; + /** + * Unpack data from the message stream into the specified index set using + * the given KernelFuser. The default implementation of this method will + * call unpackStream without the fuser argument. See the abstract stream + * virtual base class for more information about the packing operators + * defined for streams. + */ + virtual void + unpackStream( + tbox::MessageStream& stream, + const BoxOverlap& overlap, + tbox::KernelFuser& fuser); + /** * Checks that class version and restart file version are equal. If so, * reads in the data members common to all patch data types from restart diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt index a0561da1af..4724edd0d5 100644 --- a/source/SAMRAI/tbox/CMakeLists.txt +++ b/source/SAMRAI/tbox/CMakeLists.txt @@ -148,9 +148,11 @@ if (ENABLE_RAJA) endif () if (ENABLE_CUDA) - set(cuda_sources Schedule.C) + set(cuda_sources TransactionFuseable.C Schedule.C) set_source_files_properties(${cuda_sources} PROPERTIES LANGUAGE CUDA) + set (tbox_depends ${tbox_depends} cuda) + if (ENABLE_NVTX_REGIONS) find_package(CUDA REQUIRED) diff --git a/source/SAMRAI/tbox/ExecutionPolicy.h b/source/SAMRAI/tbox/ExecutionPolicy.h index 11989f7457..4ddecd5a56 100644 --- a/source/SAMRAI/tbox/ExecutionPolicy.h +++ b/source/SAMRAI/tbox/ExecutionPolicy.h @@ -111,6 +111,11 @@ struct policy_traits { >; using ReductionPolicy = RAJA::cuda_reduce; + + using WorkGroupPolicy = RAJA::WorkGroupPolicy< + RAJA::cuda_work_async<1024>, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects>; }; #else diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h new file mode 100644 index 0000000000..5c434fc148 --- /dev/null +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -0,0 +1,46 @@ +#ifndef included_tbox_KernelFuser +#define included_tbox_KernelFuser + +#include "SAMRAI/tbox/ExecutionPolicy.h" +#include "SAMRAI/tbox/AllocatorDatabase.h" + +// #include "RAJA/RAJA.hpp" + +namespace SAMRAI { +namespace tbox { + +class KernelFuser +{ +public: + // KernelFuser() : + // d_workpool(AllocatorDatabase::getDatabase()->getStreamAllocator()) + // {} + + template + void enqueue(int begin, int end, Kernel&& kernel) { + //d_workpool.enqueue(RAJA::RangeSegment(begin, end), std::forward(kernel)); + } + + void launch() + { + // d_workgroup = d_workpool.instantiate(); + // d_worksite = d_workgroup.run(); + } + +private: + using Allocator = umpire::TypedAllocator; + + // using Policy = typename tbox::detail::policy_traits< tbox::policy::parallel >::WorkGroupPolicy; + // using WorkPool = RAJA::WorkPool , Allocator>; + // using WorkGroup = RAJA::WorkGroup, Allocator>; + // using WorkSite = RAJA::WorkSite , Allocator>; + + // WorkPool d_workpool; + // WorkGroup d_workgroup; + // WorkSite d_worksite; +}; + +} +} + +#endif \ No newline at end of file diff --git a/source/SAMRAI/tbox/Schedule.C b/source/SAMRAI/tbox/Schedule.C index 54fab2ef95..d593af9518 100644 --- a/source/SAMRAI/tbox/Schedule.C +++ b/source/SAMRAI/tbox/Schedule.C @@ -105,20 +105,32 @@ Schedule::addTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { - d_local_set_fuseable.push_front(transaction); + if (!d_local_fuser) { + d_local_fuser = new KernelFuser{}; + } + fuseable_transaction->setKernelFuser(d_local_fuser); + d_local_set_fuseable.push_front(fuseable_transaction); } else { d_local_set.push_front(transaction); } } else { if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { - d_recv_sets_fuseable[src_id].push_front(transaction); + if (!d_recv_fuser) { + d_recv_fuser = new KernelFuser{}; + } + fuseable_transaction->setKernelFuser(d_recv_fuser); + d_recv_sets_fuseable[src_id].push_front(fuseable_transaction); } else { d_recv_sets[src_id].push_front(transaction); } } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { - d_send_sets_fuseable[dst_id].push_front(transaction); + if (!d_send_fuser) { + d_send_fuser = new KernelFuser{}; + } + fuseable_transaction->setKernelFuser(d_send_fuser); + d_send_sets_fuseable[dst_id].push_front(fuseable_transaction); } else { d_send_sets[dst_id].push_front(transaction); } @@ -145,19 +157,31 @@ Schedule::appendTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { - d_local_set_fuseable.push_back(transaction); + if (!d_local_fuser) { + d_local_fuser = new KernelFuser{}; + } + fuseable_transaction->setKernelFuser(d_local_fuser); + d_local_set_fuseable.push_back(fuseable_transaction); } else { d_local_set.push_back(transaction); } } else { if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { - d_recv_sets_fuseable[src_id].push_back(transaction); + if (!d_recv_fuser) { + d_recv_fuser = new KernelFuser{}; + } + fuseable_transaction->setKernelFuser(d_recv_fuser); + d_recv_sets_fuseable[src_id].push_back(fuseable_transaction); } else { d_recv_sets[src_id].push_back(transaction); } } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { + if (!d_send_fuser) { + d_send_fuser = new KernelFuser{}; + } + fuseable_transaction->setKernelFuser(d_send_fuser); d_send_sets_fuseable[dst_id].push_back(transaction); } else { d_send_sets[dst_id].push_back(transaction); @@ -438,10 +462,13 @@ Schedule::postSends() ); d_object_timers->t_pack_stream->start(); - for (const auto& transaction : d_send_sets[peer_rank]) { + + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { transaction->packStream(outgoing_stream); } - for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + d_send_fuser->launch(); + + for (const auto& transaction : d_send_sets[peer_rank]) { transaction->packStream(outgoing_stream); } #if defined(HAVE_RAJA) @@ -498,10 +525,12 @@ Schedule::postSends() ); d_object_timers->t_pack_stream->start(); - for (const auto& transaction : d_send_sets[peer_rank]) { + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { transaction->packStream(outgoing_stream); } - for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { + d_send_fuser->launch(); + + for (const auto& transaction : d_send_sets[peer_rank]) { transaction->packStream(outgoing_stream); } #if defined(HAVE_RAJA) @@ -536,10 +565,11 @@ void Schedule::performLocalCopies() { d_object_timers->t_local_copies->start(); - // TODO: fuse these kernels for (const auto& local : d_local_set_fuseable) { local->copyLocalData(); } + d_local_fuser->launch(); + for (const auto& local : d_local_set) { local->copyLocalData(); } @@ -585,13 +615,14 @@ Schedule::processCompletedCommunications() ); d_object_timers->t_unpack_stream->start(); - for (const auto& transaction : d_recv_sets[sender]) { + for (const auto& transaction : d_recv_sets_fuseable[sender]) { transaction->unpackStream(incoming_stream); } + d_recv_fuser->launch(); #if defined(HAVE_RAJA) parallel_synchronize(); #endif - for (const auto& transaction : d_recv_sets_fuseable[sender]) { + for (const auto& transaction : d_recv_sets[sender]) { transaction->unpackStream(incoming_stream); } #if defined(HAVE_RAJA) @@ -634,13 +665,14 @@ Schedule::processCompletedCommunications() ); d_object_timers->t_unpack_stream->start(); - for (const auto& transaction : d_recv_sets[sender]) { + for (const auto& transaction : d_recv_sets_fuseable[sender]) { transaction->unpackStream(incoming_stream); } + d_recv_fuser->launch(); #if defined(HAVE_RAJA) parallel_synchronize(); #endif - for (const auto& transaction : d_recv_sets_fuseable[sender]) { + for (const auto& transaction : d_recv_sets[sender]) { transaction->unpackStream(incoming_stream); } #if defined(HAVE_RAJA) diff --git a/source/SAMRAI/tbox/Schedule.h b/source/SAMRAI/tbox/Schedule.h index 21adf15983..ad085dd25b 100644 --- a/source/SAMRAI/tbox/Schedule.h +++ b/source/SAMRAI/tbox/Schedule.h @@ -18,6 +18,7 @@ #include "SAMRAI/tbox/MessageStream.h" #include "SAMRAI/tbox/Transaction.h" #include "SAMRAI/tbox/TransactionFuseable.h" +#include "SAMRAI/tbox/KernelFuser.h" #include #include @@ -346,6 +347,9 @@ class Schedule TransactionSets d_send_sets_fuseable; TransactionSets d_recv_sets_fuseable; + KernelFuser* d_send_fuser{nullptr}; + KernelFuser* d_recv_fuser{nullptr}; + /* * @brief Transactions where the source and destination are the * local process. @@ -358,6 +362,7 @@ class Schedule */ std::list > d_local_set_fuseable; + KernelFuser* d_local_fuser{nullptr}; //@{ @name High-level asynchronous messages passing objects diff --git a/source/SAMRAI/tbox/TransactionFuseable.C b/source/SAMRAI/tbox/TransactionFuseable.C index e69de29bb2..aeec68e8f1 100644 --- a/source/SAMRAI/tbox/TransactionFuseable.C +++ b/source/SAMRAI/tbox/TransactionFuseable.C @@ -0,0 +1,19 @@ +#include "SAMRAI/tbox/TransactionFuseable.h" + +namespace SAMRAI { +namespace tbox { + +void +TransactionFuseable::setKernelFuser(KernelFuser* fuser) +{ + d_fuser = fuser; +} + +KernelFuser* +TransactionFuseable::getKernelFuser() +{ + return d_fuser; +} + +} +} \ No newline at end of file diff --git a/source/SAMRAI/tbox/TransactionFuseable.h b/source/SAMRAI/tbox/TransactionFuseable.h index 40bbf42428..79265713cb 100644 --- a/source/SAMRAI/tbox/TransactionFuseable.h +++ b/source/SAMRAI/tbox/TransactionFuseable.h @@ -14,16 +14,23 @@ #include "SAMRAI/SAMRAI_config.h" #include "SAMRAI/tbox/Transaction.h" +#include "SAMRAI/tbox/KernelFuser.h" #include namespace SAMRAI { namespace tbox { - class TransactionFuseable : - public Transaction - { - }; +class TransactionFuseable : + public Transaction +{ +public: + void setKernelFuser(KernelFuser* fuser); + KernelFuser* getKernelFuser(); + +private: + KernelFuser* d_fuser{nullptr}; +}; } } diff --git a/source/SAMRAI/xfer/RefineCopyTransaction.C b/source/SAMRAI/xfer/RefineCopyTransaction.C index 2048799d65..a67b201984 100644 --- a/source/SAMRAI/xfer/RefineCopyTransaction.C +++ b/source/SAMRAI/xfer/RefineCopyTransaction.C @@ -131,7 +131,7 @@ RefineCopyTransaction::packStream( tbox::MessageStream& stream) { d_src_patch->getPatchData(d_refine_data[d_item_id]->d_src) - ->packStream(stream, *d_overlap); + ->packStream(stream, *d_overlap, *getKernelFuser()); } void @@ -139,7 +139,7 @@ RefineCopyTransaction::unpackStream( tbox::MessageStream& stream) { d_dst_patch->getPatchData(d_refine_data[d_item_id]->d_scratch) - ->unpackStream(stream, *d_overlap); + ->unpackStream(stream, *d_overlap, *getKernelFuser()); } void @@ -151,7 +151,7 @@ RefineCopyTransaction::copyLocalData() const hier::PatchData& src_data = *d_src_patch->getPatchData(d_refine_data[d_item_id]->d_src); - dst_data.copy(src_data, *d_overlap); + dst_data.copy(src_data, *d_overlap, *getKernelFuser()); } /* From beccc9f2375a1779ab0651478886f5c27d119090 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Wed, 14 Apr 2021 10:27:34 -0700 Subject: [PATCH 04/34] Add guard for non-umpire builds --- source/SAMRAI/tbox/KernelFuser.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index 5c434fc148..5cddb091b1 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -28,7 +28,11 @@ class KernelFuser } private: +#ifdef HAVE_UMPIRE using Allocator = umpire::TypedAllocator; +#else + using Allocator = ResourceAllocator; +#endif // using Policy = typename tbox::detail::policy_traits< tbox::policy::parallel >::WorkGroupPolicy; // using WorkPool = RAJA::WorkPool , Allocator>; @@ -43,4 +47,4 @@ class KernelFuser } } -#endif \ No newline at end of file +#endif From cdf11110b585ff6a5d8f4e0a48291be9d743143d Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Thu, 30 Sep 2021 15:02:11 -0700 Subject: [PATCH 05/34] Add #define to signal existance of KernelFuser --- config/SAMRAI_config.h.cmake.in | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/config/SAMRAI_config.h.cmake.in b/config/SAMRAI_config.h.cmake.in index 013a8e5609..4360106474 100644 --- a/config/SAMRAI_config.h.cmake.in +++ b/config/SAMRAI_config.h.cmake.in @@ -358,7 +358,9 @@ /* Configure for compiling on BGL family of machines */ #undef __BGL_FAMILY__ - +#ifdef HAVE_RAJA +#define HAVE_KERNEL_FUSER +#endif namespace SAMRAI { static const unsigned short MAX_DIM_VAL = SAMRAI_MAXIMUM_DIMENSION; From 6d600c8ea506212f58be9e83525502d4068257da Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Fri, 15 Oct 2021 11:04:12 -0700 Subject: [PATCH 06/34] Make CoarsenCopyTransaction fuseable --- source/SAMRAI/xfer/CoarsenCopyTransaction.C | 6 +++--- source/SAMRAI/xfer/CoarsenCopyTransaction.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/source/SAMRAI/xfer/CoarsenCopyTransaction.C b/source/SAMRAI/xfer/CoarsenCopyTransaction.C index 2a2e3fd8ea..87440693ee 100644 --- a/source/SAMRAI/xfer/CoarsenCopyTransaction.C +++ b/source/SAMRAI/xfer/CoarsenCopyTransaction.C @@ -131,7 +131,7 @@ CoarsenCopyTransaction::packStream( tbox::MessageStream& stream) { d_src_patch->getPatchData(d_coarsen_data[d_item_id]->d_src) - ->packStream(stream, *d_overlap); + ->packStream(stream, *d_overlap, *getKernelFuser()); } void @@ -139,7 +139,7 @@ CoarsenCopyTransaction::unpackStream( tbox::MessageStream& stream) { d_dst_patch->getPatchData(d_coarsen_data[d_item_id]->d_dst) - ->unpackStream(stream, *d_overlap); + ->unpackStream(stream, *d_overlap, *getKernelFuser()); } void @@ -151,7 +151,7 @@ CoarsenCopyTransaction::copyLocalData() const hier::PatchData& src_data = *d_src_patch->getPatchData(d_coarsen_data[d_item_id]->d_src); - dst_data.copy(src_data, *d_overlap); + dst_data.copy(src_data, *d_overlap, *getKernelFuser()); } /* diff --git a/source/SAMRAI/xfer/CoarsenCopyTransaction.h b/source/SAMRAI/xfer/CoarsenCopyTransaction.h index eace0a2513..92c54c334c 100644 --- a/source/SAMRAI/xfer/CoarsenCopyTransaction.h +++ b/source/SAMRAI/xfer/CoarsenCopyTransaction.h @@ -14,7 +14,7 @@ #include "SAMRAI/SAMRAI_config.h" -#include "SAMRAI/tbox/Transaction.h" +#include "SAMRAI/tbox/TransactionFuseable.h" #include "SAMRAI/hier/BaseGridGeometry.h" #include "SAMRAI/hier/PatchLevel.h" #include "SAMRAI/xfer/CoarsenClasses.h" @@ -40,7 +40,7 @@ namespace xfer { * @see tbox::Transaction */ -class CoarsenCopyTransaction:public tbox::Transaction +class CoarsenCopyTransaction:public tbox::TransactionFuseable { public: /*! From 91f0636e6da8d3fa5e9df7ba2d7bfff64c917250 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Wed, 10 Nov 2021 16:14:56 -0800 Subject: [PATCH 07/34] Start to turn on workgroups for KernelFuser, and add placeholder methods in pdat classes that take KernelFuser. --- source/SAMRAI/hier/PatchData.C | 2 +- source/SAMRAI/hier/PatchData.h | 2 +- source/SAMRAI/pdat/CellData.h | 31 ++++++++++++++++++++++++++++ source/SAMRAI/pdat/EdgeData.h | 31 ++++++++++++++++++++++++++++ source/SAMRAI/pdat/FaceData.h | 30 +++++++++++++++++++++++++++ source/SAMRAI/pdat/NodeData.h | 31 ++++++++++++++++++++++++++++ source/SAMRAI/pdat/OuteredgeData.h | 30 +++++++++++++++++++++++++++ source/SAMRAI/pdat/OuterfaceData.h | 30 +++++++++++++++++++++++++++ source/SAMRAI/pdat/OuternodeData.h | 30 +++++++++++++++++++++++++++ source/SAMRAI/pdat/OutersideData.h | 31 ++++++++++++++++++++++++++++ source/SAMRAI/pdat/SideData.h | 30 +++++++++++++++++++++++++++ source/SAMRAI/tbox/KernelFuser.h | 33 +++++++++++++++++------------- 12 files changed, 295 insertions(+), 16 deletions(-) diff --git a/source/SAMRAI/hier/PatchData.C b/source/SAMRAI/hier/PatchData.C index 08e559b256..cd06eec80f 100644 --- a/source/SAMRAI/hier/PatchData.C +++ b/source/SAMRAI/hier/PatchData.C @@ -44,7 +44,7 @@ void PatchData::packStream( tbox::MessageStream& stream, const BoxOverlap& overlap, - tbox::KernelFuser& fuser) + tbox::KernelFuser& fuser) const { packStream(stream, overlap); } diff --git a/source/SAMRAI/hier/PatchData.h b/source/SAMRAI/hier/PatchData.h index 503697927f..d314c6898b 100644 --- a/source/SAMRAI/hier/PatchData.h +++ b/source/SAMRAI/hier/PatchData.h @@ -232,7 +232,7 @@ class PatchData packStream( tbox::MessageStream& stream, const BoxOverlap& overlap, - tbox::KernelFuser& fuser); + tbox::KernelFuser& fuser) const; /** * Unpack data from the message stream into the specified index set. diff --git a/source/SAMRAI/pdat/CellData.h b/source/SAMRAI/pdat/CellData.h index 35b1cd01d3..84f6343ada 100644 --- a/source/SAMRAI/pdat/CellData.h +++ b/source/SAMRAI/pdat/CellData.h @@ -279,6 +279,16 @@ class CellData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -356,6 +366,16 @@ class CellData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object * over the specified box overlap region. The overlap must be a @@ -368,6 +388,17 @@ class CellData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + + /*! * @brief Add data from source to destination (i.e., this) * patch data object on the given overlap. diff --git a/source/SAMRAI/pdat/EdgeData.h b/source/SAMRAI/pdat/EdgeData.h index df48ee55b9..cdbe766a4f 100644 --- a/source/SAMRAI/pdat/EdgeData.h +++ b/source/SAMRAI/pdat/EdgeData.h @@ -305,6 +305,16 @@ class EdgeData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -383,6 +393,16 @@ class EdgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -395,6 +415,17 @@ class EdgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/FaceData.h b/source/SAMRAI/pdat/FaceData.h index 069e78ecdc..7755d705a0 100644 --- a/source/SAMRAI/pdat/FaceData.h +++ b/source/SAMRAI/pdat/FaceData.h @@ -309,6 +309,16 @@ class FaceData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -387,6 +397,16 @@ class FaceData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -399,6 +419,16 @@ class FaceData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/NodeData.h b/source/SAMRAI/pdat/NodeData.h index c80853d033..56e2fa4288 100644 --- a/source/SAMRAI/pdat/NodeData.h +++ b/source/SAMRAI/pdat/NodeData.h @@ -284,6 +284,16 @@ class NodeData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -362,6 +372,16 @@ class NodeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be a @@ -374,6 +394,17 @@ class NodeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/OuteredgeData.h b/source/SAMRAI/pdat/OuteredgeData.h index c99d9448fc..1cc7c8c34a 100644 --- a/source/SAMRAI/pdat/OuteredgeData.h +++ b/source/SAMRAI/pdat/OuteredgeData.h @@ -443,6 +443,16 @@ class OuteredgeData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -542,6 +552,16 @@ class OuteredgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -554,6 +574,16 @@ class OuteredgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + /*! * @brief Unpack data from stream and add into this patch data object * over the specified box overlap region. diff --git a/source/SAMRAI/pdat/OuterfaceData.h b/source/SAMRAI/pdat/OuterfaceData.h index 40b115199f..78b22dc335 100644 --- a/source/SAMRAI/pdat/OuterfaceData.h +++ b/source/SAMRAI/pdat/OuterfaceData.h @@ -334,6 +334,16 @@ class OuterfaceData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -415,6 +425,16 @@ class OuterfaceData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -427,6 +447,16 @@ class OuterfaceData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/OuternodeData.h b/source/SAMRAI/pdat/OuternodeData.h index ebcb2705c8..4899f6a24b 100644 --- a/source/SAMRAI/pdat/OuternodeData.h +++ b/source/SAMRAI/pdat/OuternodeData.h @@ -379,6 +379,16 @@ class OuternodeData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -478,6 +488,16 @@ class OuternodeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -490,6 +510,16 @@ class OuternodeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + /*! * @brief Unpack data from stream and add into this patch data object * over the specified box overlap region. The overlap must be an diff --git a/source/SAMRAI/pdat/OutersideData.h b/source/SAMRAI/pdat/OutersideData.h index 20d9482d31..fb529f6fbc 100644 --- a/source/SAMRAI/pdat/OutersideData.h +++ b/source/SAMRAI/pdat/OutersideData.h @@ -333,6 +333,16 @@ class OutersideData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -413,6 +423,16 @@ class OutersideData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -425,6 +445,17 @@ class OutersideData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/SideData.h b/source/SAMRAI/pdat/SideData.h index 249d848622..00c0e62a98 100644 --- a/source/SAMRAI/pdat/SideData.h +++ b/source/SAMRAI/pdat/SideData.h @@ -379,6 +379,16 @@ class SideData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); + virtual void + copy( + const hier::PatchData& src, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + copy(src, overlap); + } + /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -459,6 +469,16 @@ class SideData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; + virtual void + packStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) const + { + NULL_USE(fuser); + packStream(stream, overlap); + } + /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -471,6 +491,16 @@ class SideData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); + virtual void + unpackStream( + tbox::MessageStream& stream, + const hier::BoxOverlap& overlap, + tbox::KernelFuser& fuser) + { + NULL_USE(fuser); + unpackStream(stream, overlap); + } + /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index 5cddb091b1..c74923d24e 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -4,7 +4,9 @@ #include "SAMRAI/tbox/ExecutionPolicy.h" #include "SAMRAI/tbox/AllocatorDatabase.h" -// #include "RAJA/RAJA.hpp" +#ifdef HAVE_RAJA +#include "RAJA/RAJA.hpp" +#endif namespace SAMRAI { namespace tbox { @@ -12,19 +14,22 @@ namespace tbox { class KernelFuser { public: - // KernelFuser() : - // d_workpool(AllocatorDatabase::getDatabase()->getStreamAllocator()) - // {} + KernelFuser() : + d_workpool(AllocatorDatabase::getDatabase()->getStreamAllocator()), + d_workgroup(d_workpool.instantiate()), + d_worksite(d_workgroup.run()) + {} template void enqueue(int begin, int end, Kernel&& kernel) { - //d_workpool.enqueue(RAJA::RangeSegment(begin, end), std::forward(kernel)); + d_workpool.enqueue(RAJA::RangeSegment(begin, end), std::forward(kernel)); } void launch() { - // d_workgroup = d_workpool.instantiate(); - // d_worksite = d_workgroup.run(); + //These might not work with umpire::TypedAllocator in the template? +// d_workgroup = d_workpool.instantiate(); +// d_worksite = d_workgroup.run(); } private: @@ -34,14 +39,14 @@ class KernelFuser using Allocator = ResourceAllocator; #endif - // using Policy = typename tbox::detail::policy_traits< tbox::policy::parallel >::WorkGroupPolicy; - // using WorkPool = RAJA::WorkPool , Allocator>; - // using WorkGroup = RAJA::WorkGroup, Allocator>; - // using WorkSite = RAJA::WorkSite , Allocator>; + using Policy = typename tbox::detail::policy_traits< tbox::policy::parallel >::WorkGroupPolicy; + using WorkPool = RAJA::WorkPool , Allocator>; + using WorkGroup = RAJA::WorkGroup, Allocator>; + using WorkSite = RAJA::WorkSite , Allocator>; - // WorkPool d_workpool; - // WorkGroup d_workgroup; - // WorkSite d_worksite; + WorkPool d_workpool; + WorkGroup d_workgroup; + WorkSite d_worksite; }; } From 31b37983aec9074d9f14dd9903afc9413ef66dff Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Wed, 17 Nov 2021 16:06:01 -0800 Subject: [PATCH 08/34] Add to the implementation of KernelFuser --- config/SAMRAI_config.h.cmake.in | 2 +- source/SAMRAI/tbox/CMakeLists.txt | 1 + source/SAMRAI/tbox/KernelFuser.h | 40 +++++++++++++++++++++++++++---- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/config/SAMRAI_config.h.cmake.in b/config/SAMRAI_config.h.cmake.in index 4360106474..aaa58af346 100644 --- a/config/SAMRAI_config.h.cmake.in +++ b/config/SAMRAI_config.h.cmake.in @@ -359,7 +359,7 @@ #undef __BGL_FAMILY__ #ifdef HAVE_RAJA -#define HAVE_KERNEL_FUSER +#define SAMRAI_HAVE_KERNEL_FUSER #endif namespace SAMRAI { diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt index 166c485608..2c05b3251c 100644 --- a/source/SAMRAI/tbox/CMakeLists.txt +++ b/source/SAMRAI/tbox/CMakeLists.txt @@ -27,6 +27,7 @@ set ( tbox_headers InputDatabase.h InputManager.h IOStream.h + KernelFuser.h Logger.h MathUtilities.h MathUtilities.C diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index c74923d24e..827e519358 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -1,8 +1,11 @@ #ifndef included_tbox_KernelFuser #define included_tbox_KernelFuser +#include "SAMRAI/SAMRAI_config.h" + #include "SAMRAI/tbox/ExecutionPolicy.h" #include "SAMRAI/tbox/AllocatorDatabase.h" +#include "SAMRAI/tbox/Utilities.h" #ifdef HAVE_RAJA #include "RAJA/RAJA.hpp" @@ -17,21 +20,46 @@ class KernelFuser KernelFuser() : d_workpool(AllocatorDatabase::getDatabase()->getStreamAllocator()), d_workgroup(d_workpool.instantiate()), - d_worksite(d_workgroup.run()) - {} + d_worksite(d_workgroup.run()), + d_launched(false) + { + } template void enqueue(int begin, int end, Kernel&& kernel) { + if (d_launched) { + TBOX_ERROR("KernelFuser Error: Cannont enqueue until cleanup called after previous launch."); + } + d_workpool.enqueue(RAJA::RangeSegment(begin, end), std::forward(kernel)); } void launch() { - //These might not work with umpire::TypedAllocator in the template? -// d_workgroup = d_workpool.instantiate(); -// d_worksite = d_workgroup.run(); + if (d_launched) { + TBOX_ERROR("KernelFuser Error: This KernelFuser already launched."); + } + + d_workgroup = d_workpool.instantiate(); + d_worksite = d_workgroup.run(); + + d_launched = true; } + void cleanup() + { + d_workpool.clear(); + d_workgroup.clear(); + d_worksite.clear(); + d_launched = false; + } + + bool launched() const + { + return d_launched; + } + + private: #ifdef HAVE_UMPIRE using Allocator = umpire::TypedAllocator; @@ -47,6 +75,8 @@ class KernelFuser WorkPool d_workpool; WorkGroup d_workgroup; WorkSite d_worksite; + + bool d_launched; }; } From c249b869c4421e979f17e829476d0ba08f04f661 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Thu, 18 Nov 2021 15:31:21 -0800 Subject: [PATCH 09/34] Add kernel fuser allocator to AllocatorDatabase --- source/SAMRAI/tbox/AllocatorDatabase.C | 17 +++++++++++++++++ source/SAMRAI/tbox/AllocatorDatabase.h | 7 +++++++ source/SAMRAI/tbox/KernelFuser.h | 4 ++-- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/source/SAMRAI/tbox/AllocatorDatabase.C b/source/SAMRAI/tbox/AllocatorDatabase.C index 35590f699f..6b13ddf734 100644 --- a/source/SAMRAI/tbox/AllocatorDatabase.C +++ b/source/SAMRAI/tbox/AllocatorDatabase.C @@ -100,6 +100,16 @@ AllocatorDatabase::initialize() rm.makeAllocator("samrai::stream_allocator", allocator); } + if (!rm.isAllocator("samrai::fuser_allocator")) { +#if defined(HAVE_CUDA) + auto allocator = rm.getAllocator(umpire::resource::Pinned); +#else + auto allocator = rm.getAllocator(umpire::resource::Host); +#endif + + rm.makeAllocator("samrai::fuser_allocator", allocator); + } + if (!rm.isAllocator("samrai::temporary_data_allocator")) { #if defined(HAVE_CUDA) //auto allocator = rm.getAllocator(umpire::resource::Device); @@ -133,6 +143,13 @@ AllocatorDatabase::getStreamAllocator() return umpire::TypedAllocator(rm.getAllocator("samrai::stream_allocator")); } +umpire::TypedAllocator +AllocatorDatabase::getKernelFuserAllocator() +{ + umpire::ResourceManager& rm = umpire::ResourceManager::getInstance(); + return umpire::TypedAllocator(rm.getAllocator("samrai::fuser_allocator")); +} + umpire::TypedAllocator AllocatorDatabase::getInternalHostAllocator() { diff --git a/source/SAMRAI/tbox/AllocatorDatabase.h b/source/SAMRAI/tbox/AllocatorDatabase.h index 311d9c4abc..51869eca73 100644 --- a/source/SAMRAI/tbox/AllocatorDatabase.h +++ b/source/SAMRAI/tbox/AllocatorDatabase.h @@ -88,6 +88,13 @@ class AllocatorDatabase umpire::TypedAllocator getStreamAllocator(); #endif + /*! + * @brief Get the kernel fuser allocator. + */ +#ifdef HAVE_UMPIRE + umpire::TypedAllocator getKernelFuserAllocator(); +#endif + /*! * @brief Get a host allocator. */ diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index 827e519358..699cb4a1d0 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -17,8 +17,9 @@ namespace tbox { class KernelFuser { public: + KernelFuser() : - d_workpool(AllocatorDatabase::getDatabase()->getStreamAllocator()), + d_workpool(AllocatorDatabase::getDatabase()->getKernelFuserAllocator()), d_workgroup(d_workpool.instantiate()), d_worksite(d_workgroup.run()), d_launched(false) @@ -58,7 +59,6 @@ class KernelFuser { return d_launched; } - private: #ifdef HAVE_UMPIRE From e91952a14bf88e9352fdcac2f7736b2fe4ae13ef Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Fri, 19 Nov 2021 10:22:02 -0800 Subject: [PATCH 10/34] Add checks to do fuser launch and synchronize only when needed. --- source/SAMRAI/tbox/Schedule.C | 76 +++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/source/SAMRAI/tbox/Schedule.C b/source/SAMRAI/tbox/Schedule.C index 0369a63c5c..f7bcd93b01 100644 --- a/source/SAMRAI/tbox/Schedule.C +++ b/source/SAMRAI/tbox/Schedule.C @@ -288,9 +288,6 @@ Schedule::finalizeCommunication() { d_object_timers->t_finalize_communication->start(); performLocalCopies(); -#if defined(HAVE_RAJA) - parallel_synchronize(); -#endif processCompletedCommunications(); deallocateCommunicationObjects(); d_object_timers->t_finalize_communication->stop(); @@ -463,16 +460,23 @@ Schedule::postSends() d_object_timers->t_pack_stream->start(); + bool have_fuseable = !(d_send_sets_fuseable[peer_rank].empty()); + for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { transaction->packStream(outgoing_stream); } - d_send_fuser->launch(); - + if (d_send_fuser && have_fuseable) { + d_send_fuser->launch(); + } + for (const auto& transaction : d_send_sets[peer_rank]) { transaction->packStream(outgoing_stream); } -#if defined(HAVE_RAJA) - parallel_synchronize(); +#if defined(HAVE_RAJA) + bool have_non_fuseable = !(d_send_sets[peer_rank].empty()); + if (have_fuseable || have_non_fuseable) { + parallel_synchronize(); + } #endif d_object_timers->t_pack_stream->stop(); @@ -524,17 +528,24 @@ Schedule::postSends() #endif ); + bool have_fuseable = !(d_send_sets_fuseable[peer_rank].empty()); + d_object_timers->t_pack_stream->start(); for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { transaction->packStream(outgoing_stream); } - d_send_fuser->launch(); + if (d_send_fuser && have_fuseable) { + d_send_fuser->launch(); + } for (const auto& transaction : d_send_sets[peer_rank]) { transaction->packStream(outgoing_stream); } -#if defined(HAVE_RAJA) - parallel_synchronize(); +#if defined(HAVE_RAJA) + bool have_non_fuseable = !(d_send_sets[peer_rank].empty()); + if (have_fuseable || have_non_fuseable) { + parallel_synchronize(); + } #endif d_object_timers->t_pack_stream->stop(); @@ -564,16 +575,28 @@ Schedule::postSends() void Schedule::performLocalCopies() { + bool have_fuseable = !d_local_set_fuseable.empty(); + d_object_timers->t_local_copies->start(); for (const auto& local : d_local_set_fuseable) { local->copyLocalData(); } - d_local_fuser->launch(); + if (d_local_fuser && have_fuseable) { + d_local_fuser->launch(); + } for (const auto& local : d_local_set) { local->copyLocalData(); } d_object_timers->t_local_copies->stop(); + +#if defined(HAVE_RAJA) + bool have_non_fuseable = !d_local_set.empty(); + if (have_fuseable || have_non_fuseable) { + parallel_synchronize(); + } +#endif + } /* @@ -614,19 +637,29 @@ Schedule::processCompletedCommunications() #endif ); + + bool have_fuseable = !(d_recv_sets_fuseable[sender].empty()); + d_object_timers->t_unpack_stream->start(); for (const auto& transaction : d_recv_sets_fuseable[sender]) { transaction->unpackStream(incoming_stream); } - d_recv_fuser->launch(); + if (d_recv_fuser || have_fuseable) { + d_recv_fuser->launch(); + } #if defined(HAVE_RAJA) - parallel_synchronize(); + if (have_fuseable) { + parallel_synchronize(); + } #endif for (const auto& transaction : d_recv_sets[sender]) { transaction->unpackStream(incoming_stream); } #if defined(HAVE_RAJA) - parallel_synchronize(); + bool have_non_fuseable = !(d_recv_sets[sender].empty()); + if (have_non_fuseable) { + parallel_synchronize(); + } #endif d_object_timers->t_unpack_stream->stop(); completed_comm->clearRecvData(); @@ -664,19 +697,28 @@ Schedule::processCompletedCommunications() #endif ); + bool have_fuseable = !(d_recv_sets_fuseable[sender].empty()); + d_object_timers->t_unpack_stream->start(); for (const auto& transaction : d_recv_sets_fuseable[sender]) { transaction->unpackStream(incoming_stream); } - d_recv_fuser->launch(); + if (d_recv_fuser && have_fuseable) { + d_recv_fuser->launch(); + } #if defined(HAVE_RAJA) - parallel_synchronize(); + if (have_fuseable) { + parallel_synchronize(); + } #endif for (const auto& transaction : d_recv_sets[sender]) { transaction->unpackStream(incoming_stream); } #if defined(HAVE_RAJA) - parallel_synchronize(); + bool have_non_fuseable = !(d_recv_sets[sender].empty()); + if (have_non_fuseable) { + parallel_synchronize(); + } #endif d_object_timers->t_unpack_stream->stop(); completed_comm->clearRecvData(); From 3ab15d845095addec2cfe668a0ce72f33e0e0000 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Fri, 19 Nov 2021 10:58:36 -0800 Subject: [PATCH 11/34] Add kernel fuser cleanup --- source/SAMRAI/tbox/Schedule.C | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/SAMRAI/tbox/Schedule.C b/source/SAMRAI/tbox/Schedule.C index f7bcd93b01..a2f3e8fa09 100644 --- a/source/SAMRAI/tbox/Schedule.C +++ b/source/SAMRAI/tbox/Schedule.C @@ -476,6 +476,7 @@ Schedule::postSends() bool have_non_fuseable = !(d_send_sets[peer_rank].empty()); if (have_fuseable || have_non_fuseable) { parallel_synchronize(); + if (d_send_fuser) d_send_fuser->cleanup(); } #endif @@ -545,6 +546,7 @@ Schedule::postSends() bool have_non_fuseable = !(d_send_sets[peer_rank].empty()); if (have_fuseable || have_non_fuseable) { parallel_synchronize(); + if (d_send_fuser) d_send_fuser->cleanup(); } #endif @@ -594,6 +596,7 @@ Schedule::performLocalCopies() bool have_non_fuseable = !d_local_set.empty(); if (have_fuseable || have_non_fuseable) { parallel_synchronize(); + if (d_local_fuser) d_local_fuser->cleanup(); } #endif @@ -650,6 +653,7 @@ Schedule::processCompletedCommunications() #if defined(HAVE_RAJA) if (have_fuseable) { parallel_synchronize(); + if (d_recv_fuser) d_recv_fuser->cleanup(); } #endif for (const auto& transaction : d_recv_sets[sender]) { @@ -709,6 +713,7 @@ Schedule::processCompletedCommunications() #if defined(HAVE_RAJA) if (have_fuseable) { parallel_synchronize(); + if (d_recv_fuser) d_recv_fuser->cleanup(); } #endif for (const auto& transaction : d_recv_sets[sender]) { From eaf6fd232d7fd878304efc4a89c24ae67152a666 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Wed, 24 Nov 2021 12:09:16 -0800 Subject: [PATCH 12/34] Change KernelFuser to a singleton and begin adding it to some ArrayData for_all calls --- source/SAMRAI/hier/ForAll.h | 75 +++++++++++++++++++++++++++++++ source/SAMRAI/pdat/ArrayData.C | 32 +++++++++---- source/SAMRAI/pdat/ArrayData.h | 11 +++++ source/SAMRAI/pdat/NodeData.h | 8 +++- source/SAMRAI/tbox/CMakeLists.txt | 1 + source/SAMRAI/tbox/KernelFuser.C | 63 ++++++++++++++++++++++++++ source/SAMRAI/tbox/KernelFuser.h | 39 +++++++++++++++- source/SAMRAI/tbox/Schedule.C | 18 +++++--- 8 files changed, 230 insertions(+), 17 deletions(-) create mode 100644 source/SAMRAI/tbox/KernelFuser.C diff --git a/source/SAMRAI/hier/ForAll.h b/source/SAMRAI/hier/ForAll.h index 55efb87e14..53d3901747 100644 --- a/source/SAMRAI/hier/ForAll.h +++ b/source/SAMRAI/hier/ForAll.h @@ -19,6 +19,7 @@ #include "SAMRAI/hier/Box.h" #include "SAMRAI/hier/Index.h" #include "SAMRAI/tbox/ExecutionPolicy.h" +#include "SAMRAI/tbox/KernelFuser.h" #include #include @@ -205,6 +206,21 @@ inline void for_all(int begin, int end, LoopBody body) RAJA::forall::Policy>(RAJA::RangeSegment(begin, end), body); } +template ::value, int>::type = 0> +inline void for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body) +{ + if (fuser == nullptr) { + RAJA::forall::Policy>(RAJA::RangeSegment(begin, end), body); + } else { + //fuser->template enqueue(begin, end, body); + + //same as above, until fuser enqueue is correct. + RAJA::forall::Policy>(RAJA::RangeSegment(begin, end), body); + + } +} + template ::value, int>::type = 0> inline void for_all(int begin, int end, LoopBody body) @@ -212,6 +228,20 @@ inline void for_all(int begin, int end, LoopBody body) RAJA::forall(RAJA::RangeSegment(begin, end), body); } +template ::value, int>::type = 0> +inline void for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body) +{ + if (fuser == nullptr) { + RAJA::forall(RAJA::RangeSegment(begin, end), body); + } else { + //fuser->template enqueue(begin, end, body); + + //same as above, until fuser enqueue is correct. + RAJA::forall(RAJA::RangeSegment(begin, end), body); + } +} + // does NOT include end template inline void parallel_for_all(int begin, int end, LoopBody body) @@ -219,6 +249,16 @@ inline void parallel_for_all(int begin, int end, LoopBody body) for_all(begin, end, body); } +template +inline void parallel_for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body) +{ + if (fuser == nullptr) { + for_all(begin, end, body); + } else { + for_all(fuser, begin, end, body); + } +} + template inline void host_parallel_for_all(int begin, int end, LoopBody body) { @@ -231,12 +271,25 @@ inline void for_all(const hier::Box& box, const int dim, LoopBody body) for_all(box.lower()(dim), box.upper()(dim) + 1, body); } + +template +inline void for_all(tbox::KernelFuser* fuser, const hier::Box& box, const int dim, LoopBody body) +{ + for_all(fuser, box.lower()(dim), box.upper()(dim) + 1, body); +} + template inline void parallel_for_all(const hier::Box& box, const int dim, LoopBody body) { for_all(box.lower()(dim), box.upper()(dim) + 1, body); } +template +inline void parallel_for_all(tbox::KernelFuser* fuser, const hier::Box& box, const int dim, LoopBody body) +{ + for_all(fuser, box.lower()(dim), box.upper()(dim) + 1, body); +} + template inline void host_parallel_for_all(const hier::Box& box, const int dim, LoopBody body) { @@ -250,12 +303,34 @@ inline void for_all(const hier::Box& box, LoopBody body) detail::for_all::template eval(box.lower(), box.upper(), body); } +template +inline void for_all(tbox::KernelFuser* fuser, const hier::Box& box, LoopBody body) +{ + if (fuser == nullptr) { + for_all(box, body); + } else { + //We need enqueue for box expansion into multi-dimensional loops + //in addition to the current 1D begin, end. + + //This is the unchanged code until there is a good enqueue here or + //inside eval + constexpr int arg_count = detail::function_traits::argument_count; + detail::for_all::template eval(box.lower(), box.upper(), body); + } +} + template inline void parallel_for_all(const hier::Box& box, LoopBody body) { for_all(box, body); } +template +inline void parallel_for_all(tbox::KernelFuser* fuser, const hier::Box& box, LoopBody body) +{ + for_all(fuser, box, body); +} + template inline void host_parallel_for_all(const hier::Box& box, LoopBody body) { diff --git a/source/SAMRAI/pdat/ArrayData.C b/source/SAMRAI/pdat/ArrayData.C index 17b102c465..3794d5993b 100644 --- a/source/SAMRAI/pdat/ArrayData.C +++ b/source/SAMRAI/pdat/ArrayData.C @@ -11,6 +11,7 @@ #ifndef included_pdat_ArrayData_C #define included_pdat_ArrayData_C +#include "SAMRAI/tbox/KernelFuser.h" #include "SAMRAI/tbox/MessageStream.h" #include "SAMRAI/tbox/Utilities.h" #include "SAMRAI/tbox/MathUtilities.h" @@ -101,7 +102,8 @@ ArrayData::ArrayData( , d_array(d_depth * d_offset), #endif - d_on_host(true) + d_on_host(true), + d_use_fuser(false) { TBOX_ASSERT(depth > 0); @@ -298,12 +300,14 @@ void ArrayData::copy( const TYPE* const src_ptr = &src.d_array[0]; const size_t n = d_offset * d_depth; #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { copyop(dst_ptr[i], src_ptr[i]); }); } else { - hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) { copyop(dst_ptr[i], src_ptr[i]); }); } @@ -492,12 +496,15 @@ void ArrayData::copyDepth( #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; + if (d_on_host) { hier::host_parallel_for_all(0, d_offset, [=] (int i) { copyop(dst_ptr_d[i], src_ptr_d[i]); }); } else { - hier::parallel_for_all(0, d_offset, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, 0, d_offset, [=] SAMRAI_HOST_DEVICE(int i) { copyop(dst_ptr_d[i], src_ptr_d[i]); }); } @@ -1009,12 +1016,15 @@ void ArrayData::fillAll( TYPE* ptr = &d_array[0]; const size_t n = d_depth * d_offset; #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; + if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { ptr[i] = t; }); } else { - hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) { ptr[i] = t; }); } @@ -1051,12 +1061,15 @@ void ArrayData::fill( const size_t n = d_offset; if (!d_box.empty()) { #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; + if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { ptr[i] = t; }); } else { - hier::parallel_for_all(0, n, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, 0, n, [=] SAMRAI_HOST_DEVICE(int i) { ptr[i] = t; }); } @@ -1082,6 +1095,9 @@ void ArrayData::fill( if (!ispace.empty()) { #if defined(HAVE_RAJA) + tbox::KernelFuser* fuser = d_use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; + switch (ispace.getDim().getValue()) { case 1: { auto data = getView<1>(d); @@ -1090,7 +1106,7 @@ void ArrayData::fill( data(i) = t; }); } else { - hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i) { data(i) = t; }); } @@ -1103,7 +1119,7 @@ void ArrayData::fill( data(i,j) = t; }); } else { - hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i, int j) { data(i,j) = t; }); } @@ -1116,7 +1132,7 @@ void ArrayData::fill( data(i,j,k) = t; }); } else { - hier::parallel_for_all(ispace, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(fuser, ispace, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { data(i,j,k) = t; }); } diff --git a/source/SAMRAI/pdat/ArrayData.h b/source/SAMRAI/pdat/ArrayData.h index 69fbf1906f..5ae263bd0d 100644 --- a/source/SAMRAI/pdat/ArrayData.h +++ b/source/SAMRAI/pdat/ArrayData.h @@ -653,6 +653,16 @@ class ArrayData return d_on_host; } + void startKernelFuser() + { + d_use_fuser = true; + } + + void stopKernelFuser() + { + d_use_fuser = false; + } + /*! * The array data iterator iterates over the elements of a box * associated with an ArrayData object. This typedef is @@ -729,6 +739,7 @@ class ArrayData #endif bool d_on_host; + bool d_use_fuser; }; #if defined(HAVE_RAJA) diff --git a/source/SAMRAI/pdat/NodeData.h b/source/SAMRAI/pdat/NodeData.h index 56e2fa4288..8cd034b58e 100644 --- a/source/SAMRAI/pdat/NodeData.h +++ b/source/SAMRAI/pdat/NodeData.h @@ -291,7 +291,9 @@ class NodeData:public hier::PatchData tbox::KernelFuser& fuser) { NULL_USE(fuser); + d_data->startKernelFuser(); copy(src, overlap); + d_data->stopKernelFuser(); } /*! @@ -378,8 +380,9 @@ class NodeData:public hier::PatchData const hier::BoxOverlap& overlap, tbox::KernelFuser& fuser) const { - NULL_USE(fuser); + d_data->startKernelFuser(); packStream(stream, overlap); + d_data->stopKernelFuser(); } /*! @@ -400,8 +403,9 @@ class NodeData:public hier::PatchData const hier::BoxOverlap& overlap, tbox::KernelFuser& fuser) { - NULL_USE(fuser); + d_data->startKernelFuser(); unpackStream(stream, overlap); + d_data->stopKernelFuser(); } diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt index 2c05b3251c..d894b434c2 100644 --- a/source/SAMRAI/tbox/CMakeLists.txt +++ b/source/SAMRAI/tbox/CMakeLists.txt @@ -90,6 +90,7 @@ set (tbox_sources HDFDatabaseFactory.C IEEE.C InputManager.C + KernelFuser.C Logger.C MathUtilitiesSpecial.C MemoryDatabase.C diff --git a/source/SAMRAI/tbox/KernelFuser.C b/source/SAMRAI/tbox/KernelFuser.C new file mode 100644 index 0000000000..ff7f85c59f --- /dev/null +++ b/source/SAMRAI/tbox/KernelFuser.C @@ -0,0 +1,63 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2021 Lawrence Livermore National Security, LLC + * Description: Singleton kernel fuser + * + ************************************************************************/ + +#include "SAMRAI/tbox/KernelFuser.h" + + +namespace SAMRAI { +namespace tbox { + +KernelFuser* KernelFuser::s_kernel_fuser_instance(nullptr); + +StartupShutdownManager::Handler +KernelFuser::s_startup_handler( + 0, + KernelFuser::startupCallback, + 0, + 0, + tbox::StartupShutdownManager::priorityArenaManager); + +void +KernelFuser::startupCallback() +{ + KernelFuser::getFuser()->initialize(); +} + +void +KernelFuser::shutdownCallback() +{ + if (s_kernel_fuser_instance) { + delete s_kernel_fuser_instance; + } + s_kernel_fuser_instance = nullptr; +} + +KernelFuser * +KernelFuser::getFuser() +{ + if (!s_kernel_fuser_instance) { + s_kernel_fuser_instance = new KernelFuser(); + } + return s_kernel_fuser_instance; +} + +KernelFuser::~KernelFuser() +{ +} + +void +KernelFuser::initialize() +{ +} + + +} +} + diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index 699cb4a1d0..4bf5557029 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -3,6 +3,11 @@ #include "SAMRAI/SAMRAI_config.h" +#ifdef HAVE_UMPIRE +#include "umpire/ResourceManager.hpp" +#include "umpire/TypedAllocator.hpp" +#endif + #include "SAMRAI/tbox/ExecutionPolicy.h" #include "SAMRAI/tbox/AllocatorDatabase.h" #include "SAMRAI/tbox/Utilities.h" @@ -11,13 +16,19 @@ #include "RAJA/RAJA.hpp" #endif + +#ifdef HAVE_UMPIRE +#include "umpire/Allocator.hpp" +#include "umpire/TypedAllocator.hpp" +#endif + namespace SAMRAI { namespace tbox { class KernelFuser { public: - +/* KernelFuser() : d_workpool(AllocatorDatabase::getDatabase()->getKernelFuserAllocator()), d_workgroup(d_workpool.instantiate()), @@ -25,6 +36,8 @@ class KernelFuser d_launched(false) { } +*/ + static KernelFuser* getFuser(); template void enqueue(int begin, int end, Kernel&& kernel) { @@ -60,6 +73,21 @@ class KernelFuser return d_launched; } + void initialize(); + +protected: + KernelFuser() : + d_workpool(AllocatorDatabase::getDatabase()->getKernelFuserAllocator()), + d_workgroup(d_workpool.instantiate()), + d_worksite(d_workgroup.run()), + d_launched(false) + { + } + + + virtual ~KernelFuser(); + + private: #ifdef HAVE_UMPIRE using Allocator = umpire::TypedAllocator; @@ -72,6 +100,15 @@ class KernelFuser using WorkGroup = RAJA::WorkGroup, Allocator>; using WorkSite = RAJA::WorkSite , Allocator>; + static void startupCallback(); + static void shutdownCallback(); + + static KernelFuser* s_kernel_fuser_instance; + + static StartupShutdownManager::Handler + s_startup_handler; + + WorkPool d_workpool; WorkGroup d_workgroup; WorkSite d_worksite; diff --git a/source/SAMRAI/tbox/Schedule.C b/source/SAMRAI/tbox/Schedule.C index a2f3e8fa09..72a9f7ebd2 100644 --- a/source/SAMRAI/tbox/Schedule.C +++ b/source/SAMRAI/tbox/Schedule.C @@ -106,7 +106,8 @@ Schedule::addTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { if (!d_local_fuser) { - d_local_fuser = new KernelFuser{}; + //d_local_fuser = new KernelFuser{}; + d_local_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_local_fuser); d_local_set_fuseable.push_front(fuseable_transaction); @@ -117,7 +118,8 @@ Schedule::addTransaction( if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { if (!d_recv_fuser) { - d_recv_fuser = new KernelFuser{}; + //d_recv_fuser = new KernelFuser{}; + d_recv_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_recv_fuser); d_recv_sets_fuseable[src_id].push_front(fuseable_transaction); @@ -127,7 +129,8 @@ Schedule::addTransaction( } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { if (!d_send_fuser) { - d_send_fuser = new KernelFuser{}; + d_send_fuser = //new KernelFuser{}; + d_send_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_send_fuser); d_send_sets_fuseable[dst_id].push_front(fuseable_transaction); @@ -158,7 +161,8 @@ Schedule::appendTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { if (!d_local_fuser) { - d_local_fuser = new KernelFuser{}; + //d_local_fuser = new KernelFuser{}; + d_local_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_local_fuser); d_local_set_fuseable.push_back(fuseable_transaction); @@ -169,7 +173,8 @@ Schedule::appendTransaction( if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { if (!d_recv_fuser) { - d_recv_fuser = new KernelFuser{}; + //d_recv_fuser = new KernelFuser{}; + d_recv_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_recv_fuser); d_recv_sets_fuseable[src_id].push_back(fuseable_transaction); @@ -179,7 +184,8 @@ Schedule::appendTransaction( } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { if (!d_send_fuser) { - d_send_fuser = new KernelFuser{}; + //d_send_fuser = new KernelFuser{}; + d_send_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_send_fuser); d_send_sets_fuseable[dst_id].push_back(transaction); From d43092ed6626e0cf5ce8a9759b1884eef717a058 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Tue, 30 Nov 2021 14:53:15 -0800 Subject: [PATCH 13/34] Add enqueue calls to some of the for_alls, add fuser in more places --- source/SAMRAI/hier/ForAll.h | 89 +++++++++++++++---- source/SAMRAI/pdat/ArrayData.h | 9 +- .../SAMRAI/pdat/ArrayDataOperationUtilities.C | 41 ++++++--- .../SAMRAI/pdat/ArrayDataOperationUtilities.h | 1 - source/SAMRAI/tbox/KernelFuser.h | 12 +-- 5 files changed, 112 insertions(+), 40 deletions(-) diff --git a/source/SAMRAI/hier/ForAll.h b/source/SAMRAI/hier/ForAll.h index 53d3901747..0a62c62e3c 100644 --- a/source/SAMRAI/hier/ForAll.h +++ b/source/SAMRAI/hier/ForAll.h @@ -146,8 +146,37 @@ struct for_all<1> { RAJA::make_tuple(make_range(ifirst, ilast, 0)), body); } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + if (fuser == nullptr) { + RAJA::kernel::Policy1d>( + RAJA::make_tuple(make_range(ifirst, ilast, 0)), + body); + } else { + fuser->enqueue(ifirst(0), ilast(0), body); + } + } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + if (fuser == nullptr) { + RAJA::kernel( + RAJA::make_tuple(make_range(ifirst, ilast, 0)), + body); + } else { + fuser->enqueue(ifirst(0), ilast(0), body); + } + } }; + +// 2D and 3D don't use the fuser for anything pending suppor for +// multidimensional loops in KernelFuser. template <> struct for_all<2> { template { make_range(ifirst, ilast, 1)), body); } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + RAJA::kernel::Policy2d>( + RAJA::make_tuple(make_range(ifirst, ilast, 0), + make_range(ifirst, ilast, 1)), + body); + } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + RAJA::kernel( + RAJA::make_tuple(make_range(ifirst, ilast, 0), + make_range(ifirst, ilast, 1)), + body); + } }; template <> @@ -194,6 +243,28 @@ struct for_all<3> { make_range(ifirst, ilast, 2)), body); } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + RAJA::kernel::Policy3d>( + RAJA::make_tuple(make_range(ifirst, ilast, 0), + make_range(ifirst, ilast, 1), + make_range(ifirst, ilast, 2)), + body); + } + + template ::value, int>::type = 0> + inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) + { + RAJA::kernel( + RAJA::make_tuple(make_range(ifirst, ilast, 0), + make_range(ifirst, ilast, 1), + make_range(ifirst, ilast, 2)), + body); + } }; } // namespace detail @@ -213,11 +284,7 @@ inline void for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body) if (fuser == nullptr) { RAJA::forall::Policy>(RAJA::RangeSegment(begin, end), body); } else { - //fuser->template enqueue(begin, end, body); - - //same as above, until fuser enqueue is correct. - RAJA::forall::Policy>(RAJA::RangeSegment(begin, end), body); - + fuser->enqueue(begin, end, body); } } @@ -235,10 +302,7 @@ inline void for_all(tbox::KernelFuser* fuser, int begin, int end, LoopBody body) if (fuser == nullptr) { RAJA::forall(RAJA::RangeSegment(begin, end), body); } else { - //fuser->template enqueue(begin, end, body); - - //same as above, until fuser enqueue is correct. - RAJA::forall(RAJA::RangeSegment(begin, end), body); + fuser->enqueue(begin, end, body); } } @@ -309,13 +373,8 @@ inline void for_all(tbox::KernelFuser* fuser, const hier::Box& box, LoopBody bod if (fuser == nullptr) { for_all(box, body); } else { - //We need enqueue for box expansion into multi-dimensional loops - //in addition to the current 1D begin, end. - - //This is the unchanged code until there is a good enqueue here or - //inside eval constexpr int arg_count = detail::function_traits::argument_count; - detail::for_all::template eval(box.lower(), box.upper(), body); + detail::for_all::template eval(fuser, box.lower(), box.upper(), body); } } diff --git a/source/SAMRAI/pdat/ArrayData.h b/source/SAMRAI/pdat/ArrayData.h index 5ae263bd0d..8fc79a31d5 100644 --- a/source/SAMRAI/pdat/ArrayData.h +++ b/source/SAMRAI/pdat/ArrayData.h @@ -656,12 +656,17 @@ class ArrayData void startKernelFuser() { d_use_fuser = true; - } + } void stopKernelFuser() { d_use_fuser = false; - } + } + + bool useFuser() const + { + return d_use_fuser; + } /*! * The array data iterator iterates over the elements of a box diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.C b/source/SAMRAI/pdat/ArrayDataOperationUtilities.C index a33902e491..9cdfc837a5 100644 --- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.C +++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.C @@ -112,6 +112,11 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( bool on_host = (src_on_host && dst_on_host); #endif + bool use_fuser = dst.useFuser(); + tbox::KernelFuser* fuser = use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; + + /* * Loop over the depth sections of the data arrays. */ @@ -131,7 +136,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( op(dest(i), s2(i)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { op(dest(i), s2(i)); }); } @@ -148,7 +153,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( op(dest(i, j), s2(i, j)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { op(dest(i, j), s2(i, j)); }); } @@ -166,7 +171,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( op(dest(i, j, k), s2(i, j, k)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { op(dest(i, j, k), s2(i, j, k)); }); } @@ -312,6 +317,10 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( bool on_host = arraydata.dataOnHost(); #endif + bool use_fuser = arraydata.useFuser(); + tbox::KernelFuser* fuser = use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; + /* * Loop over the depth sections of the data arrays. */ @@ -335,7 +344,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( op(dest(i), source(i)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { op(dest(i), source(i)); }); } @@ -349,7 +358,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( op(dest(i, j), source(i, j)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { op(dest(i, j), source(i, j)); }); } @@ -363,7 +372,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( op(dest(i, j, k), source(i, j, k)); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { op(dest(i, j, k), source(i, j, k)); }); } @@ -511,6 +520,10 @@ inline void ArrayDataOperationUtilities >::doArr bool on_host = (src_on_host && dst_on_host); #endif + bool use_fuser = dst.useFuser(); + tbox::KernelFuser* fuser = use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; + /* * Loop over the depth sections of the data arrays. */ @@ -535,7 +548,7 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { double &dest_real = reinterpret_cast(dest(i))[0]; double &dest_imag = reinterpret_cast(dest(i))[1]; const double &s2_real = reinterpret_cast(s2(i))[0]; @@ -562,7 +575,7 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { double &dest_real = reinterpret_cast(dest(i,j))[0]; double &dest_imag = reinterpret_cast(dest(i,j))[1]; const double &s2_real = reinterpret_cast(s2(i,j))[0]; @@ -590,7 +603,7 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { double &dest_real = reinterpret_cast(dest(i,j,k))[0]; double &dest_imag = reinterpret_cast(dest(i,j,k))[1]; const double &s2_real = reinterpret_cast(s2(i,j,k))[0]; @@ -745,6 +758,10 @@ inline void ArrayDataOperationUtilities >::doAr bool on_host = arraydata.dataOnHost(); #endif + bool use_fuser = arraydata.useFuser(); + tbox::KernelFuser* fuser = use_fuser ? + tbox::KernelFuser::getFuser() : nullptr; + /* * Loop over the depth sections of the data arrays. */ @@ -773,7 +790,7 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { double &dest_real = reinterpret_cast(dest(i))[0]; double &dest_imag = reinterpret_cast(dest(i))[1]; double &source_real = reinterpret_cast(source(i))[0]; @@ -797,7 +814,7 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { double &dest_real = reinterpret_cast(dest(i,j))[0]; double &dest_imag = reinterpret_cast(dest(i,j))[1]; double &source_real = reinterpret_cast(source(i,j))[0]; @@ -821,7 +838,7 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { - hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { double &dest_real = reinterpret_cast(dest(i,j,k))[0]; double &dest_imag = reinterpret_cast(dest(i,j,k))[1]; double &source_real = reinterpret_cast(source(i,j,k))[0]; diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.h b/source/SAMRAI/pdat/ArrayDataOperationUtilities.h index 33c22a088e..430cf38405 100644 --- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.h +++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.h @@ -113,7 +113,6 @@ class ArrayDataOperationUtilities ArrayDataOperationUtilities& operator = ( const ArrayDataOperationUtilities&); - }; } diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index 4bf5557029..31e1cd8fa2 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -28,16 +28,8 @@ namespace tbox { class KernelFuser { public: -/* - KernelFuser() : - d_workpool(AllocatorDatabase::getDatabase()->getKernelFuserAllocator()), - d_workgroup(d_workpool.instantiate()), - d_worksite(d_workgroup.run()), - d_launched(false) - { - } -*/ - static KernelFuser* getFuser(); + + static KernelFuser* getFuser(); template void enqueue(int begin, int end, Kernel&& kernel) { From f4269135b0d7f5f4c2e2c5b97de5919ebda413e4 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Tue, 14 Dec 2021 09:36:41 -0800 Subject: [PATCH 14/34] Change name of virtual methods for fuseable operations in PatchData, and add needed guards for non-cuda builds --- source/SAMRAI/hier/ForAll.h | 4 ++ source/SAMRAI/hier/PatchData.C | 15 +++---- source/SAMRAI/hier/PatchData.h | 21 +++++----- .../SAMRAI/pdat/ArrayDataOperationUtilities.C | 39 ++++++++++++------- source/SAMRAI/pdat/CellData.h | 18 +++------ source/SAMRAI/pdat/EdgeData.h | 30 -------------- source/SAMRAI/pdat/FaceData.h | 30 -------------- source/SAMRAI/pdat/NodeData.h | 16 +++----- source/SAMRAI/pdat/OuteredgeData.h | 30 -------------- source/SAMRAI/pdat/OuterfaceData.h | 10 ----- source/SAMRAI/pdat/OuternodeData.h | 30 -------------- source/SAMRAI/pdat/OutersideData.h | 31 --------------- source/SAMRAI/pdat/SideData.h | 30 -------------- source/SAMRAI/tbox/ExecutionPolicy.h | 6 +++ source/SAMRAI/tbox/KernelFuser.h | 13 ++++++- source/SAMRAI/tbox/Schedule.C | 6 --- source/SAMRAI/xfer/CoarsenCopyTransaction.C | 6 +-- source/SAMRAI/xfer/RefineCopyTransaction.C | 6 +-- 18 files changed, 81 insertions(+), 260 deletions(-) diff --git a/source/SAMRAI/hier/ForAll.h b/source/SAMRAI/hier/ForAll.h index 0a62c62e3c..a67504288e 100644 --- a/source/SAMRAI/hier/ForAll.h +++ b/source/SAMRAI/hier/ForAll.h @@ -203,6 +203,7 @@ struct for_all<2> { typename std::enable_if::value, int>::type = 0> inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) { + NULL_USE(fuser); RAJA::kernel::Policy2d>( RAJA::make_tuple(make_range(ifirst, ilast, 0), make_range(ifirst, ilast, 1)), @@ -213,6 +214,7 @@ struct for_all<2> { typename std::enable_if::value, int>::type = 0> inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) { + NULL_USE(fuser); RAJA::kernel( RAJA::make_tuple(make_range(ifirst, ilast, 0), make_range(ifirst, ilast, 1)), @@ -248,6 +250,7 @@ struct for_all<3> { typename std::enable_if::value, int>::type = 0> inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) { + NULL_USE(fuser); RAJA::kernel::Policy3d>( RAJA::make_tuple(make_range(ifirst, ilast, 0), make_range(ifirst, ilast, 1), @@ -259,6 +262,7 @@ struct for_all<3> { typename std::enable_if::value, int>::type = 0> inline static void eval(tbox::KernelFuser* fuser, const hier::Index& ifirst, const hier::Index& ilast, LoopBody body) { + NULL_USE(fuser); RAJA::kernel( RAJA::make_tuple(make_range(ifirst, ilast, 0), make_range(ifirst, ilast, 1), diff --git a/source/SAMRAI/hier/PatchData.C b/source/SAMRAI/hier/PatchData.C index cd06eec80f..78d88abecc 100644 --- a/source/SAMRAI/hier/PatchData.C +++ b/source/SAMRAI/hier/PatchData.C @@ -32,28 +32,25 @@ PatchData::~PatchData() } void -PatchData::copy( +PatchData::copyFuseable( const PatchData& src, - const BoxOverlap& overlap, - tbox::KernelFuser& fuser) + const BoxOverlap& overlap) { copy(src, overlap); } void -PatchData::packStream( +PatchData::packStreamFuseable( tbox::MessageStream& stream, - const BoxOverlap& overlap, - tbox::KernelFuser& fuser) const + const BoxOverlap& overlap) const { packStream(stream, overlap); } void -PatchData::unpackStream( +PatchData::unpackStreamFuseable( tbox::MessageStream& stream, - const BoxOverlap& overlap, - tbox::KernelFuser& fuser) + const BoxOverlap& overlap) { unpackStream(stream, overlap); } diff --git a/source/SAMRAI/hier/PatchData.h b/source/SAMRAI/hier/PatchData.h index d314c6898b..e1915fafcb 100644 --- a/source/SAMRAI/hier/PatchData.h +++ b/source/SAMRAI/hier/PatchData.h @@ -25,9 +25,9 @@ namespace SAMRAI { * Forward declaration of KernelFuser class - required here because it sucks in * RAJA and requires CUDA. */ -namespace tbox { -class KernelFuser; -} +//namespace tbox { +//class KernelFuser; +//} namespace hier { @@ -170,10 +170,9 @@ class PatchData const BoxOverlap& overlap) = 0; virtual void - copy( + copyFuseable( const PatchData& src, - const BoxOverlap& overlap, - tbox::KernelFuser& fuser); + const BoxOverlap& overlap); /** * Copy data from the source into the destination using the designated @@ -229,10 +228,9 @@ class PatchData * defined for streams. */ virtual void - packStream( + packStreamFuseable( tbox::MessageStream& stream, - const BoxOverlap& overlap, - tbox::KernelFuser& fuser) const; + const BoxOverlap& overlap) const; /** * Unpack data from the message stream into the specified index set. @@ -252,10 +250,9 @@ class PatchData * defined for streams. */ virtual void - unpackStream( + unpackStreamFuseable( tbox::MessageStream& stream, - const BoxOverlap& overlap, - tbox::KernelFuser& fuser); + const BoxOverlap& overlap); /** * Checks that class version and restart file version are equal. If so, diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.C b/source/SAMRAI/pdat/ArrayDataOperationUtilities.C index 9cdfc837a5..d61e36485c 100644 --- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.C +++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.C @@ -112,10 +112,11 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( bool on_host = (src_on_host && dst_on_host); #endif +#if defined(HAVE_RAJA) bool use_fuser = dst.useFuser(); tbox::KernelFuser* fuser = use_fuser ? tbox::KernelFuser::getFuser() : nullptr; - +#endif /* * Loop over the depth sections of the data arrays. @@ -317,9 +318,11 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( bool on_host = arraydata.dataOnHost(); #endif +#if defined(HAVE_RAJA) bool use_fuser = arraydata.useFuser(); tbox::KernelFuser* fuser = use_fuser ? tbox::KernelFuser::getFuser() : nullptr; +#endif /* * Loop over the depth sections of the data arrays. @@ -520,9 +523,11 @@ inline void ArrayDataOperationUtilities >::doArr bool on_host = (src_on_host && dst_on_host); #endif - bool use_fuser = dst.useFuser(); - tbox::KernelFuser* fuser = use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; +#if defined(HAVE_RAJA) +// bool use_fuser = dst.useFuser(); +// tbox::KernelFuser* fuser = use_fuser ? +// tbox::KernelFuser::getFuser() : nullptr; +#endif /* * Loop over the depth sections of the data arrays. @@ -548,7 +553,8 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { - hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { double &dest_real = reinterpret_cast(dest(i))[0]; double &dest_imag = reinterpret_cast(dest(i))[1]; const double &s2_real = reinterpret_cast(s2(i))[0]; @@ -575,7 +581,8 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { - hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { double &dest_real = reinterpret_cast(dest(i,j))[0]; double &dest_imag = reinterpret_cast(dest(i,j))[1]; const double &s2_real = reinterpret_cast(s2(i,j))[0]; @@ -603,7 +610,8 @@ inline void ArrayDataOperationUtilities >::doArr sumop_dbl(dest_imag, s2_imag); }); } else { - hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { double &dest_real = reinterpret_cast(dest(i,j,k))[0]; double &dest_imag = reinterpret_cast(dest(i,j,k))[1]; const double &s2_real = reinterpret_cast(s2(i,j,k))[0]; @@ -758,9 +766,11 @@ inline void ArrayDataOperationUtilities >::doAr bool on_host = arraydata.dataOnHost(); #endif - bool use_fuser = arraydata.useFuser(); - tbox::KernelFuser* fuser = use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; +#if defined(HAVE_RAJA) +// bool use_fuser = arraydata.useFuser(); +// tbox::KernelFuser* fuser = use_fuser ? +// tbox::KernelFuser::getFuser() : nullptr; +#endif /* * Loop over the depth sections of the data arrays. @@ -790,7 +800,8 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { - hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i) { + hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i) { double &dest_real = reinterpret_cast(dest(i))[0]; double &dest_imag = reinterpret_cast(dest(i))[1]; double &source_real = reinterpret_cast(source(i))[0]; @@ -814,7 +825,8 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { - hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { + hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j) { double &dest_real = reinterpret_cast(dest(i,j))[0]; double &dest_imag = reinterpret_cast(dest(i,j))[1]; double &source_real = reinterpret_cast(source(i,j))[0]; @@ -838,7 +850,8 @@ inline void ArrayDataOperationUtilities >::doAr sumop_dbl(dest_imag, source_imag); }); } else { - hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + //hier::parallel_for_all(fuser, opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { + hier::parallel_for_all(opbox, [=] SAMRAI_HOST_DEVICE(int i, int j, int k) { double &dest_real = reinterpret_cast(dest(i,j,k))[0]; double &dest_imag = reinterpret_cast(dest(i,j,k))[1]; double &source_real = reinterpret_cast(source(i,j,k))[0]; diff --git a/source/SAMRAI/pdat/CellData.h b/source/SAMRAI/pdat/CellData.h index 84f6343ada..6f76dcd09a 100644 --- a/source/SAMRAI/pdat/CellData.h +++ b/source/SAMRAI/pdat/CellData.h @@ -280,12 +280,10 @@ class CellData:public hier::PatchData const hier::BoxOverlap& overlap); virtual void - copy( + copyFuseable( const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) + const hier::BoxOverlap& overlap) { - NULL_USE(fuser); copy(src, overlap); } @@ -367,12 +365,10 @@ class CellData:public hier::PatchData const hier::BoxOverlap& overlap) const; virtual void - packStream( + packStreamFuseable( tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) const + const hier::BoxOverlap& overlap) const { - NULL_USE(fuser); packStream(stream, overlap); } @@ -389,12 +385,10 @@ class CellData:public hier::PatchData const hier::BoxOverlap& overlap); virtual void - unpackStream( + unpackStreamFuseable( tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) + const hier::BoxOverlap& overlap) { - NULL_USE(fuser); unpackStream(stream, overlap); } diff --git a/source/SAMRAI/pdat/EdgeData.h b/source/SAMRAI/pdat/EdgeData.h index cdbe766a4f..24dd37b380 100644 --- a/source/SAMRAI/pdat/EdgeData.h +++ b/source/SAMRAI/pdat/EdgeData.h @@ -305,16 +305,6 @@ class EdgeData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); - virtual void - copy( - const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - copy(src, overlap); - } - /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -393,16 +383,6 @@ class EdgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; - virtual void - packStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) const - { - NULL_USE(fuser); - packStream(stream, overlap); - } - /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -415,16 +395,6 @@ class EdgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); - virtual void - unpackStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - unpackStream(stream, overlap); - } - /*! * @brief Fill all values at depth d with the value t. diff --git a/source/SAMRAI/pdat/FaceData.h b/source/SAMRAI/pdat/FaceData.h index 7755d705a0..069e78ecdc 100644 --- a/source/SAMRAI/pdat/FaceData.h +++ b/source/SAMRAI/pdat/FaceData.h @@ -309,16 +309,6 @@ class FaceData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); - virtual void - copy( - const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - copy(src, overlap); - } - /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -397,16 +387,6 @@ class FaceData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; - virtual void - packStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) const - { - NULL_USE(fuser); - packStream(stream, overlap); - } - /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -419,16 +399,6 @@ class FaceData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); - virtual void - unpackStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - unpackStream(stream, overlap); - } - /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/NodeData.h b/source/SAMRAI/pdat/NodeData.h index 8cd034b58e..570540e2c5 100644 --- a/source/SAMRAI/pdat/NodeData.h +++ b/source/SAMRAI/pdat/NodeData.h @@ -285,12 +285,10 @@ class NodeData:public hier::PatchData const hier::BoxOverlap& overlap); virtual void - copy( + copyFuseable( const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) + const hier::BoxOverlap& overlap) { - NULL_USE(fuser); d_data->startKernelFuser(); copy(src, overlap); d_data->stopKernelFuser(); @@ -375,10 +373,9 @@ class NodeData:public hier::PatchData const hier::BoxOverlap& overlap) const; virtual void - packStream( + packStreamFuseable( tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) const + const hier::BoxOverlap& overlap) const { d_data->startKernelFuser(); packStream(stream, overlap); @@ -398,10 +395,9 @@ class NodeData:public hier::PatchData const hier::BoxOverlap& overlap); virtual void - unpackStream( + unpackStreamFuseable( tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) + const hier::BoxOverlap& overlap) { d_data->startKernelFuser(); unpackStream(stream, overlap); diff --git a/source/SAMRAI/pdat/OuteredgeData.h b/source/SAMRAI/pdat/OuteredgeData.h index 1cc7c8c34a..c99d9448fc 100644 --- a/source/SAMRAI/pdat/OuteredgeData.h +++ b/source/SAMRAI/pdat/OuteredgeData.h @@ -443,16 +443,6 @@ class OuteredgeData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); - virtual void - copy( - const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - copy(src, overlap); - } - /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -552,16 +542,6 @@ class OuteredgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; - virtual void - packStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) const - { - NULL_USE(fuser); - packStream(stream, overlap); - } - /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -574,16 +554,6 @@ class OuteredgeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); - virtual void - unpackStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - unpackStream(stream, overlap); - } - /*! * @brief Unpack data from stream and add into this patch data object * over the specified box overlap region. diff --git a/source/SAMRAI/pdat/OuterfaceData.h b/source/SAMRAI/pdat/OuterfaceData.h index 78b22dc335..dce697a517 100644 --- a/source/SAMRAI/pdat/OuterfaceData.h +++ b/source/SAMRAI/pdat/OuterfaceData.h @@ -334,16 +334,6 @@ class OuterfaceData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); - virtual void - copy( - const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - copy(src, overlap); - } - /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. diff --git a/source/SAMRAI/pdat/OuternodeData.h b/source/SAMRAI/pdat/OuternodeData.h index 4899f6a24b..ebcb2705c8 100644 --- a/source/SAMRAI/pdat/OuternodeData.h +++ b/source/SAMRAI/pdat/OuternodeData.h @@ -379,16 +379,6 @@ class OuternodeData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); - virtual void - copy( - const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - copy(src, overlap); - } - /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -488,16 +478,6 @@ class OuternodeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; - virtual void - packStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) const - { - NULL_USE(fuser); - packStream(stream, overlap); - } - /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -510,16 +490,6 @@ class OuternodeData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); - virtual void - unpackStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - unpackStream(stream, overlap); - } - /*! * @brief Unpack data from stream and add into this patch data object * over the specified box overlap region. The overlap must be an diff --git a/source/SAMRAI/pdat/OutersideData.h b/source/SAMRAI/pdat/OutersideData.h index fb529f6fbc..20d9482d31 100644 --- a/source/SAMRAI/pdat/OutersideData.h +++ b/source/SAMRAI/pdat/OutersideData.h @@ -333,16 +333,6 @@ class OutersideData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); - virtual void - copy( - const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - copy(src, overlap); - } - /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -423,16 +413,6 @@ class OutersideData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; - virtual void - packStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) const - { - NULL_USE(fuser); - packStream(stream, overlap); - } - /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -445,17 +425,6 @@ class OutersideData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); - virtual void - unpackStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - unpackStream(stream, overlap); - } - - /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/pdat/SideData.h b/source/SAMRAI/pdat/SideData.h index 00c0e62a98..249d848622 100644 --- a/source/SAMRAI/pdat/SideData.h +++ b/source/SAMRAI/pdat/SideData.h @@ -379,16 +379,6 @@ class SideData:public hier::PatchData const hier::PatchData& src, const hier::BoxOverlap& overlap); - virtual void - copy( - const hier::PatchData& src, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - copy(src, overlap); - } - /*! * @brief Copy data from source (i.e., this) to destination * patch data object on the given overlap. @@ -469,16 +459,6 @@ class SideData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap) const; - virtual void - packStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) const - { - NULL_USE(fuser); - packStream(stream, overlap); - } - /*! * @brief Unpack data from stream into this patch data object over * the specified box overlap region. The overlap must be an @@ -491,16 +471,6 @@ class SideData:public hier::PatchData tbox::MessageStream& stream, const hier::BoxOverlap& overlap); - virtual void - unpackStream( - tbox::MessageStream& stream, - const hier::BoxOverlap& overlap, - tbox::KernelFuser& fuser) - { - NULL_USE(fuser); - unpackStream(stream, overlap); - } - /*! * @brief Fill all values at depth d with the value t. * diff --git a/source/SAMRAI/tbox/ExecutionPolicy.h b/source/SAMRAI/tbox/ExecutionPolicy.h index eb87d3efbd..8965b88c40 100644 --- a/source/SAMRAI/tbox/ExecutionPolicy.h +++ b/source/SAMRAI/tbox/ExecutionPolicy.h @@ -151,6 +151,12 @@ struct policy_traits { >; using ReductionPolicy = RAJA::seq_reduce; + + using WorkGroupPolicy = RAJA::WorkGroupPolicy< + RAJA::loop_work, + RAJA::reverse_ordered, + RAJA::ragged_array_of_objects>; + }; #endif // HAVE_CUDA diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index 31e1cd8fa2..682603ec1e 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -31,6 +31,7 @@ class KernelFuser static KernelFuser* getFuser(); +#ifdef HAVE_RAJA template void enqueue(int begin, int end, Kernel&& kernel) { if (d_launched) { @@ -39,6 +40,7 @@ class KernelFuser d_workpool.enqueue(RAJA::RangeSegment(begin, end), std::forward(kernel)); } +#endif void launch() { @@ -46,17 +48,21 @@ class KernelFuser TBOX_ERROR("KernelFuser Error: This KernelFuser already launched."); } +#ifdef HAVE_RAJA d_workgroup = d_workpool.instantiate(); d_worksite = d_workgroup.run(); +#endif d_launched = true; } void cleanup() { +#ifdef HAVE_RAJA d_workpool.clear(); d_workgroup.clear(); d_worksite.clear(); +#endif d_launched = false; } @@ -69,9 +75,11 @@ class KernelFuser protected: KernelFuser() : +#ifdef HAVE_RAJA d_workpool(AllocatorDatabase::getDatabase()->getKernelFuserAllocator()), d_workgroup(d_workpool.instantiate()), d_worksite(d_workgroup.run()), +#endif d_launched(false) { } @@ -87,10 +95,12 @@ class KernelFuser using Allocator = ResourceAllocator; #endif +#ifdef HAVE_RAJA using Policy = typename tbox::detail::policy_traits< tbox::policy::parallel >::WorkGroupPolicy; using WorkPool = RAJA::WorkPool , Allocator>; using WorkGroup = RAJA::WorkGroup, Allocator>; using WorkSite = RAJA::WorkSite , Allocator>; +#endif static void startupCallback(); static void shutdownCallback(); @@ -100,10 +110,11 @@ class KernelFuser static StartupShutdownManager::Handler s_startup_handler; - +#ifdef HAVE_RAJA WorkPool d_workpool; WorkGroup d_workgroup; WorkSite d_worksite; +#endif bool d_launched; }; diff --git a/source/SAMRAI/tbox/Schedule.C b/source/SAMRAI/tbox/Schedule.C index 72a9f7ebd2..778cf40e14 100644 --- a/source/SAMRAI/tbox/Schedule.C +++ b/source/SAMRAI/tbox/Schedule.C @@ -106,7 +106,6 @@ Schedule::addTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { if (!d_local_fuser) { - //d_local_fuser = new KernelFuser{}; d_local_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_local_fuser); @@ -118,7 +117,6 @@ Schedule::addTransaction( if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { if (!d_recv_fuser) { - //d_recv_fuser = new KernelFuser{}; d_recv_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_recv_fuser); @@ -129,7 +127,6 @@ Schedule::addTransaction( } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { if (!d_send_fuser) { - d_send_fuser = //new KernelFuser{}; d_send_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_send_fuser); @@ -161,7 +158,6 @@ Schedule::appendTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { if (!d_local_fuser) { - //d_local_fuser = new KernelFuser{}; d_local_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_local_fuser); @@ -173,7 +169,6 @@ Schedule::appendTransaction( if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { if (!d_recv_fuser) { - //d_recv_fuser = new KernelFuser{}; d_recv_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_recv_fuser); @@ -184,7 +179,6 @@ Schedule::appendTransaction( } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { if (!d_send_fuser) { - //d_send_fuser = new KernelFuser{}; d_send_fuser = KernelFuser::getFuser(); } fuseable_transaction->setKernelFuser(d_send_fuser); diff --git a/source/SAMRAI/xfer/CoarsenCopyTransaction.C b/source/SAMRAI/xfer/CoarsenCopyTransaction.C index 87440693ee..971756c211 100644 --- a/source/SAMRAI/xfer/CoarsenCopyTransaction.C +++ b/source/SAMRAI/xfer/CoarsenCopyTransaction.C @@ -131,7 +131,7 @@ CoarsenCopyTransaction::packStream( tbox::MessageStream& stream) { d_src_patch->getPatchData(d_coarsen_data[d_item_id]->d_src) - ->packStream(stream, *d_overlap, *getKernelFuser()); + ->packStreamFuseable(stream, *d_overlap); } void @@ -139,7 +139,7 @@ CoarsenCopyTransaction::unpackStream( tbox::MessageStream& stream) { d_dst_patch->getPatchData(d_coarsen_data[d_item_id]->d_dst) - ->unpackStream(stream, *d_overlap, *getKernelFuser()); + ->unpackStreamFuseable(stream, *d_overlap); } void @@ -151,7 +151,7 @@ CoarsenCopyTransaction::copyLocalData() const hier::PatchData& src_data = *d_src_patch->getPatchData(d_coarsen_data[d_item_id]->d_src); - dst_data.copy(src_data, *d_overlap, *getKernelFuser()); + dst_data.copyFuseable(src_data, *d_overlap); } /* diff --git a/source/SAMRAI/xfer/RefineCopyTransaction.C b/source/SAMRAI/xfer/RefineCopyTransaction.C index fd420d895a..a8cb4441d3 100644 --- a/source/SAMRAI/xfer/RefineCopyTransaction.C +++ b/source/SAMRAI/xfer/RefineCopyTransaction.C @@ -131,7 +131,7 @@ RefineCopyTransaction::packStream( tbox::MessageStream& stream) { d_src_patch->getPatchData(d_refine_data[d_item_id]->d_src) - ->packStream(stream, *d_overlap, *getKernelFuser()); + ->packStreamFuseable(stream, *d_overlap); } void @@ -139,7 +139,7 @@ RefineCopyTransaction::unpackStream( tbox::MessageStream& stream) { d_dst_patch->getPatchData(d_refine_data[d_item_id]->d_scratch) - ->unpackStream(stream, *d_overlap, *getKernelFuser()); + ->unpackStreamFuseable(stream, *d_overlap); } void @@ -151,7 +151,7 @@ RefineCopyTransaction::copyLocalData() const hier::PatchData& src_data = *d_src_patch->getPatchData(d_refine_data[d_item_id]->d_src); - dst_data.copy(src_data, *d_overlap, *getKernelFuser()); + dst_data.copyFuseable(src_data, *d_overlap); } /* From f8a55187b8aeb57ea9f69ca97f4e4d84bae80541 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Wed, 22 Dec 2021 09:22:21 -0800 Subject: [PATCH 15/34] Add missing initialization in ArrayData --- source/SAMRAI/pdat/ArrayData.C | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/SAMRAI/pdat/ArrayData.C b/source/SAMRAI/pdat/ArrayData.C index 3794d5993b..5a02a57c1a 100644 --- a/source/SAMRAI/pdat/ArrayData.C +++ b/source/SAMRAI/pdat/ArrayData.C @@ -134,7 +134,8 @@ ArrayData::ArrayData( #else d_array(d_depth * d_offset), #endif - d_on_host(true) + d_on_host(true), + d_use_fuser(false) { #ifndef HAVE_UMPIRE NULL_USE(allocator); From 4fe3d2cebf82e698323d928bb0fda1a6f3d3a9bc Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Fri, 7 Jan 2022 14:21:14 -0800 Subject: [PATCH 16/34] Make KernelFuser a true no-op in non-RAJA builds. --- source/SAMRAI/tbox/KernelFuser.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index 682603ec1e..7cf6588a06 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -51,9 +51,8 @@ class KernelFuser #ifdef HAVE_RAJA d_workgroup = d_workpool.instantiate(); d_worksite = d_workgroup.run(); -#endif - d_launched = true; +#endif } void cleanup() @@ -62,8 +61,8 @@ class KernelFuser d_workpool.clear(); d_workgroup.clear(); d_worksite.clear(); -#endif d_launched = false; +#endif } bool launched() const From c57914d044c64ca181947d23432c49f226acd3db Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Fri, 7 Jan 2022 16:42:00 -0800 Subject: [PATCH 17/34] Rearrange split message receives in AsyncCommPeer to avoid CUDA reallocations --- source/SAMRAI/tbox/AsyncCommPeer.C | 67 +++++++++++++++++++++++++++--- source/SAMRAI/tbox/AsyncCommPeer.h | 5 ++- source/SAMRAI/tbox/Schedule.C | 4 ++ 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/source/SAMRAI/tbox/AsyncCommPeer.C b/source/SAMRAI/tbox/AsyncCommPeer.C index e024b81202..3e005983f6 100644 --- a/source/SAMRAI/tbox/AsyncCommPeer.C +++ b/source/SAMRAI/tbox/AsyncCommPeer.C @@ -76,6 +76,7 @@ AsyncCommPeer::AsyncCommPeer(): d_external_buf(0), d_internal_buf_size(0), d_internal_buf(0), + d_count_buf(0), d_mpi(SAMRAI_MPI::getSAMRAIWorld()), d_tag0(-1), d_tag1(-1), @@ -119,6 +120,7 @@ AsyncCommPeer::AsyncCommPeer( d_external_buf(0), d_internal_buf_size(0), d_internal_buf(0), + d_count_buf(0), d_mpi(SAMRAI_MPI::getSAMRAIWorld()), d_tag0(-1), d_tag1(-1), @@ -165,6 +167,16 @@ AsyncCommPeer::~AsyncCommPeer() #endif d_internal_buf = 0; } + if (d_count_buf) { +#ifdef HAVE_UMPIRE + d_allocator.deallocate( + (char*)d_count_buf, 2 * sizeof(FlexData)); +#else + free(d_count_buf); +#endif + d_count_buf = 0; + } + d_first_recv_buf = 0; } @@ -570,7 +582,31 @@ AsyncCommPeer::checkRecv( // Post receive for first (and maybe only) chunk of data. const size_t first_chunk_count = getNumberOfFlexData( d_max_first_data_len); - resizeBuffer(first_chunk_count + 2); + + if (first_chunk_count > 0) { + resizeBuffer(first_chunk_count + 2); + d_first_recv_buf = d_internal_buf; + } else { + // If the size of the first chunk is zero, due to + // d_max_first_data_len being set to zero, then we use + // a small buffer to get only the full count size, deferring + // the receipt of the full data to the second Irecv. + if (d_count_buf) { +#ifdef HAVE_UMPIRE + d_allocator.deallocate( + (char*)d_count_buf, 2 * sizeof(FlexData)); +#else + free(d_count_buf); +#endif + } +#ifdef HAVE_UMPIRE + d_count_buf = + (FlexData *)d_allocator.allocate(2 * sizeof(FlexData)); +#else + d_count_buf = (FlexData *)malloc(2 * sizeof(FlexData)); +#endif + d_first_recv_buf = d_count_buf; + } TBOX_ASSERT(req[0] == MPI_REQUEST_NULL); #ifdef DEBUG_CHECK_ASSERTIONS @@ -578,7 +614,7 @@ AsyncCommPeer::checkRecv( #endif t_recv_timer->start(); d_mpi_err = d_mpi.Irecv( - d_internal_buf, + d_first_recv_buf, static_cast(sizeof(FlexData) * (first_chunk_count + 2)), MPI_BYTE, d_peer_rank, @@ -647,9 +683,9 @@ AsyncCommPeer::checkRecv( TBOX_ASSERT(mpi_status[0].MPI_SOURCE == d_peer_rank); TBOX_ASSERT(req[0] == MPI_REQUEST_NULL); // Get full count embedded in message. - d_full_count = d_internal_buf[count - 1].d_i; + d_full_count = d_first_recv_buf[count - 1].d_i; - TBOX_ASSERT(d_internal_buf[count - 2].d_i == 0); // Sequence number check. + TBOX_ASSERT(d_first_recv_buf[count - 2].d_i == 0); // Sequence number check. TBOX_ASSERT(getNumberOfFlexData(d_full_count) >= count - 2); if (d_full_count > d_max_first_data_len) { @@ -664,7 +700,18 @@ AsyncCommPeer::checkRecv( const size_t second_chunk_count = getNumberOfFlexData( d_full_count - d_max_first_data_len); - resizeBuffer(d_internal_buf_size + second_chunk_count); + size_t new_internal_buf_size = + d_internal_buf_size + second_chunk_count; + + // If the first Irecv didn't use d_internal_buf, then + // the message in the second Irecv will contain the entire + // buffer of data for this communicattion instance, and we need + // to add 2 to the buffer size to make room for the trailing + // metadata. + if (d_internal_buf_size == 0) { + new_internal_buf_size += 2; + } + resizeBuffer(new_internal_buf_size); TBOX_ASSERT(req[1] == MPI_REQUEST_NULL); req[1] = MPI_REQUEST_NULL; @@ -971,6 +1018,16 @@ AsyncCommPeer::clearRecvData() #endif d_internal_buf = 0; } + if (d_count_buf) { +#ifdef HAVE_UMPIRE + d_allocator.deallocate( + (char*)d_count_buf, 2 * sizeof(FlexData)); +#else + free(d_count_buf); +#endif + d_count_buf = 0; + } + d_first_recv_buf = 0; } /* diff --git a/source/SAMRAI/tbox/AsyncCommPeer.h b/source/SAMRAI/tbox/AsyncCommPeer.h index aa52c9969e..331ef44494 100644 --- a/source/SAMRAI/tbox/AsyncCommPeer.h +++ b/source/SAMRAI/tbox/AsyncCommPeer.h @@ -615,7 +615,10 @@ class AsyncCommPeer:public AsyncCommStage::Member * for overhead data. */ size_t d_internal_buf_size; - FlexData* d_internal_buf; + FlexData* d_internal_buf = nullptr; + + FlexData* d_count_buf = nullptr; + FlexData* d_first_recv_buf = nullptr; /*! * diff --git a/source/SAMRAI/tbox/Schedule.C b/source/SAMRAI/tbox/Schedule.C index 778cf40e14..811b0cfe8d 100644 --- a/source/SAMRAI/tbox/Schedule.C +++ b/source/SAMRAI/tbox/Schedule.C @@ -40,7 +40,11 @@ const int Schedule::s_default_second_tag = 1; * MPI communication. This parameter should be dependent on the MPI * implementation. */ +#if defined(HAVE_CUDA) +const size_t Schedule::s_default_first_message_length = 0; +#else const size_t Schedule::s_default_first_message_length = 1000; +#endif const std::string Schedule::s_default_timer_prefix("tbox::Schedule"); std::map Schedule::s_static_timers; From b6fee9f018463ecfa602ebc2712a4b47e26bae96 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Thu, 3 Feb 2022 10:46:40 -0800 Subject: [PATCH 18/34] Add cmake option for setting number of threads for RAJA WorkGroup policy --- CMakeLists.txt | 3 +-- config/SAMRAI_config.h.cmake.in | 2 ++ source/SAMRAI/tbox/ExecutionPolicy.h | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 92233af642..17a56d7609 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,8 +59,6 @@ option(ENABLE_SAMRAI_TESTS "Enable SAMRAI Test Programs" On) option(ENABLE_PERF_TESTS "Enable Performance Tests." Off) set(NUM_PERF_PROCS 8 CACHE INT "Number of processors for performance tests.") option(ENABLE_CHECK_ASSERTIONS "Enable assertion checking." On) -option(ENABLE_CHECK_DEV_ASSERTIONS "Enable SAMRAI developer assertion checking." Off) -option(ENABLE_CHECK_DIM_ASSERTIONS "Enable assertion checking for dimensions." Off) option(ENABLE_BOX_COUNTING "Turns on box telemetry." Off) option(ENABLE_DEPRECATED "Build with deprecated features." On) option(ENABLE_TIMERS "Enable SAMRAI timers." On) @@ -72,6 +70,7 @@ set(CUDA_ARCH "sm_70" CACHE STRING "Compute architecture to pass to CUDA builds" set(CMAKE_CUDA_FLAGS "" CACHE STRING "") set(CMAKE_INSTALL_LIBDIR lib) #set(CMAKE_INSTALL_RPATH_USE_LINK_PATH Off CACHE Bool "Rpath uses Link path") +set(SAMRAI_RAJA_WORKGROUP_THREADS 512 CACHE INT "Number of workgroup threads") include(GNUInstallDirs) diff --git a/config/SAMRAI_config.h.cmake.in b/config/SAMRAI_config.h.cmake.in index aaa58af346..ad9a805b93 100644 --- a/config/SAMRAI_config.h.cmake.in +++ b/config/SAMRAI_config.h.cmake.in @@ -340,6 +340,8 @@ /* Maximum dimension allowed */ #define SAMRAI_MAXIMUM_DIMENSION @SAMRAI_MAXIMUM_DIMENSION@ +#define SAMRAI_RAJA_WORKGROUP_THREADS @SAMRAI_RAJA_WORKGROUP_THREADS@ + /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS diff --git a/source/SAMRAI/tbox/ExecutionPolicy.h b/source/SAMRAI/tbox/ExecutionPolicy.h index 8965b88c40..b79255991e 100644 --- a/source/SAMRAI/tbox/ExecutionPolicy.h +++ b/source/SAMRAI/tbox/ExecutionPolicy.h @@ -11,6 +11,8 @@ #ifndef included_tbox_ExecutionPolicy #define included_tbox_ExecutionPolicy +#include "SAMRAI/SAMRAI_config.h" + #if defined(HAVE_RAJA) #include "RAJA/RAJA.hpp" @@ -114,7 +116,7 @@ struct policy_traits { using ReductionPolicy = RAJA::cuda_reduce; using WorkGroupPolicy = RAJA::WorkGroupPolicy< - RAJA::cuda_work_async<1024>, + RAJA::cuda_work_async, RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, RAJA::constant_stride_array_of_objects>; }; From 899a9fac1137c3eccd1c923ac21cad9f08c4d2dc Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Tue, 8 Feb 2022 13:12:13 -0800 Subject: [PATCH 19/34] Add logic to avoid synchronize calls when it is known that no kernels have been launched --- source/SAMRAI/tbox/Schedule.C | 33 +++++++++++++++++++++------- source/SAMRAI/tbox/Schedule.h | 7 ++++++ source/SAMRAI/xfer/CoarsenSchedule.C | 4 +++- source/SAMRAI/xfer/RefineSchedule.C | 28 +++++++++++++++-------- source/SAMRAI/xfer/RefineSchedule.h | 4 +++- 5 files changed, 57 insertions(+), 19 deletions(-) diff --git a/source/SAMRAI/tbox/Schedule.C b/source/SAMRAI/tbox/Schedule.C index 811b0cfe8d..6089a3a9bc 100644 --- a/source/SAMRAI/tbox/Schedule.C +++ b/source/SAMRAI/tbox/Schedule.C @@ -252,6 +252,7 @@ Schedule::communicate() #endif d_object_timers->t_communicate->start(); + d_completed_transactions = false; beginCommunication(); finalizeCommunication(); d_object_timers->t_communicate->stop(); @@ -316,6 +317,8 @@ Schedule::postReceives() return; } + d_completed_transactions = true; + int rank = d_mpi.getRank(); /* @@ -476,13 +479,16 @@ Schedule::postSends() for (const auto& transaction : d_send_sets[peer_rank]) { transaction->packStream(outgoing_stream); } -#if defined(HAVE_RAJA) + bool have_non_fuseable = !(d_send_sets[peer_rank].empty()); + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; +#if defined(HAVE_RAJA) parallel_synchronize(); if (d_send_fuser) d_send_fuser->cleanup(); - } #endif + } d_object_timers->t_pack_stream->stop(); @@ -546,13 +552,15 @@ Schedule::postSends() for (const auto& transaction : d_send_sets[peer_rank]) { transaction->packStream(outgoing_stream); } -#if defined(HAVE_RAJA) bool have_non_fuseable = !(d_send_sets[peer_rank].empty()); + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; +#if defined(HAVE_RAJA) parallel_synchronize(); if (d_send_fuser) d_send_fuser->cleanup(); - } #endif + } d_object_timers->t_pack_stream->stop(); @@ -596,13 +604,14 @@ Schedule::performLocalCopies() } d_object_timers->t_local_copies->stop(); -#if defined(HAVE_RAJA) bool have_non_fuseable = !d_local_set.empty(); if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; +#if defined(HAVE_RAJA) parallel_synchronize(); if (d_local_fuser) d_local_fuser->cleanup(); - } #endif + } } @@ -663,12 +672,16 @@ Schedule::processCompletedCommunications() for (const auto& transaction : d_recv_sets[sender]) { transaction->unpackStream(incoming_stream); } -#if defined(HAVE_RAJA) bool have_non_fuseable = !(d_recv_sets[sender].empty()); +#if defined(HAVE_RAJA) if (have_non_fuseable) { parallel_synchronize(); } #endif + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; + } + d_object_timers->t_unpack_stream->stop(); completed_comm->clearRecvData(); } @@ -723,12 +736,16 @@ Schedule::processCompletedCommunications() for (const auto& transaction : d_recv_sets[sender]) { transaction->unpackStream(incoming_stream); } -#if defined(HAVE_RAJA) bool have_non_fuseable = !(d_recv_sets[sender].empty()); +#if defined(HAVE_RAJA) if (have_non_fuseable) { parallel_synchronize(); } #endif + if (have_fuseable || have_non_fuseable) { + d_completed_transactions = true; + } + d_object_timers->t_unpack_stream->stop(); completed_comm->clearRecvData(); } else { diff --git a/source/SAMRAI/tbox/Schedule.h b/source/SAMRAI/tbox/Schedule.h index 3638918c7f..3ba41b4991 100644 --- a/source/SAMRAI/tbox/Schedule.h +++ b/source/SAMRAI/tbox/Schedule.h @@ -289,6 +289,11 @@ class Schedule return "Schedule"; } + bool completedTransactions() const + { + return d_completed_transactions; + } + private: void allocateCommunicationObjects(); @@ -350,6 +355,8 @@ class Schedule KernelFuser* d_send_fuser{nullptr}; KernelFuser* d_recv_fuser{nullptr}; + bool d_completed_transactions = false; + /* * @brief Transactions where the source and destination are the * local process. diff --git a/source/SAMRAI/xfer/CoarsenSchedule.C b/source/SAMRAI/xfer/CoarsenSchedule.C index d5acde0729..871ec7ddda 100644 --- a/source/SAMRAI/xfer/CoarsenSchedule.C +++ b/source/SAMRAI/xfer/CoarsenSchedule.C @@ -313,7 +313,9 @@ CoarsenSchedule::coarsenData() const d_schedule->communicate(); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_schedule->completedTransactions()) { + tbox::parallel_synchronize(); + } #endif /* diff --git a/source/SAMRAI/xfer/RefineSchedule.C b/source/SAMRAI/xfer/RefineSchedule.C index 74075519b8..bbeddfa3b4 100644 --- a/source/SAMRAI/xfer/RefineSchedule.C +++ b/source/SAMRAI/xfer/RefineSchedule.C @@ -2098,9 +2098,11 @@ RefineSchedule::fillData( * space. */ - copyScratchToDestination(); + bool copied = copyScratchToDestination(); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (copied) { + tbox::parallel_synchronize(); + } #endif /* @@ -2149,6 +2151,8 @@ RefineSchedule::recursiveFill( double fill_time, bool do_physical_boundary_fill) const { + int rank = d_dst_level->getBoxLevel()->getMPI().getRank(); + /* * Copy data from the source interiors of the source level into the ghost * cells and interiors of the scratch space on the destination level @@ -2156,7 +2160,9 @@ RefineSchedule::recursiveFill( */ d_coarse_priority_level_schedule->communicate(); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_coarse_priority_level_schedule->completedTransactions()) { + tbox::parallel_synchronize(); + } #endif /* @@ -2213,7 +2219,6 @@ RefineSchedule::recursiveFill( * Recursively call the fill routine to fill the required coarse fill * boxes on the coarser level. */ - d_coarse_interp_schedule->recursiveFill(fill_time, do_physical_boundary_fill); @@ -2338,7 +2343,9 @@ RefineSchedule::recursiveFill( */ d_fine_priority_level_schedule->communicate(); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_fine_priority_level_schedule->completedTransactions()) { + tbox::parallel_synchronize(); + } #endif /* @@ -2604,10 +2611,11 @@ RefineSchedule::allocateWorkSpace( ************************************************************************** */ -void +bool RefineSchedule::copyScratchToDestination() const { TBOX_ASSERT(d_dst_level); + bool copied = false; for (hier::PatchLevel::iterator p(d_dst_level->begin()); p != d_dst_level->end(); ++p) { @@ -2621,11 +2629,12 @@ RefineSchedule::copyScratchToDestination() const getPatchData(dst_id)->getTime(), patch->getPatchData(src_id)->getTime())); patch->getPatchData(dst_id)->copy(*patch->getPatchData(src_id)); + copied = true; } } - } + return copied; } /* @@ -2647,6 +2656,7 @@ RefineSchedule::refineScratchData( overlaps) const { t_refine_scratch_data->start(); + int rank = d_dst_level->getBoxLevel()->getMPI().getRank(); #ifdef DEBUG_CHECK_ASSERTIONS bool is_encon = (fine_level == d_encon_level); @@ -2773,12 +2783,12 @@ RefineSchedule::refineScratchData( d_nbr_blk_fill_level->getPatch(unfilled_id)); if (d_refine_patch_strategy) { - d_refine_patch_strategy->preprocessRefineBoxes(*nbr_fill_patch, + d_refine_patch_strategy->preprocessRefineBoxes(*nbr_fill_patch, *crse_patch, fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + tbox::parallel_synchronize(); #endif } diff --git a/source/SAMRAI/xfer/RefineSchedule.h b/source/SAMRAI/xfer/RefineSchedule.h index a501943c5e..aa935b0e96 100644 --- a/source/SAMRAI/xfer/RefineSchedule.h +++ b/source/SAMRAI/xfer/RefineSchedule.h @@ -534,9 +534,11 @@ class RefineSchedule * If the scratch and destination patch data components are the same, * then no copying is performed. * + * @return Returns true only if copies were performed. + * * @pre d_dst_level */ - void + bool copyScratchToDestination() const; /*! From 7c24184acca533f48c1f71c6af4c6abafd60e7f7 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Mon, 4 Apr 2022 16:35:53 -0700 Subject: [PATCH 20/34] Add methods for applications to indicate need for synchronization --- source/SAMRAI/xfer/CoarsenPatchStrategy.h | 33 +++++++++++++++++++++++ source/SAMRAI/xfer/CoarsenSchedule.cpp | 16 +++++++---- source/SAMRAI/xfer/RefinePatchStrategy.h | 30 +++++++++++++++++++++ source/SAMRAI/xfer/RefineSchedule.cpp | 32 +++++++++++++++++----- 4 files changed, 100 insertions(+), 11 deletions(-) diff --git a/source/SAMRAI/xfer/CoarsenPatchStrategy.h b/source/SAMRAI/xfer/CoarsenPatchStrategy.h index 5a087d6693..03f20ee206 100644 --- a/source/SAMRAI/xfer/CoarsenPatchStrategy.h +++ b/source/SAMRAI/xfer/CoarsenPatchStrategy.h @@ -139,6 +139,37 @@ class CoarsenPatchStrategy const hier::Box& coarse_box, const hier::IntVector& ratio) = 0; + /*! + * @brief Check flag for if host-device synchronization is needed. + * + * Returns current value of the flag while setting the flag back to + * the default value of true. + */ + bool + needSynchronize() + { + bool flag = d_need_synchronize; + d_need_synchronize = true; + return flag; + } + +protected: + + /*! + * @brief Set flag indicating if device synchronization is needed after + * a child class operation. + * + * This allows implementations of methods such as preprocessCoarsen and + * postprocessCoarsen to set the flag to false if they have done nothing + * that requires host-device synchronization and do not need + * CoarsenSchedule to call the synchronize routine. + */ + void + setNeedCoarsenSynchronize(bool flag) + { + d_need_synchronize = flag; + } + private: /*! * @brief Get the set of CoarsenPatchStrategy objects that have been @@ -163,6 +194,8 @@ class CoarsenPatchStrategy current_objects.insert(this); } + bool d_need_synchronize = true; + }; } diff --git a/source/SAMRAI/xfer/CoarsenSchedule.cpp b/source/SAMRAI/xfer/CoarsenSchedule.cpp index 4a87b36ea8..67fa2953f9 100644 --- a/source/SAMRAI/xfer/CoarsenSchedule.cpp +++ b/source/SAMRAI/xfer/CoarsenSchedule.cpp @@ -1025,10 +1025,13 @@ CoarsenSchedule::coarsenSourceData( patch_strategy->preprocessCoarsen(*temp_patch, *fine_patch, box, block_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } + bool need_sync = false; for (size_t ici = 0; ici < d_number_coarsen_items; ++ici) { const CoarsenClasses::Data * const crs_item = d_coarsen_items[ici]; @@ -1037,11 +1040,12 @@ CoarsenSchedule::coarsenSourceData( crs_item->d_opcoarsen->coarsen(*temp_patch, *fine_patch, source_id, source_id, box, block_ratio); + need_sync = true; } } -#if defined(HAVE_RAJA) - tbox::parallel_synchronize(); -#endif + if (need_sync) { + tbox::parallel_synchronize(); + } if (patch_strategy) { patch_strategy->postprocessCoarsen(*temp_patch, @@ -1049,7 +1053,9 @@ CoarsenSchedule::coarsenSourceData( box, block_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } } diff --git a/source/SAMRAI/xfer/RefinePatchStrategy.h b/source/SAMRAI/xfer/RefinePatchStrategy.h index a5f79e7e43..f5f1e9db3e 100644 --- a/source/SAMRAI/xfer/RefinePatchStrategy.h +++ b/source/SAMRAI/xfer/RefinePatchStrategy.h @@ -265,6 +265,7 @@ class RefinePatchStrategy NULL_USE(coarse_to_unfilled); NULL_USE(overlaps); NULL_USE(refine_items); + setNeedRefineSynchronize(false); } /*! @@ -290,6 +291,33 @@ class RefinePatchStrategy NULL_USE(coarse_level); NULL_USE(coarse_to_fine); NULL_USE(coarse_to_unfilled); + setNeedRefineSynchronize(false); + } + + /*! + * @brief Check if a synchronization is required. + */ + bool + needSynchronize() + { + bool flag = d_need_synchronize; + d_need_synchronize = true; + return flag; + } + +protected: + + /*! + * @brief Set flag to indicate synchronization is needed. + * + * Implementations of RefinePatchStrategy should call this to set + * the flag to false if they have not launched RAJA kernels that require + * a synchronization call afterward. + */ + void + setNeedRefineSynchronize(bool flag) + { + d_need_synchronize = flag; } private: @@ -327,6 +355,8 @@ class RefinePatchStrategy current_objects.erase(this); } + bool d_need_synchronize = true; + }; } diff --git a/source/SAMRAI/xfer/RefineSchedule.cpp b/source/SAMRAI/xfer/RefineSchedule.cpp index dc61a807b3..782722d1ca 100644 --- a/source/SAMRAI/xfer/RefineSchedule.cpp +++ b/source/SAMRAI/xfer/RefineSchedule.cpp @@ -2671,6 +2671,11 @@ RefineSchedule::refineScratchData( coarse_to_unfilled, overlaps, d_refine_items); +#if defined(HAVE_RAJA) + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif } const hier::IntVector ratio(fine_level->getRatioToLevelZero() @@ -2723,10 +2728,14 @@ RefineSchedule::refineScratchData( fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } + bool need_sync = false; + for (size_t iri = 0; iri < d_number_refine_items; ++iri) { const RefineClasses::Data * const ref_item = d_refine_items[iri]; if (ref_item->d_oprefine) { @@ -2739,11 +2748,14 @@ RefineSchedule::refineScratchData( ref_item->d_oprefine->refine(*fine_patch, *crse_patch, scratch_id, scratch_id, *refine_overlap, local_ratio); + need_sync = true; } } #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (need_sync) { + tbox::parallel_synchronize(); + } #endif if (d_refine_patch_strategy) { @@ -2752,7 +2764,9 @@ RefineSchedule::refineScratchData( fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } @@ -2788,7 +2802,9 @@ RefineSchedule::refineScratchData( fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } @@ -2818,7 +2834,9 @@ RefineSchedule::refineScratchData( fill_boxes, local_ratio); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } @@ -2854,7 +2872,9 @@ RefineSchedule::refineScratchData( coarse_to_fine, coarse_to_unfilled); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } #endif } From 78c618dce074a921e29c1033f10fd5d7b0f0d924 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Mon, 4 Apr 2022 16:39:34 -0700 Subject: [PATCH 21/34] Clarify some documentation comments --- source/SAMRAI/xfer/RefinePatchStrategy.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/source/SAMRAI/xfer/RefinePatchStrategy.h b/source/SAMRAI/xfer/RefinePatchStrategy.h index f5f1e9db3e..fed5441b9d 100644 --- a/source/SAMRAI/xfer/RefinePatchStrategy.h +++ b/source/SAMRAI/xfer/RefinePatchStrategy.h @@ -295,7 +295,10 @@ class RefinePatchStrategy } /*! - * @brief Check if a synchronization is required. + * @brief Check flag for if host-device synchronization is needed. + * + * Returns current value of the flag while setting the flag back to + * the default value of true. */ bool needSynchronize() @@ -308,11 +311,13 @@ class RefinePatchStrategy protected: /*! - * @brief Set flag to indicate synchronization is needed. + * @brief Set flag indicating if device synchronization is needed after + * a child class operation. * - * Implementations of RefinePatchStrategy should call this to set - * the flag to false if they have not launched RAJA kernels that require - * a synchronization call afterward. + * This allows implementations of methods such as preprocessRefine and + * postprocessRefine to set the flag to false if they have done nothing + * that requires host-device synchronization and do not need + * RefineSchedule to call the synchronize routine. */ void setNeedRefineSynchronize(bool flag) From 2b1a9e1e8aaa4af2c2636402543b09e06621c770 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Wed, 13 Apr 2022 09:05:19 -0700 Subject: [PATCH 22/34] Add option to set a synchronize between refine and postprocessRefine --- source/SAMRAI/tbox/KernelFuser.h | 6 ++++-- source/SAMRAI/xfer/CoarsenSchedule.cpp | 2 ++ source/SAMRAI/xfer/RefinePatchStrategy.h | 6 ++++++ source/SAMRAI/xfer/RefineSchedule.cpp | 11 ++++++----- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index 7cf6588a06..dbda13de00 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -49,8 +49,10 @@ class KernelFuser } #ifdef HAVE_RAJA - d_workgroup = d_workpool.instantiate(); - d_worksite = d_workgroup.run(); + if (d_workpool.num_loops() > 0) { + d_workgroup = d_workpool.instantiate(); + d_worksite = d_workgroup.run(); + } d_launched = true; #endif } diff --git a/source/SAMRAI/xfer/CoarsenSchedule.cpp b/source/SAMRAI/xfer/CoarsenSchedule.cpp index 67fa2953f9..6e5775ecaa 100644 --- a/source/SAMRAI/xfer/CoarsenSchedule.cpp +++ b/source/SAMRAI/xfer/CoarsenSchedule.cpp @@ -1043,9 +1043,11 @@ CoarsenSchedule::coarsenSourceData( need_sync = true; } } +#if defined(HAVE_RAJA) if (need_sync) { tbox::parallel_synchronize(); } +#endif if (patch_strategy) { patch_strategy->postprocessCoarsen(*temp_patch, diff --git a/source/SAMRAI/xfer/RefinePatchStrategy.h b/source/SAMRAI/xfer/RefinePatchStrategy.h index fed5441b9d..8ac8069264 100644 --- a/source/SAMRAI/xfer/RefinePatchStrategy.h +++ b/source/SAMRAI/xfer/RefinePatchStrategy.h @@ -294,6 +294,12 @@ class RefinePatchStrategy setNeedRefineSynchronize(false); } + virtual void + setPostRefineSyncFlag() + { + setNeedRefineSynchronize(true); + } + /*! * @brief Check flag for if host-device synchronization is needed. * diff --git a/source/SAMRAI/xfer/RefineSchedule.cpp b/source/SAMRAI/xfer/RefineSchedule.cpp index 782722d1ca..d030334474 100644 --- a/source/SAMRAI/xfer/RefineSchedule.cpp +++ b/source/SAMRAI/xfer/RefineSchedule.cpp @@ -2752,13 +2752,14 @@ RefineSchedule::refineScratchData( } } -#if defined(HAVE_RAJA) - if (need_sync) { - tbox::parallel_synchronize(); - } -#endif if (d_refine_patch_strategy) { + d_refine_patch_strategy->setPostRefineSyncFlag(); +#if defined(HAVE_RAJA) + if (d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif d_refine_patch_strategy->postprocessRefineBoxes(*fine_patch, *crse_patch, fill_boxes, From 1eb8cf5547284c7e9da3862ab5b27d61b4342214 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Thu, 14 Apr 2022 13:41:05 -0700 Subject: [PATCH 23/34] Add KernelFuserStages as a singleton to hold and use KernelFuser --- source/SAMRAI/pdat/ArrayData.cpp | 11 +-- .../pdat/ArrayDataOperationUtilities.cpp | 4 +- source/SAMRAI/tbox/CMakeLists.txt | 2 + source/SAMRAI/tbox/KernelFuser.cpp | 33 ------- source/SAMRAI/tbox/KernelFuser.h | 10 --- source/SAMRAI/tbox/KernelFuserStages.cpp | 63 +++++++++++++ source/SAMRAI/tbox/KernelFuserStages.h | 89 +++++++++++++++++++ source/SAMRAI/tbox/Schedule.cpp | 13 +-- 8 files changed, 169 insertions(+), 56 deletions(-) create mode 100644 source/SAMRAI/tbox/KernelFuserStages.cpp create mode 100644 source/SAMRAI/tbox/KernelFuserStages.h diff --git a/source/SAMRAI/pdat/ArrayData.cpp b/source/SAMRAI/pdat/ArrayData.cpp index d3a1a3b6da..e1ef228661 100644 --- a/source/SAMRAI/pdat/ArrayData.cpp +++ b/source/SAMRAI/pdat/ArrayData.cpp @@ -12,6 +12,7 @@ #define included_pdat_ArrayData_C #include "SAMRAI/tbox/KernelFuser.h" +#include "SAMRAI/tbox/KernelFuserStages.h" #include "SAMRAI/tbox/MessageStream.h" #include "SAMRAI/tbox/Utilities.h" #include "SAMRAI/tbox/MathUtilities.h" @@ -302,7 +303,7 @@ void ArrayData::copy( const size_t n = d_offset * d_depth; #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; + tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { copyop(dst_ptr[i], src_ptr[i]); @@ -498,7 +499,7 @@ void ArrayData::copyDepth( #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; + tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, d_offset, [=] (int i) { @@ -1018,7 +1019,7 @@ void ArrayData::fillAll( const size_t n = d_depth * d_offset; #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; + tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { @@ -1063,7 +1064,7 @@ void ArrayData::fill( if (!d_box.empty()) { #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; + tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { @@ -1097,7 +1098,7 @@ void ArrayData::fill( if (!ispace.empty()) { #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; + tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; switch (ispace.getDim().getValue()) { case 1: { diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp index 9643fcdccd..61df70982d 100644 --- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp +++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp @@ -115,7 +115,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( #if defined(HAVE_RAJA) bool use_fuser = dst.useFuser(); tbox::KernelFuser* fuser = use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; + tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; #endif /* @@ -321,7 +321,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( #if defined(HAVE_RAJA) bool use_fuser = arraydata.useFuser(); tbox::KernelFuser* fuser = use_fuser ? - tbox::KernelFuser::getFuser() : nullptr; + tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; #endif /* diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt index bd4a47044f..35489daba9 100644 --- a/source/SAMRAI/tbox/CMakeLists.txt +++ b/source/SAMRAI/tbox/CMakeLists.txt @@ -28,6 +28,7 @@ set ( tbox_headers InputManager.h IOStream.h KernelFuser.h + KernelFuserStages.h Logger.h MathUtilities.h MathUtilities.cpp @@ -91,6 +92,7 @@ set (tbox_sources IEEE.cpp InputManager.cpp KernelFuser.cpp + KernelFuserStages.cpp Logger.cpp MathUtilitiesSpecial.cpp MemoryDatabase.cpp diff --git a/source/SAMRAI/tbox/KernelFuser.cpp b/source/SAMRAI/tbox/KernelFuser.cpp index ff7f85c59f..6f77f3639f 100644 --- a/source/SAMRAI/tbox/KernelFuser.cpp +++ b/source/SAMRAI/tbox/KernelFuser.cpp @@ -14,39 +14,6 @@ namespace SAMRAI { namespace tbox { -KernelFuser* KernelFuser::s_kernel_fuser_instance(nullptr); - -StartupShutdownManager::Handler -KernelFuser::s_startup_handler( - 0, - KernelFuser::startupCallback, - 0, - 0, - tbox::StartupShutdownManager::priorityArenaManager); - -void -KernelFuser::startupCallback() -{ - KernelFuser::getFuser()->initialize(); -} - -void -KernelFuser::shutdownCallback() -{ - if (s_kernel_fuser_instance) { - delete s_kernel_fuser_instance; - } - s_kernel_fuser_instance = nullptr; -} - -KernelFuser * -KernelFuser::getFuser() -{ - if (!s_kernel_fuser_instance) { - s_kernel_fuser_instance = new KernelFuser(); - } - return s_kernel_fuser_instance; -} KernelFuser::~KernelFuser() { diff --git a/source/SAMRAI/tbox/KernelFuser.h b/source/SAMRAI/tbox/KernelFuser.h index dbda13de00..8b187cae6e 100644 --- a/source/SAMRAI/tbox/KernelFuser.h +++ b/source/SAMRAI/tbox/KernelFuser.h @@ -29,7 +29,6 @@ class KernelFuser { public: - static KernelFuser* getFuser(); #ifdef HAVE_RAJA template @@ -74,7 +73,6 @@ class KernelFuser void initialize(); -protected: KernelFuser() : #ifdef HAVE_RAJA d_workpool(AllocatorDatabase::getDatabase()->getKernelFuserAllocator()), @@ -103,14 +101,6 @@ class KernelFuser using WorkSite = RAJA::WorkSite , Allocator>; #endif - static void startupCallback(); - static void shutdownCallback(); - - static KernelFuser* s_kernel_fuser_instance; - - static StartupShutdownManager::Handler - s_startup_handler; - #ifdef HAVE_RAJA WorkPool d_workpool; WorkGroup d_workgroup; diff --git a/source/SAMRAI/tbox/KernelFuserStages.cpp b/source/SAMRAI/tbox/KernelFuserStages.cpp new file mode 100644 index 0000000000..142d5ae296 --- /dev/null +++ b/source/SAMRAI/tbox/KernelFuserStages.cpp @@ -0,0 +1,63 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2021 Lawrence Livermore National Security, LLC + * Description: Singleton kernel fuser + * + ************************************************************************/ + +#include "SAMRAI/tbox/KernelFuserStages.h" + + +namespace SAMRAI { +namespace tbox { + +KernelFuserStages* KernelFuserStages::s_kernel_fuser_stages_instance(nullptr); + +StartupShutdownManager::Handler +KernelFuserStages::s_startup_handler( + 0, + KernelFuserStages::startupCallback, + 0, + 0, + tbox::StartupShutdownManager::priorityArenaManager); + +void +KernelFuserStages::startupCallback() +{ + KernelFuserStages::getFuserStages()->initialize(); +} + +void +KernelFuserStages::shutdownCallback() +{ + if (s_kernel_fuser_stages_instance) { + delete s_kernel_fuser_stages_instance; + } + s_kernel_fuser_stages_instance = nullptr; +} + +KernelFuserStages * +KernelFuserStages::getFuserStages() +{ + if (!s_kernel_fuser_stages_instance) { + s_kernel_fuser_stages_instance = new KernelFuserStages(); + } + return s_kernel_fuser_stages_instance; +} + +KernelFuserStages::~KernelFuserStages() +{ +} + +void +KernelFuserStages::initialize() +{ +} + + +} +} + diff --git a/source/SAMRAI/tbox/KernelFuserStages.h b/source/SAMRAI/tbox/KernelFuserStages.h new file mode 100644 index 0000000000..d2a861ea11 --- /dev/null +++ b/source/SAMRAI/tbox/KernelFuserStages.h @@ -0,0 +1,89 @@ +#ifndef included_tbox_KernelFuserStages +#define included_tbox_KernelFuserStages + +#include "SAMRAI/SAMRAI_config.h" + +#include "SAMRAI/tbox/KernelFuser.h" + +#ifdef HAVE_RAJA +#include "RAJA/RAJA.hpp" +#endif + + +namespace SAMRAI { +namespace tbox { + +class KernelFuserStages +{ +public: + + static KernelFuserStages* getFuserStages(); + +#ifdef HAVE_RAJA + template + void enqueue(int stage, int begin, int end, Kernel&& kernel) { + d_kernel_fusers[stage].enqueue(begin, end, kernel); + } +#endif + + void launch() + { + for (auto& fuser : d_kernel_fusers) { + fuser.second.launch(); + } + } + + void cleanup() + { + for (auto& fuser : d_kernel_fusers) { + fuser.second.cleanup(); + } + } + + KernelFuser* getDefaultFuser() + { + return &d_kernel_fusers[-99999]; + } + + KernelFuser* getFuser(int stage) + { + return &d_kernel_fusers[stage]; + } + + void clearKernelFuser(int stage) + { + d_kernel_fusers.erase(stage); + } + + void clearAllFusers() + { + d_kernel_fusers.clear(); + } + + void initialize(); + +protected: + KernelFuserStages() + { + } + + virtual ~KernelFuserStages(); + + +private: + + static void startupCallback(); + static void shutdownCallback(); + + static KernelFuserStages* s_kernel_fuser_stages_instance; + + static StartupShutdownManager::Handler + s_startup_handler; + + std::map d_kernel_fusers; +}; + +} +} + +#endif diff --git a/source/SAMRAI/tbox/Schedule.cpp b/source/SAMRAI/tbox/Schedule.cpp index e931c28bf3..3df0417f7a 100644 --- a/source/SAMRAI/tbox/Schedule.cpp +++ b/source/SAMRAI/tbox/Schedule.cpp @@ -10,6 +10,7 @@ #include "SAMRAI/tbox/Schedule.h" #include "SAMRAI/tbox/AllocatorDatabase.h" #include "SAMRAI/tbox/InputManager.h" +#include "SAMRAI/tbox/KernelFuserStages.h" #include "SAMRAI/tbox/PIO.h" #include "SAMRAI/tbox/SAMRAIManager.h" #include "SAMRAI/tbox/SAMRAI_MPI.h" @@ -110,7 +111,7 @@ Schedule::addTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { if (!d_local_fuser) { - d_local_fuser = KernelFuser::getFuser(); + d_local_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); } fuseable_transaction->setKernelFuser(d_local_fuser); d_local_set_fuseable.push_front(fuseable_transaction); @@ -121,7 +122,7 @@ Schedule::addTransaction( if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { if (!d_recv_fuser) { - d_recv_fuser = KernelFuser::getFuser(); + d_recv_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); } fuseable_transaction->setKernelFuser(d_recv_fuser); d_recv_sets_fuseable[src_id].push_front(fuseable_transaction); @@ -131,7 +132,7 @@ Schedule::addTransaction( } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { if (!d_send_fuser) { - d_send_fuser = KernelFuser::getFuser(); + d_send_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); } fuseable_transaction->setKernelFuser(d_send_fuser); d_send_sets_fuseable[dst_id].push_front(fuseable_transaction); @@ -162,7 +163,7 @@ Schedule::appendTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { if (!d_local_fuser) { - d_local_fuser = KernelFuser::getFuser(); + d_local_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); } fuseable_transaction->setKernelFuser(d_local_fuser); d_local_set_fuseable.push_back(fuseable_transaction); @@ -173,7 +174,7 @@ Schedule::appendTransaction( if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { if (!d_recv_fuser) { - d_recv_fuser = KernelFuser::getFuser(); + d_recv_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); } fuseable_transaction->setKernelFuser(d_recv_fuser); d_recv_sets_fuseable[src_id].push_back(fuseable_transaction); @@ -183,7 +184,7 @@ Schedule::appendTransaction( } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { if (!d_send_fuser) { - d_send_fuser = KernelFuser::getFuser(); + d_send_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); } fuseable_transaction->setKernelFuser(d_send_fuser); d_send_sets_fuseable[dst_id].push_back(transaction); From 5a62605b76d6b26066b555e27b223f9f5b1580bb Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Thu, 14 Apr 2022 17:15:38 -0700 Subject: [PATCH 24/34] Add ScheduleKernelFuser and change KernelFuserStages to StagedKernelFusers --- source/SAMRAI/pdat/ArrayData.cpp | 12 ++-- .../pdat/ArrayDataOperationUtilities.cpp | 4 +- source/SAMRAI/tbox/CMakeLists.txt | 6 +- source/SAMRAI/tbox/KernelFuserStages.cpp | 63 ----------------- source/SAMRAI/tbox/Schedule.cpp | 14 ++-- source/SAMRAI/tbox/ScheduleKernelFuser.cpp | 66 +++++++++++++++++ source/SAMRAI/tbox/ScheduleKernelFuser.h | 70 +++++++++++++++++++ source/SAMRAI/tbox/StagedKernelFusers.cpp | 63 +++++++++++++++++ ...rnelFuserStages.h => StagedKernelFusers.h} | 19 ++--- 9 files changed, 225 insertions(+), 92 deletions(-) delete mode 100644 source/SAMRAI/tbox/KernelFuserStages.cpp create mode 100644 source/SAMRAI/tbox/ScheduleKernelFuser.cpp create mode 100644 source/SAMRAI/tbox/ScheduleKernelFuser.h create mode 100644 source/SAMRAI/tbox/StagedKernelFusers.cpp rename source/SAMRAI/tbox/{KernelFuserStages.h => StagedKernelFusers.h} (75%) diff --git a/source/SAMRAI/pdat/ArrayData.cpp b/source/SAMRAI/pdat/ArrayData.cpp index e1ef228661..042bd473e2 100644 --- a/source/SAMRAI/pdat/ArrayData.cpp +++ b/source/SAMRAI/pdat/ArrayData.cpp @@ -12,7 +12,7 @@ #define included_pdat_ArrayData_C #include "SAMRAI/tbox/KernelFuser.h" -#include "SAMRAI/tbox/KernelFuserStages.h" +#include "SAMRAI/tbox/ScheduleKernelFuser.h" #include "SAMRAI/tbox/MessageStream.h" #include "SAMRAI/tbox/Utilities.h" #include "SAMRAI/tbox/MathUtilities.h" @@ -303,7 +303,7 @@ void ArrayData::copy( const size_t n = d_offset * d_depth; #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; + tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { copyop(dst_ptr[i], src_ptr[i]); @@ -499,7 +499,7 @@ void ArrayData::copyDepth( #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; + tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, d_offset, [=] (int i) { @@ -1019,7 +1019,7 @@ void ArrayData::fillAll( const size_t n = d_depth * d_offset; #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; + tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { @@ -1064,7 +1064,7 @@ void ArrayData::fill( if (!d_box.empty()) { #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; + tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { @@ -1098,7 +1098,7 @@ void ArrayData::fill( if (!ispace.empty()) { #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; + tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; switch (ispace.getDim().getValue()) { case 1: { diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp index 61df70982d..6b8210e4fb 100644 --- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp +++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp @@ -115,7 +115,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( #if defined(HAVE_RAJA) bool use_fuser = dst.useFuser(); tbox::KernelFuser* fuser = use_fuser ? - tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; + tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; #endif /* @@ -321,7 +321,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( #if defined(HAVE_RAJA) bool use_fuser = arraydata.useFuser(); tbox::KernelFuser* fuser = use_fuser ? - tbox::KernelFuserStages::getFuserStages()->getDefaultFuser() : nullptr; + tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; #endif /* diff --git a/source/SAMRAI/tbox/CMakeLists.txt b/source/SAMRAI/tbox/CMakeLists.txt index 35489daba9..8027d882f5 100644 --- a/source/SAMRAI/tbox/CMakeLists.txt +++ b/source/SAMRAI/tbox/CMakeLists.txt @@ -28,7 +28,6 @@ set ( tbox_headers InputManager.h IOStream.h KernelFuser.h - KernelFuserStages.h Logger.h MathUtilities.h MathUtilities.cpp @@ -50,9 +49,11 @@ set ( tbox_headers SAMRAI_MPI.h SAMRAIManager.h Schedule.h + ScheduleKernelFuser.h Serializable.h SiloDatabase.h SiloDatabaseFactory.h + StagedKernelFusers.h StartupShutdownManager.h Statistic.h Statistician.h @@ -92,7 +93,6 @@ set (tbox_sources IEEE.cpp InputManager.cpp KernelFuser.cpp - KernelFuserStages.cpp Logger.cpp MathUtilitiesSpecial.cpp MemoryDatabase.cpp @@ -111,9 +111,11 @@ set (tbox_sources SAMRAI_MPI.cpp Scanner.cpp Schedule.cpp + ScheduleKernelFuser.cpp Serializable.cpp SiloDatabase.cpp SiloDatabaseFactory.cpp + StagedKernelFusers.cpp StartupShutdownManager.cpp StatTransaction.cpp Statistic.cpp diff --git a/source/SAMRAI/tbox/KernelFuserStages.cpp b/source/SAMRAI/tbox/KernelFuserStages.cpp deleted file mode 100644 index 142d5ae296..0000000000 --- a/source/SAMRAI/tbox/KernelFuserStages.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/************************************************************************* - * - * This file is part of the SAMRAI distribution. For full copyright - * information, see COPYRIGHT and LICENSE. - * - * Copyright: (c) 1997-2021 Lawrence Livermore National Security, LLC - * Description: Singleton kernel fuser - * - ************************************************************************/ - -#include "SAMRAI/tbox/KernelFuserStages.h" - - -namespace SAMRAI { -namespace tbox { - -KernelFuserStages* KernelFuserStages::s_kernel_fuser_stages_instance(nullptr); - -StartupShutdownManager::Handler -KernelFuserStages::s_startup_handler( - 0, - KernelFuserStages::startupCallback, - 0, - 0, - tbox::StartupShutdownManager::priorityArenaManager); - -void -KernelFuserStages::startupCallback() -{ - KernelFuserStages::getFuserStages()->initialize(); -} - -void -KernelFuserStages::shutdownCallback() -{ - if (s_kernel_fuser_stages_instance) { - delete s_kernel_fuser_stages_instance; - } - s_kernel_fuser_stages_instance = nullptr; -} - -KernelFuserStages * -KernelFuserStages::getFuserStages() -{ - if (!s_kernel_fuser_stages_instance) { - s_kernel_fuser_stages_instance = new KernelFuserStages(); - } - return s_kernel_fuser_stages_instance; -} - -KernelFuserStages::~KernelFuserStages() -{ -} - -void -KernelFuserStages::initialize() -{ -} - - -} -} - diff --git a/source/SAMRAI/tbox/Schedule.cpp b/source/SAMRAI/tbox/Schedule.cpp index 3df0417f7a..84d32d7b9c 100644 --- a/source/SAMRAI/tbox/Schedule.cpp +++ b/source/SAMRAI/tbox/Schedule.cpp @@ -10,7 +10,7 @@ #include "SAMRAI/tbox/Schedule.h" #include "SAMRAI/tbox/AllocatorDatabase.h" #include "SAMRAI/tbox/InputManager.h" -#include "SAMRAI/tbox/KernelFuserStages.h" +#include "SAMRAI/tbox/ScheduleKernelFuser.h" #include "SAMRAI/tbox/PIO.h" #include "SAMRAI/tbox/SAMRAIManager.h" #include "SAMRAI/tbox/SAMRAI_MPI.h" @@ -111,7 +111,7 @@ Schedule::addTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { if (!d_local_fuser) { - d_local_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); + d_local_fuser = ScheduleKernelFuser::getInstance()->getFuser(); } fuseable_transaction->setKernelFuser(d_local_fuser); d_local_set_fuseable.push_front(fuseable_transaction); @@ -122,7 +122,7 @@ Schedule::addTransaction( if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { if (!d_recv_fuser) { - d_recv_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); + d_recv_fuser = ScheduleKernelFuser::getInstance()->getFuser(); } fuseable_transaction->setKernelFuser(d_recv_fuser); d_recv_sets_fuseable[src_id].push_front(fuseable_transaction); @@ -132,7 +132,7 @@ Schedule::addTransaction( } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { if (!d_send_fuser) { - d_send_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); + d_send_fuser = ScheduleKernelFuser::getInstance()->getFuser(); } fuseable_transaction->setKernelFuser(d_send_fuser); d_send_sets_fuseable[dst_id].push_front(fuseable_transaction); @@ -163,7 +163,7 @@ Schedule::appendTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { if (!d_local_fuser) { - d_local_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); + d_local_fuser = ScheduleKernelFuser::getInstance()->getFuser(); } fuseable_transaction->setKernelFuser(d_local_fuser); d_local_set_fuseable.push_back(fuseable_transaction); @@ -174,7 +174,7 @@ Schedule::appendTransaction( if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { if (!d_recv_fuser) { - d_recv_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); + d_recv_fuser = ScheduleKernelFuser::getInstance()->getFuser(); } fuseable_transaction->setKernelFuser(d_recv_fuser); d_recv_sets_fuseable[src_id].push_back(fuseable_transaction); @@ -184,7 +184,7 @@ Schedule::appendTransaction( } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { if (!d_send_fuser) { - d_send_fuser = KernelFuserStages::getFuserStages()->getDefaultFuser(); + d_send_fuser = ScheduleKernelFuser::getInstance()->getFuser(); } fuseable_transaction->setKernelFuser(d_send_fuser); d_send_sets_fuseable[dst_id].push_back(transaction); diff --git a/source/SAMRAI/tbox/ScheduleKernelFuser.cpp b/source/SAMRAI/tbox/ScheduleKernelFuser.cpp new file mode 100644 index 0000000000..3b2883c9bb --- /dev/null +++ b/source/SAMRAI/tbox/ScheduleKernelFuser.cpp @@ -0,0 +1,66 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2021 Lawrence Livermore National Security, LLC + * Description: Singleton kernel fuser + * + ************************************************************************/ + +#include "SAMRAI/tbox/ScheduleKernelFuser.h" + + +namespace SAMRAI { +namespace tbox { + +ScheduleKernelFuser* ScheduleKernelFuser::s_schedule_kernel_fuser_instance(nullptr); + +StartupShutdownManager::Handler +ScheduleKernelFuser::s_startup_handler( + 0, + ScheduleKernelFuser::startupCallback, + 0, + 0, + tbox::StartupShutdownManager::priorityArenaManager); + +void +ScheduleKernelFuser::startupCallback() +{ + ScheduleKernelFuser::getInstance()->initialize(); +} + +void +ScheduleKernelFuser::shutdownCallback() +{ + if (s_schedule_kernel_fuser_instance) { + delete s_schedule_kernel_fuser_instance; + } + s_schedule_kernel_fuser_instance = nullptr; +} + +ScheduleKernelFuser * +ScheduleKernelFuser::getInstance() +{ + if (!s_schedule_kernel_fuser_instance) { + s_schedule_kernel_fuser_instance = new ScheduleKernelFuser(); + } + return s_schedule_kernel_fuser_instance; +} + +ScheduleKernelFuser::~ScheduleKernelFuser() +{ + delete d_kernel_fuser; + d_kernel_fuser = nullptr; +} + +void +ScheduleKernelFuser::initialize() +{ + d_kernel_fuser = new KernelFuser(); +} + + +} +} + diff --git a/source/SAMRAI/tbox/ScheduleKernelFuser.h b/source/SAMRAI/tbox/ScheduleKernelFuser.h new file mode 100644 index 0000000000..9e07f0cd27 --- /dev/null +++ b/source/SAMRAI/tbox/ScheduleKernelFuser.h @@ -0,0 +1,70 @@ +#ifndef included_tbox_ScheduleKernelFuser +#define included_tbox_ScheduleKernelFuser + +#include "SAMRAI/SAMRAI_config.h" + +#include "SAMRAI/tbox/KernelFuser.h" + +#ifdef HAVE_RAJA +#include "RAJA/RAJA.hpp" +#endif + + +namespace SAMRAI { +namespace tbox { + +class ScheduleKernelFuser +{ +public: + + static ScheduleKernelFuser* getInstance(); + +#ifdef HAVE_RAJA + template + void enqueue(int stage, int begin, int end, Kernel&& kernel) { + d_kernel_fuser->enqueue(begin, end, kernel); + } +#endif + + void launch() + { + d_kernel_fuser->launch(); + } + + void cleanup() + { + d_kernel_fuser->cleanup(); + } + + KernelFuser* getFuser() + { + return d_kernel_fuser; + } + + void initialize(); + +protected: + ScheduleKernelFuser() + { + } + + virtual ~ScheduleKernelFuser(); + + +private: + + static void startupCallback(); + static void shutdownCallback(); + + static ScheduleKernelFuser* s_schedule_kernel_fuser_instance; + + static StartupShutdownManager::Handler + s_startup_handler; + + KernelFuser* d_kernel_fuser = nullptr; +}; + +} +} + +#endif diff --git a/source/SAMRAI/tbox/StagedKernelFusers.cpp b/source/SAMRAI/tbox/StagedKernelFusers.cpp new file mode 100644 index 0000000000..57d885227c --- /dev/null +++ b/source/SAMRAI/tbox/StagedKernelFusers.cpp @@ -0,0 +1,63 @@ +/************************************************************************* + * + * This file is part of the SAMRAI distribution. For full copyright + * information, see COPYRIGHT and LICENSE. + * + * Copyright: (c) 1997-2021 Lawrence Livermore National Security, LLC + * Description: Singleton kernel fuser + * + ************************************************************************/ + +#include "SAMRAI/tbox/StagedKernelFusers.h" + + +namespace SAMRAI { +namespace tbox { + +StagedKernelFusers* StagedKernelFusers::s_staged_kernel_fusers_instance(nullptr); + +StartupShutdownManager::Handler +StagedKernelFusers::s_startup_handler( + 0, + StagedKernelFusers::startupCallback, + 0, + 0, + tbox::StartupShutdownManager::priorityArenaManager); + +void +StagedKernelFusers::startupCallback() +{ + StagedKernelFusers::getInstance()->initialize(); +} + +void +StagedKernelFusers::shutdownCallback() +{ + if (s_staged_kernel_fusers_instance) { + delete s_staged_kernel_fusers_instance; + } + s_staged_kernel_fusers_instance = nullptr; +} + +StagedKernelFusers * +StagedKernelFusers::getInstance() +{ + if (!s_staged_kernel_fusers_instance) { + s_staged_kernel_fusers_instance = new StagedKernelFusers(); + } + return s_staged_kernel_fusers_instance; +} + +StagedKernelFusers::~StagedKernelFusers() +{ +} + +void +StagedKernelFusers::initialize() +{ +} + + +} +} + diff --git a/source/SAMRAI/tbox/KernelFuserStages.h b/source/SAMRAI/tbox/StagedKernelFusers.h similarity index 75% rename from source/SAMRAI/tbox/KernelFuserStages.h rename to source/SAMRAI/tbox/StagedKernelFusers.h index d2a861ea11..c77b6be17c 100644 --- a/source/SAMRAI/tbox/KernelFuserStages.h +++ b/source/SAMRAI/tbox/StagedKernelFusers.h @@ -1,5 +1,5 @@ -#ifndef included_tbox_KernelFuserStages -#define included_tbox_KernelFuserStages +#ifndef included_tbox_StagedKernelFusers +#define included_tbox_StagedKernelFusers #include "SAMRAI/SAMRAI_config.h" @@ -13,11 +13,11 @@ namespace SAMRAI { namespace tbox { -class KernelFuserStages +class StagedKernelFusers { public: - static KernelFuserStages* getFuserStages(); + static StagedKernelFusers* getInstance(); #ifdef HAVE_RAJA template @@ -40,11 +40,6 @@ class KernelFuserStages } } - KernelFuser* getDefaultFuser() - { - return &d_kernel_fusers[-99999]; - } - KernelFuser* getFuser(int stage) { return &d_kernel_fusers[stage]; @@ -63,11 +58,11 @@ class KernelFuserStages void initialize(); protected: - KernelFuserStages() + StagedKernelFusers() { } - virtual ~KernelFuserStages(); + virtual ~StagedKernelFusers(); private: @@ -75,7 +70,7 @@ class KernelFuserStages static void startupCallback(); static void shutdownCallback(); - static KernelFuserStages* s_kernel_fuser_stages_instance; + static StagedKernelFusers* s_staged_kernel_fusers_instance; static StartupShutdownManager::Handler s_startup_handler; From 879e75ebf0c214fab7e243324fb4ee4afcd35a85 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Mon, 25 Apr 2022 10:08:51 -0700 Subject: [PATCH 25/34] Add StagedKernelFusers launch/cleanup in RefineSchedule --- source/SAMRAI/xfer/RefineSchedule.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/source/SAMRAI/xfer/RefineSchedule.cpp b/source/SAMRAI/xfer/RefineSchedule.cpp index d030334474..7c75b1bd3a 100644 --- a/source/SAMRAI/xfer/RefineSchedule.cpp +++ b/source/SAMRAI/xfer/RefineSchedule.cpp @@ -29,6 +29,7 @@ #include "SAMRAI/tbox/MathUtilities.h" #include "SAMRAI/tbox/InputManager.h" #include "SAMRAI/tbox/OpenMPUtilities.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/StartupShutdownManager.h" #include "SAMRAI/tbox/TimerManager.h" #include "SAMRAI/tbox/Utilities.h" @@ -2866,6 +2867,12 @@ RefineSchedule::refineScratchData( } } + tbox::StagedKernelFusers::getInstance()->launch(); +#if defined(HAVE_RAJA) + tbox::parallel_synchronize(); +#endif + tbox::StagedKernelFusers::getInstance()->cleanup(); + if (d_refine_patch_strategy) { d_refine_patch_strategy->postprocessRefineLevel( *fine_level, From 67ef4610cfd2b84484a30c7797b917cb26f91a51 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Tue, 26 Apr 2022 14:21:58 -0700 Subject: [PATCH 26/34] Stop some synchronize calls between the communicate and refine steps --- source/SAMRAI/xfer/RefineSchedule.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/source/SAMRAI/xfer/RefineSchedule.cpp b/source/SAMRAI/xfer/RefineSchedule.cpp index 7c75b1bd3a..e3908b29d9 100644 --- a/source/SAMRAI/xfer/RefineSchedule.cpp +++ b/source/SAMRAI/xfer/RefineSchedule.cpp @@ -2161,9 +2161,10 @@ RefineSchedule::recursiveFill( */ d_coarse_priority_level_schedule->communicate(); #if defined(HAVE_RAJA) - if (d_coarse_priority_level_schedule->completedTransactions()) { - tbox::parallel_synchronize(); - } +// TODO: Be sure that this sync isn't needed. +// if (d_coarse_priority_level_schedule->completedTransactions()) { +// tbox::parallel_synchronize(); +// } #endif /* @@ -2224,7 +2225,8 @@ RefineSchedule::recursiveFill( do_physical_boundary_fill); #if defined(HAVE_RAJA) - tbox::parallel_synchronize(); +// TODO: This sync probably isn't necessary, but keep an eye on it. +// tbox::parallel_synchronize(); #endif /* @@ -2344,9 +2346,10 @@ RefineSchedule::recursiveFill( */ d_fine_priority_level_schedule->communicate(); #if defined(HAVE_RAJA) - if (d_fine_priority_level_schedule->completedTransactions()) { - tbox::parallel_synchronize(); - } +// TODO: Be sure that this sync isn't needed. +// if (d_fine_priority_level_schedule->completedTransactions()) { +// tbox::parallel_synchronize(); +// } #endif /* From 8a8a98e5b8c992cfd90f7f5603a2791ed7c09975 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Tue, 3 May 2022 13:49:26 -0700 Subject: [PATCH 27/34] Change tbox::Schedule to use StagedKernelFusers --- source/SAMRAI/pdat/ArrayData.cpp | 18 ++--- .../pdat/ArrayDataOperationUtilities.cpp | 7 +- source/SAMRAI/tbox/Schedule.cpp | 68 +++++++++---------- source/SAMRAI/tbox/Schedule.h | 7 +- source/SAMRAI/tbox/TransactionFuseable.cpp | 6 +- source/SAMRAI/tbox/TransactionFuseable.h | 10 +-- 6 files changed, 59 insertions(+), 57 deletions(-) diff --git a/source/SAMRAI/pdat/ArrayData.cpp b/source/SAMRAI/pdat/ArrayData.cpp index 042bd473e2..f9e397eb86 100644 --- a/source/SAMRAI/pdat/ArrayData.cpp +++ b/source/SAMRAI/pdat/ArrayData.cpp @@ -12,7 +12,7 @@ #define included_pdat_ArrayData_C #include "SAMRAI/tbox/KernelFuser.h" -#include "SAMRAI/tbox/ScheduleKernelFuser.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/MessageStream.h" #include "SAMRAI/tbox/Utilities.h" #include "SAMRAI/tbox/MathUtilities.h" @@ -116,7 +116,7 @@ ArrayData::ArrayData( } #endif -#ifdef DEBUG_INITIALIZE_UNDEFINED +#ifdef DEBUG_IINITIALIZE_UNDEFINED undefineData(); #endif } @@ -149,7 +149,7 @@ ArrayData::ArrayData( } #endif -#ifdef DEBUG_INITIALIZE_UNDEFINED +#ifdef DEBUG_IINITIALIZE_UNDEFINED undefineData(); #endif } @@ -303,7 +303,7 @@ void ArrayData::copy( const size_t n = d_offset * d_depth; #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { copyop(dst_ptr[i], src_ptr[i]); @@ -499,7 +499,7 @@ void ArrayData::copyDepth( #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, d_offset, [=] (int i) { @@ -1019,7 +1019,7 @@ void ArrayData::fillAll( const size_t n = d_depth * d_offset; #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { @@ -1030,7 +1030,7 @@ void ArrayData::fillAll( ptr[i] = t; }); } -#if defined(DEBUG_INITIALIZE_UNDEFINED) +#if defined(DEBUG_IINITIALIZE_UNDEFINED) tbox::parallel_synchronize(); #endif #else @@ -1064,7 +1064,7 @@ void ArrayData::fill( if (!d_box.empty()) { #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; if (d_on_host) { hier::host_parallel_for_all(0, n, [=] (int i) { @@ -1098,7 +1098,7 @@ void ArrayData::fill( if (!ispace.empty()) { #if defined(HAVE_RAJA) tbox::KernelFuser* fuser = d_use_fuser ? - tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; switch (ispace.getDim().getValue()) { case 1: { diff --git a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp index 6b8210e4fb..0e8fd6db96 100644 --- a/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp +++ b/source/SAMRAI/pdat/ArrayDataOperationUtilities.cpp @@ -14,10 +14,11 @@ #include "SAMRAI/pdat/ArrayDataOperationUtilities.h" #include "SAMRAI/pdat/ArrayData.h" #include "SAMRAI/hier/ForAll.h" -#include "SAMRAI/tbox/Utilities.h" #include "SAMRAI/pdat/SumOperation.h" #include "SAMRAI/tbox/Collectives.h" #include "SAMRAI/tbox/NVTXUtilities.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" +#include "SAMRAI/tbox/Utilities.h" namespace SAMRAI { @@ -115,7 +116,7 @@ void ArrayDataOperationUtilities::doArrayDataOperationOnBox( #if defined(HAVE_RAJA) bool use_fuser = dst.useFuser(); tbox::KernelFuser* fuser = use_fuser ? - tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; #endif /* @@ -321,7 +322,7 @@ void ArrayDataOperationUtilities::doArrayDataBufferOperationOnBox( #if defined(HAVE_RAJA) bool use_fuser = arraydata.useFuser(); tbox::KernelFuser* fuser = use_fuser ? - tbox::ScheduleKernelFuser::getInstance()->getFuser() : nullptr; + tbox::StagedKernelFusers::getInstance()->getFuser(0) : nullptr; #endif /* diff --git a/source/SAMRAI/tbox/Schedule.cpp b/source/SAMRAI/tbox/Schedule.cpp index 84d32d7b9c..0e2cfa61da 100644 --- a/source/SAMRAI/tbox/Schedule.cpp +++ b/source/SAMRAI/tbox/Schedule.cpp @@ -10,7 +10,7 @@ #include "SAMRAI/tbox/Schedule.h" #include "SAMRAI/tbox/AllocatorDatabase.h" #include "SAMRAI/tbox/InputManager.h" -#include "SAMRAI/tbox/ScheduleKernelFuser.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/PIO.h" #include "SAMRAI/tbox/SAMRAIManager.h" #include "SAMRAI/tbox/SAMRAI_MPI.h" @@ -110,10 +110,10 @@ Schedule::addTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { - if (!d_local_fuser) { - d_local_fuser = ScheduleKernelFuser::getInstance()->getFuser(); + if (!d_local_fusers) { + d_local_fusers = StagedKernelFusers::getInstance(); } - fuseable_transaction->setKernelFuser(d_local_fuser); + fuseable_transaction->setKernelFuser(d_local_fusers); d_local_set_fuseable.push_front(fuseable_transaction); } else { d_local_set.push_front(transaction); @@ -121,20 +121,20 @@ Schedule::addTransaction( } else { if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { - if (!d_recv_fuser) { - d_recv_fuser = ScheduleKernelFuser::getInstance()->getFuser(); + if (!d_recv_fusers) { + d_recv_fusers = StagedKernelFusers::getInstance(); } - fuseable_transaction->setKernelFuser(d_recv_fuser); + fuseable_transaction->setKernelFuser(d_recv_fusers); d_recv_sets_fuseable[src_id].push_front(fuseable_transaction); } else { d_recv_sets[src_id].push_front(transaction); } } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { - if (!d_send_fuser) { - d_send_fuser = ScheduleKernelFuser::getInstance()->getFuser(); + if (!d_send_fusers) { + d_send_fusers = StagedKernelFusers::getInstance(); } - fuseable_transaction->setKernelFuser(d_send_fuser); + fuseable_transaction->setKernelFuser(d_send_fusers); d_send_sets_fuseable[dst_id].push_front(fuseable_transaction); } else { d_send_sets[dst_id].push_front(transaction); @@ -162,10 +162,10 @@ Schedule::appendTransaction( if ((d_mpi.getRank() == src_id) && (d_mpi.getRank() == dst_id)) { if (fuseable_transaction) { - if (!d_local_fuser) { - d_local_fuser = ScheduleKernelFuser::getInstance()->getFuser(); + if (!d_local_fusers) { + d_local_fusers = StagedKernelFusers::getInstance(); } - fuseable_transaction->setKernelFuser(d_local_fuser); + fuseable_transaction->setKernelFuser(d_local_fusers); d_local_set_fuseable.push_back(fuseable_transaction); } else { d_local_set.push_back(transaction); @@ -173,20 +173,20 @@ Schedule::appendTransaction( } else { if (d_mpi.getRank() == dst_id) { if (fuseable_transaction) { - if (!d_recv_fuser) { - d_recv_fuser = ScheduleKernelFuser::getInstance()->getFuser(); + if (!d_recv_fusers) { + d_recv_fusers = StagedKernelFusers::getInstance(); } - fuseable_transaction->setKernelFuser(d_recv_fuser); + fuseable_transaction->setKernelFuser(d_recv_fusers); d_recv_sets_fuseable[src_id].push_back(fuseable_transaction); } else { d_recv_sets[src_id].push_back(transaction); } } else if (d_mpi.getRank() == src_id) { if (fuseable_transaction) { - if (!d_send_fuser) { - d_send_fuser = ScheduleKernelFuser::getInstance()->getFuser(); + if (!d_send_fusers) { + d_send_fusers = StagedKernelFusers::getInstance(); } - fuseable_transaction->setKernelFuser(d_send_fuser); + fuseable_transaction->setKernelFuser(d_send_fusers); d_send_sets_fuseable[dst_id].push_back(transaction); } else { d_send_sets[dst_id].push_back(transaction); @@ -473,8 +473,8 @@ Schedule::postSends() for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { transaction->packStream(outgoing_stream); } - if (d_send_fuser && have_fuseable) { - d_send_fuser->launch(); + if (d_send_fusers && have_fuseable) { + d_send_fusers->launch(); } for (const auto& transaction : d_send_sets[peer_rank]) { @@ -487,7 +487,7 @@ Schedule::postSends() d_completed_transactions = true; #if defined(HAVE_RAJA) parallel_synchronize(); - if (d_send_fuser) d_send_fuser->cleanup(); + if (d_send_fusers) d_send_fusers->cleanup(); #endif } @@ -546,8 +546,8 @@ Schedule::postSends() for (const auto& transaction : d_send_sets_fuseable[peer_rank]) { transaction->packStream(outgoing_stream); } - if (d_send_fuser && have_fuseable) { - d_send_fuser->launch(); + if (d_send_fusers && have_fuseable) { + d_send_fusers->launch(); } for (const auto& transaction : d_send_sets[peer_rank]) { @@ -559,7 +559,7 @@ Schedule::postSends() d_completed_transactions = true; #if defined(HAVE_RAJA) parallel_synchronize(); - if (d_send_fuser) d_send_fuser->cleanup(); + if (d_send_fusers) d_send_fusers->cleanup(); #endif } @@ -596,8 +596,8 @@ Schedule::performLocalCopies() for (const auto& local : d_local_set_fuseable) { local->copyLocalData(); } - if (d_local_fuser && have_fuseable) { - d_local_fuser->launch(); + if (d_local_fusers && have_fuseable) { + d_local_fusers->launch(); } for (const auto& local : d_local_set) { @@ -610,7 +610,7 @@ Schedule::performLocalCopies() d_completed_transactions = true; #if defined(HAVE_RAJA) parallel_synchronize(); - if (d_local_fuser) d_local_fuser->cleanup(); + if (d_local_fusers) d_local_fusers->cleanup(); #endif } @@ -661,13 +661,13 @@ Schedule::processCompletedCommunications() for (const auto& transaction : d_recv_sets_fuseable[sender]) { transaction->unpackStream(incoming_stream); } - if (d_recv_fuser || have_fuseable) { - d_recv_fuser->launch(); + if (d_recv_fusers || have_fuseable) { + d_recv_fusers->launch(); } #if defined(HAVE_RAJA) if (have_fuseable) { parallel_synchronize(); - if (d_recv_fuser) d_recv_fuser->cleanup(); + if (d_recv_fusers) d_recv_fusers->cleanup(); } #endif for (const auto& transaction : d_recv_sets[sender]) { @@ -725,13 +725,13 @@ Schedule::processCompletedCommunications() for (const auto& transaction : d_recv_sets_fuseable[sender]) { transaction->unpackStream(incoming_stream); } - if (d_recv_fuser && have_fuseable) { - d_recv_fuser->launch(); + if (d_recv_fusers && have_fuseable) { + d_recv_fusers->launch(); } #if defined(HAVE_RAJA) if (have_fuseable) { parallel_synchronize(); - if (d_recv_fuser) d_recv_fuser->cleanup(); + if (d_recv_fusers) d_recv_fusers->cleanup(); } #endif for (const auto& transaction : d_recv_sets[sender]) { diff --git a/source/SAMRAI/tbox/Schedule.h b/source/SAMRAI/tbox/Schedule.h index 1c232cee50..bd436bef5a 100644 --- a/source/SAMRAI/tbox/Schedule.h +++ b/source/SAMRAI/tbox/Schedule.h @@ -19,6 +19,7 @@ #include "SAMRAI/tbox/Transaction.h" #include "SAMRAI/tbox/TransactionFuseable.h" #include "SAMRAI/tbox/KernelFuser.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include #include @@ -352,8 +353,8 @@ class Schedule TransactionSets d_send_sets_fuseable; TransactionSets d_recv_sets_fuseable; - KernelFuser* d_send_fuser{nullptr}; - KernelFuser* d_recv_fuser{nullptr}; + StagedKernelFusers* d_send_fusers{nullptr}; + StagedKernelFusers* d_recv_fusers{nullptr}; bool d_completed_transactions = false; @@ -369,7 +370,7 @@ class Schedule */ std::list > d_local_set_fuseable; - KernelFuser* d_local_fuser{nullptr}; + StagedKernelFusers* d_local_fusers{nullptr}; //@{ @name High-level asynchronous messages passing objects diff --git a/source/SAMRAI/tbox/TransactionFuseable.cpp b/source/SAMRAI/tbox/TransactionFuseable.cpp index aeec68e8f1..f70d83f42f 100644 --- a/source/SAMRAI/tbox/TransactionFuseable.cpp +++ b/source/SAMRAI/tbox/TransactionFuseable.cpp @@ -4,16 +4,16 @@ namespace SAMRAI { namespace tbox { void -TransactionFuseable::setKernelFuser(KernelFuser* fuser) +TransactionFuseable::setKernelFuser(StagedKernelFusers* fuser) { d_fuser = fuser; } -KernelFuser* +StagedKernelFusers* TransactionFuseable::getKernelFuser() { return d_fuser; } } -} \ No newline at end of file +} diff --git a/source/SAMRAI/tbox/TransactionFuseable.h b/source/SAMRAI/tbox/TransactionFuseable.h index 79265713cb..686268680b 100644 --- a/source/SAMRAI/tbox/TransactionFuseable.h +++ b/source/SAMRAI/tbox/TransactionFuseable.h @@ -14,7 +14,7 @@ #include "SAMRAI/SAMRAI_config.h" #include "SAMRAI/tbox/Transaction.h" -#include "SAMRAI/tbox/KernelFuser.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include @@ -25,14 +25,14 @@ class TransactionFuseable : public Transaction { public: - void setKernelFuser(KernelFuser* fuser); - KernelFuser* getKernelFuser(); + void setKernelFuser(StagedKernelFusers* fuser); + StagedKernelFusers* getKernelFuser(); private: - KernelFuser* d_fuser{nullptr}; + StagedKernelFusers* d_fuser{nullptr}; }; } } -#endif \ No newline at end of file +#endif From f2230e9153a4dbc5bb1b4ff58e9f557849278fb8 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Tue, 3 May 2022 14:03:12 -0700 Subject: [PATCH 28/34] Add optional synchronization around boundary conditions. --- source/SAMRAI/xfer/RefineSchedule.cpp | 27 ++++++++++++++++++- source/SAMRAI/xfer/SingularityPatchStrategy.h | 21 +++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/source/SAMRAI/xfer/RefineSchedule.cpp b/source/SAMRAI/xfer/RefineSchedule.cpp index e3908b29d9..656c284fbd 100644 --- a/source/SAMRAI/xfer/RefineSchedule.cpp +++ b/source/SAMRAI/xfer/RefineSchedule.cpp @@ -2384,6 +2384,9 @@ RefineSchedule::fillPhysicalBoundaries( d_dst_level->setBoundaryBoxes(); if (d_refine_patch_strategy) { +#if defined(HAVE_RAJA) + bool bdry_is_filled = false; +#endif for (hier::PatchLevel::iterator p(d_dst_level->begin()); p != d_dst_level->end(); ++p) { const std::shared_ptr& patch(*p); @@ -2392,8 +2395,17 @@ RefineSchedule::fillPhysicalBoundaries( setPhysicalBoundaryConditions(*patch, fill_time, d_boundary_fill_ghost_width); +#if defined(HAVE_RAJA) + bdry_is_filled = true; +#endif } } +#if defined(HAVE_RAJA) + if (bdry_is_filled && d_refine_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif + } t_fill_physical_boundaries->stop(); } @@ -2427,7 +2439,9 @@ RefineSchedule::fillSingularityBoundaries( const hier::IntVector& ratio = d_dst_level->getRatioToLevelZero(); if (d_singularity_patch_strategy) { - +#if defined(HAVE_RAJA) + bool sing_is_filled = false; +#endif for (hier::BlockId::block_t bn = 0; bn < grid_geometry->getNumberBlocks(); ++bn) { hier::BlockId block_id(bn); @@ -2476,6 +2490,9 @@ RefineSchedule::fillSingularityBoundaries( d_dst_to_encon, fill_box, nboxes[bb], grid_geometry); +#if defined(HAVE_RAJA) + sing_is_filled = true; +#endif } } } @@ -2503,6 +2520,9 @@ RefineSchedule::fillSingularityBoundaries( d_dst_to_encon, fill_box, eboxes[bb], grid_geometry); +#if defined(HAVE_RAJA) + sing_is_filled = true; +#endif } } } @@ -2511,6 +2531,11 @@ RefineSchedule::fillSingularityBoundaries( } } } +#if defined(HAVE_RAJA) + if (sing_is_filled && d_singularity_patch_strategy->needSynchronize()) { + tbox::parallel_synchronize(); + } +#endif } } t_fill_singularity_boundaries->stop(); diff --git a/source/SAMRAI/xfer/SingularityPatchStrategy.h b/source/SAMRAI/xfer/SingularityPatchStrategy.h index 144a656b05..0124f258f5 100644 --- a/source/SAMRAI/xfer/SingularityPatchStrategy.h +++ b/source/SAMRAI/xfer/SingularityPatchStrategy.h @@ -90,6 +90,27 @@ class SingularityPatchStrategy const hier::BoundaryBox& boundary_box, const std::shared_ptr& grid_geometry) = 0; + bool + needSynchronize() + { + bool flag = d_need_synchronize; + d_need_synchronize = true; + return flag; + } + +protected: + + void + setNeedSingularitySynchronize(bool flag) + { + d_need_synchronize = flag; + } + +private: + + bool d_need_synchronize = true; + + }; } From 8df72609a345c93512da80b570ea22b33b654b05 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Fri, 13 May 2022 10:50:57 -0700 Subject: [PATCH 29/34] Add StagedKernelFusers calls to PatchLevel allocate` --- source/SAMRAI/hier/CMakeLists.txt | 4 ++++ source/SAMRAI/hier/PatchLevel.cpp | 2 ++ source/SAMRAI/hier/PatchLevel.h | 11 +++++++++++ source/SAMRAI/tbox/StagedKernelFusers.h | 25 +++++++++++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/source/SAMRAI/hier/CMakeLists.txt b/source/SAMRAI/hier/CMakeLists.txt index 8cdff48343..1342c6251e 100644 --- a/source/SAMRAI/hier/CMakeLists.txt +++ b/source/SAMRAI/hier/CMakeLists.txt @@ -140,6 +140,10 @@ if (ENABLE_MPI) set (hier_depends ${hier_depends} mpi) endif () +if (ENABLE_CUDA) + set (hier_depends ${hier_depends} cuda) +endif () + blt_add_library( NAME SAMRAI_hier SOURCES ${hier_sources} diff --git a/source/SAMRAI/hier/PatchLevel.cpp b/source/SAMRAI/hier/PatchLevel.cpp index 30f29c3b66..671ba9f5ac 100644 --- a/source/SAMRAI/hier/PatchLevel.cpp +++ b/source/SAMRAI/hier/PatchLevel.cpp @@ -9,7 +9,9 @@ ************************************************************************/ #include "SAMRAI/hier/PatchLevel.h" +#include "SAMRAI/tbox/Collectives.h" #include "SAMRAI/tbox/MathUtilities.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/TimerManager.h" #include "SAMRAI/hier/BaseGridGeometry.h" #include "SAMRAI/hier/BoxContainer.h" diff --git a/source/SAMRAI/hier/PatchLevel.h b/source/SAMRAI/hier/PatchLevel.h index eba972eb5a..81f5534d13 100644 --- a/source/SAMRAI/hier/PatchLevel.h +++ b/source/SAMRAI/hier/PatchLevel.h @@ -17,6 +17,7 @@ #include "SAMRAI/hier/BoxLevel.h" #include "SAMRAI/hier/PatchFactory.h" #include "SAMRAI/hier/ProcessorMapping.h" +#include "SAMRAI/tbox/StagedKernelFusers.h" #include "SAMRAI/tbox/Utilities.h" #include @@ -758,6 +759,11 @@ class PatchLevel for (Iterator ip(begin()); ip != end(); ++ip) { ip->allocatePatchData(id, timestamp); } + +#if defined(HAVE_RAJA) + tbox::StagedKernelFusers::getInstance()->launchAndCleanup(); +#endif + } /*! @@ -775,6 +781,11 @@ class PatchLevel for (Iterator ip(begin()); ip != end(); ++ip) { ip->allocatePatchData(components, timestamp); } + +#if defined(HAVE_RAJA) + tbox::StagedKernelFusers::getInstance()->launchAndCleanup(); +#endif + } /*! diff --git a/source/SAMRAI/tbox/StagedKernelFusers.h b/source/SAMRAI/tbox/StagedKernelFusers.h index c77b6be17c..42cb239954 100644 --- a/source/SAMRAI/tbox/StagedKernelFusers.h +++ b/source/SAMRAI/tbox/StagedKernelFusers.h @@ -3,6 +3,7 @@ #include "SAMRAI/SAMRAI_config.h" +#include "SAMRAI/tbox/Collectives.h" #include "SAMRAI/tbox/KernelFuser.h" #ifdef HAVE_RAJA @@ -23,6 +24,7 @@ class StagedKernelFusers template void enqueue(int stage, int begin, int end, Kernel&& kernel) { d_kernel_fusers[stage].enqueue(begin, end, kernel); + d_active = true; } #endif @@ -38,6 +40,7 @@ class StagedKernelFusers for (auto& fuser : d_kernel_fusers) { fuser.second.cleanup(); } + d_active = false; } KernelFuser* getFuser(int stage) @@ -48,15 +51,35 @@ class StagedKernelFusers void clearKernelFuser(int stage) { d_kernel_fusers.erase(stage); + if (d_kernel_fusers.empty()) { + d_active = false; + } } void clearAllFusers() { d_kernel_fusers.clear(); + d_active = false; + } + + bool isActive() const + { + return d_active; } void initialize(); + void launchAndCleanup() + { + if (d_active) { + launch(); +#ifdef HAVE_RAJA + tbox::parallel_synchronize(); +#endif + cleanup(); + } + } + protected: StagedKernelFusers() { @@ -76,6 +99,8 @@ class StagedKernelFusers s_startup_handler; std::map d_kernel_fusers; + + bool d_active = false; }; } From 6f6c6bdb63cc6601681be6a654a2d6ec128b75e2 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Wed, 18 May 2022 16:11:08 -0700 Subject: [PATCH 30/34] Add cuda dependency in some tests --- source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt | 3 ++- source/test/MappingConnector/CMakeLists.txt | 3 ++- source/test/mblktree/CMakeLists.txt | 3 ++- source/test/sparsedata/CMakeLists.txt | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt b/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt index 2d329d7d69..a524991ca4 100644 --- a/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt +++ b/source/test/MappedBoxLevelConnectorUtilsTests/CMakeLists.txt @@ -8,7 +8,8 @@ blt_add_executable( SAMRAI_hier SAMRAI_geom SAMRAI_mesh - SAMRAI_tbox) + SAMRAI_tbox + cuda) target_compile_definitions(mblcu PUBLIC TESTING=1) diff --git a/source/test/MappingConnector/CMakeLists.txt b/source/test/MappingConnector/CMakeLists.txt index 4359563198..2b7e2386eb 100644 --- a/source/test/MappingConnector/CMakeLists.txt +++ b/source/test/MappingConnector/CMakeLists.txt @@ -8,7 +8,8 @@ blt_add_executable( SAMRAI_hier SAMRAI_geom SAMRAI_mesh - SAMRAI_tbox) + SAMRAI_tbox + cuda) target_compile_definitions(mapping-connector PUBLIC TESTING=1) diff --git a/source/test/mblktree/CMakeLists.txt b/source/test/mblktree/CMakeLists.txt index 4abe346d13..88e6349abe 100644 --- a/source/test/mblktree/CMakeLists.txt +++ b/source/test/mblktree/CMakeLists.txt @@ -5,7 +5,8 @@ set (mblktree_depends_on SAMRAI_hier SAMRAI_geom SAMRAI_mesh - SAMRAI_tbox) + SAMRAI_tbox + cuda) blt_add_executable( NAME mblktree diff --git a/source/test/sparsedata/CMakeLists.txt b/source/test/sparsedata/CMakeLists.txt index f14417c13c..f61867c5ab 100644 --- a/source/test/sparsedata/CMakeLists.txt +++ b/source/test/sparsedata/CMakeLists.txt @@ -23,7 +23,8 @@ blt_add_executable( SAMRAI_tbox SAMRAI_hier SAMRAI_pdat - SAMRAI_geom) + SAMRAI_geom + cuda) target_compile_definitions(sparse PUBLIC TESTING=1) From 639767749020a8167ef699a581cedcd70c2676ea Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Tue, 24 May 2022 09:25:26 -0700 Subject: [PATCH 31/34] Add kernel fusion calls to PatchLevel deallocate and CoarsenSchedule --- source/SAMRAI/hier/PatchLevel.h | 8 ++++++ source/SAMRAI/xfer/CoarsenPatchStrategy.h | 6 ++++ source/SAMRAI/xfer/CoarsenSchedule.cpp | 35 ++++++++++++++++------- source/SAMRAI/xfer/RefineSchedule.cpp | 4 --- 4 files changed, 38 insertions(+), 15 deletions(-) diff --git a/source/SAMRAI/hier/PatchLevel.h b/source/SAMRAI/hier/PatchLevel.h index 81f5534d13..595db02210 100644 --- a/source/SAMRAI/hier/PatchLevel.h +++ b/source/SAMRAI/hier/PatchLevel.h @@ -823,6 +823,10 @@ class PatchLevel for (Iterator ip(begin()); ip != end(); ++ip) { ip->deallocatePatchData(id); } + +#if defined(HAVE_RAJA) + tbox::StagedKernelFusers::getInstance()->launchAndCleanup(); +#endif } /*! @@ -840,6 +844,10 @@ class PatchLevel for (Iterator ip(begin()); ip != end(); ++ip) { ip->deallocatePatchData(components); } + +#if defined(HAVE_RAJA) + tbox::StagedKernelFusers::getInstance()->launchAndCleanup(); +#endif } /*! diff --git a/source/SAMRAI/xfer/CoarsenPatchStrategy.h b/source/SAMRAI/xfer/CoarsenPatchStrategy.h index 03f20ee206..b282db4d46 100644 --- a/source/SAMRAI/xfer/CoarsenPatchStrategy.h +++ b/source/SAMRAI/xfer/CoarsenPatchStrategy.h @@ -139,6 +139,12 @@ class CoarsenPatchStrategy const hier::Box& coarse_box, const hier::IntVector& ratio) = 0; + virtual void + setPostCoarsenSyncFlag() + { + setNeedCoarsenSynchronize(true); + } + /*! * @brief Check flag for if host-device synchronization is needed. * diff --git a/source/SAMRAI/xfer/CoarsenSchedule.cpp b/source/SAMRAI/xfer/CoarsenSchedule.cpp index 6e5775ecaa..378e21bc09 100644 --- a/source/SAMRAI/xfer/CoarsenSchedule.cpp +++ b/source/SAMRAI/xfer/CoarsenSchedule.cpp @@ -313,9 +313,9 @@ CoarsenSchedule::coarsenData() const d_schedule->communicate(); #if defined(HAVE_RAJA) - if (d_schedule->completedTransactions()) { - tbox::parallel_synchronize(); - } +// if (d_schedule->completedTransactions()) { +// tbox::parallel_synchronize(); +// } #endif /* @@ -1043,24 +1043,37 @@ CoarsenSchedule::coarsenSourceData( need_sync = true; } } + #if defined(HAVE_RAJA) - if (need_sync) { - tbox::parallel_synchronize(); - } +// if (need_sync) { +// tbox::parallel_synchronize(); +// } #endif if (patch_strategy) { + d_coarsen_patch_strategy->setPostCoarsenSyncFlag(); +#if defined(HAVE_RAJA) + if (d_coarsen_patch_strategy->needSynchronize()) { + printf("POST COARSEN SYNCHRONIZE\n"); + tbox::parallel_synchronize(); + } +#endif + patch_strategy->postprocessCoarsen(*temp_patch, *fine_patch, box, block_ratio); -#if defined(HAVE_RAJA) - if (patch_strategy->needSynchronize()) { - tbox::parallel_synchronize(); - } -#endif } } + + tbox::StagedKernelFusers::getInstance()->launch(); +#if defined(HAVE_RAJA) + if (patch_strategy->needSynchronize() || tbox::StagedKernelFusers::getInstance()->isActive()) { + tbox::parallel_synchronize(); + } +#endif + tbox::StagedKernelFusers::getInstance()->cleanup(); + } /* diff --git a/source/SAMRAI/xfer/RefineSchedule.cpp b/source/SAMRAI/xfer/RefineSchedule.cpp index 656c284fbd..023f1b4b3d 100644 --- a/source/SAMRAI/xfer/RefineSchedule.cpp +++ b/source/SAMRAI/xfer/RefineSchedule.cpp @@ -2763,8 +2763,6 @@ RefineSchedule::refineScratchData( #endif } - bool need_sync = false; - for (size_t iri = 0; iri < d_number_refine_items; ++iri) { const RefineClasses::Data * const ref_item = d_refine_items[iri]; if (ref_item->d_oprefine) { @@ -2777,8 +2775,6 @@ RefineSchedule::refineScratchData( ref_item->d_oprefine->refine(*fine_patch, *crse_patch, scratch_id, scratch_id, *refine_overlap, local_ratio); - need_sync = true; - } } From 41a65a697c55b7ae109b45410ff2a5688a023964 Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Thu, 7 Jul 2022 13:09:29 -0700 Subject: [PATCH 32/34] Revise StagedKernelFusers, remove isActive, add check on whether it launched kernels. --- source/SAMRAI/tbox/StagedKernelFusers.h | 24 +++++++++++------------- source/SAMRAI/xfer/CoarsenSchedule.cpp | 2 +- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/source/SAMRAI/tbox/StagedKernelFusers.h b/source/SAMRAI/tbox/StagedKernelFusers.h index 42cb239954..919886b33d 100644 --- a/source/SAMRAI/tbox/StagedKernelFusers.h +++ b/source/SAMRAI/tbox/StagedKernelFusers.h @@ -24,7 +24,6 @@ class StagedKernelFusers template void enqueue(int stage, int begin, int end, Kernel&& kernel) { d_kernel_fusers[stage].enqueue(begin, end, kernel); - d_active = true; } #endif @@ -32,6 +31,7 @@ class StagedKernelFusers { for (auto& fuser : d_kernel_fusers) { fuser.second.launch(); + d_launched = (d_launched || fuser.second.launched()); } } @@ -40,7 +40,7 @@ class StagedKernelFusers for (auto& fuser : d_kernel_fusers) { fuser.second.cleanup(); } - d_active = false; + d_launched = false; } KernelFuser* getFuser(int stage) @@ -51,33 +51,30 @@ class StagedKernelFusers void clearKernelFuser(int stage) { d_kernel_fusers.erase(stage); - if (d_kernel_fusers.empty()) { - d_active = false; - } } void clearAllFusers() { d_kernel_fusers.clear(); - d_active = false; + d_launched = false; } - bool isActive() const + bool launched() { - return d_active; + return d_launched; } void initialize(); void launchAndCleanup() { - if (d_active) { - launch(); + launch(); #ifdef HAVE_RAJA + if (d_launched) { tbox::parallel_synchronize(); -#endif - cleanup(); } +#endif + cleanup(); } protected: @@ -100,7 +97,8 @@ class StagedKernelFusers std::map d_kernel_fusers; - bool d_active = false; + bool d_launched = false; + }; } diff --git a/source/SAMRAI/xfer/CoarsenSchedule.cpp b/source/SAMRAI/xfer/CoarsenSchedule.cpp index 378e21bc09..5b2f590c91 100644 --- a/source/SAMRAI/xfer/CoarsenSchedule.cpp +++ b/source/SAMRAI/xfer/CoarsenSchedule.cpp @@ -1068,7 +1068,7 @@ CoarsenSchedule::coarsenSourceData( tbox::StagedKernelFusers::getInstance()->launch(); #if defined(HAVE_RAJA) - if (patch_strategy->needSynchronize() || tbox::StagedKernelFusers::getInstance()->isActive()) { + if (patch_strategy->needSynchronize() || tbox::StagedKernelFusers::getInstance()->launched()) { tbox::parallel_synchronize(); } #endif From cb6fa43d4e81d3fa39315d8d6e2d2c139165a4a8 Mon Sep 17 00:00:00 2001 From: Noah Elliott Date: Thu, 21 Jul 2022 16:23:15 -0700 Subject: [PATCH 33/34] Remove stray printf --- source/SAMRAI/xfer/CoarsenSchedule.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/source/SAMRAI/xfer/CoarsenSchedule.cpp b/source/SAMRAI/xfer/CoarsenSchedule.cpp index 5b2f590c91..a07a0c6471 100644 --- a/source/SAMRAI/xfer/CoarsenSchedule.cpp +++ b/source/SAMRAI/xfer/CoarsenSchedule.cpp @@ -1054,7 +1054,6 @@ CoarsenSchedule::coarsenSourceData( d_coarsen_patch_strategy->setPostCoarsenSyncFlag(); #if defined(HAVE_RAJA) if (d_coarsen_patch_strategy->needSynchronize()) { - printf("POST COARSEN SYNCHRONIZE\n"); tbox::parallel_synchronize(); } #endif From 4837d89d875db902d6a6459e3668bc19250c928a Mon Sep 17 00:00:00 2001 From: "Noah S. Elliott" Date: Wed, 9 Nov 2022 16:05:45 -0800 Subject: [PATCH 34/34] Small fixes to work with RAJA 2022.03 --- source/SAMRAI/pdat/ArrayData.cpp | 6 +++--- source/SAMRAI/pdat/ArrayView.h | 12 ++++++------ source/SAMRAI/xfer/CoarsenSchedule.cpp | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/source/SAMRAI/pdat/ArrayData.cpp b/source/SAMRAI/pdat/ArrayData.cpp index f9e397eb86..33733c826b 100644 --- a/source/SAMRAI/pdat/ArrayData.cpp +++ b/source/SAMRAI/pdat/ArrayData.cpp @@ -116,7 +116,7 @@ ArrayData::ArrayData( } #endif -#ifdef DEBUG_IINITIALIZE_UNDEFINED +#ifdef DEBUG_INITIALIZE_UNDEFINED undefineData(); #endif } @@ -149,7 +149,7 @@ ArrayData::ArrayData( } #endif -#ifdef DEBUG_IINITIALIZE_UNDEFINED +#ifdef DEBUG_INITIALIZE_UNDEFINED undefineData(); #endif } @@ -1030,7 +1030,7 @@ void ArrayData::fillAll( ptr[i] = t; }); } -#if defined(DEBUG_IINITIALIZE_UNDEFINED) +#if defined(DEBUG_INITIALIZE_UNDEFINED) tbox::parallel_synchronize(); #endif #else diff --git a/source/SAMRAI/pdat/ArrayView.h b/source/SAMRAI/pdat/ArrayView.h index d649e817d3..8f3c6d2bcf 100644 --- a/source/SAMRAI/pdat/ArrayView.h +++ b/source/SAMRAI/pdat/ArrayView.h @@ -70,7 +70,7 @@ struct ArrayView<1, TYPE> : public RAJA::View{ {box.lower()[0]} }, - std::array{ {box.upper()[0]} }, + std::array{ {box.upper()[0]+1} }, RAJA::as_array::get())){} }; @@ -84,7 +84,7 @@ struct ArrayView<2, TYPE> : public RAJA::View{ {box.lower()[0], box.lower()[1]} }, - std::array{ {box.upper()[0], box.upper()[1]} }, + std::array{ {box.upper()[0]+1, box.upper()[1]+1} }, RAJA::as_array::get())){} }; @@ -98,7 +98,7 @@ struct ArrayView<3, TYPE> : public RAJA::View{ {box.lower()[0], box.lower()[1], box.lower()[2]} }, - std::array{ {box.upper()[0], box.upper()[1], box.upper()[2]} }, + std::array{ {box.upper()[0]+1, box.upper()[1]+1, box.upper()[2]+1} }, RAJA::as_array::get())){}; }; @@ -112,7 +112,7 @@ struct ArrayView<1, const TYPE> : public RAJA::View{ {box.lower()[0]} }, - std::array{ {box.upper()[0]} }, + std::array{ {box.upper()[0]+1} }, RAJA::as_array::get())){} }; @@ -126,7 +126,7 @@ struct ArrayView<2, const TYPE> : public RAJA::View{ {box.lower()[0], box.lower()[1]} }, - std::array{ {box.upper()[0], box.upper()[1]} }, + std::array{ {box.upper()[0]+1, box.upper()[1]+1} }, RAJA::as_array::get())){} }; @@ -141,7 +141,7 @@ struct ArrayView<3, const TYPE> : public RAJA::View{ {box.lower()[0], box.lower()[1], box.lower()[2]} }, - std::array{ {box.upper()[0], box.upper()[1], box.upper()[2]} }, + std::array{ {box.upper()[0]+1, box.upper()[1]+1, box.upper()[2]+1} }, RAJA::as_array::get())){}; }; diff --git a/source/SAMRAI/xfer/CoarsenSchedule.cpp b/source/SAMRAI/xfer/CoarsenSchedule.cpp index a07a0c6471..c7c19d7f0d 100644 --- a/source/SAMRAI/xfer/CoarsenSchedule.cpp +++ b/source/SAMRAI/xfer/CoarsenSchedule.cpp @@ -1067,7 +1067,7 @@ CoarsenSchedule::coarsenSourceData( tbox::StagedKernelFusers::getInstance()->launch(); #if defined(HAVE_RAJA) - if (patch_strategy->needSynchronize() || tbox::StagedKernelFusers::getInstance()->launched()) { + if (!patch_strategy || patch_strategy->needSynchronize() || tbox::StagedKernelFusers::getInstance()->launched()) { tbox::parallel_synchronize(); } #endif