Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resilience features #2149

Draft
wants to merge 6 commits into
base: develop
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Checkpoint bugfixes, more asynchrony in recovery, support checkpointi…
…ng objgroups
Matthew-Whitlock committed May 16, 2023
commit ffef26255cf7c3b8fab3ef652cc36a65727d9379
4 changes: 3 additions & 1 deletion src/vt/elm/elm_lb_data.h
Original file line number Diff line number Diff line change
@@ -47,6 +47,7 @@
#include "vt/elm/elm_id.h"
#include "vt/elm/elm_comm.h"
#include "vt/timing/timing.h"
#include "vt/vrt/vrt_common.h"

namespace vt { namespace vrt { namespace collection { namespace balance {

@@ -103,7 +104,8 @@ struct ElementLBData {
void serialize(Serializer& s) {
s | cur_time_started_;
s | cur_time_;
s | cur_phase_;
if(!s.hasTraits(vt::vrt::CheckpointInternalTrait()))
s | cur_phase_;
s | phase_timings_;
s | phase_comm_;
s | cur_subphase_;
22 changes: 22 additions & 0 deletions src/vt/objgroup/proxy/proxy_objgroup.h
Original file line number Diff line number Diff line change
@@ -60,6 +60,8 @@
#include "vt/messaging/pending_send.h"
#include "vt/utils/fntraits/fntraits.h"

#include "vt/vrt/vrt_common.h"

namespace vt { namespace objgroup { namespace proxy {

/**
@@ -365,7 +367,27 @@ struct Proxy {

template <typename Serializer>
void serialize(Serializer& s) {
auto old_proxy = proxy_;
s | proxy_;

using vt::vrt::CheckpointTrait;
using vt::vrt::CheckpointInternalTrait;

if constexpr(s.hasTraits(CheckpointTrait())){
vtAssert(old_proxy != no_obj_group, "ObjGroups must be pre-instantiated to be checkpointed or restored");
vtAssert(old_proxy == proxy_, "The proxy ID bits of this ObjGroup do not match the ID found in the checkpoint!" \
" Varying IDs is not yet supported.");
auto objPtr = get();

bool null = objPtr == nullptr;
s | null;

if(!null){
auto newS = s.withoutTraits(CheckpointTrait())
.withTraits(CheckpointInternalTrait());
newS | *objPtr;
}
}
}

private:
4 changes: 2 additions & 2 deletions src/vt/vrt/collection/collection_builder.impl.h
Original file line number Diff line number Diff line change
@@ -193,10 +193,10 @@ void CollectionManager::makeCollectionImpl(param::ConstructParams<ColT>& po) {
makeCollectionElement<ColT>(proxy, idx, this_node, std::move(c));
}

if (global_constructed_elms != 0) {
//if (global_constructed_elms != 0) {
// Construct a underlying group for the collection
constructGroup<ColT>(proxy);
}
//}
}

template <typename ColT, typename Callable>
19 changes: 2 additions & 17 deletions src/vt/vrt/collection/manager.h
Original file line number Diff line number Diff line change
@@ -62,6 +62,7 @@
#include "vt/vrt/collection/dispatch/registry.h"
#include "vt/vrt/collection/listener/listen_events.h"
#include "vt/vrt/proxy/collection_proxy.h"
#include "vt/vrt/proxy/collection_elm_proxy.h"
#include "vt/topos/mapping/mapping_headers.h"
#include "vt/messaging/message.h"
#include "vt/messaging/pending_send.h"
@@ -1640,23 +1641,6 @@ struct CollectionManager
*
* \brief Migrate local element, potentially requested by remote location
*/
template <
typename ColT,
typename MsgT = vt::Message
>
struct MigrateRequestMsg : MsgT {
using ElmT = VrtElmProxy<ColT, typename ColT::IndexType>;

MigrateRequestMsg() = default;
MigrateRequestMsg(
ElmT proxy_elm, NodeType to_node
) : to_node_(to_node),
proxy_elm_(proxy_elm)
{ }

NodeType to_node_ = uninitialized_destination;
ElmT proxy_elm_;
};

/**
* \brief Migrate a remote proxy element to a node, by messaging that
@@ -1834,6 +1818,7 @@ struct CollectionManager
#include "vt/vrt/collection/types/base.impl.h"
#include "vt/rdmahandle/manager.collection.impl.h"
#include "vt/vrt/proxy/collection_proxy.impl.h"
#include "vt/vrt/proxy/collection_elm_proxy.impl.h"
#include "vt/context/runnable_context/lb_data.impl.h"
#include "vt/context/runnable_context/collection.impl.h"

14 changes: 6 additions & 8 deletions src/vt/vrt/collection/manager.impl.h
Original file line number Diff line number Diff line change
@@ -2217,26 +2217,24 @@ void CollectionManager::checkpointToFile(
namespace detail {
template <typename ColT>
inline void MigrateRequestHandler (
CollectionManager::MigrateRequestMsg<ColT>* msg, ColT*
ColT*, VrtElmProxy<ColT, typename ColT::IndexType> proxy_elm, NodeType dest
) {
auto node = msg->to_node_;
auto proxy_elm = msg->proxy_elm_;
theCollection()->migrate(proxy_elm, node);
theCollection()->migrate(proxy_elm, dest);
}
} /* end namespace detail */

template <typename ColT>
EpochType CollectionManager::requestMigrateDeferred(
VrtElmProxy<ColT, typename ColT::IndexType> proxy_elem, NodeType destination
VrtElmProxy<ColT, typename ColT::IndexType> proxy_elm, NodeType destination
) {
auto ep = theTerm()->makeEpochRooted(
"Request element migration", term::UseDS{true}
);
theMsg()->pushEpoch(ep);

proxy_elem.template send<
MigrateRequestMsg<ColT>, detail::MigrateRequestHandler<ColT>
>(proxy_elem, destination);
proxy_elm.template send<detail::MigrateRequestHandler<ColT>>(
proxy_elm, destination
);

theMsg()->popEpoch(ep);
theTerm()->finishedEpoch(ep);
1 change: 1 addition & 0 deletions src/vt/vrt/collection/types/migratable.impl.h
Original file line number Diff line number Diff line change
@@ -45,6 +45,7 @@
#define INCLUDED_VT_VRT_COLLECTION_TYPES_MIGRATABLE_IMPL_H

#include "vt/config.h"
#include "vt/vrt/vrt_common.h"
#include "vt/vrt/collection/types/migratable.h"

namespace vt { namespace vrt { namespace collection {
17 changes: 11 additions & 6 deletions src/vt/vrt/proxy/collection_elm_proxy.h
Original file line number Diff line number Diff line change
@@ -50,6 +50,7 @@
#include "vt/vrt/collection/send/sendable.h"
#include "vt/vrt/collection/insert/insertable.h"
#include "vt/vrt/proxy/base_elm_proxy.h"
#include "vt/vrt/vrt_common.h"

#include <iosfwd>

@@ -90,12 +91,16 @@ struct VrtElmProxy : ProxyCollectionElmTraits<ColT, IndexT> {
std::ostream& os, VrtElmProxy<ColU,IndexU> const& vrt
);

template <typename Ser>
void serialize(DefaultSerializer<Ser>& s);
template <typename Ser>
void serialize(CheckpointSerializer<Ser>& s);
template <typename Ser>
std::unique_ptr<ColT> deserializeToElm(Ser& s);


template <typename SerT, typename SerT::has_not_traits_t<CheckpointTrait>* = nullptr>
void serialize(SerT& s);

template <typename SerT, typename SerT::has_traits_t<CheckpointTrait>* = nullptr>
void serialize(SerT& s);

template <typename SerT>
std::unique_ptr<ColT> deserializeToElm(SerT& s);
};


81 changes: 50 additions & 31 deletions src/vt/vrt/proxy/collection_elm_proxy.impl.h
Original file line number Diff line number Diff line change
@@ -41,8 +41,8 @@
//@HEADER
*/

#if !defined INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_H
#define INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_H
#if !defined INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_IMPL_H
#define INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_IMPL_H

#include "vt/config.h"
#include "vt/vrt/collection/manager.h"
@@ -52,57 +52,76 @@ namespace vt { namespace vrt { namespace collection {

//Standard serialize, just pass along to base.
template <typename ColT, typename IndexT>
template <typename Ser>
void VrtElmProxy<ColT, IndexT>::serialize(DefaultSerializer<Ser>& s) {
template <typename SerT, typename SerT::has_not_traits_t<CheckpointTrait>*>
void VrtElmProxy<ColT, IndexT>::serialize(SerT& s) {
ProxyCollectionElmTraits<ColT, IndexT>::serialize(s);
}

//Checkpoint serialize, actually serialize the element itself.
template <typename ColT, typename IndexT>
template <typename Ser>
void VrtElmProxy<ColT, IndexT>::serialize(CheckpointSerializer<Ser>& s) {
template <typename SerT, typename SerT::has_traits_t<CheckpointTrait>*>
void VrtElmProxy<ColT, IndexT>::serialize(SerT& s) {
ProxyCollectionElmTraits<ColT, IndexT>::serialize(s);

//Make sure proxies within the element don't also try recovering
auto elm_serializer = checkpoint::withoutTrait<vt::vrt::CheckpointTrait>(s);

auto local_elm_ptr = this->tryGetLocalPtr();
if(local_elm_ptr != nullptr){
local_elm_ptr | elm_serializer;
} else {
//The element is somewhere else so we'll need to request a migration to here.
vtAssert(!s.isUnpacking(), "Must serialize elements from the node they are at");

//Avoid delaying the serializer though, we want to enable asynchronous progress.
std::unique_ptr<ColT> new_elm_ptr;
new_elm_ptr | elm_serializer;
vtAssert(local_elm_ptr != nullptr || s.isUnpacking(), "Must serialize/size elements from the node they are at");

//Traits for nested serialize/deserialize
using CheckpointlessTraits = typename SerT::Traits::without<CheckpointTrait>::with<CheckpointInternalTrait>;

//Weird nested serialization to enable asynchronous deserializing w/o changing semantics.
if(!(s.isPacking() || s.isUnpacking())){
int size = checkpoint::getSize(*local_elm_ptr);
s.contiguousBytes(nullptr, 1, size);
} else if(s.isPacking()){
auto serialized_elm = checkpoint::serialize<CheckpointlessTraits>(*local_elm_ptr);
int size = serialized_elm->getSize();
s | size;
s.contiguousBytes(serialized_elm->getBuffer(), 1, size);
} else if(s.isUnpacking()){
int size;
s | size;

auto ep = theCollection()->requestMigrateDeferred(*this, theContext()->getNode());
auto buf = std::make_unique<char[]>(size);
s.contiguousBytes(buf.get(), 1, size);

if(local_elm_ptr != nullptr){
checkpoint::deserializeInPlace<CheckpointlessTraits>(buf.get(), local_elm_ptr);
} else {
//The element is somewhere else so we'll need to request a migration to here.
auto ep = theCollection()->requestMigrateDeferred(*this, theContext()->getNode());

theTerm()->addAction(ep, [*this, new_elm_ptr = std::move(new_elm_ptr)]{
auto local_elm_ptr = *this.tryGetLocalPtr();
assert(local_elm_ptr != nullptr);
local_elm_ptr = std::move(new_elm_ptr);
});
theTerm()->addActionUnique(ep, std::move([elm_proxy = *this, buffer = std::move(buf)]{
auto elm_ptr = elm_proxy.tryGetLocalPtr();
assert(elm_ptr != nullptr);
checkpoint::deserializeInPlace<CheckpointlessTraits>(buffer.get(), elm_ptr);
}));
}
}
}

//Deserialize without placing values into the runtime,
//just return the element pointer.
template <typename ColT, typename IndexT>
template <typename Ser>
template <typename SerT>
std::unique_ptr<ColT>
VrtElmProxy<ColT, IndexT>::deserializeToElm(Ser& s) {
//Still have to hit data in order.
VrtElmProxy<ColT, IndexT>::deserializeToElm(SerT& s) {
//Still have to hit data in the same order.
ProxyCollectionElmTraits<ColT, IndexT>::serialize(s);

//Make sure proxies within the element don't also try recovering
auto elm_serializer = checkpoint::withoutTrait<vt::vrt::CheckpointTrait>(s);
int size;
s | size;
auto buf = std::make_unique<char[]>(size);
s.contiguousBytes(buf.get(), 1, size);

std::unique_ptr<ColT> elm(new ColT());

using CheckpointlessTraits = typename SerT::Traits::without<CheckpointTrait>::with<CheckpointInternalTrait>;
checkpoint::deserializeInPlace<CheckpointlessTraits>(buf.get(), elm.get());

std::unique_ptr<ColT> elm;
elm | elm_serializer;
return elm;
}

}}} /* end namespace vt::vrt::collection */

#endif /*INCLUDED_VT_VRT_PROXY_COLLECTION_ELM_PROXY_IMPL_H*/
10 changes: 1 addition & 9 deletions src/vt/vrt/proxy/collection_proxy.h
Original file line number Diff line number Diff line change
@@ -118,15 +118,7 @@ struct CollectionProxy : ProxyCollectionTraits<ColT, IndexT> {

//Serialize normally
template <typename SerializerT>
using DefaultSerializer = vt::vrt::DefaultSerializer<SerializerT>;
template <typename T>
void serialize(typename DefaultSerializer<T>::type& s);

//Serialize for checkpoint/recovery
template <typename SerializerT>
using CheckpointSerializer = vt::vrt::CheckpointSerializer<SerializerT>;
template <typename T>
void serialize(typename CheckpointSerializer<T>::type& s);
void serialize(SerializerT& s);
};

}}} /* end namespace vt::vrt::collection */
Loading