Skip to content

Commit

Permalink
lighthouse/quorum: avoid split brain and add shrink_only support (#71)
Browse files Browse the repository at this point in the history
  • Loading branch information
d4l3k authored Jan 15, 2025
1 parent 97ad397 commit 79572e6
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 61 deletions.
2 changes: 2 additions & 0 deletions proto/torchft.proto
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ message QuorumMember {
string store_address = 3;
int64 step = 4;
uint64 world_size = 5;
bool shrink_only = 6;
}

message Quorum {
Expand Down Expand Up @@ -72,6 +73,7 @@ message ManagerQuorumRequest {
int64 rank = 1;
int64 step = 2;
string checkpoint_server_addr = 3;
bool shrink_only = 4;
}

message ManagerQuorumResponse {
Expand Down
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,22 @@ impl ManagerClient {
})
}

#[pyo3(signature = (rank, step, checkpoint_server_addr, timeout=None))]
#[pyo3(signature = (rank, step, checkpoint_server_addr, shrink_only, timeout=None))]
fn quorum(
&mut self,
py: Python<'_>,
rank: i64,
step: i64,
checkpoint_server_addr: String,
shrink_only: bool,
timeout: Option<Duration>,
) -> Result<(i64, i64, i64, String, String, i64, Option<i64>, i64, bool), StatusError> {
py.allow_threads(move || {
let mut request = tonic::Request::new(ManagerQuorumRequest {
rank: rank,
step: step,
checkpoint_server_addr: checkpoint_server_addr,
shrink_only: shrink_only,
});
// This notifies the server about the timeout but doesn't affect the
// endpoint timeout which we set on client creation.
Expand Down
Loading

0 comments on commit 79572e6

Please sign in to comment.