From c7e2c27c97bbf12033db89ed713166c0936141d5 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 26 Oct 2023 17:03:33 +0800 Subject: [PATCH 1/2] checker: add disconnected check when fix orphan peers (#7240) close tikv/pd#7249 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> Signed-off-by: lhy1024 --- server/schedule/checker/rule_checker.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index af3f28839ba..1898012c19d 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -422,6 +422,13 @@ loopFits: hasUnhealthyFit = true break loopFits } + // avoid to meet down store when fix orpahn peers, + // Isdisconnected is more strictly than IsUnhealthy. + if c.cluster.GetStore(p.GetStoreId()).IsDisconnected() { + hasUnhealthyFit = true + pinDownPeer = p + break loopFits + } } } @@ -434,6 +441,9 @@ loopFits: // try to use orphan peers to replace unhealthy down peers. for _, orphanPeer := range fit.OrphanPeers { if pinDownPeer != nil { + if pinDownPeer.GetId() == orphanPeer.GetId() { + continue + } // make sure the orphan peer is healthy. if isUnhealthyPeer(orphanPeer.GetId()) { continue @@ -457,6 +467,9 @@ loopFits: return operator.CreatePromoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner: return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) + case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Voter && + c.cluster.GetStore(pinDownPeer.GetStoreId()).IsDisconnected() && !dstStore.IsDisconnected(): + return operator.CreateRemovePeerOperator("remove-replaced-orphan-peer", c.cluster, 0, region, pinDownPeer.GetStoreId()) default: // destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now. // destRole never be leader, so we not consider it. From cf5147f8be37c32318931a73703df2a9bca56d73 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 30 Oct 2023 14:54:41 +0800 Subject: [PATCH 2/2] pick test Signed-off-by: lhy1024 --- server/cluster/coordinator_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/cluster/coordinator_test.go b/server/cluster/coordinator_test.go index b21a7910a98..abddba3d4ce 100644 --- a/server/cluster/coordinator_test.go +++ b/server/cluster/coordinator_test.go @@ -512,7 +512,7 @@ func TestReplica(t *testing.T) { re.NoError(tc.addLeaderRegion(2, 1, 2, 3, 4)) region = tc.GetRegion(2) re.NoError(dispatchHeartbeat(co, region, stream)) - region = waitRemovePeer(re, stream, region, 4) + region = waitRemovePeer(re, stream, region, 3) // store3 is down, we should remove it firstly. re.NoError(dispatchHeartbeat(co, region, stream)) waitNoResponse(re, stream)