From 542fde90c7ac6ebaeb4027a09a703d19b264a1a6 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 20 Dec 2023 16:48:23 +0800 Subject: [PATCH] check: remove orphan peer only when the peers is greater than the rule count (#7581) (#7591) close tikv/pd#7584 The healthy orphan peer should be the last one to be removed only if there are extra peers to keep the high availablility. Signed-off-by: ti-chi-bot Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: tongjian <1045931706@qq.com> Co-authored-by: bufferflies <1045931706@qq.com> --- server/schedule/checker/rule_checker.go | 4 ++- server/schedule/checker/rule_checker_test.go | 36 ++++++++++++++++++++ server/schedule/placement/fit.go | 9 +++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index f9602ebc892..443da023d70 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -506,6 +506,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg } } + extra := fit.ExtraCount() // If hasUnhealthyFit is true, try to remove unhealthy orphan peers only if number of OrphanPeers is >= 2. // Ref https://github.com/tikv/pd/issues/4045 if len(fit.OrphanPeers) >= 2 { @@ -522,7 +523,8 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } - if hasHealthPeer { + // The healthy orphan peer can be removed to keep the high availability only if the peer count is greater than the rule requirement. + if hasHealthPeer && extra > 0 { // there already exists a healthy orphan peer, so we can remove other orphan Peers. checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() // if there exists a disconnected orphan peer, we will pick it to remove firstly. diff --git a/server/schedule/checker/rule_checker_test.go b/server/schedule/checker/rule_checker_test.go index 902e972e50f..85b9830cd99 100644 --- a/server/schedule/checker/rule_checker_test.go +++ b/server/schedule/checker/rule_checker_test.go @@ -1883,3 +1883,39 @@ func (suite *ruleCheckerTestSuite) TestTiFlashLocationLabels() { op := suite.rc.Check(suite.cluster.GetRegion(1)) suite.Nil(op) } + +func (suite *ruleCheckerTestSuite) TestRemoveOrphanPeer() { + suite.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z2", "host": "h1"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z2", "host": "h2"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"zone": "z2", "host": "h2"}) + rule := &placement.Rule{ + GroupID: "pd", + ID: "test2", + Role: placement.Voter, + Count: 3, + LabelConstraints: []placement.LabelConstraint{ + { + Key: "zone", + Op: placement.In, + Values: []string{"z2"}, + }, + }, + } + suite.ruleManager.SetRule(rule) + suite.ruleManager.DeleteRule("pd", "default") + + // case1: regionA has 3 peers but not extra peer can be removed, so it needs to add peer first + suite.cluster.AddLeaderRegionWithRange(1, "200", "300", 1, 2, 3) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("add-rule-peer", op.Desc()) + + // case2: regionB has 4 peers and one extra peer can be removed, so it needs to remove extra peer first + suite.cluster.AddLeaderRegionWithRange(2, "300", "400", 1, 2, 3, 4) + op = suite.rc.Check(suite.cluster.GetRegion(2)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) +} diff --git a/server/schedule/placement/fit.go b/server/schedule/placement/fit.go index 454715cdc8e..4ac9492a193 100644 --- a/server/schedule/placement/fit.go +++ b/server/schedule/placement/fit.go @@ -112,6 +112,15 @@ func (f *RegionFit) IsSatisfied() bool { return len(f.OrphanPeers) == 0 } +// ExtraCount return the extra count. +func (f *RegionFit) ExtraCount() int { + desired := 0 + for _, r := range f.RuleFits { + desired += r.Rule.Count + } + return len(f.regionStores) - desired +} + // GetRuleFit returns the RuleFit that contains the peer. func (f *RegionFit) GetRuleFit(peerID uint64) *RuleFit { for _, rf := range f.RuleFits {