From 95eab05797e383417ffbb6e8a7fc4ca93664adc7 Mon Sep 17 00:00:00 2001 From: Andres Morey Date: Fri, 21 May 2021 21:16:31 +0300 Subject: [PATCH 1/5] added steps to de-register/register node from external load balancers before/after restart, added delay to wait for de-registration to finish before rebooting --- cmd/kured/main.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 9964434df..29fc5b021 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -87,6 +87,8 @@ const ( KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed" ) +const ExcludeFromLoadBalancersLabel = "node.kubernetes.io~1exclude-from-external-load-balancers" + func init() { prometheus.MustRegister(rebootRequiredGauge) } @@ -345,6 +347,28 @@ func release(lock *daemonsetlock.DaemonSetLock) { } } +func deregisterLB(client *kubernetes.Clientset, nodeID string) { + log.Infof("De-registering node from external load balancers") + + // add "exclude-from-external-load-balancers" node label + labelPatch := fmt.Sprintf(`[{"op":"add","path":"/metadata/labels/%s","value":"" }]`, ExcludeFromLoadBalancersLabel) + _, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) + if err != nil { + log.Warnf("Unable to add \"%s\" label to node", ExcludeFromLoadBalancersLabel) + } +} + +func registerLB(client *kubernetes.Clientset, nodeID string) { + log.Infof("Registering node with external load balancers") + + // remove "exclude-from-external-load-balancers" node label + labelPatch := fmt.Sprintf(`[{"op":"remove","path":"/metadata/labels/%s"}]`, ExcludeFromLoadBalancersLabel) + _, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) + if err != nil { + log.Warnf("Unable to remove \"%s\" label from node", ExcludeFromLoadBalancersLabel) + } +} + func drain(client *kubernetes.Clientset, node *v1.Node) { nodename := node.GetName() @@ -510,6 +534,9 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) } } + + registerLB(client, nodeID) + throttle(releaseDelay) release(lock) } @@ -580,7 +607,12 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s continue } + deregisterLB(client, nodeID) drain(client, node) + + log.Infof("Waiting for node to finish de-registering from load balancer (105 sec.)") + time.Sleep(105 * time.Second) + invokeReboot(nodeID, rebootCommand) for { log.Infof("Waiting for reboot") From 9d268d208d8ff5be2f2c480a4048925494a89290 Mon Sep 17 00:00:00 2001 From: Andres Morey Date: Fri, 25 Jun 2021 15:19:26 +0300 Subject: [PATCH 2/5] Added command line flag, improved handling of ExcludeFromELBs label removal --- cmd/kured/main.go | 67 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 29fc5b021..bb529b3e6 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -41,6 +41,7 @@ var ( // Command line flags forceReboot bool drainTimeout time.Duration + rebootDelay time.Duration period time.Duration drainGracePeriod int skipWaitForDeleteTimeoutSeconds int @@ -87,7 +88,11 @@ const ( KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed" ) -const ExcludeFromLoadBalancersLabel = "node.kubernetes.io~1exclude-from-external-load-balancers" +const ( + ExcludeFromELBsLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers" + ExcludeFromELBsLabelVal = "kured-remove-after-reboot" + ExcludeFromELBsLabelKeyEscaped = "node.kubernetes.io~1exclude-from-external-load-balancers" +) func init() { prometheus.MustRegister(rebootRequiredGauge) @@ -108,6 +113,8 @@ func main() { "when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)") rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0, "timeout after which the drain is aborted (default: 0, infinite time)") + rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0, + "delay reboot for this duration (default: 0, disabled)") rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60, "sentinel check period") rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system", @@ -347,25 +354,46 @@ func release(lock *daemonsetlock.DaemonSetLock) { } } -func deregisterLB(client *kubernetes.Clientset, nodeID string) { - log.Infof("De-registering node from external load balancers") +func enableExcludeFromELBs(client *kubernetes.Clientset, nodeID string) { + log.Infof("Adding ExcludeFromELBs label to node") - // add "exclude-from-external-load-balancers" node label - labelPatch := fmt.Sprintf(`[{"op":"add","path":"/metadata/labels/%s","value":"" }]`, ExcludeFromLoadBalancersLabel) - _, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) + // Add ExcludeFromELBs node label + labelPatch := fmt.Sprintf(`[{"op":"add","path":"/metadata/labels/%s","value":"%s" }]`, ExcludeFromELBsLabelKeyEscaped, ExcludeFromELBsLabelVal) + _, err := client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) if err != nil { - log.Warnf("Unable to add \"%s\" label to node", ExcludeFromLoadBalancersLabel) + log.Errorf("Unable to add ExcludeFromELBs label to node: %s" , err.Error()) } } -func registerLB(client *kubernetes.Clientset, nodeID string) { - log.Infof("Registering node with external load balancers") +func disableExcludeFromELBs(client *kubernetes.Clientset, nodeID string) { + ctx := context.Background() - // remove "exclude-from-external-load-balancers" node label - labelPatch := fmt.Sprintf(`[{"op":"remove","path":"/metadata/labels/%s"}]`, ExcludeFromLoadBalancersLabel) - _, err := client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) + // Get node + node, err := client.CoreV1().Nodes().Get(ctx, nodeID, metav1.GetOptions{}) if err != nil { - log.Warnf("Unable to remove \"%s\" label from node", ExcludeFromLoadBalancersLabel) + log.Warnf("Unable to find node: %s", nodeID) + return + } + + // Check ExcludeFromELBs node label + labelVal, ok := node.Labels[ExcludeFromELBsLabelKey] + if !ok { + return + } + + // Different label value found + if labelVal != ExcludeFromELBsLabelVal { + log.Warnf("Found ExcludeFromELBs label on node with value: %s (no action taken)", labelVal) + return + } + + // Remove ExcludeFromELBs node label + labelPatch := fmt.Sprintf(`[{"op":"remove","path":"/metadata/labels/%s"}]`, ExcludeFromELBsLabelKeyEscaped) + _, err = client.CoreV1().Nodes().Patch(ctx, nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) + if err != nil { + log.Errorf("Unable to remove ExcludeFromELBs label from node: %s", err.Error()) + } else { + log.Infof("Removed ExcludeFromELBs label from node") } } @@ -534,8 +562,6 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) } } - - registerLB(client, nodeID) throttle(releaseDelay) release(lock) @@ -546,6 +572,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s // Remove taint immediately during startup to quickly allow scheduling again. if !rebootRequired(sentinelCommand) { preferNoScheduleTaint.Disable() + disableExcludeFromELBs(client, nodeID) } // instantiate prometheus client @@ -560,12 +587,14 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s if !window.Contains(time.Now()) { // Remove taint outside the reboot time window to allow for normal operation. preferNoScheduleTaint.Disable() + disableExcludeFromELBs(client, nodeID) continue } if !rebootRequired(sentinelCommand) { log.Infof("Reboot not required") preferNoScheduleTaint.Disable() + disableExcludeFromELBs(client, nodeID) continue } log.Infof("Reboot required") @@ -607,11 +636,13 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s continue } - deregisterLB(client, nodeID) + enableExcludeFromELBs(client, nodeID) drain(client, node) - log.Infof("Waiting for node to finish de-registering from load balancer (105 sec.)") - time.Sleep(105 * time.Second) + if rebootDelay > 0 { + log.Infof("Delaying reboot for %v", rebootDelay) + time.Sleep(rebootDelay) + } invokeReboot(nodeID, rebootCommand) for { From dd1a940eca29771c1b40b4293191abe61f0bdcf6 Mon Sep 17 00:00:00 2001 From: Andres Morey Date: Fri, 25 Jun 2021 15:32:45 +0300 Subject: [PATCH 3/5] improved debugging messages in disableExcludeFromELBs --- cmd/kured/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index bb529b3e6..58c828346 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -383,7 +383,7 @@ func disableExcludeFromELBs(client *kubernetes.Clientset, nodeID string) { // Different label value found if labelVal != ExcludeFromELBsLabelVal { - log.Warnf("Found ExcludeFromELBs label on node with value: %s (no action taken)", labelVal) + log.Debugf("Found ExcludeFromELBs label on node with value: '%s' (no action taken)", labelVal) return } From a136b7232f72ce6b6496aaf4d5a6759f305eba42 Mon Sep 17 00:00:00 2001 From: Andres Morey Date: Sat, 26 Jun 2021 00:09:54 +0300 Subject: [PATCH 4/5] placed disableExcludeFromELBs in kured startup sequence --- cmd/kured/main.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 58c828346..d8d017e8a 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -541,6 +541,9 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s log.Fatal(err) } + // Remove ExcludeFromELBs label immediately to allow ELB registration + disableExcludeFromELBs(client, nodeID) + lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation) nodeMeta := nodeMeta{} @@ -549,6 +552,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s if err != nil { log.Fatalf("Error retrieving node object via k8s API: %v", err) } + if !nodeMeta.Unschedulable { uncordon(client, node) } @@ -572,7 +576,6 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s // Remove taint immediately during startup to quickly allow scheduling again. if !rebootRequired(sentinelCommand) { preferNoScheduleTaint.Disable() - disableExcludeFromELBs(client, nodeID) } // instantiate prometheus client @@ -587,14 +590,12 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s if !window.Contains(time.Now()) { // Remove taint outside the reboot time window to allow for normal operation. preferNoScheduleTaint.Disable() - disableExcludeFromELBs(client, nodeID) continue } if !rebootRequired(sentinelCommand) { log.Infof("Reboot not required") preferNoScheduleTaint.Disable() - disableExcludeFromELBs(client, nodeID) continue } log.Infof("Reboot required") From 84dc7bda73c4e92f3ca4f71e5964eef6c8783d92 Mon Sep 17 00:00:00 2001 From: Andres Morey Date: Mon, 28 Jun 2021 16:30:23 +0300 Subject: [PATCH 5/5] added comments to exported constants, fixes linting issue --- cmd/kured/main.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index d8d017e8a..faff87a15 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -89,8 +89,11 @@ const ( ) const ( + // ExcludeFromELBsLabelKey is a label key that tells the K8S control plane to exclude a node from external load balancers ExcludeFromELBsLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers" + // ExcludeFromELBsLabelVal is a label value used to track label placement by kured ExcludeFromELBsLabelVal = "kured-remove-after-reboot" + // ExcludeFromELBsLabelKeyEscaped is the escaped label key value passed to the Patch() function ExcludeFromELBsLabelKeyEscaped = "node.kubernetes.io~1exclude-from-external-load-balancers" )