-
Notifications
You must be signed in to change notification settings - Fork 211
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for excluding/including nodes from external load balancers before/after reboot #378
Changes from all commits
95eab05
9d268d2
dd1a940
a136b72
84dc7bd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,7 @@ var ( | |
// Command line flags | ||
forceReboot bool | ||
drainTimeout time.Duration | ||
rebootDelay time.Duration | ||
period time.Duration | ||
drainGracePeriod int | ||
skipWaitForDeleteTimeoutSeconds int | ||
|
@@ -87,6 +88,15 @@ const ( | |
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed" | ||
) | ||
|
||
const ( | ||
// ExcludeFromELBsLabelKey is a label key that tells the K8S control plane to exclude a node from external load balancers | ||
ExcludeFromELBsLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers" | ||
// ExcludeFromELBsLabelVal is a label value used to track label placement by kured | ||
ExcludeFromELBsLabelVal = "kured-remove-after-reboot" | ||
// ExcludeFromELBsLabelKeyEscaped is the escaped label key value passed to the Patch() function | ||
ExcludeFromELBsLabelKeyEscaped = "node.kubernetes.io~1exclude-from-external-load-balancers" | ||
) | ||
|
||
func init() { | ||
prometheus.MustRegister(rebootRequiredGauge) | ||
} | ||
|
@@ -106,6 +116,8 @@ func main() { | |
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)") | ||
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0, | ||
"timeout after which the drain is aborted (default: 0, infinite time)") | ||
rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0, | ||
"delay reboot for this duration (default: 0, disabled)") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "delay reboot for this duration after drain and exclusion from any ELB"? I feel this will be more understandable by someone not reading the code. |
||
rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60, | ||
"sentinel check period") | ||
rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system", | ||
|
@@ -345,6 +357,49 @@ func release(lock *daemonsetlock.DaemonSetLock) { | |
} | ||
} | ||
|
||
func enableExcludeFromELBs(client *kubernetes.Clientset, nodeID string) { | ||
log.Infof("Adding ExcludeFromELBs label to node") | ||
|
||
// Add ExcludeFromELBs node label | ||
labelPatch := fmt.Sprintf(`[{"op":"add","path":"/metadata/labels/%s","value":"%s" }]`, ExcludeFromELBsLabelKeyEscaped, ExcludeFromELBsLabelVal) | ||
_, err := client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) | ||
if err != nil { | ||
log.Errorf("Unable to add ExcludeFromELBs label to node: %s" , err.Error()) | ||
} | ||
} | ||
|
||
func disableExcludeFromELBs(client *kubernetes.Clientset, nodeID string) { | ||
ctx := context.Background() | ||
|
||
// Get node | ||
node, err := client.CoreV1().Nodes().Get(ctx, nodeID, metav1.GetOptions{}) | ||
if err != nil { | ||
log.Warnf("Unable to find node: %s", nodeID) | ||
return | ||
} | ||
|
||
// Check ExcludeFromELBs node label | ||
labelVal, ok := node.Labels[ExcludeFromELBsLabelKey] | ||
if !ok { | ||
return | ||
} | ||
|
||
// Different label value found | ||
if labelVal != ExcludeFromELBsLabelVal { | ||
log.Debugf("Found ExcludeFromELBs label on node with value: '%s' (no action taken)", labelVal) | ||
return | ||
} | ||
|
||
// Remove ExcludeFromELBs node label | ||
labelPatch := fmt.Sprintf(`[{"op":"remove","path":"/metadata/labels/%s"}]`, ExcludeFromELBsLabelKeyEscaped) | ||
_, err = client.CoreV1().Nodes().Patch(ctx, nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) | ||
if err != nil { | ||
log.Errorf("Unable to remove ExcludeFromELBs label from node: %s", err.Error()) | ||
} else { | ||
log.Infof("Removed ExcludeFromELBs label from node") | ||
} | ||
} | ||
|
||
func drain(client *kubernetes.Clientset, node *v1.Node) { | ||
nodename := node.GetName() | ||
|
||
|
@@ -489,6 +544,9 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s | |
log.Fatal(err) | ||
} | ||
|
||
// Remove ExcludeFromELBs label immediately to allow ELB registration | ||
disableExcludeFromELBs(client, nodeID) | ||
|
||
lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation) | ||
|
||
nodeMeta := nodeMeta{} | ||
|
@@ -497,6 +555,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s | |
if err != nil { | ||
log.Fatalf("Error retrieving node object via k8s API: %v", err) | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please do not add those spaces changes in this commit. If you want to change the formatting to look better for you, please do it in a different commit/PR. |
||
if !nodeMeta.Unschedulable { | ||
uncordon(client, node) | ||
} | ||
|
@@ -510,6 +569,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s | |
deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) | ||
} | ||
} | ||
|
||
throttle(releaseDelay) | ||
release(lock) | ||
} | ||
|
@@ -580,7 +640,14 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s | |
continue | ||
} | ||
|
||
enableExcludeFromELBs(client, nodeID) | ||
drain(client, node) | ||
|
||
if rebootDelay > 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Next to that, I would probably use the existing throttle function, maybe rename and refactor it so that it's clearer. I am worried the users might not understand the difference between all those options we have, tbh, so I think it might be worth clarifying in the CLI arguments. |
||
log.Infof("Delaying reboot for %v", rebootDelay) | ||
time.Sleep(rebootDelay) | ||
} | ||
|
||
invokeReboot(nodeID, rebootCommand) | ||
for { | ||
log.Infof("Waiting for reboot") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be best toremove this from this PR, as it's effectively not linked to the PR title.
Maybe we need to fix the current ELB feature to not require the addition of this delay. This way, the intent of the rebootDelay would would be more clear.
If we don't add this feature in a different PR, we should at least make the boundary very clear by having only 2 commits: One for ELB support, one for the reboot-delay. (You will need to squash your changes appropriately).