Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for excluding/including nodes from external load balancers before/after reboot #378

Closed
wants to merge 5 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ var (
// Command line flags
forceReboot bool
drainTimeout time.Duration
rebootDelay time.Duration
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be best toremove this from this PR, as it's effectively not linked to the PR title.

Maybe we need to fix the current ELB feature to not require the addition of this delay. This way, the intent of the rebootDelay would would be more clear.

If we don't add this feature in a different PR, we should at least make the boundary very clear by having only 2 commits: One for ELB support, one for the reboot-delay. (You will need to squash your changes appropriately).

period time.Duration
drainGracePeriod int
skipWaitForDeleteTimeoutSeconds int
Expand Down Expand Up @@ -87,6 +88,15 @@ const (
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
)

const (
// ExcludeFromELBsLabelKey is a label key that tells the K8S control plane to exclude a node from external load balancers
ExcludeFromELBsLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers"
// ExcludeFromELBsLabelVal is a label value used to track label placement by kured
ExcludeFromELBsLabelVal = "kured-remove-after-reboot"
// ExcludeFromELBsLabelKeyEscaped is the escaped label key value passed to the Patch() function
ExcludeFromELBsLabelKeyEscaped = "node.kubernetes.io~1exclude-from-external-load-balancers"
)

func init() {
prometheus.MustRegister(rebootRequiredGauge)
}
Expand All @@ -106,6 +116,8 @@ func main() {
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)")
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0,
"timeout after which the drain is aborted (default: 0, infinite time)")
rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0,
"delay reboot for this duration (default: 0, disabled)")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"delay reboot for this duration after drain and exclusion from any ELB"? I feel this will be more understandable by someone not reading the code.

rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60,
"sentinel check period")
rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system",
Expand Down Expand Up @@ -345,6 +357,49 @@ func release(lock *daemonsetlock.DaemonSetLock) {
}
}

func enableExcludeFromELBs(client *kubernetes.Clientset, nodeID string) {
log.Infof("Adding ExcludeFromELBs label to node")

// Add ExcludeFromELBs node label
labelPatch := fmt.Sprintf(`[{"op":"add","path":"/metadata/labels/%s","value":"%s" }]`, ExcludeFromELBsLabelKeyEscaped, ExcludeFromELBsLabelVal)
_, err := client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{})
if err != nil {
log.Errorf("Unable to add ExcludeFromELBs label to node: %s" , err.Error())
}
}

func disableExcludeFromELBs(client *kubernetes.Clientset, nodeID string) {
ctx := context.Background()

// Get node
node, err := client.CoreV1().Nodes().Get(ctx, nodeID, metav1.GetOptions{})
if err != nil {
log.Warnf("Unable to find node: %s", nodeID)
return
}

// Check ExcludeFromELBs node label
labelVal, ok := node.Labels[ExcludeFromELBsLabelKey]
if !ok {
return
}

// Different label value found
if labelVal != ExcludeFromELBsLabelVal {
log.Debugf("Found ExcludeFromELBs label on node with value: '%s' (no action taken)", labelVal)
return
}

// Remove ExcludeFromELBs node label
labelPatch := fmt.Sprintf(`[{"op":"remove","path":"/metadata/labels/%s"}]`, ExcludeFromELBsLabelKeyEscaped)
_, err = client.CoreV1().Nodes().Patch(ctx, nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{})
if err != nil {
log.Errorf("Unable to remove ExcludeFromELBs label from node: %s", err.Error())
} else {
log.Infof("Removed ExcludeFromELBs label from node")
}
}

func drain(client *kubernetes.Clientset, node *v1.Node) {
nodename := node.GetName()

Expand Down Expand Up @@ -489,6 +544,9 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
log.Fatal(err)
}

// Remove ExcludeFromELBs label immediately to allow ELB registration
disableExcludeFromELBs(client, nodeID)

lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation)

nodeMeta := nodeMeta{}
Expand All @@ -497,6 +555,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %v", err)
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please do not add those spaces changes in this commit. If you want to change the formatting to look better for you, please do it in a different commit/PR.

if !nodeMeta.Unschedulable {
uncordon(client, node)
}
Expand All @@ -510,6 +569,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation)
}
}

throttle(releaseDelay)
release(lock)
}
Expand Down Expand Up @@ -580,7 +640,14 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
continue
}

enableExcludeFromELBs(client, nodeID)
drain(client, node)

if rebootDelay > 0 {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Next to that, I would probably use the existing throttle function, maybe rename and refactor it so that it's clearer.

I am worried the users might not understand the difference between all those options we have, tbh, so I think it might be worth clarifying in the CLI arguments.

log.Infof("Delaying reboot for %v", rebootDelay)
time.Sleep(rebootDelay)
}

invokeReboot(nodeID, rebootCommand)
for {
log.Infof("Waiting for reboot")
Expand Down