From fa525278877b82990e943938c7ade7947dd13c47 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Tue, 13 Aug 2024 14:12:46 +1000 Subject: [PATCH] Only scale up zone after all leader zone replicas are ready (#164) * Only scale up zone after leader zone replicas are ready * Update CHANGELOG * Change to only scaling once all replicas are ready * Rename config annotation * Add log line * remove redundant test * Update changelog --- CHANGELOG.md | 2 ++ README.md | 10 ++++-- pkg/config/config.go | 3 ++ pkg/controller/replicas.go | 32 +++++++++++++------ pkg/controller/replicas_test.go | 54 +++++++++++++++++++++++++++++++++ 5 files changed, 89 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0491833a5..8c5db0007 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## main / unreleased +* [FEATURE] Optionally only scale-up a `StatefulSet` once all of the leader `StatefulSet` replicas are ready. Enable with `grafana.com/rollout-upscale-only-when-leader-ready` annotation set to `true`. #164 + ## v0.17.1 * [ENHANCEMENT] prepare-downscale admission webhook: undo prepare-shutdown calls if adding the `last-downscale` annotation fails. #151 diff --git a/README.md b/README.md index 71c82476a..557806b97 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,11 @@ For each **rollout group**, the operator **guarantees**: The operator can also optionally coordinate scaling up and down of `StatefulSets` that are part of the same `rollout-group` based on the `grafana.com/rollout-downscale-leader` annotation. When using this feature, the `grafana.com/min-time-between-zones-downscale` label must also be set on each `StatefulSet`. -This can be useful for automating the tedious scaling of stateful services like Mimir ingesters. Making use of this feature requires adding a few annotations and labels to configure how it works. Examples for a multi-AZ ingester group are given below. +This can be useful for automating the tedious scaling of stateful services like Mimir ingesters. Making use of this feature requires adding a few annotations and labels to configure how it works. + +If the `grafana.com/rollout-upscale-only-when-leader-ready` annotation is set to `true` on a follower `StatefulSet`, the operator will only scale up the follower once all replicas in the leader `StatefulSet` are `ready`. This ensures that the follower zone does not scale up until the leader zone is completely stable. + +Example usage for a multi-AZ ingester group: - For `ingester-zone-a`, add the following: - Labels: @@ -47,7 +51,8 @@ This can be useful for automating the tedious scaling of stateful services like - `grafana.com/min-time-between-zones-downscale=12h` (change the value here to an appropriate duration) - `grafana.com/prepare-downscale=true` (to allow the service to be notified when it will be scaled down) - Annotations: - - `grafana.com/rollout-downscale-leader=ingester-zone-a` (zone `b` will follow zone `a`, after a delay) + - `grafana.com/rollout-downscale-leader=ingester-zone-a` (zone `b` will follow zone `a`, after a delay) + - `grafana.com/rollout-upscale-only-when-leader-ready=true` (zone `b` will only scale up once all replicas in zone `a` are ready) - `grafana.com/prepare-downscale-http-path=ingester/prepare-shutdown` (to call a specific endpoint on the service) - `grafana.com/prepare-downscale-http-port=80` (to call a specific endpoint on the service) - For `ingester-zone-c`, add the following: @@ -56,6 +61,7 @@ This can be useful for automating the tedious scaling of stateful services like - `grafana.com/prepare-downscale=true` (to allow the service to be notified when it will be scaled down) - Annotations: - `grafana.com/rollout-downscale-leader=ingester-zone-b` (zone `c` will follow zone `b`, after a delay) + - `grafana.com/rollout-upscale-only-when-leader-ready=true` (zone `c` will only scale up once all replicas in zone `b` are ready) - `grafana.com/prepare-downscale-http-path=ingester/prepare-shutdown` (to call a specific endpoint on the service) - `grafana.com/prepare-downscale-http-port=80` (to call a specific endpoint on the service) diff --git a/pkg/config/config.go b/pkg/config/config.go index fcabb1e5d..3a39760f1 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -28,6 +28,9 @@ const ( // RolloutDownscaleLeaderAnnotationKey is the name of the leader statefulset that should be used to determine // the number of replicas in a follower statefulset. RolloutDownscaleLeaderAnnotationKey = "grafana.com/rollout-downscale-leader" + // RolloutLeaderReadyKey is whether to only scale up once `ready` replicas match the desired replicas. + RolloutLeaderReadyAnnotationKey = "grafana.com/rollout-upscale-only-when-leader-ready" + RolloutLeaderReadyAnnotationValue = "true" rolloutMirrorReplicasFromResourceAnnotationKeyPrefix = "grafana.com/rollout-mirror-replicas-from-resource" // RolloutMirrorReplicasFromResourceNameAnnotationKey -- when set (together with "kind" and optionally "api-version" annotations), rollout-operator sets number of diff --git a/pkg/controller/replicas.go b/pkg/controller/replicas.go index 0340d4da0..f9186dec2 100644 --- a/pkg/controller/replicas.go +++ b/pkg/controller/replicas.go @@ -20,23 +20,35 @@ func desiredStsReplicas(group string, sts *v1.StatefulSet, all []*v1.StatefulSet } leaderReplicas := *leader.Spec.Replicas + leaderReadyReplicas := leader.Status.ReadyReplicas + + logger = log.With( + logger, + "follower", sts.GetName(), + "follower_replicas", followerReplicas, + "leader", leader.GetName(), + "leader_replicas", leaderReplicas, + "leader_ready_replicas", leaderReadyReplicas, + "group", group, + ) + if leaderReplicas > followerReplicas { - // Scale up is always allowed immediately + // Handle scale-up scenarios + annotations := sts.GetAnnotations() + onlyWhenReady, ok := annotations[config.RolloutLeaderReadyAnnotationKey] + if ok && onlyWhenReady == config.RolloutLeaderReadyAnnotationValue { + // We only scale up once all of the leader pods are ready. Otherwise we do nothing. + if leaderReplicas != leaderReadyReplicas { + level.Debug(logger).Log("msg", "not all leader replicas are ready. Follower replicas not changed.") + return followerReplicas, nil + } + } return leaderReplicas, nil } if leaderReplicas < followerReplicas { // Add all relevant key-value pairs before checking if the minimum time between // scale downs has elapsed. - logger = log.With( - logger, - "follower", sts.GetName(), - "follower_replicas", followerReplicas, - "leader", leader.GetName(), - "leader_replicas", leaderReplicas, - "group", group, - ) - minTimeElapsed, err := minimumTimeHasElapsed(sts, all, logger) if err != nil { return followerReplicas, err diff --git a/pkg/controller/replicas_test.go b/pkg/controller/replicas_test.go index a29e93b0b..ca31420d4 100644 --- a/pkg/controller/replicas_test.go +++ b/pkg/controller/replicas_test.go @@ -211,6 +211,60 @@ func TestReconcileStsReplicas(t *testing.T) { require.Equal(t, int32(4), replicas) }) + t.Run("scale up only when all leader replicas are ready", func(t *testing.T) { + sts1 := mockStatefulSet("test-zone-a", withReplicas(4, 3)) + sts2 := mockStatefulSet("test-zone-b", withReplicas(2, 2), withAnnotations(map[string]string{ + config.RolloutDownscaleLeaderAnnotationKey: "test-zone-a", + config.RolloutLeaderReadyAnnotationKey: config.RolloutLeaderReadyAnnotationValue, + })) + sts3 := mockStatefulSet("test-zone-c", withReplicas(1, 1), withAnnotations(map[string]string{ + config.RolloutDownscaleLeaderAnnotationKey: "test-zone-b", + config.RolloutLeaderReadyAnnotationKey: config.RolloutLeaderReadyAnnotationValue, + })) + + replicas, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2, sts3}, log.NewNopLogger()) + require.NoError(t, err) + require.Equal(t, int32(2), replicas, "no change in replicas because leader isn't ready yet") + + sts1 = mockStatefulSet("test-zone-a", withReplicas(4, 4)) + replicasB, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2, sts3}, log.NewNopLogger()) + require.NoError(t, err) + require.Equal(t, int32(4), replicasB, "ready to scale zone-b") + + sts2 = mockStatefulSet("test-zone-b", withReplicas(replicasB, 2)) + replicasC, err := desiredStsReplicas("test", sts3, []*v1.StatefulSet{sts1, sts2, sts3}, log.NewNopLogger()) + require.NoError(t, err) + require.Equal(t, int32(1), replicasC, "no change in replicas because zone-b isn't ready yet") + + sts2 = mockStatefulSet("test-zone-b", withReplicas(replicasB, replicasB)) + replicasC, err = desiredStsReplicas("test", sts3, []*v1.StatefulSet{sts1, sts2, sts3}, log.NewNopLogger()) + require.NoError(t, err) + require.Equal(t, int32(4), replicasC, "ready to scale zone-c") + }) + + t.Run("scale up ignoring ready replicas", func(t *testing.T) { + sts1 := mockStatefulSet("test-zone-a", withReplicas(4, 2)) + sts2 := mockStatefulSet("test-zone-b", withReplicas(3, 3), withAnnotations(map[string]string{ + config.RolloutDownscaleLeaderAnnotationKey: "test-zone-a", + })) + + replicas, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2}, log.NewNopLogger()) + require.NoError(t, err) + require.Equal(t, int32(4), replicas) + }) + + t.Run("scaling down always occurs to desired replicas", func(t *testing.T) { + sts1 := mockStatefulSet("test-zone-a", withReplicas(2, 1)) + sts2 := mockStatefulSet("test-zone-b", withReplicas(3, 3), withAnnotations(map[string]string{ + config.RolloutDownscaleLeaderAnnotationKey: "test-zone-a", + config.RolloutLeaderReadyAnnotationKey: config.RolloutLeaderReadyAnnotationValue, + })) + + replicas, err := desiredStsReplicas("test", sts2, []*v1.StatefulSet{sts1, sts2}, log.NewNopLogger()) + require.NoError(t, err) + require.Equal(t, int32(2), replicas) + }) + t.Run("scale down min time error", func(t *testing.T) { downscale1 := time.Now().UTC().Round(time.Second).Add(-72 * time.Hour) downscale2 := time.Now().UTC().Round(time.Second).Add(-60 * time.Hour)