-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement zoneTracker for coordinating downscales across statefulsets using a file in object storage for the prepare_downscale admission webhook #96
Changes from 3 commits
f5be68c
7d1958f
60b37ec
bd9ef2d
b11ec77
33dd5e5
f2b53ef
93efd5d
7bd2c6d
6e4bf58
0f40551
0b07920
c269119
f62c470
72c1cda
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,24 +23,33 @@ import ( | |
|
||
"github.com/grafana/rollout-operator/pkg/config" | ||
"github.com/grafana/rollout-operator/pkg/util" | ||
"github.com/thanos-io/objstore" | ||
) | ||
|
||
const ( | ||
PrepareDownscaleWebhookPath = "/admission/prepare-downscale" | ||
) | ||
|
||
func PrepareDownscale(ctx context.Context, logger log.Logger, ar v1.AdmissionReview, api *kubernetes.Clientset) *v1.AdmissionResponse { | ||
func PrepareDownscale(ctx context.Context, logger log.Logger, ar v1.AdmissionReview, api *kubernetes.Clientset, useZoneTracker bool) *v1.AdmissionResponse { | ||
client := &http.Client{ | ||
Timeout: 5 * time.Second, | ||
} | ||
return prepareDownscale(ctx, logger, ar, api, client) | ||
|
||
var zt *zoneTracker | ||
if useZoneTracker { | ||
// TODO(jordanrushing): fix bucket creation semantics and wire-in config supporting multiple CSPs | ||
bkt := objstore.NewInMemBucket() | ||
zt = newZoneTracker(bkt, "testkey") | ||
} | ||
|
||
return prepareDownscale(ctx, logger, ar, api, client, zt) | ||
} | ||
|
||
type httpClient interface { | ||
Post(url, contentType string, body io.Reader) (resp *http.Response, err error) | ||
} | ||
|
||
func prepareDownscale(ctx context.Context, logger log.Logger, ar v1.AdmissionReview, api kubernetes.Interface, client httpClient) *v1.AdmissionResponse { | ||
func prepareDownscale(ctx context.Context, logger log.Logger, ar v1.AdmissionReview, api kubernetes.Interface, client httpClient, zt *zoneTracker) *v1.AdmissionResponse { | ||
logger = log.With(logger, "name", ar.Request.Name, "resource", ar.Request.Resource.Resource, "namespace", ar.Request.Namespace) | ||
|
||
oldObj, oldGVK, err := codecs.UniversalDeserializer().Decode(ar.Request.OldObject.Raw, nil, nil) | ||
|
@@ -155,27 +164,54 @@ func prepareDownscale(ctx context.Context, logger log.Logger, ar v1.AdmissionRev | |
) | ||
} | ||
|
||
foundSts, err := findDownscalesDoneMinTimeAgo(stsList, ar.Request.Name) | ||
if err != nil { | ||
level.Warn(logger).Log("msg", "downscale not allowed due to error while parsing downscale annotations", "err", err) | ||
return deny( | ||
"downscale of %s/%s in %s from %d to %d replicas is not allowed because parsing downscale annotations failed.", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, | ||
) | ||
} | ||
if foundSts != nil { | ||
msg := fmt.Sprintf("downscale of %s/%s in %s from %d to %d replicas is not allowed because statefulset %v was downscaled at %v and is labelled to wait %s between zone downscales", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, foundSts.name, foundSts.lastDownscaleTime, foundSts.waitTime) | ||
level.Warn(logger).Log("msg", msg, "err", err) | ||
return deny(msg) | ||
} | ||
// If using zoneTracker instead of downscale annotations, check the zone file for recent downscaling. | ||
// Otherwise, check the downscale annotations. | ||
if zt != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could make 2 types (one with the zones file and one with the existing logic) here that implement the same interface. That way the code for them is not intermingled as here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good suggestion for a follow-up PR IMO. |
||
if err := zt.loadZones(ctx, stsList); err != nil { | ||
jhalterman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
level.Warn(logger).Log("msg", "downscale not allowed due to error while loading zones", "err", err) | ||
return deny( | ||
"downscale of %s/%s in %s from %d to %d replicas is not allowed because loading zones failed.", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, | ||
) | ||
} | ||
// Check if the zone has been downscaled recently. | ||
foundSts, err := zt.findDownscalesDoneMinTimeAgo(stsList, ar.Request.Name) | ||
if err != nil { | ||
level.Warn(logger).Log("msg", "downscale not allowed due to error while parsing downscale timestamps from the zone file", "err", err) | ||
return deny( | ||
"downscale of %s/%s in %s from %d to %d replicas is not allowed because parsing parsing downscale timestamps from the zone file failed.", | ||
Comment on lines
+181
to
+183
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need a log line here and the returning of a warning? couldn't it be just returning the result, or an empty result and an error? then we only deal with logging once where ever it is we're calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that's fair, generally I'm just trying to follow the pattern we've implemented elsewhere in this webhook. I don't think the extra logging has any real negative trade-offs as far as I can tell and this way depending on if you're looking at the operator directly or the admission responses you have visibility into the root cause of the failure. |
||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, | ||
) | ||
} | ||
if foundSts != nil { | ||
msg := fmt.Sprintf("downscale of %s/%s in %s from %d to %d replicas is not allowed because statefulset %v was downscaled at %v and is labelled to wait %s between zone downscales", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, foundSts.name, foundSts.lastDownscaleTime, foundSts.waitTime) | ||
level.Warn(logger).Log("msg", msg, "err", err) | ||
return deny(msg) | ||
} | ||
} else { | ||
foundSts, err := findDownscalesDoneMinTimeAgo(stsList, ar.Request.Name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need the old way? If the initial load of the zones adds the waiting time to the initial times won't that be enough to migrate? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The use of the two conditionals is to maintain compatibility for users of the |
||
if err != nil { | ||
level.Warn(logger).Log("msg", "downscale not allowed due to error while parsing downscale annotations", "err", err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here |
||
return deny( | ||
"downscale of %s/%s in %s from %d to %d replicas is not allowed because parsing downscale annotations failed.", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, | ||
) | ||
} | ||
if foundSts != nil { | ||
msg := fmt.Sprintf("downscale of %s/%s in %s from %d to %d replicas is not allowed because statefulset %v was downscaled at %v and is labelled to wait %s between zone downscales", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, foundSts.name, foundSts.lastDownscaleTime, foundSts.waitTime) | ||
level.Warn(logger).Log("msg", msg, "err", err) | ||
return deny(msg) | ||
} | ||
|
||
foundSts = findStatefulSetWithNonUpdatedReplicas(ctx, api, ar.Request.Namespace, stsList) | ||
if foundSts != nil { | ||
msg := fmt.Sprintf("downscale of %s/%s in %s from %d to %d replicas is not allowed because statefulset %v has %d non-updated replicas and %d non-ready replicas", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, foundSts.name, foundSts.nonUpdatedReplicas, foundSts.nonReadyReplicas) | ||
level.Warn(logger).Log("msg", msg) | ||
return deny(msg) | ||
foundSts = findStatefulSetWithNonUpdatedReplicas(ctx, api, ar.Request.Namespace, stsList) | ||
if foundSts != nil { | ||
msg := fmt.Sprintf("downscale of %s/%s in %s from %d to %d replicas is not allowed because statefulset %v has %d non-updated replicas and %d non-ready replicas", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, foundSts.name, foundSts.nonUpdatedReplicas, foundSts.nonReadyReplicas) | ||
level.Warn(logger).Log("msg", msg) | ||
return deny(msg) | ||
} | ||
} | ||
} | ||
|
||
|
@@ -241,13 +277,23 @@ func prepareDownscale(ctx context.Context, logger log.Logger, ar v1.AdmissionRev | |
) | ||
} | ||
|
||
err = addDownscaledAnnotationToStatefulSet(ctx, api, ar.Request.Namespace, ar.Request.Name) | ||
if err != nil { | ||
level.Error(logger).Log("msg", "downscale not allowed due to error while adding annotation", "err", err) | ||
return deny( | ||
"downscale of %s/%s in %s from %d to %d replicas is not allowed because adding an annotation to the statefulset failed.", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, | ||
) | ||
if zt != nil { | ||
if err := zt.setDownscaled(ctx, ar.Request.Name); err != nil { | ||
level.Error(logger).Log("msg", "downscale not allowed due to error while setting downscale timestamp in the zone file", "err", err) | ||
return deny( | ||
"downscale of %s/%s in %s from %d to %d replicas is not allowed because setting downscale timestamp in the zone file failed.", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, | ||
) | ||
} | ||
} else { | ||
err := addDownscaledAnnotationToStatefulSet(ctx, api, ar.Request.Namespace, ar.Request.Name) | ||
if err != nil { | ||
level.Error(logger).Log("msg", "downscale not allowed due to error while adding annotation", "err", err) | ||
return deny( | ||
"downscale of %s/%s in %s from %d to %d replicas is not allowed because adding an annotation to the statefulset failed.", | ||
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldReplicas, *newReplicas, | ||
) | ||
} | ||
} | ||
|
||
// Down-scale operation is allowed because all pods successfully prepared for shutdown | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Like I mentioned in our 1:1, I think the easiest option for now is to turn the
zoneTracker
param into a string. If it's not nil, require that it be one ofgcs
,s3
,whatever azure storage is called
, etc. That's probably easier than trying to wire up a config file, configmap that populates that config file, etc. so that there could be azoneTracker
storage config in the file that specifies the storage backend.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ended up adding the necessary config with flags passed to the zoneTracker instantiation before passing
zt
through toprepareDownscale