From 8460f72283f115088bdba0d3e17767a43e52d706 Mon Sep 17 00:00:00 2001 From: Sascha Schwarze Date: Thu, 16 Jan 2025 08:59:41 +0100 Subject: [PATCH] Ensure ContainerHealthy condition is set back to True Co-authored-by: Matthias Diester Signed-off-by: Sascha Schwarze --- pkg/reconciler/autoscaling/hpa/hpa_test.go | 2 +- .../revision/reconcile_resources.go | 9 ++- pkg/reconciler/revision/table_test.go | 79 ++++++++++++++++++- pkg/testing/functional.go | 26 +++++- pkg/testing/v1/revision.go | 15 ++++ 5 files changed, 124 insertions(+), 7 deletions(-) diff --git a/pkg/reconciler/autoscaling/hpa/hpa_test.go b/pkg/reconciler/autoscaling/hpa/hpa_test.go index bcbca4983a0e..c7793c99e1b5 100644 --- a/pkg/reconciler/autoscaling/hpa/hpa_test.go +++ b/pkg/reconciler/autoscaling/hpa/hpa_test.go @@ -331,7 +331,7 @@ func TestReconcile(t *testing.T) { Name: "nop deletion reconcile", // Test that with a DeletionTimestamp we do nothing. Objects: []runtime.Object{ - pa(testNamespace, testRevision, WithHPAClass, WithPADeletionTimestamp), + WithDeletionTimestamp(pa(testNamespace, testRevision, WithHPAClass)), deploy(testNamespace, testRevision), }, Key: key(testNamespace, testRevision), diff --git a/pkg/reconciler/revision/reconcile_resources.go b/pkg/reconciler/revision/reconcile_resources.go index 3fa6f07d24fe..cdb90c553794 100644 --- a/pkg/reconciler/revision/reconcile_resources.go +++ b/pkg/reconciler/revision/reconcile_resources.go @@ -76,7 +76,10 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision) // If a container keeps crashing (no active pods in the deployment although we want some) if *deployment.Spec.Replicas > 0 && deployment.Status.AvailableReplicas == 0 { - pods, err := c.kubeclient.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(deployment.Spec.Selector)}) + pods, err := c.kubeclient.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{ + LabelSelector: metav1.FormatLabelSelector(deployment.Spec.Selector), + Limit: 1, + }) if err != nil { logger.Errorw("Error getting pods", zap.Error(err)) return nil @@ -117,6 +120,10 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision) } } + if deployment.Status.ReadyReplicas > 0 { + rev.Status.MarkContainerHealthyTrue() + } + return nil } diff --git a/pkg/reconciler/revision/table_test.go b/pkg/reconciler/revision/table_test.go index 57f020700096..9ef47b0b5338 100644 --- a/pkg/reconciler/revision/table_test.go +++ b/pkg/reconciler/revision/table_test.go @@ -41,6 +41,7 @@ import ( tracingconfig "knative.dev/pkg/tracing/config" autoscalingv1alpha1 "knative.dev/serving/pkg/apis/autoscaling/v1alpha1" defaultconfig "knative.dev/serving/pkg/apis/config" + "knative.dev/serving/pkg/apis/serving" v1 "knative.dev/serving/pkg/apis/serving/v1" "knative.dev/serving/pkg/autoscaler/config/autoscalerconfig" servingclient "knative.dev/serving/pkg/client/injection/client" @@ -544,7 +545,7 @@ func TestReconcile(t *testing.T) { WithRoutingState(v1.RoutingStateActive, fc), ), pa("foo", "pull-backoff", WithReachabilityReachable), // pa can't be ready since deployment times out. - pod(t, "foo", "pull-backoff", WithWaitingContainer("pull-backoff", "ImagePullBackoff", "can't pull it")), + pod(t, "foo", "pull-backoff", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithWaitingContainer("pull-backoff", "ImagePullBackoff", "can't pull it")), timeoutDeploy(deploy(t, "foo", "pull-backoff"), "Timed out!"), image("foo", "pull-backoff"), }, @@ -570,7 +571,7 @@ func TestReconcile(t *testing.T) { WithRoutingState(v1.RoutingStateActive, fc), WithLogURL, allUnknownConditions, MarkActive), pa("foo", "pod-error", WithReachabilityReachable), // PA can't be ready, since no traffic. - pod(t, "foo", "pod-error", WithFailingContainer("pod-error", 5, "I failed man!")), + pod(t, "foo", "pod-error", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithFailingContainer("pod-error", 5, "I failed man!")), deploy(t, "foo", "pod-error"), image("foo", "pod-error"), }, @@ -744,6 +745,60 @@ func TestReconcile(t *testing.T) { PodSpecPersistentVolumeClaim: defaultconfig.Enabled, PodSpecPersistentVolumeWrite: defaultconfig.Enabled, }}), + }, { + Name: "revision's ContainerHealthy turns back to True if the deployment is healthy", + Objects: []runtime.Object{ + Revision("foo", "container-unhealthy", + WithLogURL, + MarkRevisionReady, + withDefaultContainerStatuses(), + WithRevisionLabel(serving.RoutingStateLabelKey, "active"), + MarkContainerHealthyFalse("ExitCode137"), + ), + pa("foo", "container-unhealthy", + WithPASKSReady, + WithScaleTargetInitialized, + WithTraffic, + WithReachabilityReachable, + WithPAStatusService("something"), + ), + readyDeploy(deploy(t, "foo", "container-unhealthy", withReplicas(1))), + image("foo", "container-unhealthy"), + }, + Key: "foo/container-unhealthy", + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: Revision("foo", "container-unhealthy", + WithLogURL, + MarkRevisionReady, + withDefaultContainerStatuses(), + WithRevisionLabel(serving.RoutingStateLabelKey, "active"), + ), + }}, + WantEvents: []string{ + Eventf(corev1.EventTypeNormal, "RevisionReady", "Revision becomes ready upon all resources being ready"), + }, + }, { + Name: "revision's ContainerHealthy stays True even if the last Pod with restarts terminates", + Objects: []runtime.Object{ + Revision("foo", "container-healthy", + WithLogURL, + MarkRevisionReady, + withDefaultContainerStatuses(), + WithRevisionLabel(serving.RoutingStateLabelKey, "active"), + MarkContainerHealthyTrue(), + ), + pa("foo", "container-healthy", + WithPASKSReady, + WithScaleTargetInitialized, + WithTraffic, + WithReachabilityReachable, + WithPAStatusService("something"), + ), + WithDeletionTimestamp(pod(t, "foo", "container-healthy", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithFailingContainer("crashed-at-some-point", 128, "OOMKilled"))), + readyDeploy(deploy(t, "foo", "container-healthy", withReplicas(0))), + image("foo", "container-healthy"), + }, + Key: "foo/container-healthy", }} table.Test(t, MakeFactory(func(ctx context.Context, listers *Listers, _ configmap.Watcher) controller.Reconciler { @@ -853,6 +908,19 @@ func allUnknownConditions(r *v1.Revision) { type configOption func(*config.Config) +type deploymentOption func(*appsv1.Deployment) + +// withReplicas configures the number of replicas on the Deployment +func withReplicas(replicas int32) deploymentOption { + return func(d *appsv1.Deployment) { + d.Spec.Replicas = &replicas + d.Status.AvailableReplicas = replicas + d.Status.ReadyReplicas = replicas + d.Status.Replicas = replicas + d.Status.UpdatedReplicas = replicas + } +} + func deploy(t *testing.T, namespace, name string, opts ...interface{}) *appsv1.Deployment { t.Helper() cfg := reconcilerTestConfig() @@ -878,6 +946,13 @@ func deploy(t *testing.T, namespace, name string, opts ...interface{}) *appsv1.D if err != nil { t.Fatal("failed to create deployment") } + + for _, opt := range opts { + if deploymentOpt, ok := opt.(deploymentOption); ok { + deploymentOpt(deployment) + } + } + return deployment } diff --git a/pkg/testing/functional.go b/pkg/testing/functional.go index f0ee41eb7658..176d1e73abf2 100644 --- a/pkg/testing/functional.go +++ b/pkg/testing/functional.go @@ -132,10 +132,11 @@ func WithNoTraffic(reason, message string) PodAutoscalerOption { } } -// WithPADeletionTimestamp will set the DeletionTimestamp on the PodAutoscaler. -func WithPADeletionTimestamp(r *autoscalingv1alpha1.PodAutoscaler) { +// WithDeletionTimestamp will set the DeletionTimestamp on the object. +func WithDeletionTimestamp[T metav1.Object](obj T) T { t := metav1.NewTime(time.Unix(1e9, 0)) - r.ObjectMeta.SetDeletionTimestamp(&t) + obj.SetDeletionTimestamp(&t) + return obj } // WithHPAClass updates the PA to add the hpa class annotation. @@ -288,6 +289,25 @@ func WithEndpointsOwnersRemoved(eps *corev1.Endpoints) { // PodOption enables further configuration of a Pod. type PodOption func(*corev1.Pod) +// WithPodCondition sets a condition in the status +func WithPodCondition(conditionType corev1.PodConditionType, status corev1.ConditionStatus, reason string) PodOption { + return func(pod *corev1.Pod) { + for i, condition := range pod.Status.Conditions { + if condition.Type == conditionType { + pod.Status.Conditions[i].Status = status + pod.Status.Conditions[i].Reason = reason + return + } + } + + pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{ + Type: conditionType, + Status: status, + Reason: reason, + }) + } +} + // WithFailingContainer sets the .Status.ContainerStatuses on the pod to // include a container named accordingly to fail with the given state. func WithFailingContainer(name string, exitCode int, message string) PodOption { diff --git a/pkg/testing/v1/revision.go b/pkg/testing/v1/revision.go index a475bad0aa40..a31c3b218f9f 100644 --- a/pkg/testing/v1/revision.go +++ b/pkg/testing/v1/revision.go @@ -152,12 +152,27 @@ func MarkDeploying(reason string) RevisionOption { } } +// MarkContainerHealthyUnknown changes the ContainerHealthy condition to Unknown with the given reason func MarkContainerHealthyUnknown(reason string) RevisionOption { return func(r *v1.Revision) { r.Status.MarkContainerHealthyUnknown(reason, "") } } +// MarkContainerHealthyFalse changes the ContainerHealthy condition to False with the given reason +func MarkContainerHealthyFalse(reason string) RevisionOption { + return func(r *v1.Revision) { + r.Status.MarkContainerHealthyFalse(reason, "") + } +} + +// MarkContainerHealthyTrue changes the ContainerHealthy condition to True +func MarkContainerHealthyTrue() RevisionOption { + return func(r *v1.Revision) { + r.Status.MarkContainerHealthyTrue() + } +} + // MarkProgressDeadlineExceeded calls the method of the same name on the Revision // with the message we expect the Revision Reconciler to pass. func MarkProgressDeadlineExceeded(message string) RevisionOption {