diff --git a/src/k8s/pkg/k8sd/app/hooks_bootstrap.go b/src/k8s/pkg/k8sd/app/hooks_bootstrap.go index ec93418a9e..ab8436843d 100644 --- a/src/k8s/pkg/k8sd/app/hooks_bootstrap.go +++ b/src/k8s/pkg/k8sd/app/hooks_bootstrap.go @@ -283,6 +283,17 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT return fmt.Errorf("failed after retry: %w", err) } + log.Info("Checking worker node services") + if err := control.RetryFor(ctx, 3, 5*time.Second, func() error { + if err := snaputil.CheckWorkerServicesStates(ctx, snap, "active"); err != nil { + return fmt.Errorf("failed to ensure all worker services are active: %w", err) + } + return nil + }); err != nil { + return fmt.Errorf("failed after retry: %w", err) + } + + log.Info("Worker node services are ready") return nil } @@ -508,8 +519,18 @@ func (a *App) onBootstrapControlPlane(ctx context.Context, s state.State, bootst if err := waitApiServerReady(ctx, snap); err != nil { return fmt.Errorf("kube-apiserver did not become ready in time: %w", err) } - log.Info("API server is ready - notify controllers") + log.Info("API server is ready - checking control plane services") + if err := control.RetryFor(ctx, 3, 5*time.Second, func() error { + if err := snaputil.CheckControlPlaneServicesStates(ctx, snap, "active"); err != nil { + return fmt.Errorf("failed to ensure all control plane services are active: %w", err) + } + return nil + }); err != nil { + return fmt.Errorf("failed after retry: %w", err) + } + + log.Info("Control plane services are ready - notify controllers") a.NotifyFeatureController( cfg.Network.GetEnabled(), cfg.Gateway.GetEnabled(), diff --git a/src/k8s/pkg/k8sd/app/hooks_join.go b/src/k8s/pkg/k8sd/app/hooks_join.go index 5bf692f057..546907fa1d 100644 --- a/src/k8s/pkg/k8sd/app/hooks_join.go +++ b/src/k8s/pkg/k8sd/app/hooks_join.go @@ -11,6 +11,7 @@ import ( "github.com/canonical/k8s/pkg/k8sd/setup" "github.com/canonical/k8s/pkg/k8sd/types" "github.com/canonical/k8s/pkg/log" + snaputil "github.com/canonical/k8s/pkg/snap/util" "github.com/canonical/k8s/pkg/utils" "github.com/canonical/k8s/pkg/utils/control" "github.com/canonical/k8s/pkg/utils/experimental/snapdconfig" @@ -236,5 +237,16 @@ func (a *App) onPostJoin(ctx context.Context, s state.State, initConfig map[stri return fmt.Errorf("failed to wait for kube-apiserver to become ready: %w", err) } + log.Info("API server is ready - checking control plane services") + if err := control.RetryFor(ctx, 3, 5*time.Second, func() error { + if err := snaputil.CheckControlPlaneServicesStates(ctx, snap, "active"); err != nil { + return fmt.Errorf("failed to ensure all control plane services are active: %w", err) + } + return nil + }); err != nil { + return fmt.Errorf("failed after retry: %w", err) + } + + log.Info("Control plane node services are ready") return nil } diff --git a/src/k8s/pkg/snap/interface.go b/src/k8s/pkg/snap/interface.go index ed1d8d2233..ef6a27320b 100644 --- a/src/k8s/pkg/snap/interface.go +++ b/src/k8s/pkg/snap/interface.go @@ -19,9 +19,10 @@ type Snap interface { GID() int // GID is the group ID to set on config files. Hostname() string // Hostname is the name of the node. - StartService(ctx context.Context, serviceName string) error // snapctl start $service - StopService(ctx context.Context, serviceName string) error // snapctl stop $service - RestartService(ctx context.Context, serviceName string) error // snapctl restart $service + StartService(ctx context.Context, serviceName string) error // snapctl start $service + StopService(ctx context.Context, serviceName string) error // snapctl stop $service + RestartService(ctx context.Context, serviceName string) error // snapctl restart $service + GetServiceState(ctx context.Context, serviceName string) (string, error) // snapctl services $service SnapctlGet(ctx context.Context, args ...string) ([]byte, error) // snapctl get $args... SnapctlSet(ctx context.Context, args ...string) error // snapctl set $args... diff --git a/src/k8s/pkg/snap/mock/mock.go b/src/k8s/pkg/snap/mock/mock.go index 22abd0c6e7..2363cb8b95 100644 --- a/src/k8s/pkg/snap/mock/mock.go +++ b/src/k8s/pkg/snap/mock/mock.go @@ -98,6 +98,10 @@ func (s *Snap) RestartService(ctx context.Context, name string) error { return s.RestartServiceErr } +func (s *Snap) GetServiceState(ctx context.Context, name string) (string, error) { + return "active", nil +} + func (s *Snap) Refresh(ctx context.Context, opts types.RefreshOpts) (string, error) { if len(s.RefreshCalledWith) == 0 { s.RefreshCalledWith = []types.RefreshOpts{opts} diff --git a/src/k8s/pkg/snap/pebble.go b/src/k8s/pkg/snap/pebble.go index c4a67019e8..5c2d93fdfb 100644 --- a/src/k8s/pkg/snap/pebble.go +++ b/src/k8s/pkg/snap/pebble.go @@ -1,10 +1,12 @@ package snap import ( + "bytes" "context" "fmt" "os/exec" "path/filepath" + "strings" "time" "github.com/canonical/k8s/pkg/k8sd/types" @@ -56,6 +58,31 @@ func (s *pebble) RestartService(ctx context.Context, name string) error { return s.runCommand(ctx, []string{filepath.Join(s.snapDir, "bin", "pebble"), "restart", name}) } +// GetServiceState returns a k8s service state. The name can be either prefixed or not. +func (s *pebble) GetServiceState(ctx context.Context, name string) (string, error) { + var b bytes.Buffer + err := s.runCommand(ctx, []string{filepath.Join(s.snapDir, "bin", "pebble"), "services", name}, func(c *exec.Cmd) { c.Stdout = &b }) + if err != nil { + return "", err + } + + output := b.String() + // We're expecting output like this: + // Service Startup Current Since + // kubelet enabled inactive - + lines := strings.Split(output, "\n") + if len(lines) < 2 { + return "", fmt.Errorf("Unexpected output when checking service %s state", name) + } + + fields := strings.Fields(lines[1]) + if len(fields) < 3 || (!strings.EqualFold(stateActive, fields[2]) && !strings.EqualFold(stateInactive, fields[2])) { + return "", fmt.Errorf("Unexpected output when checking service %s state", name) + } + + return fields[2], nil +} + func (s *pebble) Refresh(ctx context.Context, to types.RefreshOpts) (string, error) { switch { case to.Revision != "": diff --git a/src/k8s/pkg/snap/snap.go b/src/k8s/pkg/snap/snap.go index ea61ec8621..a42c8ddbbd 100644 --- a/src/k8s/pkg/snap/snap.go +++ b/src/k8s/pkg/snap/snap.go @@ -24,6 +24,11 @@ import ( "k8s.io/cli-runtime/pkg/genericclioptions" ) +const ( + stateActive = "active" + stateInactive = "inactive" +) + type SnapOpts struct { SnapInstanceName string SnapDir string @@ -85,6 +90,33 @@ func (s *snap) RestartService(ctx context.Context, name string) error { return s.runCommand(ctx, []string{"snapctl", "restart", serviceName(name)}) } +// GetServiceState returns a k8s service state. The name can be either prefixed or not. +func (s *snap) GetServiceState(ctx context.Context, name string) (string, error) { + log.FromContext(ctx).V(2).WithCallDepth(1).Info("Getting service state", "service", name) + + var b bytes.Buffer + err := s.runCommand(ctx, []string{"snapctl", "services", serviceName(name)}, func(c *exec.Cmd) { c.Stdout = &b }) + if err != nil { + return "", err + } + + output := b.String() + // We're expecting output like this: + // Service Startup Current Notes + // k8s.kubelet enabled inactive - + lines := strings.Split(output, "\n") + if len(lines) < 2 { + return "", fmt.Errorf("Unexpected output when checking service %s state", name) + } + + fields := strings.Fields(lines[1]) + if len(fields) < 3 || (!strings.EqualFold(stateActive, fields[2]) && !strings.EqualFold(stateInactive, fields[2])) { + return "", fmt.Errorf("Unexpected output when checking service %s state", name) + } + + return fields[2], nil +} + // Refresh refreshes the snap to a different track, revision or custom snap. func (s *snap) Refresh(ctx context.Context, to types.RefreshOpts) (string, error) { if s.Strict() { diff --git a/src/k8s/pkg/snap/util/services.go b/src/k8s/pkg/snap/util/services.go index f6ae568624..c8a7d9edca 100644 --- a/src/k8s/pkg/snap/util/services.go +++ b/src/k8s/pkg/snap/util/services.go @@ -3,6 +3,7 @@ package snaputil import ( "context" "fmt" + "strings" "github.com/canonical/k8s/pkg/snap" ) @@ -98,6 +99,34 @@ func StopK8sDqliteServices(ctx context.Context, snap snap.Snap) error { return nil } +// CheckWorkerServicesStates checks if the worker services are in the expected state. +// CheckWorkerServicesStates will return on the first error or service state mismatch. +func CheckWorkerServicesStates(ctx context.Context, snap snap.Snap, state string) error { + return checkServicesStates(ctx, snap, workerServices, state) +} + +// CheckControlPlaneServicesStates checks if the control plane services are in the expected state. +// CheckControlPlaneServicesStates will return on the first error or service state mismatch. +func CheckControlPlaneServicesStates(ctx context.Context, snap snap.Snap, state string) error { + return checkServicesStates(ctx, snap, controlPlaneServices, state) +} + +// CheckK8sDqliteServices checks if the k8s-dqlite datastore service is in the expected state. +func CheckK8sDqliteServices(ctx context.Context, snap snap.Snap, state string) error { + return checkServicesStates(ctx, snap, []string{"k8s-dqlite"}, state) +} + +func checkServicesStates(ctx context.Context, snap snap.Snap, services []string, state string) error { + for _, service := range services { + if actualState, err := snap.GetServiceState(ctx, service); err != nil { + return fmt.Errorf("failed to check service %s: %w", service, err) + } else if !strings.EqualFold(state, actualState) { + return fmt.Errorf("expected %s to be in state %s, but it is %s", service, state, actualState) + } + } + return nil +} + // ServiceArgsFromMap processes a map of string pointers and categorizes them into update and delete lists. // - If the value pointer is nil, it adds the argument name to the delete list. // - If the value pointer is not nil, it adds the argument and its value to the update map.