From 1b22df48a41578d19fb512bd8111a481b64011e2 Mon Sep 17 00:00:00 2001 From: Noel Georgi Date: Mon, 19 Aug 2024 23:39:57 +0530 Subject: [PATCH] chore: support debug shell for advanced development Support dropping into a very minimal debug shell. ```bash sudo -E --preserve-env=HOME _out/talosctl-linux-amd64 cluster create --provisioner=qemu $REGISTRY_MIRROR_FLAGS --controlplanes=1 --workers=0 --with-bootloader=false --with-debug-shell ``` Co-authored-by: Dmitry Sharshakov Signed-off-by: Noel Georgi Signed-off-by: Dmitry Sharshakov --- Dockerfile | 34 +++++++++++++++++++ Makefile | 8 +++++ cmd/talosctl/cmd/mgmt/cluster/create.go | 26 ++++++++++++++ internal/pkg/mount/switchroot/switchroot.go | 11 ++++++ pkg/provision/options.go | 11 ++++++ pkg/provision/providers/qemu/launch.go | 9 +++++ pkg/provision/providers/qemu/node.go | 5 +++ .../content/v1.9/advanced/developing-talos.md | 4 +++ 8 files changed, 108 insertions(+) diff --git a/Dockerfile b/Dockerfile index 0eebd8b39e..cb91f8cca9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,7 @@ ARG TOOLS ARG PKGS ARG EXTRAS ARG INSTALLER_ARCH +ARG DEBUG_TOOLS_SOURCE ARG PKGS_PREFIX ARG PKG_FHS @@ -42,6 +43,8 @@ ARG PKG_CNI ARG PKG_FLANNEL_CNI ARG PKG_TALOSCTL_CNI_BUNDLE_INSTALL +ARG DEBUG_TOOLS_SOURCE + # Resolve package images using ${PKGS} to be used later in COPY --from=. FROM ${PKG_FHS} AS pkg-fhs @@ -140,6 +143,29 @@ FROM ${PKG_KERNEL} AS pkg-kernel FROM --platform=amd64 ${PKG_KERNEL} AS pkg-kernel-amd64 FROM --platform=arm64 ${PKG_KERNEL} AS pkg-kernel-arm64 +FROM --platform=amd64 ${TOOLS} as tools-amd64 +FROM --platform=arm64 ${TOOLS} as tools-arm64 + +FROM scratch as pkg-debug-tools-scratch-amd64 +FROM scratch as pkg-debug-tools-scratch-arm64 + +FROM scratch as pkg-debug-tools-bash-minimal-amd64 +COPY --from=tools-amd64 /toolchain/bin/bash /toolchain/bin/bash +COPY --from=tools-amd64 /toolchain/lib/ld-musl-x86_64.so.1 /toolchain/toolchain/lib/ld-musl-x86_64.so.1 +COPY --from=tools-amd64 /toolchain/bin/cat /toolchain/bin/cat +COPY --from=tools-amd64 /toolchain/bin/ls /toolchain/bin/ls +COPY --from=tools-amd64 /toolchain/bin/tee /toolchain/bin/tee + +FROM scratch as pkg-debug-tools-bash-minimal-arm64 +COPY --from=tools-arm64 /toolchain/bin/bash /toolchain/bin/bash +COPY --from=tools-arm64 /toolchain/lib/ld-musl-aarch64.so.1 /toolchain/toolchain/lib/ld-musl-aarch64.so.1 +COPY --from=tools-arm64 /toolchain/bin/cat /toolchain/bin/cat +COPY --from=tools-arm64 /toolchain/bin/ls /toolchain/bin/ls +COPY --from=tools-arm64 /toolchain/bin/tee /toolchain/bin/tee + +FROM pkg-debug-tools-${DEBUG_TOOLS_SOURCE}-amd64 as pkg-debug-tools-amd64 +FROM pkg-debug-tools-${DEBUG_TOOLS_SOURCE}-arm64 as pkg-debug-tools-arm64 + # Strip CNI package. FROM scratch AS pkg-cni-stripped-amd64 @@ -651,6 +677,10 @@ COPY --link --from=pkg-kmod-amd64 /usr/lib/libkmod.* /rootfs/lib/ COPY --link --from=pkg-kmod-amd64 /usr/bin/kmod /rootfs/sbin/modprobe COPY --link --from=modules-amd64 /lib/modules /rootfs/lib/modules COPY --link --from=machined-build-amd64 /machined /rootfs/sbin/init + +# this is a no-op as it copies from a scratch image when WITH_DEBUG_SHELL is not set +COPY --link --from=pkg-debug-tools-amd64 * /rootfs/ + RUN </dev/nu ARTIFACTS := _out TOOLS ?= ghcr.io/siderolabs/tools:v1.9.0-alpha.0-4-g2058296 +DEBUG_TOOLS_SOURCE := scratch + PKGS_PREFIX ?= ghcr.io/siderolabs PKGS ?= v1.9.0-alpha.0-24-gbe92da0 EXTRAS ?= v1.9.0-alpha.0-1-geab6e58 @@ -147,6 +149,11 @@ else GO_LDFLAGS += -s -w endif +ifneq (, $(filter $(WITH_DEBUG_SHELL), t true TRUE y yes 1)) +# bash-minimal is a Dockerfile target that copies over the bash from siderolabs tools +DEBUG_TOOLS_SOURCE := bash-minimal +endif + GO_BUILDFLAGS_TALOSCTL := $(GO_BUILDFLAGS) -tags "$(GO_BUILDTAGS_TALOSCTL)" GO_BUILDFLAGS += -tags "$(GO_BUILDTAGS)" @@ -161,6 +168,7 @@ COMMON_ARGS += --progress=$(PROGRESS) COMMON_ARGS += --platform=$(PLATFORM) COMMON_ARGS += --push=$(PUSH) COMMON_ARGS += --build-arg=TOOLS=$(TOOLS) +COMMON_ARGS += --build-arg=DEBUG_TOOLS_SOURCE=$(DEBUG_TOOLS_SOURCE) COMMON_ARGS += --build-arg=PKGS=$(PKGS) COMMON_ARGS += --build-arg=EXTRAS=$(EXTRAS) COMMON_ARGS += --build-arg=GOFUMPT_VERSION=$(GOFUMPT_VERSION) diff --git a/cmd/talosctl/cmd/mgmt/cluster/create.go b/cmd/talosctl/cmd/mgmt/cluster/create.go index 33872039e2..a97b1cd9dd 100644 --- a/cmd/talosctl/cmd/mgmt/cluster/create.go +++ b/cmd/talosctl/cmd/mgmt/cluster/create.go @@ -87,6 +87,7 @@ const ( controlPlanePortFlag = "control-plane-port" firewallFlag = "with-firewall" tpm2EnabledFlag = "with-tpm2" + withDebugShellFlag = "with-debug-shell" // The following flags are the gen options - the options that are only used in machine configuration (i.e., not during the qemu/docker provisioning). // They are not applicable when no machine configuration is generated, hence mutually exclusive with the --input-dir flag. @@ -190,6 +191,7 @@ var ( withUUIDHostnames bool withSiderolinkAgent agentFlag withJSONLogs bool + debugShellEnabled bool ) // createCmd represents the cluster up command. @@ -470,6 +472,7 @@ func create(ctx context.Context) error { provision.WithBootlader(bootloaderEnabled), provision.WithUEFI(uefiEnabled), provision.WithTPM2(tpm2Enabled), + provision.WithDebugShell(debugShellEnabled), provision.WithExtraUEFISearchPaths(extraUEFISearchPaths), provision.WithTargetArch(targetArch), provision.WithSiderolinkAgent(withSiderolinkAgent.IsEnabled()), @@ -477,6 +480,12 @@ func create(ctx context.Context) error { var configBundleOpts []bundle.Option + if debugShellEnabled { + if provisionerName != "qemu" { + return errors.New("debug shell only supported with qemu provisioner") + } + } + if ports != "" { if provisionerName != docker { return errors.New("exposed-ports flag only supported with docker provisioner") @@ -968,6 +977,21 @@ func create(ctx context.Context) error { return err } + if debugShellEnabled { + fmt.Println("You can now connect to debug shell on any node using these commands:") + + for _, node := range request.Nodes { + talosDir, err := clientconfig.GetTalosDirectory() + if err != nil { + return nil + } + + fmt.Printf("socat - UNIX-CONNECT:%s\n", filepath.Join(talosDir, "clusters", clusterName, node.Name+".serial")) + } + + return nil + } + // No talosconfig in the bundle - skip the operations below if bundleTalosconfig == nil { return nil @@ -1206,6 +1230,8 @@ func init() { createCmd.Flags().BoolVar(&bootloaderEnabled, bootloaderEnabledFlag, true, "enable bootloader to load kernel and initramfs from disk image after install") createCmd.Flags().BoolVar(&uefiEnabled, "with-uefi", true, "enable UEFI on x86_64 architecture") createCmd.Flags().BoolVar(&tpm2Enabled, tpm2EnabledFlag, false, "enable TPM2 emulation support using swtpm") + createCmd.Flags().BoolVar(&debugShellEnabled, withDebugShellFlag, false, "drop talos into a maintenance shell on boot, this is for advanced debugging for developers only") + createCmd.Flags().MarkHidden("with-debug-shell") //nolint:errcheck createCmd.Flags().StringSliceVar(&extraUEFISearchPaths, "extra-uefi-search-paths", []string{}, "additional search paths for UEFI firmware (only applies when UEFI is enabled)") createCmd.Flags().StringSliceVar(®istryMirrors, registryMirrorFlag, []string{}, "list of registry mirrors to use in format: =") createCmd.Flags().StringSliceVar(®istryInsecure, registryInsecureFlag, []string{}, "list of registry hostnames to skip TLS verification for") diff --git a/internal/pkg/mount/switchroot/switchroot.go b/internal/pkg/mount/switchroot/switchroot.go index aa39fb9beb..61ba10377d 100644 --- a/internal/pkg/mount/switchroot/switchroot.go +++ b/internal/pkg/mount/switchroot/switchroot.go @@ -11,6 +11,7 @@ import ( "path/filepath" "github.com/siderolabs/go-debug" + "github.com/siderolabs/go-procfs/procfs" "golang.org/x/sys/unix" "github.com/siderolabs/talos/internal/pkg/mount" @@ -28,6 +29,8 @@ var preservedPaths = map[string]struct{}{ // Switch moves the rootfs to a specified directory. See // https://github.com/karelzak/util-linux/blob/master/sys-utils/switch_root.c. +// +//nolint:gocyclo func Switch(prefix string, mountpoints *mount.Points) (err error) { log.Println("moving mounts to the new rootfs") @@ -88,6 +91,14 @@ func Switch(prefix string, mountpoints *mount.Points) (err error) { log.Printf("race detection enabled with halt_on_error=1") } + if val := procfs.ProcCmdline().Get("talos.debugshell"); val != nil { + if err = unix.Exec("/bin/bash", []string{"/bin/bash"}, envv); err != nil { + return fmt.Errorf("error executing /bin/bash: %w", err) + } + + return nil + } + if err = unix.Exec("/sbin/init", []string{"/sbin/init"}, envv); err != nil { return fmt.Errorf("error executing /sbin/init: %w", err) } diff --git a/pkg/provision/options.go b/pkg/provision/options.go index f52441bd0b..dd7ef74afd 100644 --- a/pkg/provision/options.go +++ b/pkg/provision/options.go @@ -79,6 +79,15 @@ func WithTPM2(enabled bool) Option { } } +// WithDebugShell drops into debug shell in initramfs. +func WithDebugShell(enabled bool) Option { + return func(o *Options) error { + o.WithDebugShell = enabled + + return nil + } +} + // WithExtraUEFISearchPaths configures additional search paths to look for UEFI firmware. func WithExtraUEFISearchPaths(extraUEFISearchPaths []string) Option { return func(o *Options) error { @@ -166,6 +175,8 @@ type Options struct { UEFIEnabled bool // Enable TPM2 emulation using swtpm. TPM2Enabled bool + // Enable debug shell in the bootloader. + WithDebugShell bool // Configure additional search paths to look for UEFI firmware. ExtraUEFISearchPaths []string diff --git a/pkg/provision/providers/qemu/launch.go b/pkg/provision/providers/qemu/launch.go index 509fb338a8..88a6e05583 100644 --- a/pkg/provision/providers/qemu/launch.go +++ b/pkg/provision/providers/qemu/launch.go @@ -56,6 +56,7 @@ type LaunchConfig struct { NodeUUID uuid.UUID BadRTC bool ArchitectureData Arch + WithDebugShell bool // Talos config Config string @@ -320,6 +321,14 @@ func launchVM(config *LaunchConfig) error { "pause", } + if config.WithDebugShell { + args = append( + args, + "-serial", + fmt.Sprintf("unix:%s/%s.serial,server,nowait", config.StatePath, config.Hostname), + ) + } + var ( scsiAttached, ahciAttached, nvmeAttached bool ahciBus int diff --git a/pkg/provision/providers/qemu/node.go b/pkg/provision/providers/qemu/node.go index e7f60e725e..472a40b990 100644 --- a/pkg/provision/providers/qemu/node.go +++ b/pkg/provision/providers/qemu/node.go @@ -89,6 +89,10 @@ func (p *provisioner) createNode(state *vm.State, clusterReq provision.ClusterRe } } + if opts.WithDebugShell { + cmdline.Append("talos.debugshell", "") + } + var nodeConfig string if !nodeReq.SkipInjectingConfig { @@ -157,6 +161,7 @@ func (p *provisioner) createNode(state *vm.State, clusterReq provision.ClusterRe TFTPServer: nodeReq.TFTPServer, IPXEBootFileName: nodeReq.IPXEBootFilename, APIPort: apiPort, + WithDebugShell: opts.WithDebugShell, } if clusterReq.IPXEBootScript != "" { diff --git a/website/content/v1.9/advanced/developing-talos.md b/website/content/v1.9/advanced/developing-talos.md index 26b931cada..66a288ca62 100644 --- a/website/content/v1.9/advanced/developing-talos.md +++ b/website/content/v1.9/advanced/developing-talos.md @@ -177,6 +177,10 @@ Specfic tests can be run with `-test.run=TestIntegration/api.ResetSuite`. `make WITH_DEBUG=1` enables Go profiling and other debug features, useful for local development. +`make initramfs WITH_DEBUG_SHELL=true` adds bash and minimal utilities for debugging purposes. +Combine with `--with-debug-shell` flag when creating cluster to obtain shell access. +This is uncommonly used as in this case the bash shell will run in place of machined. + ## Destroying Cluster ```bash