diff --git a/templates/configmap-host-support-bundle.yaml b/templates/configmap-host-support-bundle.yaml new file mode 100644 index 0000000..9a33372 --- /dev/null +++ b/templates/configmap-host-support-bundle.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kotsadm-host-support-bundle + labels: + troubleshoot.sh/kind: support-bundle +data: + support-bundle-spec: {{ .Files.Get "troubleshoot/embedded-cluster-host-support-bundle.yaml" | quote }} diff --git a/troubleshoot/embedded-cluster-host-support-bundle.yaml b/troubleshoot/embedded-cluster-host-support-bundle.yaml new file mode 100644 index 0000000..25e69cf --- /dev/null +++ b/troubleshoot/embedded-cluster-host-support-bundle.yaml @@ -0,0 +1,438 @@ +apiVersion: troubleshoot.sh/v1beta2 +kind: SupportBundle +metadata: + name: embedded-cluster-host-support-bundle + runHostCollectorsInPod: true +spec: + uri: https://raw.githubusercontent.com/replicatedhq/kots-helm/main/troubleshoot/embedded-cluster-host-support-bundle.yaml + hostCollectors: + - ipv4Interfaces: {} + - hostServices: {} + - cpu: {} + - hostOS: {} + - memory: {} + - blockDevices: {} + - time: {} + - certificate: + collectorName: k8s-api-keypair + certificatePath: /var/lib/k0s/pki/k0s-api.crt + keyPath: /var/lib/k0s/pki/k0s-api.key + - certificate: + collectorName: etcd-keypair + certificatePath: /var/lib/k0s/pki/etcd/server.crt + keyPath: /var/lib/k0s/pki/etcd/server.key + # Disk usage for commonly used directories + - diskUsage: + collectorName: root-disk-usage + path: / + - diskUsage: + collectorName: openebs-disk-usage + path: /var/openebs/local + - diskUsage: + collectorName: embedded-cluster-path-usage + path: /var/lib/embedded-cluster + - diskUsage: + collectorName: k0s-path-usage + path: /var/lib/k0s + - diskUsage: + collectorName: openebs-path-usage + path: /opt/openebs + - diskUsage: + collectorName: tmp-path-usage + path: /tmp + # APIserver and etcd health endpoints + - run: + collectorName: k8s-api-healthz-6443 + command: 'curl' + args: + [ + '--cert', + '/var/lib/k0s/pki/admin.crt', + '--key', + '/var/lib/k0s/pki/admin.key', + '--cacert', + '/var/lib/k0s/pki/ca.crt', + '-i', + 'https://localhost:6443/healthz?verbose', + ] + - run: + collectorName: etcd-healthz-2379 + command: 'curl' + args: + [ + '--cert', + '/var/lib/k0s/pki/apiserver-etcd-client.crt', + '--key', + '/var/lib/k0s/pki/apiserver-etcd-client.key', + '--cacert', + '/var/lib/k0s/pki/etcd/ca.crt', + '-i', + 'https://localhost:2379/health', + ] + # Run collectors for system information & metrics + - run: + collectorName: free + command: free + args: ['-h'] + - run: + collectorName: top + command: top + args: ['-b', '-n', '1'] + - run: + collectorName: df + command: df + args: ['-h'] + - run: + collectorName: iostat + command: iostat + args: ['-x'] + - run: + collectorName: vmstat + command: vmstat + args: ['1', '5'] + - run: + collectorName: uptime + command: uptime + - run: + collectorName: k0s-version + command: /usr/local/bin/k0s + args: ['version'] + - run: + collectorName: k0s-status + command: /usr/local/bin/k0s + args: [ "status" ] + - run: + collectorName: k0s-issue-template + command: sh + args: [ "-c", "uname -srvmo; cat /etc/os-release || lsb_release -a" ] + - run: + collectorName: k0s-sysinfo + command: /usr/local/bin/k0s + args: [ "sysinfo" ] + - copy: + collectorName: installer-logs + path: /var/lib/embedded-cluster/logs/*.log + - copy: + collectorName: installer-support-files + path: /var/lib/embedded-cluster/support/* + - run: + collectorName: network-manager-logs + command: journalctl + args: [ "--since", "10 minutes ago", "--no-pager", "-u", "NetworkManager" ] + - run: + collectorName: k0scontroller-logs + command: journalctl + args: [ "--since", "2 days ago", "--no-pager", "-u", "k0scontroller.service" ] + - run: + collectorName: k0s-images-dir + command: ls + args: [ "-alh", "/var/lib/k0s/images" ] + # External k0s runtime dependencies + # https://docs.k0sproject.io/stable/external-runtime-deps/ + - kernelConfigs: {} + - cgroups: {} + - run: + collectorName: 'check-proc-filesystem' + command: 'sh' + args: ['-c', 'stat -f -c "%T" /proc'] + - run: + collectorName: 'check-modprobe' + command: 'sh' + args: ['-c', 'command -v modprobe'] + - run: + collectorName: 'check-mount' + command: 'sh' + args: ['-c', 'command -v mount'] + - run: + collectorName: 'check-umount' + command: 'sh' + args: ['-c', 'command -v umount'] + hostAnalyzers: + - ipv4Interfaces: + outcomes: + - fail: + when: "count == 0" + message: No IPv4 interfaces detected + - pass: + when: "count >= 1" + message: IPv4 interface detected + - memory: + checkName: Amount of Memory + outcomes: + - fail: + when: "< 2G" + message: At least 2G of memory is recommended + - pass: + message: The system has at least 2G of memory + - diskUsage: + checkName: Root disk usage + collectorName: root-disk-usage + outcomes: + - fail: + when: "total < 40Gi" + message: The disk containing directory / has less than 40Gi of total space + - fail: + when: "used/total > 80%" + message: The disk containing directory / is more than 80% full + - fail: + when: "available < 10Gi" + message: The disk containing directory / has less than 10Gi of disk space available + - pass: + message: The disk containing directory / has sufficient space + - diskUsage: + checkName: Embedded Cluster Disk Space + collectorName: embedded-cluster-path-usage + outcomes: + - fail: + when: 'total < 40Gi' + message: The filesystem at /var/lib/embedded-cluster has less than 40Gi of total space + - pass: + message: The filesystem at /var/lib/embedded-cluster has sufficient space + - diskUsage: + checkName: k0s Disk Space + collectorName: k0s-path-usage + outcomes: + - fail: + when: 'total < 40Gi' + message: The filesystem at /var/lib/k0s has less than 40Gi of total space + - fail: + when: 'used/total > 80%' + message: The filesystem at /var/lib/k0s is more than 80% full + - pass: + message: The filesystem at /var/lib/k0s has sufficient space + - diskUsage: + checkName: OpenEBS disk usage + collectorName: openebs-disk-usage + outcomes: + - fail: + when: "total < 40Gi" + message: The disk containing OpenEBS volumes has less than 40Gi of space + - fail: + when: "used/total > 80%" + message: The disk containing OpenEBS volumes is more than 80% full + - fail: + when: "available < 10Gi" + message: The disk containing OpenEBS volumes has less than 10Gi of disk space available + - pass: + message: The disk containing directory OpenEBS volumes has sufficient space + - diskUsage: + checkName: tmp Disk Space + collectorName: tmp-path-usage + outcomes: + - fail: + when: 'total < 5Gi' + message: The filesystem at /tmp has less than 5Gi of total space + - pass: + message: The filesystem at /tmp has sufficient space + - textAnalyze: + checkName: Kubernetes API probing + fileName: host-collectors/run-host/k0s-status.txt + regex: 'Kube-api probing successful: true' + outcomes: + - fail: + when: "false" + message: Kubernetes API probing is reporting a failure + - pass: + when: "true" + message: Kubernetes API probing is reporting success + - textAnalyze: + checkName: NetworkManager managing calico interfaces + fileName: host-collectors/run-host/network-manager-logs.txt + regex: 'device .*cali.+: state change: config' + outcomes: + - fail: + when: "true" + message: NetworkManager seems to be managing calico interfaces + - pass: + when: "false" + message: NetworkManager isn't managing calico interfaces + - hostServices: + checkName: "Local Artifact Mirror" + outcomes: + - fail: + when: "local-artifact-mirror != active" + message: Local Artifact Mirror isn't active + - pass: + when: "local-artifact-mirror = active" + message: Local Artifact Mirror is active + - time: + checkName: System Clock + outcomes: + - fail: + when: 'ntp == unsynchronized+inactive' + message: 'System clock is not synchronized' + - fail: + when: 'ntp == unsynchronized+active' + message: System clock is not yet synchronized + - pass: + when: 'ntp == synchronized+active' + message: 'System clock is synchronized' + - fail: + message: 'Unable to determine system clock status' + - jsonCompare: + checkName: Check if either cgroup v1 or v2 is enabled + fileName: host-collectors/system/cgroups.json + path: 'cgroup-enabled' + value: | + true + outcomes: + - fail: + when: 'false' + message: 'Neither cgroup v1 nor v2 is enabled' + - pass: + when: 'true' + message: 'One of cgroup v1 or v2 is enabled' + - jsonCompare: + checkName: Check if cpu cgroup controller is enabled + fileName: host-collectors/system/cgroups.json + jsonPath: "{$.allControllers[?(@ == 'cpu')]}" + value: | + "cpu" + outcomes: + - fail: + when: 'false' + message: "'cpu' cgroup controller is not enabled" + - pass: + when: 'true' + message: "'cpu' cgroup controller is enabled" + - jsonCompare: + checkName: Check if cpuacct cgroup controller is enabled + fileName: host-collectors/system/cgroups.json + jsonPath: "{$.allControllers[?(@ == 'cpuacct')]}" + value: | + "cpuacct" + outcomes: + - fail: + when: 'false' + message: "'cpuacct' cgroup controller is not enabled" + - pass: + when: 'true' + message: "'cpuacct' cgroup controller is enabled" + - jsonCompare: + checkName: Check if cpuset cgroup controller is enabled + fileName: host-collectors/system/cgroups.json + jsonPath: "{$.allControllers[?(@ == 'cpuset')]}" + value: | + "cpuset" + outcomes: + - fail: + when: 'false' + message: "'cpuset' cgroup controller is not enabled" + - pass: + when: 'true' + message: "'cpuset' cgroup controller is enabled" + - jsonCompare: + checkName: Check if memory cgroup controller is enabled + fileName: host-collectors/system/cgroups.json + jsonPath: "{$.allControllers[?(@ == 'memory')]}" + value: | + "memory" + outcomes: + - fail: + when: 'false' + message: "'memory' cgroup controller is not enabled" + - pass: + when: 'true' + message: "'memory' cgroup controller is enabled" + - jsonCompare: + checkName: Check if devices cgroup controller is enabled + fileName: host-collectors/system/cgroups.json + jsonPath: "{$.allControllers[?(@ == 'devices')]}" + value: | + "devices" + outcomes: + - fail: + when: 'false' + message: "'devices' cgroup controller is not enabled" + - pass: + when: 'true' + message: "'devices' cgroup controller is enabled" + - jsonCompare: + checkName: Check if freezer cgroup controller is enabled + fileName: host-collectors/system/cgroups.json + jsonPath: "{$.allControllers[?(@ == 'freezer')]}" + value: | + "freezer" + outcomes: + - fail: + when: 'false' + message: "'freezer' cgroup controller is not enabled" + - pass: + when: 'true' + message: "'freezer' cgroup controller is enabled" + - jsonCompare: + checkName: Check if pids cgroup controller is enabled + fileName: host-collectors/system/cgroups.json + jsonPath: "{$.allControllers[?(@ == 'pids')]}" + value: | + "pids" + outcomes: + - fail: + when: 'false' + message: "'pids' cgroup controller is not enabled" + - pass: + when: 'true' + message: "'pids' cgroup controller is enabled" + - textAnalyze: + checkName: Check if /proc filesystem is mounted + fileName: host-collectors/run-host/check-proc-filesystem.txt + regex: 'proc' + outcomes: + - pass: + when: "true" + message: "/proc filesystem is mounted" + - fail: + when: "false" + message: "/proc filesystem is not mounted" + - textAnalyze: + checkName: Check if 'modprobe' command exists in PATH + fileName: host-collectors/run-host/check-modprobe.txt + regex: '/usr/sbin/modprobe' + outcomes: + - pass: + when: "true" + message: "'modprobe' command exists in PATH" + - fail: + when: "false" + message: "'modprobe' command does not exist in PATH" + - textAnalyze: + checkName: Check if 'mount' command exists in PATH + fileName: host-collectors/run-host/check-mount.txt + regex: '/usr/bin/mount' + outcomes: + - pass: + when: "true" + message: "'mount' command exists in PATH" + - fail: + when: "false" + message: "'mount' command does not exist in PATH" + - textAnalyze: + checkName: Check if 'umount' command exists in PATH + fileName: host-collectors/run-host/check-umount.txt + regex: '/usr/bin/umount' + outcomes: + - pass: + when: "true" + message: "'umount' command exists in PATH" + - fail: + when: "false" + message: "'umount' command does not exist in PATH" + - hostOS: + checkName: Check minimum kernel version + outcomes: + - pass: + when: "kernelVersion >= 3.10" + message: "Minimum kernel version of 3.10 has been met" + - fail: + message: "Minimum kernel version of 3.10 has not been met" + - textAnalyze: + checkName: Hostname Mismatch + fileName: host-collectors/run-host/k0scontroller-logs.txt + regex: ".*can only access node lease with the same name as the requesting node.*" + outcomes: + - fail: + when: "true" + message: "Possible hostname change. Verify that the current hostname matches what's expected by the k8s control plane" + - pass: + when: "false" + message: "No signs of hostname changes found" diff --git a/troubleshoot/embedded-cluster-support-bundle.yaml b/troubleshoot/embedded-cluster-support-bundle.yaml index a0a1326..a06064e 100644 --- a/troubleshoot/embedded-cluster-support-bundle.yaml +++ b/troubleshoot/embedded-cluster-support-bundle.yaml @@ -4,6 +4,7 @@ metadata: name: embedded-cluster-kotsadm-support-bundle labels: troubleshoot.io/kind: support-bundle + runHostCollectorsInPod: true # default is false spec: uri: https://raw.githubusercontent.com/replicatedhq/kots-helm/main/troubleshoot/embedded-cluster-support-bundle.yaml collectors: