From b9f8745f12ff6c9dadddf6db5ef687735b65c6e0 Mon Sep 17 00:00:00 2001 From: Pete Wall Date: Tue, 12 Nov 2024 14:10:51 -0700 Subject: [PATCH] Add KubeDNS/CoreDNS support in cluster metrics (#892) Signed-off-by: Pete Wall --- charts/feature-cluster-metrics/README.md | 14 +- .../templates/_api_server.alloy.tpl | 10 +- .../templates/_kepler.alloy.tpl | 2 +- .../templates/_kube_dns.alloy.tpl | 27 ++ .../templates/_module.alloy.tpl | 10 +- .../tests/control_plane_test.yaml | 339 ++++++++++++++++++ .../values.schema.json | 31 ++ charts/feature-cluster-metrics/values.yaml | 45 ++- charts/k8s-monitoring/Chart.lock | 2 +- .../modules/databases/kv/etcd/metrics.alloy | 10 +- .../kubernetes/cert-manager/metrics.alloy | 10 +- .../modules/kubernetes/core/metrics.alloy | 10 +- .../system/node-exporter/metrics.alloy | 10 +- ...feature-annotation-autodiscovery-1.0.0.tgz | Bin 3895 -> 3897 bytes ...eature-application-observability-1.0.0.tgz | Bin 5339 -> 5339 bytes ...onitoring-feature-cluster-events-1.0.0.tgz | Bin 1887 -> 1886 bytes ...nitoring-feature-cluster-metrics-1.0.0.tgz | Bin 58015 -> 58297 bytes ...-monitoring-feature-integrations-1.0.0.tgz | Bin 12581 -> 12581 bytes .../k8s-monitoring-feature-pod-logs-1.0.0.tgz | Bin 3878 -> 3878 bytes ...k8s-monitoring-feature-profiling-1.0.0.tgz | Bin 3572 -> 3572 bytes ...ture-prometheus-operator-objects-1.0.0.tgz | Bin 378443 -> 382168 bytes .../examples/auth/sigv4/alloy-metrics.alloy | 2 +- .../docs/examples/auth/sigv4/output.yaml | 50 ++- .../examples/autoscaling/alloy-metrics.alloy | 2 +- .../docs/examples/autoscaling/output.yaml | 50 ++- .../collector-storage/alloy-metrics.alloy | 2 +- .../examples/collector-storage/output.yaml | 50 ++- .../examples/extra-rules/alloy-metrics.alloy | 2 +- .../docs/examples/extra-rules/output.yaml | 50 ++- .../alloy-metrics.alloy | 9 +- .../control-plane-monitoring/output.yaml | 67 ++-- .../default/alloy-metrics.alloy | 2 +- .../cluster-metrics/default/output.yaml | 50 ++- .../integrations/cert-manager/output.yaml | 10 +- .../features/integrations/etcd/output.yaml | 10 +- .../metrics-tuning/alloy-metrics.alloy | 2 +- .../docs/examples/metrics-tuning/output.yaml | 50 ++- .../platforms/azure-aks/alloy-metrics.alloy | 2 +- .../examples/platforms/azure-aks/output.yaml | 50 ++- .../platforms/eks-fargate/alloy-metrics.alloy | 2 +- .../platforms/eks-fargate/output.yaml | 40 ++- .../gke-autopilot/alloy-metrics.alloy | 2 +- .../platforms/gke-autopilot/output.yaml | 40 ++- .../platforms/openshift/alloy-metrics.alloy | 2 +- .../examples/platforms/openshift/output.yaml | 50 ++- .../alloy-metrics.alloy | 2 +- .../private-image-registries/output.yaml | 50 ++- .../docs/examples/proxies/alloy-metrics.alloy | 2 +- .../docs/examples/proxies/output.yaml | 50 ++- .../control-plane-monitoring/test-values.yaml | 4 +- charts/k8s-monitoring/vendir.lock.yml | 6 +- charts/k8s-monitoring/vendir.yml | 2 +- 52 files changed, 944 insertions(+), 288 deletions(-) create mode 100644 charts/feature-cluster-metrics/templates/_kube_dns.alloy.tpl create mode 100644 charts/feature-cluster-metrics/tests/control_plane_test.yaml diff --git a/charts/feature-cluster-metrics/README.md b/charts/feature-cluster-metrics/README.md index 1d26aac4f..02926c13a 100644 --- a/charts/feature-cluster-metrics/README.md +++ b/charts/feature-cluster-metrics/README.md @@ -201,7 +201,7 @@ Actual integration testing in a live environment should be done in the main [k8s | Key | Type | Default | Description | |-----|------|---------|-------------| -| controlPlane.enabled | bool | `false` | enable all Kubernetes Control Plane metrics sources. This includes api-server, kube-scheduler, kube-controller-manager, and etcd. | +| controlPlane.enabled | bool | `false` | enable all Kubernetes Control Plane metrics sources. This includes api-server, kube-scheduler, kube-controller-manager, and KubeDNS. | ### General settings @@ -263,6 +263,18 @@ Actual integration testing in a live environment should be done in the main [k8s | kubeControllerManager.port | int | `10257` | Port number used by the Kube Controller Manager, set by `--secure-port.` | | kubeControllerManager.scrapeInterval | string | 60s | How frequently to scrape metrics from the Kube Controller Manager Overrides metrics.scrapeInterval | +### KubeDNS + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| kubeDNS.enabled | bool | `false` | Scrape metrics from KubeDNS | +| kubeDNS.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for KubeDNS. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery/discovery.relabel/#rule-block)) | +| kubeDNS.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for KubeDNS. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#rule-block)) | +| kubeDNS.maxCacheSize | string | `nil` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides metrics.maxCacheSize | +| kubeDNS.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| kubeDNS.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. An empty list means keep all. | +| kubeDNS.scrapeInterval | string | 60s | How frequently to scrape metrics from KubeDNS Overrides metrics.scrapeInterval | + ### Kube Proxy | Key | Type | Default | Description | diff --git a/charts/feature-cluster-metrics/templates/_api_server.alloy.tpl b/charts/feature-cluster-metrics/templates/_api_server.alloy.tpl index 04e3d673f..9a411b4e1 100644 --- a/charts/feature-cluster-metrics/templates/_api_server.alloy.tpl +++ b/charts/feature-cluster-metrics/templates/_api_server.alloy.tpl @@ -1,9 +1,3 @@ -{{ define "feature.clusterMetrics.apiServer.allowList" }} -{{ if .Values.apiServer.metricsTuning.includeMetrics }} -{{ .Values.apiServer.metricsTuning.includeMetrics | toYaml }} -{{ end }} -{{ end }} - {{- define "feature.clusterMetrics.apiServer.alloy" }} {{- if or .Values.apiServer.enabled (and .Values.controlPlane.enabled (not (eq .Values.apiServer.enabled false))) }} {{- $metricAllowList := .Values.apiServer.metricsTuning.includeMetrics }} @@ -17,8 +11,8 @@ kubernetes.apiserver "scrape" { {{- if $metricDenyList }} drop_metrics = {{ $metricDenyList | join "|" | quote }} {{- end }} - scrape_interval = {{ .Values.cadvisor.scrapeInterval | default .Values.global.scrapeInterval | int }} - max_cache_size = {{ .Values.cadvisor.maxCacheSize | default .Values.global.maxCacheSize | int }} + scrape_interval = {{ .Values.apiServer.scrapeInterval | default .Values.global.scrapeInterval | quote }} + max_cache_size = {{ .Values.apiServer.maxCacheSize | default .Values.global.maxCacheSize | int }} {{- if .Values.apiServer.extraMetricProcessingRules }} forward_to = [prometheus.relabel.apiServer.receiver] } diff --git a/charts/feature-cluster-metrics/templates/_kepler.alloy.tpl b/charts/feature-cluster-metrics/templates/_kepler.alloy.tpl index 2231d0e9f..4189f2801 100644 --- a/charts/feature-cluster-metrics/templates/_kepler.alloy.tpl +++ b/charts/feature-cluster-metrics/templates/_kepler.alloy.tpl @@ -1,5 +1,5 @@ {{ define "feature.clusterMetrics.kepler.allowList" }} -{{ if .Values.cadvisor.metricsTuning.useDefaultAllowList }} +{{ if .Values.kepler.metricsTuning.useDefaultAllowList }} {{ "default-allow-lists/kepler.yaml" | .Files.Get }} {{ end }} {{ if .Values.kepler.metricsTuning.includeMetrics }} diff --git a/charts/feature-cluster-metrics/templates/_kube_dns.alloy.tpl b/charts/feature-cluster-metrics/templates/_kube_dns.alloy.tpl new file mode 100644 index 000000000..eff313719 --- /dev/null +++ b/charts/feature-cluster-metrics/templates/_kube_dns.alloy.tpl @@ -0,0 +1,27 @@ +{{- define "feature.clusterMetrics.kubeDNS.alloy" }} +{{- if or .Values.kubeDNS.enabled (and .Values.controlPlane.enabled (not (eq .Values.kubeDNS.enabled false))) }} +{{- $metricAllowList := .Values.kubeDNS.metricsTuning.includeMetrics }} +{{- $metricDenyList := .Values.kubeDNS.metricsTuning.excludeMetrics }} + +kubernetes.kube_dns "scrape" { + clustering = true +{{- if $metricAllowList }} + keep_metrics = "up|{{ $metricAllowList | join "|" }}" +{{- end }} +{{- if $metricDenyList }} + drop_metrics = {{ $metricDenyList | join "|" | quote }} +{{- end }} + scrape_interval = {{ .Values.kubeDNS.scrapeInterval | default .Values.global.scrapeInterval | quote }} + max_cache_size = {{ .Values.kubeDNS.maxCacheSize | default .Values.global.maxCacheSize | int }} +{{- if .Values.kubeDNS.extraMetricProcessingRules }} + forward_to = [prometheus.relabel.kube_dns.receiver] +} + +prometheus.relabel "kube_dns" { + max_cache_size = {{ .Values.kubeDNS.maxCacheSize | default .Values.global.maxCacheSize | int }} + {{ .Values.kubeDNS.extraMetricProcessingRules | indent 2 }} +{{- end }} + forward_to = argument.metrics_destinations.value +} +{{- end }} +{{- end }} diff --git a/charts/feature-cluster-metrics/templates/_module.alloy.tpl b/charts/feature-cluster-metrics/templates/_module.alloy.tpl index 28ac90cdc..a0ac6b456 100644 --- a/charts/feature-cluster-metrics/templates/_module.alloy.tpl +++ b/charts/feature-cluster-metrics/templates/_module.alloy.tpl @@ -1,10 +1,17 @@ {{- define "feature.clusterMetrics.module" }} +{{- $includeKubernetesModule := false }} +{{- $includeKubernetesModule = or $includeKubernetesModule .Values.cadvisor.enabled }} +{{- $includeKubernetesModule = or $includeKubernetesModule .Values.kubelet.enabled }} +{{- $includeKubernetesModule = or $includeKubernetesModule .Values.kubeletResource.enabled }} +{{- $includeKubernetesModule = or $includeKubernetesModule .Values.apiServer.enabled }} +{{- $includeKubernetesModule = or $includeKubernetesModule .Values.kubeDNS.enabled }} +{{- $includeKubernetesModule = or $includeKubernetesModule (and .Values.controlPlane.enabled (or (not (eq .Values.apiServer.enabled false)) (not (eq .Values.kubeDNS.enabled false)))) }} declare "cluster_metrics" { argument "metrics_destinations" { comment = "Must be a list of metric destinations where collected metrics should be forwarded to" } - {{- if or .Values.cadvisor.enabled .Values.kubelet.enabled .Values.kubeletResource.enabled (or .Values.apiServer.enabled (and .Values.controlPlane.enabled (not (eq .Values.apiServer.enabled false)))) }} + {{- if $includeKubernetesModule }} {{- include "alloyModules.load" (deepCopy $ | merge (dict "name" "kubernetes" "path" "modules/kubernetes/core/metrics.alloy")) | nindent 2 }} {{- end }} {{- include "feature.clusterMetrics.kubelet.alloy" . | indent 2 }} @@ -12,6 +19,7 @@ declare "cluster_metrics" { {{- include "feature.clusterMetrics.cadvisor.alloy" . | indent 2 }} {{- include "feature.clusterMetrics.apiServer.alloy" . | indent 2 }} {{- include "feature.clusterMetrics.kubeControllerManager.alloy" . | indent 2 }} + {{- include "feature.clusterMetrics.kubeDNS.alloy" . | indent 2 }} {{- include "feature.clusterMetrics.kubeProxy.alloy" . | indent 2 }} {{- include "feature.clusterMetrics.kubeScheduler.alloy" . | indent 2 }} {{- include "feature.clusterMetrics.kube_state_metrics.alloy" . | indent 2 }} diff --git a/charts/feature-cluster-metrics/tests/control_plane_test.yaml b/charts/feature-cluster-metrics/tests/control_plane_test.yaml new file mode 100644 index 000000000..8f0b12d2b --- /dev/null +++ b/charts/feature-cluster-metrics/tests/control_plane_test.yaml @@ -0,0 +1,339 @@ +# yamllint disable rule:document-start rule:line-length rule:trailing-spaces +suite: Test control plane monitoring +templates: + - configmap.yaml +tests: + - it: should render the configuration with control plane components included + set: + deployAsConfigMap: true + controlPlane: + enabled: true + asserts: + - isKind: + of: ConfigMap + - equal: + path: data["module.alloy"] + value: |- + declare "cluster_metrics" { + argument "metrics_destinations" { + comment = "Must be a list of metric destinations where collected metrics should be forwarded to" + } + + import.git "kubernetes" { + repository = "https://github.com/grafana/alloy-modules.git" + revision = "main" + path = "modules/kubernetes/core/metrics.alloy" + pull_frequency = "15m" + } + + kubernetes.kubelet "scrape" { + clustering = true + keep_metrics = "up|container_cpu_usage_seconds_total|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubernetes_build_info|namespace_workload_pod|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes" + scrape_interval = "60s" + max_cache_size = 100000 + forward_to = argument.metrics_destinations.value + } + + kubernetes.resources "scrape" { + clustering = true + job_label = "integrations/kubernetes/resources" + keep_metrics = "up|node_cpu_usage_seconds_total|node_memory_working_set_bytes" + scrape_interval = "60s" + max_cache_size = 100000 + forward_to = argument.metrics_destinations.value + } + + kubernetes.cadvisor "scrape" { + clustering = true + keep_metrics = "up|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|machine_memory_bytes" + scrape_interval = "60s" + max_cache_size = 100000 + forward_to = [prometheus.relabel.cadvisor.receiver] + } + + prometheus.relabel "cadvisor" { + max_cache_size = 100000 + // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688 + rule { + source_labels = ["__name__","container"] + separator = "@" + regex = "(container_cpu_.*|container_fs_.*|container_memory_.*)@" + action = "drop" + } + // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688 + rule { + source_labels = ["__name__","image"] + separator = "@" + regex = "(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@" + action = "drop" + } + // Normalizing unimportant labels (not deleting to continue satisfying