Skip to content

Commit

Permalink
multi-cluster-diagnostics pod (#2780)
Browse files Browse the repository at this point in the history
* diagnostics deployment

Signed-off-by: chipzoller <[email protected]>
Co-authored-by: thomasvn <[email protected]>
Co-authored-by: Chip Zoller <[email protected]>
  • Loading branch information
3 people authored Dec 12, 2023
1 parent 6dd8064 commit ec23beb
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 4 deletions.
20 changes: 20 additions & 0 deletions cost-analyzer/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ If release name contains chart name it will be used as a full name.
{{- end -}}
{{- end -}}

{{- define "diagnostics.fullname" -}}
{{- if .Values.diagnosticsFullnameOverride -}}
{{- .Values.diagnosticsFullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name "diagnostics" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}

{{- define "federator.fullname" -}}
{{- printf "%s-%s" .Release.Name "federator" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
Expand Down Expand Up @@ -109,6 +117,9 @@ Create the fully qualified name for Prometheus alertmanager service.
{{- printf "%s-%s" .Release.Name "query-service-load-balancer" | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{- define "diagnostics.serviceName" -}}
{{- printf "%s-%s" .Release.Name "diagnostics" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- define "aggregator.serviceName" -}}
{{- printf "%s-%s" .Release.Name "aggregator" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
Expand Down Expand Up @@ -233,6 +244,10 @@ app: federator
{{ include "cost-analyzer.chartLabels" . }}
app: aggregator
{{- end -}}
{{- define "diagnostics.commonLabels" -}}
{{ include "cost-analyzer.chartLabels" . }}
app: diagnostics
{{- end -}}
{{- define "cloudCost.commonLabels" -}}
{{ include "cost-analyzer.chartLabels" . }}
{{ include "cloudCost.selectorLabels" . }}
Expand All @@ -255,6 +270,11 @@ app: {{ template "cost-analyzer.networkCostsName" . }}
{{- define "networkcosts.selectorLabels" -}}
app: {{ template "cost-analyzer.networkCostsName" . }}
{{- end }}
{{- define "diagnostics.selectorLabels" -}}
app.kubernetes.io/name: diagnostics
app.kubernetes.io/instance: {{ .Release.Name }}
app: diagnostics
{{- end }}
{{/*
{{- end -}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,15 @@ data:
server {{ template "cloudCost.fullname" . }}.{{ .Release.Namespace }}:9005;
}
{{- end }}

{{- if and .Values.diagnostics.enabled .Values.diagnostics.isDiagnosticsPrimary.enabled }}
{{- if or .Values.global.thanos.enabled (not (empty .Values.kubecostModel.federatedStorageConfigSecret )) }}
upstream multi-cluster-diagnostics {
server {{ template "diagnostics.fullname" . }}.{{ .Release.Namespace }}:9007;
}
{{- end }}
{{- end }}

server {
server_name _;
root /var/www;
Expand Down Expand Up @@ -712,14 +721,48 @@ data:
}
{{- end }}



{{- if .Values.kubecostFrontend.trendsDisabled }}
location /model/allocation/trends {
return 204 'endpoint disabled';
}
{{ end }}

location /model/multi-cluster-diagnostics-enabled {
default_type 'application/json';
{{- if .Values.diagnostics.isDiagnosticsPrimary.enabled }}
{{- if .Values.diagnostics.enabled }}
{{- if or .Values.global.thanos.enabled (not (empty .Values.kubecostModel.federatedStorageConfigSecret )) }}
return 200 '{"multi-cluster-diagnostics-enabled": "true"}';
{{- else }}
return 200 '{"multi-cluster-diagnostics-enabled": "false"}';
{{- end }}
{{- end }}
{{- end }}
}
{{- if and .Values.diagnostics.enabled .Values.diagnostics.isDiagnosticsPrimary.enabled }}
{{- if or .Values.global.thanos.enabled (not (empty .Values.kubecostModel.federatedStorageConfigSecret )) }}
location /model/multi-cluster-diagnostics {
default_type 'application/json';
proxy_read_timeout 300;
proxy_pass http://multi-cluster-diagnostics/status;
proxy_redirect off;
proxy_set_header Connection "";
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
# simple alias for support
location /model/mcd {
default_type 'application/json';
proxy_read_timeout 300;
proxy_pass http://multi-cluster-diagnostics/status?window=7d;
proxy_redirect off;
proxy_set_header Connection "";
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
{{- end }}
{{- end }}

location /model/aggregatorEnabled {
default_type 'application/json';
{{- if .Values.kubecostAggregator.enabled }}
Expand Down
182 changes: 182 additions & 0 deletions cost-analyzer/templates/diagnostics-deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
{{- if .Values.diagnostics.enabled }}
{{- if or .Values.global.thanos.enabled (not (empty .Values.kubecostModel.federatedStorageConfigSecret )) -}}

{{- if eq .Values.prometheus.server.global.external_labels.cluster_id "cluster-one" }}
{{- fail "Error: The 'cluster_id' is set to default 'cluster-one'. Please update so that the diagnostics service can uniquely identify data coming from this cluster." }}
{{- end }}

apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ template "diagnostics.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "diagnostics.selectorLabels" . | nindent 4 }}
{{- if and .Values.diagnostics .Values.diagnostics.labels }}
{{- toYaml .Values.diagnostics.labels | nindent 4 }}
{{- end }}
spec:
replicas: 1
selector:
matchLabels:
{{- include "diagnostics.selectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "diagnostics.selectorLabels" . | nindent 8 }}
annotations:
# Generates a unique annotation upon each `helm upgrade`, forcing a redeployment
helm.sh/pod-restarter: {{ randNumeric 3 | quote}}
{{- with .Values.global.podAnnotations}}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
restartPolicy: Always
{{- if .Values.diagnostics.securityContext }}
securityContext:
{{- toYaml .Values.diagnostics.securityContext | nindent 8 }}
{{- else if .Values.global.securityContext }}
securityContext:
{{- toYaml .Values.global.securityContext | nindent 8 }}
{{- end }}
serviceAccountName: {{ template "cost-analyzer.serviceAccountName" . }}
volumes:
{{- if .Values.kubecostModel.federatedStorageConfigSecret }}
- name: federated-storage-config
secret:
defaultMode: 420
secretName: {{ .Values.kubecostModel.federatedStorageConfigSecret }}
{{- else if .Values.global.thanos.enabled }}
- name: federated-storage-config
secret:
defaultMode: 420
secretName: {{ .Values.thanos.storeSecretName }}
items:
- key: object-store.yaml
path: federated-store.yaml
{{- end }}
- name: config-db
{{- /* #TODO: make pv? */}}
emptyDir: {}
containers:
- name: diagnostics
args: ["diagnostics"]
{{- if .Values.kubecostModel }}
{{- if .Values.kubecostModel.openSourceOnly }}
image: quay.io/kubecost1/kubecost-cost-model:{{ .Values.imageVersion }}
{{- else if .Values.kubecostModel.fullImageName }}
image: {{ .Values.kubecostModel.fullImageName }}
{{- else if .Values.imageVersion }}
image: {{ .Values.kubecostModel.image }}:{{ .Values.imageVersion }}
{{- else }}
image: {{ .Values.kubecostModel.image }}:prod-{{ $.Chart.AppVersion }}
{{- end }}
{{- else }}
image: gcr.io/kubecost1/cost-model:prod-{{ $.Chart.AppVersion }}
{{- end }}
{{- if .Values.kubecostModel.imagePullPolicy }}
imagePullPolicy: {{ .Values.kubecostModel.imagePullPolicy }}
{{- else }}
imagePullPolicy: Always
{{- end }}
{{- if .Values.imagePullSecrets }}
imagePullSecrets:
{{ toYaml .Values.imagePullSecrets | indent 2 }}
{{- end }}
{{- if .Values.diagnostics.containerSecurityContext }}
securityContext:
{{- toYaml .Values.diagnostics.containerSecurityContext | nindent 12 }}
{{- else if .Values.global.containerSecurityContext }}
securityContext:
{{- toYaml .Values.global.containerSecurityContext | nindent 12 }}
{{- end }}
volumeMounts:
- name: config-db
mountPath: /var/configs/db
readOnly: false
- name: federated-storage-config
mountPath: /var/configs/etl
readOnly: true
env:
{{- if and (.Values.prometheus.server.global.external_labels.cluster_id) (not .Values.prometheus.server.clusterIDConfigmap) }}
- name: CLUSTER_ID
value: {{ .Values.prometheus.server.global.external_labels.cluster_id }}
{{- end }}
{{- if .Values.prometheus.server.clusterIDConfigmap }}
- name: CLUSTER_ID
valueFrom:
configMapKeyRef:
name: {{ .Values.prometheus.server.clusterIDConfigmap }}
key: CLUSTER_ID
{{- end }}
- name: FEDERATED_STORE_CONFIG
value: /var/configs/etl/federated-store.yaml
- name: DIAGNOSTICS_KUBECOST_FQDN
value: {{ template "cost-analyzer.serviceName" . }}
- name: DIAGNOSTICS_KUBECOST_NAMESPACE
value: {{ .Release.Namespace }}
- name: DIAGNOSTICS_POLLING_INTERVAL
value: {{ .Values.diagnostics.pollingInterval | default "300s" }}
- name: DIAGNOSTICS_PRIMARY
{{- if .Values.diagnostics.isDiagnosticsPrimary.enabled }}
value: "true"
{{- else }}
value: "false"
{{- end }}
- name: DIAGNOSTICS_COLLECT_HELM_VALUES
{{- if and .Values.reporting.valuesReporting .Values.diagnostics.collectHelmValues }}
value: "true"
{{- else }}
value: "false"
{{- end }}
- name: DIAGNOSTICS_KEEP_HISTORY
{{- if .Values.diagnostics.keepDiagnosticHistory }}
value: "true"
{{- else }}
value: "false"
{{- end }}
{{- if .Values.systemProxy.enabled }}
- name: HTTP_PROXY
value: {{ .Values.systemProxy.httpProxyUrl }}
- name: http_proxy
value: {{ .Values.systemProxy.httpProxyUrl }}
- name: HTTPS_PROXY
value: {{ .Values.systemProxy.httpsProxyUrl }}
- name: https_proxy
value: {{ .Values.systemProxy.httpsProxyUrl }}
- name: NO_PROXY
value: {{ .Values.systemProxy.noProxy }}
- name: no_proxy
value: {{ .Values.systemProxy.noProxy }}
{{- end }}
{{- range $key, $value := .Values.diagnostics.env }}
- name: {{ $key | quote }}
value: {{ $value | quote }}
{{- end }}
{{- /* TODO: heatlhcheck that validates the diagnotics pod is healthy */}}
{{- if .Values.diagnostics.isDiagnosticsPrimary.enabled}}
readinessProbe:
httpGet:
path: /healthz
port: 9007
ports:
- name: diagnostics-api
containerPort: 9007
protocol: TCP
{{- end }}
resources:
{{- toYaml .Values.diagnostics.resources | nindent 12 }}
{{- with .Values.diagnostics.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.diagnostics.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.diagnostics.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
22 changes: 22 additions & 0 deletions cost-analyzer/templates/diagnostics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{{- if .Values.diagnostics.isDiagnosticsPrimary.enabled }}
{{- if .Values.diagnostics.enabled }}
{{- if or .Values.global.thanos.enabled (not (empty .Values.kubecostModel.federatedStorageConfigSecret )) -}}
apiVersion: v1
kind: Service
metadata:
name: {{ template "diagnostics.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "diagnostics.selectorLabels" . | nindent 4 }}
spec:
ports:
- name: diagnostics-api
protocol: TCP
port: 9007
targetPort: diagnostics-api
selector:
{{- include "diagnostics.selectorLabels" . | nindent 4 }}
type: ClusterIP
{{- end }}
{{- end }}
{{- end }}
4 changes: 2 additions & 2 deletions cost-analyzer/values-eks-cost-monitoring.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pricingCsv:
enabled: false
location:
provider: "AWS"
region: "us-east-1"
region: "us-east-1"
URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI
csvAccessCredentials: pricing-schema-access-secret

Expand Down Expand Up @@ -46,7 +46,7 @@ kubecostFrontend:
#limits:
# cpu: "100m"
# memory: "256Mi"

kubecostModel:
image: public.ecr.aws/kubecost/cost-model
imagePullPolicy: Always
Expand Down
26 changes: 26 additions & 0 deletions cost-analyzer/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 +1078,32 @@ kubecostAggregator:
# containerPort: 40000
securityContext: {} # Define a securityContext for the aggregator pod. This will take highest precedence.

## Kubecost Diagnostic Pod (beta, disabled by default until GA) Enables the
## Kubecost primary to report health/diagnostics of all agent clusters. Each
## agent cluster sends its health/diagnostic data to a storage bucket. Future
## versions may include repairing & alerting from the primary.
## Ref: https://docs.kubecost.com/install-and-configure/install/diagnostics
##
diagnostics:
enabled: true
## How frequently to run & push diagnostics. Defaults to 5 minutes.
pollingInterval: "300s"
## Creates a new Diagnostic file in the bucket for every run.
keepDiagnosticHistory: true
## Pushes the cluster's Kubecost Helm Values to the bucket once upon startup.
## This may contain sensitive information and is roughly 30kb per cluster.
collectHelmValues: false
## Only needs to be enabled on the primary cluster. When enabled, will
## download the diagnostic files from the bucket and serve HTTP queries.
isDiagnosticsPrimary:
enabled: true

resources:
requests:
cpu: "10m"
memory: "20Mi"
securityContext: {}

# Kubecost Cluster Controller for Right Sizing and Cluster Turndown
clusterController:
enabled: false
Expand Down

0 comments on commit ec23beb

Please sign in to comment.