Skip to content

Commit

Permalink
Merge pull request #1744 from dgrisonnet/improve-burnrate
Browse files Browse the repository at this point in the history
OCPBUGS-49764: bindata/alerts/slo: improve burnrate calculation
  • Loading branch information
openshift-merge-bot[bot] authored Feb 5, 2025
2 parents 1537626 + 275f05d commit f90c0d9
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 50 deletions.
60 changes: 32 additions & 28 deletions bindata/assets/alerts/kube-apiserver-slos-basic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ spec:
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/KubeAPIErrorBudgetBurn.md
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
sum:apiserver_request:burnrate1h > (14.40 * 0.01000)
and
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
sum:apiserver_request:burnrate5m > (14.40 * 0.01000)
for: 2m
labels:
long: 1h
Expand All @@ -28,9 +28,9 @@ spec:
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/KubeAPIErrorBudgetBurn.md
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
sum:apiserver_request:burnrate6h > (6.00 * 0.01000)
and
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
sum:apiserver_request:burnrate30m > (6.00 * 0.01000)
for: 15m
labels:
long: 6h
Expand Down Expand Up @@ -61,11 +61,9 @@ spec:
# errors
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: apiserver_request:burnrate5m
record: apiserver_request:burn5m
- expr: |
(
(
Expand All @@ -88,11 +86,9 @@ spec:
# errors
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
labels:
verb: read
record: apiserver_request:burnrate30m
record: apiserver_request:burn30m
- expr: |
(
(
Expand All @@ -115,11 +111,9 @@ spec:
# errors
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
labels:
verb: read
record: apiserver_request:burnrate1h
record: apiserver_request:burn1h
- expr: |
(
(
Expand All @@ -142,11 +136,9 @@ spec:
# errors
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
labels:
verb: read
record: apiserver_request:burnrate6h
record: apiserver_request:burn6h
- expr: |
(
(
Expand All @@ -158,11 +150,9 @@ spec:
+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
labels:
verb: write
record: apiserver_request:burnrate1h
record: apiserver_request:burn1h
- expr: |
(
(
Expand All @@ -174,11 +164,9 @@ spec:
+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
labels:
verb: write
record: apiserver_request:burnrate30m
record: apiserver_request:burn30m
- expr: |
(
(
Expand All @@ -190,11 +178,9 @@ spec:
+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: apiserver_request:burnrate5m
record: apiserver_request:burn5m
- expr: |
(
(
Expand All @@ -206,11 +192,29 @@ spec:
+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
labels:
verb: write
record: apiserver_request:burnrate6h
record: apiserver_request:burn6h
- expr: |
sum(apiserver_request:burn5m)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE"}[5m]))
record: sum:apiserver_request:burnrate5m
- expr: |
sum(apiserver_request:burn30m)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE"}[30m]))
record: sum:apiserver_request:burnrate30m
- expr: |
sum(apiserver_request:burn1h)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE"}[1h]))
record: sum:apiserver_request:burnrate1h
- expr: |
sum(apiserver_request:burn6h)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE"}[6h]))
record: sum:apiserver_request:burnrate5m
- expr: |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
Expand Down
47 changes: 25 additions & 22 deletions bindata/assets/alerts/kube-apiserver-slos-extended.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ spec:
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/KubeAPIErrorBudgetBurn.md
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
sum:apiserver_request:burnrate1d > (3.00 * 0.01000)
and
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
sum:apiserver_request:burnrate2h > (3.00 * 0.01000)
for: 1h
labels:
long: 1d
Expand All @@ -28,9 +28,9 @@ spec:
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/KubeAPIErrorBudgetBurn.md
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
sum:apiserver_request:burnrate3d > (1.00 * 0.01000)
and
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
sum:apiserver_request:burnrate6h > (1.00 * 0.01000)
for: 3h
labels:
long: 3d
Expand Down Expand Up @@ -61,11 +61,9 @@ spec:
# errors
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
labels:
verb: read
record: apiserver_request:burnrate2h
record: apiserver_request:burn2h
- expr: |
(
(
Expand All @@ -88,11 +86,9 @@ spec:
# errors
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
labels:
verb: read
record: apiserver_request:burnrate1d
record: apiserver_request:burn1d
- expr: |
(
(
Expand All @@ -115,11 +111,9 @@ spec:
# errors
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
labels:
verb: read
record: apiserver_request:burnrate3d
record: apiserver_request:burn3d
- expr: |
(
(
Expand All @@ -131,11 +125,9 @@ spec:
+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
labels:
verb: write
record: apiserver_request:burnrate1d
record: apiserver_request:burn1d
- expr: |
(
(
Expand All @@ -147,11 +139,9 @@ spec:
+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
labels:
verb: write
record: apiserver_request:burnrate2h
record: apiserver_request:burn2h
- expr: |
(
(
Expand All @@ -163,8 +153,21 @@ spec:
+
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
labels:
verb: write
record: apiserver_request:burnrate3d
record: apiserver_request:burn3d
- expr: |
sum(apiserver_request:burn2h)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE"}[2h]))
record: sum:apiserver_request:burnrate2h
- expr: |
sum(apiserver_request:burn1d)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE"}[1d]))
record: sum:apiserver_request:burnrate1d
- expr: |
sum(apiserver_request:burn3d)
/
sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE"}[3d]))
record: sum:apiserver_request:burnrate3d

0 comments on commit f90c0d9

Please sign in to comment.