diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 99a7f7cf5d8..916c9354eae 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -1181,27 +1181,35 @@ spec: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing expr: | max by(cluster, namespace, pod) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 - for: 5m + for: 20m labels: - severity: warning + severity: critical - alert: MimirBlockBuilderLagging annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%. + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging expr: | max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 for: 75m labels: severity: warning + - alert: MimirBlockBuilderLagging + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging + expr: | + max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + for: 140m + labels: + severity: critical - alert: MimirBlockBuilderCompactAndUploadFailed annotations: message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed expr: | sum by (cluster, namespace, pod) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 - for: 5m labels: - severity: warning + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index ae02e2a1e65..85570d950c4 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -1155,27 +1155,35 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing expr: | max by(cluster, namespace, instance) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 - for: 5m + for: 20m labels: - severity: warning + severity: critical - alert: MimirBlockBuilderLagging annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%. + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging expr: | max by(cluster, namespace, instance) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 for: 75m labels: severity: warning + - alert: MimirBlockBuilderLagging + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging + expr: | + max by(cluster, namespace, instance) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + for: 140m + labels: + severity: critical - alert: MimirBlockBuilderCompactAndUploadFailed annotations: message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed expr: | sum by (cluster, namespace, instance) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 - for: 5m labels: - severity: warning + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index be4057ddb7a..63e205a4293 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1169,27 +1169,35 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing expr: | max by(cluster, namespace, pod) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 - for: 5m + for: 20m labels: - severity: warning + severity: critical - alert: MimirBlockBuilderLagging annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%. + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging expr: | max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 for: 75m labels: severity: warning + - alert: MimirBlockBuilderLagging + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging + expr: | + max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + for: 140m + labels: + severity: critical - alert: MimirBlockBuilderCompactAndUploadFailed annotations: message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed expr: | sum by (cluster, namespace, pod) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 - for: 5m labels: - severity: warning + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index b223a0514b7..d92d9279e67 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -230,12 +230,12 @@ // Alert if block-builder didn't process cycles in the past hour. { alert: $.alertName('BlockBuilderNoCycleProcessing'), - 'for': '5m', + 'for': '20m', expr: ||| max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 ||| % $._config, labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not processed cycles in the past hour.' % $._config, @@ -255,19 +255,31 @@ severity: 'warning', }, annotations: { - message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}%%.' % $._config, + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}.' % $._config, + }, + }, + { + alert: $.alertName('BlockBuilderLagging'), + 'for': '140m', // 2h20m. Indicating the lag did not come down for ~2 consumption cycles. + expr: ||| + max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}.' % $._config, }, }, - // Alert if block-builder is failing to compact and upload any blocks. + // Alert immediately if block-builder is failing to compact and upload any blocks. { alert: $.alertName('BlockBuilderCompactAndUploadFailed'), - 'for': '5m', expr: ||| sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 ||| % $._config, labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to compact and upload blocks.' % $._config,