Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(PoC) Alertmanager: Strict initialization mode for the Alertmanager #10511

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -15597,6 +15597,17 @@
"fieldType": "string",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "strict_initialization_mode",
"required": false,
"desc": "Skip starting the Alertmanager for tenants without a non-default, non-empty configuration.",
"fieldValue": null,
"fieldDefaultValue": false,
"fieldFlag": "alertmanager.strict-initialization-mode",
"fieldType": "boolean",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_concurrent_get_requests_per_tenant",
Expand Down
2 changes: 2 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,8 @@ Usage of ./cmd/mimir/mimir:
Directory to store Alertmanager state and temporarily configuration files. The content of this directory is not required to be persisted between restarts unless Alertmanager replication has been disabled. (default "./data-alertmanager/")
-alertmanager.storage.retention duration
How long should we store stateful data (notification logs and silences). For notification log entries, refers to how long should we keep entries before they expire and are deleted. For silences, refers to how long should tenants view silences after they expire and are deleted. (default 120h0m0s)
-alertmanager.strict-initialization-mode
[experimental] Skip starting the Alertmanager for tenants without a non-default, non-empty configuration.
-alertmanager.utf8-migration-logging-enabled
[experimental] Enable logging of tenant configurations that are incompatible with UTF-8 strict mode.
-alertmanager.utf8-strict-mode-enabled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2377,6 +2377,11 @@ sharding_ring:
# CLI flag: -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix
[grafana_alertmanager_conditionally_skip_tenant_suffix: <string> | default = ""]

# (experimental) Skip starting the Alertmanager for tenants without a
# non-default, non-empty configuration.
# CLI flag: -alertmanager.strict-initialization-mode
[strict_initialization_mode: <boolean> | default = false]

# (advanced) Maximum number of concurrent GET requests allowed per tenant. The
# zero value (and negative values) result in a limit of GOMAXPROCS or 8,
# whichever is larger. Status code 503 is served for GET requests that would
Expand Down
88 changes: 85 additions & 3 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ type MultitenantAlertmanagerConfig struct {

GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"`
StrictInitializationMode bool `yaml:"strict_initialization_mode" category:"experimental"`

MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"`

Expand Down Expand Up @@ -129,6 +130,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger
f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.")
f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.")
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.")
f.BoolVar(&cfg.StrictInitializationMode, "alertmanager.strict-initialization-mode", false, "Skip starting the Alertmanager for tenants without a non-default, non-empty configuration.")
f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.")

f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.")
Expand Down Expand Up @@ -324,6 +326,9 @@ type MultitenantAlertmanager struct {
tenantsDiscovered prometheus.Gauge
syncTotal *prometheus.CounterVec
syncFailures *prometheus.CounterVec

lolMtx sync.RWMutex
receivingAlerts map[string]struct{}
}

// NewMultitenantAlertmanager creates a new MultitenantAlertmanager.
Expand Down Expand Up @@ -397,6 +402,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC
registry: registerer,
limits: limits,
features: features,
receivingAlerts: map[string]struct{}{},
ringCheckErrors: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_alertmanager_ring_check_errors_total",
Help: "Number of errors that have occurred when checking the ring for ownership.",
Expand Down Expand Up @@ -677,7 +683,7 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
}

if !startAM {
level.Debug(am.logger).Log("msg", "not initializing alertmanager for grafana tenant without a promoted, non-default configuration", "user", user)
level.Debug(am.logger).Log("msg", "not initializing alertmanager for tenant", "user", user)
amInitSkipped[user] = struct{}{}
continue
}
Expand Down Expand Up @@ -723,20 +729,33 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
// computeConfig takes an AlertConfigDescs struct containing Mimir and Grafana configurations.
// It returns the final configuration and a bool indicating whether the Alertmanager should be started for the tenant.
func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, bool, error) {
isGrafanaCfgUsable := cfgs.Grafana.Promoted && !cfgs.Grafana.Default
isMimirCfgUsable := cfgs.Mimir.RawConfig != "" && cfgs.Mimir.RawConfig != am.fallbackConfig
if am.cfg.StrictInitializationMode && !isGrafanaCfgUsable && !isMimirCfgUsable {
// Skip starting the Alertmanager if we have no usable configurations.
am.lolMtx.RLock()
_, ok := am.receivingAlerts[cfgs.Mimir.User]
am.lolMtx.RUnlock()
if !ok {
return amConfig{}, false, nil
}
level.Debug(am.logger).Log("msg", "user has no usable config but is receiving alerts, starting Alertmanager", "user", cfgs.Mimir.User)
}

cfg := amConfig{
AlertConfigDesc: cfgs.Mimir,
tmplExternalURL: am.cfg.ExternalURL.URL,
}

// If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration.
if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" {
if !isGrafanaCfgUsable || cfgs.Grafana.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User)
isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix)
return cfg, !isGrafanaTenant, nil
}

// If the Mimir configuration is either default or empty, use the Grafana configuration.
if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" {
if !isMimirCfgUsable {
level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User)
cfg, err := createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig)
return cfg, true, err
Expand Down Expand Up @@ -1005,6 +1024,22 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http
return
}

if req.URL.Path == "/alertmanager/api/v2/alerts" && req.Method == http.MethodPost {
am.lolMtx.Lock()
am.receivingAlerts[userID] = struct{}{}
am.lolMtx.Unlock()
userAM, err = am.startAlertmanager(req.Context(), userID)
if err != nil {
level.Error(am.logger).Log("msg", "unable to initialize the Alertmanager", "user", userID, "err", err)
http.Error(w, "Failed to initialize the Alertmanager", http.StatusInternalServerError)
return
}

level.Debug(am.logger).Log("msg", "alerts received, Alertmanager initialized", "user", userID, "err", err)
userAM.mux.ServeHTTP(w, req)
return
}

if am.fallbackConfig != "" {
userAM, err = am.alertmanagerFromFallbackConfig(req.Context(), userID)
if errors.Is(err, errNotUploadingFallback) {
Expand All @@ -1025,6 +1060,53 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http
http.Error(w, "the Alertmanager is not configured", http.StatusPreconditionFailed)
}

func (am *MultitenantAlertmanager) startAlertmanager(ctx context.Context, userID string) (*Alertmanager, error) {
if !am.isUserOwned(userID) {
return nil, errors.Wrap(errNotUploadingFallback, "user not owned by this instance")
}

cfg, err := am.store.GetAlertConfig(ctx, userID)
if err != nil {
if !errors.Is(err, alertspb.ErrNotFound) {
return nil, errors.Wrap(err, "failed to check for existing configuration")
}

level.Warn(am.logger).Log("msg", "no configuration exists for user; uploading fallback configuration", "user", userID)

// Upload an empty config so that the Alertmanager is not de-activated in the next poll.
cfgDesc := alertspb.ToProto("", nil, userID)
err = am.store.SetAlertConfig(ctx, cfgDesc)
if err != nil {
return nil, err
}

// Calling setConfig with an empty configuration will use the fallback config.
amConfig := amConfig{
AlertConfigDesc: cfgDesc,
tmplExternalURL: am.cfg.ExternalURL.URL,
}
err = am.setConfig(amConfig)
if err != nil {
return nil, err
}

am.alertmanagersMtx.Lock()
defer am.alertmanagersMtx.Unlock()
return am.alertmanagers[userID], nil
}

amConfig := amConfig{
AlertConfigDesc: cfg,
tmplExternalURL: am.cfg.ExternalURL.URL,
}
if err := am.setConfig(amConfig); err != nil {
return nil, err
}
am.alertmanagersMtx.Lock()
defer am.alertmanagersMtx.Unlock()
return am.alertmanagers[userID], nil
}

func (am *MultitenantAlertmanager) alertmanagerFromFallbackConfig(ctx context.Context, userID string) (*Alertmanager, error) {
// Make sure we never create fallback instances for a user not owned by this instance.
// This check is not strictly necessary as the configuration polling loop will deactivate
Expand Down
Loading