-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmonitor.go
186 lines (155 loc) · 4.63 KB
/
monitor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
package main
import (
"math"
"os/exec"
"time"
"git.iamthefij.com/iamthefij/slog"
)
// Monitor represents a particular periodic check of a command
type Monitor struct { //nolint:maligned
// Config values
AlertAfter int16 `yaml:"alert_after"`
AlertEvery *int16 `yaml:"alert_every"`
CheckInterval SecondsOrDuration `yaml:"check_interval"`
Name string
AlertDown []string `yaml:"alert_down"`
AlertUp []string `yaml:"alert_up"`
Command CommandOrShell
// Other values
alertCount int16
failureCount int16
lastCheck time.Time
lastSuccess time.Time
lastOutput string
lastCheckDuration time.Duration
}
// IsValid returns a boolean indicating if the Monitor has been correctly
// configured
func (monitor Monitor) IsValid() bool {
return (!monitor.Command.Empty() &&
monitor.getAlertAfter() > 0 &&
monitor.AlertDown != nil)
}
// ShouldCheck returns a boolean indicating if the Monitor is ready to be
// be checked again
func (monitor Monitor) ShouldCheck() bool {
if monitor.lastCheck.IsZero() {
return true
}
sinceLastCheck := time.Since(monitor.lastCheck)
return sinceLastCheck >= monitor.CheckInterval.Value()
}
// Check will run the command configured by the Monitor and return a status
// and a possible AlertNotice
func (monitor *Monitor) Check() (bool, *AlertNotice) {
var cmd *exec.Cmd
if monitor.Command.Command != nil {
cmd = exec.Command(monitor.Command.Command[0], monitor.Command.Command[1:]...)
} else {
cmd = ShellCommand(monitor.Command.ShellCommand)
}
checkStartTime := time.Now()
output, err := cmd.CombinedOutput()
monitor.lastCheck = time.Now()
monitor.lastOutput = string(output)
monitor.lastCheckDuration = monitor.lastCheck.Sub(checkStartTime)
var alertNotice *AlertNotice
isSuccess := (err == nil)
if isSuccess {
alertNotice = monitor.success()
} else {
alertNotice = monitor.failure()
}
slog.Debugf("Command output: %s", monitor.lastOutput)
slog.OnErrWarnf(err, "Command result: %v", err)
slog.Infof(
"%s success=%t, alert=%t",
monitor.Name,
isSuccess,
alertNotice != nil,
)
return isSuccess, alertNotice
}
// IsUp returns the status of the current monitor
func (monitor Monitor) IsUp() bool {
return monitor.alertCount == 0
}
// LastCheckMilliseconds gives number of miliseconds the last check ran for
func (monitor Monitor) LastCheckMilliseconds() int64 {
return monitor.lastCheckDuration.Milliseconds()
}
func (monitor *Monitor) success() (notice *AlertNotice) {
if !monitor.IsUp() {
// Alert that we have recovered
notice = monitor.createAlertNotice(true)
}
monitor.failureCount = 0
monitor.alertCount = 0
monitor.lastSuccess = time.Now()
return
}
func (monitor *Monitor) failure() (notice *AlertNotice) {
monitor.failureCount++
// If we haven't hit the minimum failures, we can exit
if monitor.failureCount < monitor.getAlertAfter() {
slog.Debugf(
"%s failed but did not hit minimum failures. "+
"Count: %v alert after: %v",
monitor.Name,
monitor.failureCount,
monitor.getAlertAfter(),
)
return
}
// Take number of failures after minimum
failureCount := (monitor.failureCount - monitor.getAlertAfter())
// Use alert cadence to determine if we should alert
switch {
case monitor.AlertEvery == nil, *monitor.AlertEvery == 0:
// Handle alerting on first failure only
if failureCount == 0 {
notice = monitor.createAlertNotice(false)
}
case *monitor.AlertEvery > 0:
// Handle integer number of failures before alerting
if failureCount%*monitor.AlertEvery == 0 {
notice = monitor.createAlertNotice(false)
}
default:
// Handle negative numbers indicating an exponential backoff
if failureCount >= int16(math.Pow(2, float64(monitor.alertCount))-1) { //nolint:gomnd
notice = monitor.createAlertNotice(false)
}
}
// If we're going to alert, increment count
if notice != nil {
monitor.alertCount++
}
return notice
}
func (monitor Monitor) getAlertAfter() int16 {
// TODO: Come up with a better way than this method
// Zero is one!
if monitor.AlertAfter == 0 {
return 1
}
return monitor.AlertAfter
}
// GetAlertNames gives a list of alert names for a given monitor status
func (monitor Monitor) GetAlertNames(up bool) []string {
if up {
return monitor.AlertUp
}
return monitor.AlertDown
}
func (monitor Monitor) createAlertNotice(isUp bool) *AlertNotice {
// TODO: Maybe add something about recovery status here
return &AlertNotice{
MonitorName: monitor.Name,
AlertCount: monitor.alertCount,
FailureCount: monitor.failureCount,
LastCheckOutput: monitor.lastOutput,
LastSuccess: monitor.lastSuccess,
IsUp: isUp,
}
}