forked from DataDog/Miscellany
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuptime.py
200 lines (172 loc) · 8.87 KB
/
uptime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import os.path
import sys
import time
from checks import AgentCheck, CheckException
# Modify this function to allow custom logic for determining whether the host is up. Note that this check is run in
# the main collector thread, so slow tasks performed in this function may reduce the agent's reporting frequency.
def is_up():
return True
# See the following site for why this code is necessary on Windows.
# http://stupidpythonideas.blogspot.com/2014/07/getting-atomic-writes-right.html
replace = None
if sys.platform == 'win32':
import win32api, win32con
def replace(src, dst):
win32api.MoveFileEx(src, dst, win32con.MOVEFILE_REPLACE_EXISTING)
else:
replace = os.rename
class UptimeMetricAggregator:
def __init__(self, name, start, end):
self.name = name
self.start = start
self.end = end
self.value = 0
class UptimeInterval:
def __init__(self, start, end):
self.start = start
self.end = end
class UptimeCheck(AgentCheck):
def __init__(self, name, init_config, agentConfig, instances=None):
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
self.instance_cleanup_times = {}
def check(self, instance):
uptime_log_directory = instance['uptime_log_directory']
# Logs previous uptimes. Only updated if a downtime occurs, or if one
# of the past uptimes becomes older than the retention period.
uptime_log_path = os.path.join(uptime_log_directory, 'uptime.log')
now_ref_ts = int(time.time())
# The metrics array can be empty, but it can't be missing
metrics = [UptimeMetricAggregator(metric['name'], now_ref_ts - metric['timespan'], now_ref_ts)
for metric in instance.get('metrics')]
# The default retention is one year.
# One year in seconds: 60 * 60 * 24 * 365.25 = 31557600
retention = instance.get('retention', 31557600)
min_cleanup_period = instance.get('min_cleanup_period', 3600)
last_cleanup_ts = self.instance_cleanup_times.get(_instance_key(instance))
if not last_cleanup_ts or last_cleanup_ts - now_ref_ts > min_cleanup_period:
strip_old_entries = True
self.instance_cleanup_times[_instance_key(instance)] = now_ref_ts
# no need to update if first entry is retained
else:
strip_old_entries = False
last_interval = None
tmp_uptime_log_path = uptime_log_path + ".tmp"
if os.path.isfile(uptime_log_path):
with open(uptime_log_path, 'r') as uptime_log_file:
interval = line_to_interval(uptime_log_file.readline())
# We will later process each line in the file, so we must seek the beginning
uptime_log_file.seek(0)
if interval:
# We don't need to strip old entries if there are none.
if strip_old_entries and interval.end > now_ref_ts - retention:
strip_old_entries = False
if strip_old_entries:
with open(tmp_uptime_log_path, 'w') as tmp_uptime_log:
process_uptime_log_file(uptime_log_file, metrics, tmp_uptime_log, now_ref_ts - retention)
tmp_uptime_log.flush()
os.fsync(tmp_uptime_log)
else:
process_uptime_log_file(uptime_log_file, metrics)
# Must wait until both files are closed
if strip_old_entries:
replace(tmp_uptime_log_path, uptime_log_path)
last_interval = interval
# Tracks the current uptime. Updated each time the check is run. A
# separate file is used to avoid copying the entire log each time the
# check runs.
uptime_path = os.path.join(uptime_log_directory, 'uptime')
downtime_threshold = instance['downtime_threshold']
prev_interval = read_uptime_interval(uptime_path)
current_interval = get_current_interval(now_ref_ts, prev_interval,
downtime_threshold)
update_metrics_with_interval(metrics, current_interval)
# The following conditions must be met for us to add an interval to
# the uptime log:
# - the previous interval must exist
# - the start of the current interval must differ from the start of
# the previous interval
# - the previous interval must have nonzero length (end > start)
# - if the uptime history has an entry check that the last entry isn't
# the same as the previous interval:
# If the system crashes after adding an entry to the uptime log, but
# before writing the current interval, we may find ourselves trying to
# add an entry to the uptime log that already exists the next time the
# check runs. Entries in the uptime log should be unique, so we must
# check that the entry hasn't already been added. We check the second
# entry to guard against a partial write, which would corrupt the log
# file (which could be fixed with manual intervention), but would not cause
# any uptime date to be lost.
if (prev_interval and current_interval.start != prev_interval.start and
prev_interval.start != prev_interval.end and
(last_interval is None or
last_interval.end != prev_interval.end)):
# add_entry_to_uptime_log(prev_interval, uptime_log_path)
# The uptime log will be updated relatively infrequently, so it doesn't
# make sense to add extra code to append entries when adding an interval.
update_metrics_with_interval(metrics, prev_interval)
add_entry_to_uptime_log(prev_interval, uptime_log_path)
write_current_interval(current_interval, uptime_path)
for metric in metrics:
self.gauge(metric.name, float(metric.value) / (metric.end - metric.start))
def process_uptime_log_file(uptime_log_file, metrics, new_uptime_log_file=None, retention_min_ts=None):
for line in uptime_log_file:
interval = line_to_interval(line)
update_metrics_with_interval(metrics, interval)
if new_uptime_log_file and interval.end > retention_min_ts:
new_uptime_log_file.write(line)
def update_metrics_with_interval(metrics, interval):
for metric in metrics:
if interval.end > metric.start:
if interval.start > metric.start:
metric.value = metric.value + interval.end - interval.start
else:
metric.value = metric.value + interval.end - metric.start
def _instance_key(instance):
""" Returns a unique value for each valid instance of this check.
The uptime_log_directory must be unique for each instance of the check, so we just use that.
"""
return instance['uptime_log_directory']
# Generate the current uptime interval. The current interval depends
# the previous interval (whether it's None, how long ago it ended) and
# the downtime_threshold (did the previous interval end longer than
# downtime_threshold seconds ago).
def get_current_interval(current_time, prev_interval, downtime_threshold):
if not is_up():
return prev_interval
if (not prev_interval or
current_time - prev_interval.end > downtime_threshold):
return UptimeInterval(current_time, current_time)
else:
return UptimeInterval(prev_interval.start, current_time)
def write_current_interval(current_interval, uptime_path):
tmp_uptime_path = uptime_path + ".tmp"
with open(tmp_uptime_path, 'w') as tmp_uptime_file:
tmp_uptime_file.write(interval_to_line(current_interval))
# We need to flush the file and call fsync to ensure that it is written to disk immediately. Experiments have
# shown that closing the file object and renaming it are not sufficient to ensure that the file is written to
# disk right away.
tmp_uptime_file.flush()
os.fsync(tmp_uptime_file)
replace(tmp_uptime_path, uptime_path)
def add_entry_to_uptime_log(prev_interval, uptime_log_path):
with open(uptime_log_path, 'a') as uptime_log:
uptime_log.write(interval_to_line(prev_interval))
uptime_log.flush()
os.fsync(uptime_log)
def interval_to_line(interval):
return "{0} {1}\n".format(interval.start, interval.end)
def line_to_interval(line):
entry = line.rstrip('\n').split(' ')
interval = UptimeInterval(int(entry[0]), int(entry[1]))
if interval.start > interval.end:
raise CheckException(
"Invalid interval: first value ({0}) was greater than second ({1})"
"".format(interval.start, interval.end))
return interval
def read_uptime_interval(path):
if os.path.isfile(path):
with open(path) as uptime_file:
return line_to_interval(uptime_file.readline())
else:
return None