-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathhealth.sh
executable file
·112 lines (92 loc) · 3.13 KB
/
health.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
#
# Publish AutoPyFactory service metrics
#docs: http://itmon.web.cern.ch/itmon/recipes/how_to_publish_service_metrics.html
# http://itmon.web.cern.ch/itmon/recipes/how_to_create_a_service_xml.html
function err() {
echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $@" >&2
}
function age() {
local filename=$1
local now=$(date +%s)
local changed=$(stat -c %Y "$filename")
if [[ $changed -eq 0 ]]; then
err "Bad stat call"
let changed=now
fi
local elapsed
let elapsed=now-changed
echo $elapsed
}
tmpfile=$(mktemp /var/log/apf/health.XXXXX)
# Test 1
# check apf.log has recently being written to
logfile=/var/log/apf/apf.log
shortname=$(hostname -s)
timestamp=$(date +%Y-%m-%dT%H:%M:%S)
apflogage=$(age "$logfile")
status='degraded'
msg='Degraded'
if [[ $apflogage -lt 300 ]]; then
status='available'
msg='OK, activity seen in last 5 minutes in apf.log'
elif [[ $apflogage -lt 600 ]]; then
status='degraded'
msg='No activity seen for 10 minutes in apf.log'
elif [[ $apflogage -lt 1800 ]]; then
status='unavailable'
msg='No activity seen for 30 minutes in apf.log'
fi
# Test 2
# 'condor_q | tail -1' example output:
# 8848 jobs; 10 completed, 0 removed, 1662 idle, 7176 running, 0 held, 0 suspended
summary=$(condor_q | tail -1)
total=$(echo $summary | cut -d' ' -f1)
completed=$(echo $summary | cut -d' ' -f3)
removed=$(echo $summary | cut -d' ' -f5)
idle=$(echo $summary | cut -d' ' -f7)
running=$(echo $summary | cut -d' ' -f9)
if [[ $completed -gt 5000 ]]; then
status='degraded'
msg='Number of completed jobs too high (>5000)'
fi
if [[ $removed -gt 5000 ]]; then
status='degraded'
msg='Number of removed jobs too high (>5000)'
fi
# Test 3
logfile=/var/log/condor/GridmanagerLog.apf
shortname=$(hostname -s)
timestamp=$(date +%Y-%m-%dT%H:%M:%S)
gridage=$(age "$logfile")
if [[ $gridage -gt 1200 ]]; then
status='unavailable'
msg='No activity seen for 30 minutes in GridmanagerLog'
fi
cat <<EOF > $tmpfile
<?xml version="1.0" encoding="UTF-8"?>
<serviceupdate xmlns="http://sls.cern.ch/SLS/XML/update">
<id>PilotFactory_$shortname</id>
<status>$status</status>
<webpage>http://apfmon.lancs.ac.uk</webpage>
<contact>[email protected]</contact>
<availabilitydesc>Checks for recent activity in APF and condor logs</availabilitydesc>
<availabilityinfo>$msg</availabilityinfo>
<timestamp>$timestamp</timestamp>
<data>
<numericvalue desc="Age of apf.log in seconds" name="age">$apflogage</numericvalue>
<numericvalue desc="Age of GridmanagerLog in seconds" name="age">$gridage</numericvalue>
<numericvalue desc="Number of Completed jobs in condor" name="completed">$completed</numericvalue>
<numericvalue desc="Number of Removed jobs in condor" name="removed">$removed</numericvalue>
</data>
</serviceupdate>
EOF
#echo $tmpfile
if ! curl -i -s -F file=@$tmpfile xsls.cern.ch >/dev/null ; then
err "Error sending XML to xsls.cern.ch"
exit 1
fi
# remove files older than 2880 minutes (48 hours)
find /var/log/apf/ -type f -name health.* -mmin +1440 -delete
# check validity
#xmllint --noout --schema http://itmon.web.cern.ch/itmon/files/xsls_schema.xsd $tmpfile