Skip to content

Commit

Permalink
Update restartComponent.sh script to notify everyone in the WMCore team
Browse files Browse the repository at this point in the history
Watch every single component of the agent; fix some paths
  • Loading branch information
amaltaro committed Aug 26, 2024
1 parent 9e91b41 commit 01b6760
Showing 1 changed file with 22 additions and 15 deletions.
37 changes: 22 additions & 15 deletions deploy/restartComponent.sh
Original file line number Diff line number Diff line change
@@ -1,35 +1,42 @@
#!/bin/sh
## Pass the component names as command line arguments, e.g.:
## ./restartComponent.sh ErrorHandler JobSubmitter AgentStatusWatcher
### Script to check the tail of each WMAgent component and evaluate
# whether they are running or not, based on file meta-data (stat).
# Component is automatically restarted if deemed down.
# NOTE that this script may not catch multi-thread components down,
# when only one of the threads is down.
###

HOST=`hostname`
DATENOW=`date +%s`

# Get a few environment variables in, like $install and $manage
source /data/admin/wmagent/env.sh
HOST=$(hostname)
DATENOW=$(date +%s)
DEST_NAME=cms-wmcore-team

# Figure whether it's a python2 or python3 agent
if [ ! -d "$install" ]; then
install="/data/srv/wmagent/current/install/wmagentpy3"
install="/data/srv/wmagent/current/install/"
fi

echo "List of components to be monitored: $@"
for comp in $@; do
echo -e "\n###Checking agent logs at: $(date)"
comps=$(ls $install)
for comp in $comps; do
COMPLOG=$install/$comp/ComponentLog
if [ ! -f $COMPLOG ]; then
echo "Not a component or $COMPLOG does not exist"
continue
fi
echo "Checking logs from: $COMPLOG"
LASTCHANGE=`stat -c %Y $COMPLOG`
INTERVAL=`expr $DATENOW - $LASTCHANGE`
LASTCHANGE=$(stat -c %Y $COMPLOG)
INTERVAL=$(expr $DATENOW - $LASTCHANGE)
if (("$INTERVAL" >= 1800)); then
OTHERS=`ps aux | grep wmcore | grep -v grep`
OTHERS=$(ps aux | grep wmcore | grep -v grep)
if [[ -z "$OTHERS" ]]; then
echo "Since the agent is not running, don't do anything ..."
exit 1
fi

TAIL_LOG=`tail -n100 $COMPLOG`
TAIL_LOG=$(tail -n100 $COMPLOG)
$manage execute-agent wmcoreD --restart --components=$comp
echo -e "ComponentLog quiet for $INTERVAL secs\n\nTail of the log is:\n$TAIL_LOG" |
mail -s "$HOST : $comp restarted" [email protected],todor.trendafilov.ivanov@cern.ch
mail -s "$HOST : $comp restarted" $DEST_NAME@cern.ch
fi
done

0 comments on commit 01b6760

Please sign in to comment.