From 01b67607794866dcb7f09848f21dfd45996ee0ab Mon Sep 17 00:00:00 2001 From: Alan Malta Rodrigues Date: Thu, 22 Aug 2024 21:45:37 -0400 Subject: [PATCH] Update restartComponent.sh script to notify everyone in the WMCore team Watch every single component of the agent; fix some paths --- deploy/restartComponent.sh | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/deploy/restartComponent.sh b/deploy/restartComponent.sh index ec938b80f9..6e26447ad5 100644 --- a/deploy/restartComponent.sh +++ b/deploy/restartComponent.sh @@ -1,35 +1,42 @@ #!/bin/sh -## Pass the component names as command line arguments, e.g.: -## ./restartComponent.sh ErrorHandler JobSubmitter AgentStatusWatcher +### Script to check the tail of each WMAgent component and evaluate +# whether they are running or not, based on file meta-data (stat). +# Component is automatically restarted if deemed down. +# NOTE that this script may not catch multi-thread components down, +# when only one of the threads is down. +### -HOST=`hostname` -DATENOW=`date +%s` - -# Get a few environment variables in, like $install and $manage -source /data/admin/wmagent/env.sh +HOST=$(hostname) +DATENOW=$(date +%s) +DEST_NAME=cms-wmcore-team # Figure whether it's a python2 or python3 agent if [ ! -d "$install" ]; then - install="/data/srv/wmagent/current/install/wmagentpy3" + install="/data/srv/wmagent/current/install/" fi -echo "List of components to be monitored: $@" -for comp in $@; do +echo -e "\n###Checking agent logs at: $(date)" +comps=$(ls $install) +for comp in $comps; do COMPLOG=$install/$comp/ComponentLog + if [ ! -f $COMPLOG ]; then + echo "Not a component or $COMPLOG does not exist" + continue + fi echo "Checking logs from: $COMPLOG" - LASTCHANGE=`stat -c %Y $COMPLOG` - INTERVAL=`expr $DATENOW - $LASTCHANGE` + LASTCHANGE=$(stat -c %Y $COMPLOG) + INTERVAL=$(expr $DATENOW - $LASTCHANGE) if (("$INTERVAL" >= 1800)); then - OTHERS=`ps aux | grep wmcore | grep -v grep` + OTHERS=$(ps aux | grep wmcore | grep -v grep) if [[ -z "$OTHERS" ]]; then echo "Since the agent is not running, don't do anything ..." exit 1 fi - TAIL_LOG=`tail -n100 $COMPLOG` + TAIL_LOG=$(tail -n100 $COMPLOG) $manage execute-agent wmcoreD --restart --components=$comp echo -e "ComponentLog quiet for $INTERVAL secs\n\nTail of the log is:\n$TAIL_LOG" | - mail -s "$HOST : $comp restarted" alan.malta@cern.ch,todor.trendafilov.ivanov@cern.ch + mail -s "$HOST : $comp restarted" $DEST_NAME@cern.ch fi done