-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: reintroduce NHC config template (#27)
Co-authored-by: jamesbeedy <[email protected]>
- Loading branch information
1 parent
246f1da
commit ebc7f2f
Showing
5 changed files
with
176 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
src/templates/nhc.conf.tmpl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# Enforce short hostnames to match the node names as tracked by slurm. | ||
* || HOSTNAME="$HOSTNAME_S" | ||
|
||
####################################################################### | ||
### | ||
### NHC Configuration Variables | ||
### | ||
# Explicitly instruct NHC to assume PBS (TORQUE, PBSPro) is the Resource Manager | ||
# * || export NHC_RM=pbs | ||
|
||
# Do not mark nodes offline | ||
# * || export MARK_OFFLINE=0 | ||
|
||
# Activate debugging mode | ||
# * || export DEBUG=1 | ||
|
||
# Set watchdog timer to 15 seconds | ||
# * || export TIMEOUT=15 | ||
|
||
# In out-of-band contexts, enable all checks | ||
# * || export NHC_CHECK_ALL=1 | ||
|
||
# Make sure $PATH contains important directories for diagnostic commands | ||
# * || export MOABHOMEDIR="/opt/moab" | ||
# * || export PATH="$MOABHOMEDIR/bin:$PATH" | ||
|
||
|
||
####################################################################### | ||
### | ||
### Hardware checks | ||
### | ||
# Set these to your correct socket, core, and thread counts. | ||
# * || check_hw_cpuinfo 2 12 24 | ||
|
||
# Set these to the amount of physical RAM you have (leave the fudge factor). | ||
# * || check_hw_physmem 32gb 32gb 5% | ||
|
||
# Set these to the amount of swap you have (leave the fudge factor). | ||
# * || check_hw_swap 8g 8g 3% | ||
|
||
# If you prefer to use this instead of the previous two, you can. | ||
# * || check_hw_mem 40g 40g 5% | ||
|
||
# Check specifically for free physical memory. | ||
# * || check_hw_physmem_free 1MB | ||
|
||
# Same, but for swap space. | ||
# * || check_hw_swap_free 1MB | ||
|
||
# Check for some sort of free memory of either type. | ||
* || check_hw_mem_free 1mb | ||
|
||
# Checks that there's a QDR IB interface that's ACTIVE and shows LinkUp. | ||
# * || check_hw_ib 40 | ||
|
||
# Checks for an active Myrinet interface named "myri0." | ||
# * || check_hw_gm myri0 | ||
|
||
# Checks for an active ethernet interface named "eth1." | ||
# * || check_hw_eth eth1 | ||
|
||
# Make sure we're running the correct BIOS version on all nodes. | ||
# * || check_dmi_data_match "BIOS Information: Version: 2.0.1" | ||
|
||
# Make sure our RAM is running at the correct bus rate. | ||
# * || check_dmi_data_match -t "Memory Device" "*Speed: 1866 MHz" | ||
|
||
# Check the mcelog daemon for any pending errors. | ||
# * || check_hw_mcelog | ||
|
||
|
||
####################################################################### | ||
### | ||
### Filesystem checks | ||
### | ||
# All nodes should have their root filesystem mounted read/write. | ||
* || check_fs_mount_rw -f / | ||
|
||
# Assert that /tmp is a mounted filesystem of type "tmpfs." | ||
# * || check_fs_mount_rw -t tmpfs -f /tmp | ||
|
||
# Controlling TTYs are a good thing! | ||
# * || check_fs_mount_rw -t devpts -s '/(none|devpts)/' -f /dev/pts | ||
|
||
# Make sure the root filesystem doesn't get too full. | ||
# * || check_fs_free / 3% | ||
|
||
# Free inodes are also important. | ||
# * || check_fs_ifree / 1k | ||
|
||
# The following illustrates how to assert an NFSv3 mount (or any other specific mount option). | ||
# * || check_fs_mount -s bluearc0:/home -t nfs -o '/(^|,)vers=3(,|$)/' -f /home | ||
|
||
|
||
####################################################################### | ||
### | ||
### File/metadata checks | ||
### | ||
# These should always be directories and always be read/write/execute and sticky. | ||
* || check_file_test -r -w -x -d -k /tmp /var/tmp | ||
|
||
# These should always be readable and should never be empty. | ||
* || check_file_test -r -s /etc/passwd /etc/group | ||
|
||
# Assert common properties for /dev/null (which occasionally gets clobbered). | ||
* || check_file_test -c -r -w /dev/null /dev/zero | ||
# * || check_file_stat -m 0666 -u 0 -g 0 -t 1 -T 3 /dev/null | ||
|
||
# Make sure there's relatively recent activity from the syslog. | ||
# * || check_file_stat -n 7200 /var/log/messages | ||
|
||
# Validate a couple important accounts in the passwd file. | ||
# * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*" | ||
|
||
|
||
####################################################################### | ||
### | ||
### Process checks | ||
### | ||
# Everybody needs sshd running, right? But don't use -r (restart)! | ||
# * || check_ps_service -u root -S sshd | ||
|
||
# Check for munge with the correct user | ||
* || check_ps_service -u munge munged | ||
|
||
# The cron daemon is another useful critter... | ||
# * || check_ps_service -r crond | ||
|
||
# Check for wulfd but don't manage it. | ||
# * || check_ps_daemon wulfd root | ||
|
||
# Make sure no users are SSH'd in, but don't kill them. | ||
# * || check_ps_blacklist sshd '!root' | ||
|
||
# Flag and kill any processes which are owned by unauthorized users. | ||
# * || check_ps_unauth_users log syslog kill | ||
|
||
# Flag any user processes not properly parented. | ||
# * || check_ps_userproc_lineage log syslog | ||
|
||
# Most systems also need NFS locking services. | ||
# * || check_ps_service -d rpc.statd -r nfslock | ||
|
||
# The audit daemon can sometimes disappear if things get hairy. | ||
# * || check_ps_service -r auditd | ||
|
||
# This is only valid for RHEL6 and similar/newer systems. | ||
# * || check_ps_service -d rsyslogd -r rsyslog | ||
|
||
# In the case of MySQL, it's typically better to cycle. | ||
# * || check_ps_service -c mysqld | ||
|
||
# Double your core count is a good rule of thumb for load average max. | ||
# * || check_ps_loadavg 24 | ||
# This should work if you place it after one of the check_hw_*() checks. | ||
# * || check_ps_loadavg $((2*HW_CORES)) | ||
|
||
|
||
####################################################################### | ||
### | ||
### Other checks | ||
### | ||
# Check to verify that SELinux is disabled. (Remove the "-r 1" to verify it's enabled.) | ||
# * || check_cmd_status -t 1 -r 1 selinuxenabled | ||
|
||
# Verify settings for an Ethernet interface. | ||
# * || check_cmd_output -m '/addr:10\.0\.0\.1/' -m '/Bcast:10\.0\.0\.255/' -m '/Mask:255\.255\.255\.0/' -m '/^[[:space:]]*UP /' /sbin/ifconfig eth3 | ||
|
||
# nVidia HealthMon GPU health checks (requires Tesla Development Kit) | ||
# * || check_nv_healthmon |