Skip to content

Commit

Permalink
fix: reintroduce NHC config template (#27)
Browse files Browse the repository at this point in the history
Co-authored-by: jamesbeedy <[email protected]>
  • Loading branch information
jedel1043 and jamesbeedy authored Sep 24, 2024
1 parent 246f1da commit ebc7f2f
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 6 deletions.
2 changes: 1 addition & 1 deletion charms/slurmctld/src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"GresTypes": "gpu",
"HealthCheckInterval": "600",
"HealthCheckNodeState": "ANY,CYCLE",
"HealthCheckProgram": "/usr/sbin/omni-nhc-wrapper",
"HealthCheckProgram": "/usr/sbin/charmed-nhc-wrapper",
"MailProg": "/usr/bin/mail.mailutils",
"PluginDir": "/usr/lib/x86_64-linux-gnu/slurm-wlm",
"PlugStackConfig": "/etc/slurm/plugstack.conf.d/plugstack.conf",
Expand Down
2 changes: 0 additions & 2 deletions charms/slurmd/.gitignore

This file was deleted.

1 change: 1 addition & 0 deletions charms/slurmd/.wokeignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
src/templates/nhc.conf.tmpl
7 changes: 4 additions & 3 deletions charms/slurmd/src/slurmd_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def install(self) -> bool:
return False

self.render_nhc_config()
self.render_nhc_wrapper("")

spool_dir = Path("/var/spool/slurmd")
spool_dir.mkdir()
Expand Down Expand Up @@ -271,10 +272,10 @@ def render_nhc_config(self, extra_configs=None) -> bool:
return True

def render_nhc_wrapper(self, params: str) -> None:
"""Render the /usr/sbin/omni-nhc-wrapper script."""
logger.debug(f"## rendering /usr/sbin/omni-nhc-wrapper: {params}")
"""Render the /usr/sbin/charmed-nhc-wrapper script."""
logger.debug(f"## rendering /usr/sbin/charmed-nhc-wrapper: {params}")

target = Path("/usr/sbin/omni-nhc-wrapper")
target = Path("/usr/sbin/charmed-nhc-wrapper")

target.write_text(
textwrap.dedent(
Expand Down
170 changes: 170 additions & 0 deletions charms/slurmd/src/templates/nhc.conf.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Enforce short hostnames to match the node names as tracked by slurm.
* || HOSTNAME="$HOSTNAME_S"

#######################################################################
###
### NHC Configuration Variables
###
# Explicitly instruct NHC to assume PBS (TORQUE, PBSPro) is the Resource Manager
# * || export NHC_RM=pbs

# Do not mark nodes offline
# * || export MARK_OFFLINE=0

# Activate debugging mode
# * || export DEBUG=1

# Set watchdog timer to 15 seconds
# * || export TIMEOUT=15

# In out-of-band contexts, enable all checks
# * || export NHC_CHECK_ALL=1

# Make sure $PATH contains important directories for diagnostic commands
# * || export MOABHOMEDIR="/opt/moab"
# * || export PATH="$MOABHOMEDIR/bin:$PATH"


#######################################################################
###
### Hardware checks
###
# Set these to your correct socket, core, and thread counts.
# * || check_hw_cpuinfo 2 12 24

# Set these to the amount of physical RAM you have (leave the fudge factor).
# * || check_hw_physmem 32gb 32gb 5%

# Set these to the amount of swap you have (leave the fudge factor).
# * || check_hw_swap 8g 8g 3%

# If you prefer to use this instead of the previous two, you can.
# * || check_hw_mem 40g 40g 5%

# Check specifically for free physical memory.
# * || check_hw_physmem_free 1MB

# Same, but for swap space.
# * || check_hw_swap_free 1MB

# Check for some sort of free memory of either type.
* || check_hw_mem_free 1mb

# Checks that there's a QDR IB interface that's ACTIVE and shows LinkUp.
# * || check_hw_ib 40

# Checks for an active Myrinet interface named "myri0."
# * || check_hw_gm myri0

# Checks for an active ethernet interface named "eth1."
# * || check_hw_eth eth1

# Make sure we're running the correct BIOS version on all nodes.
# * || check_dmi_data_match "BIOS Information: Version: 2.0.1"

# Make sure our RAM is running at the correct bus rate.
# * || check_dmi_data_match -t "Memory Device" "*Speed: 1866 MHz"

# Check the mcelog daemon for any pending errors.
# * || check_hw_mcelog


#######################################################################
###
### Filesystem checks
###
# All nodes should have their root filesystem mounted read/write.
* || check_fs_mount_rw -f /

# Assert that /tmp is a mounted filesystem of type "tmpfs."
# * || check_fs_mount_rw -t tmpfs -f /tmp

# Controlling TTYs are a good thing!
# * || check_fs_mount_rw -t devpts -s '/(none|devpts)/' -f /dev/pts

# Make sure the root filesystem doesn't get too full.
# * || check_fs_free / 3%

# Free inodes are also important.
# * || check_fs_ifree / 1k

# The following illustrates how to assert an NFSv3 mount (or any other specific mount option).
# * || check_fs_mount -s bluearc0:/home -t nfs -o '/(^|,)vers=3(,|$)/' -f /home


#######################################################################
###
### File/metadata checks
###
# These should always be directories and always be read/write/execute and sticky.
* || check_file_test -r -w -x -d -k /tmp /var/tmp

# These should always be readable and should never be empty.
* || check_file_test -r -s /etc/passwd /etc/group

# Assert common properties for /dev/null (which occasionally gets clobbered).
* || check_file_test -c -r -w /dev/null /dev/zero
# * || check_file_stat -m 0666 -u 0 -g 0 -t 1 -T 3 /dev/null

# Make sure there's relatively recent activity from the syslog.
# * || check_file_stat -n 7200 /var/log/messages

# Validate a couple important accounts in the passwd file.
# * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*"


#######################################################################
###
### Process checks
###
# Everybody needs sshd running, right? But don't use -r (restart)!
# * || check_ps_service -u root -S sshd

# Check for munge with the correct user
* || check_ps_service -u munge munged

# The cron daemon is another useful critter...
# * || check_ps_service -r crond

# Check for wulfd but don't manage it.
# * || check_ps_daemon wulfd root

# Make sure no users are SSH'd in, but don't kill them.
# * || check_ps_blacklist sshd '!root'

# Flag and kill any processes which are owned by unauthorized users.
# * || check_ps_unauth_users log syslog kill

# Flag any user processes not properly parented.
# * || check_ps_userproc_lineage log syslog

# Most systems also need NFS locking services.
# * || check_ps_service -d rpc.statd -r nfslock

# The audit daemon can sometimes disappear if things get hairy.
# * || check_ps_service -r auditd

# This is only valid for RHEL6 and similar/newer systems.
# * || check_ps_service -d rsyslogd -r rsyslog

# In the case of MySQL, it's typically better to cycle.
# * || check_ps_service -c mysqld

# Double your core count is a good rule of thumb for load average max.
# * || check_ps_loadavg 24
# This should work if you place it after one of the check_hw_*() checks.
# * || check_ps_loadavg $((2*HW_CORES))


#######################################################################
###
### Other checks
###
# Check to verify that SELinux is disabled. (Remove the "-r 1" to verify it's enabled.)
# * || check_cmd_status -t 1 -r 1 selinuxenabled

# Verify settings for an Ethernet interface.
# * || check_cmd_output -m '/addr:10\.0\.0\.1/' -m '/Bcast:10\.0\.0\.255/' -m '/Mask:255\.255\.255\.0/' -m '/^[[:space:]]*UP /' /sbin/ifconfig eth3

# nVidia HealthMon GPU health checks (requires Tesla Development Kit)
# * || check_nv_healthmon

0 comments on commit ebc7f2f

Please sign in to comment.