From ccefec72da2e0820ab81353ed20976946cbd7ad0 Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Tue, 25 Jun 2024 19:47:41 +0000 Subject: [PATCH] pr feedback integration --- src/charm.py | 91 +++--- src/constants.py | 99 +++++++ src/interface_slurmd.py | 18 +- src/interface_slurmdbd.py | 10 +- src/interface_slurmrestd.py | 12 +- src/slurm_conf_editor.py | 2 +- src/slurmctld_ops.py | 541 +++++++++--------------------------- tox.ini | 8 +- 8 files changed, 311 insertions(+), 470 deletions(-) create mode 100644 src/constants.py diff --git a/src/charm.py b/src/charm.py index b93ea27..56f96b0 100755 --- a/src/charm.py +++ b/src/charm.py @@ -9,7 +9,8 @@ import subprocess from typing import Any, Dict, List, Optional, Union -from charms.fluentbit.v0.fluentbit import FluentbitClient # type: ignore +from charms.fluentbit.v0.fluentbit import FluentbitClient +from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, FLUENTBIT_CONFIG, SLURM_CONF_PATH from interface_slurmd import ( PartitionAvailableEvent, PartitionUnavailableEvent, @@ -40,7 +41,7 @@ main, ) from slurm_conf_editor import slurm_conf_as_string -from slurmctld_ops import SlurmctldManager +from slurmctld_ops import SlurmctldManager, is_container logger = logging.getLogger() @@ -65,13 +66,9 @@ def __init__(self, *args): user_supplied_slurm_conf_params=str(), ) - # Fluentbit relation - self._fluentbit = FluentbitClient(self, "fluentbit") - - # SlurmctldManager - self._slurmctld_manager = SlurmctldManager(self, "slurmctld") + self._slurmctld_manager = SlurmctldManager() - # Slurm components + self._fluentbit = FluentbitClient(self, "fluentbit") self._slurmd = Slurmd(self, "slurmd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmrestd = Slurmrestd(self, "slurmrestd") @@ -91,7 +88,6 @@ def __init__(self, *args): self._slurmd.on.slurmd_departed: self._on_write_slurm_conf, # slurmrestd available self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, - # NOTE: a second slurmctld should get the jwt/munge keys and configure them # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, # actions @@ -111,29 +107,43 @@ def _on_install(self, event: InstallEvent) -> None: # Store the munge_key and jwt_rsa key in the stored state. # NOTE: Use secrets instead of stored state when secrets are supported the framework. if self.model.unit.is_leader(): - self._stored.jwt_rsa = self._slurmctld_manager.generate_jwt_rsa() - self._stored.munge_key = self._slurmctld_manager.get_munge_key() - self._slurmctld_manager.write_jwt_rsa(self.get_jwt_rsa()) - self._slurmctld_manager.restart_munged() + jwt_rsa = self._slurmctld_manager.generate_jwt_rsa() + self._stored.jwt_rsa = jwt_rsa + + munge_key = self._slurmctld_manager.generate_munge_key() + self._stored.munge_key = munge_key + + self._slurmctld_manager.stop_munged() + self._slurmctld_manager.write_munge_key(munge_key) + self._slurmctld_manager.start_munged() + + self._slurmctld_manager.stop_slurmctld() + self._slurmctld_manager.write_jwt_rsa(jwt_rsa) + self._slurmctld_manager.start_slurmctld() + self.unit.set_workload_version(self._slurmctld_manager.version()) self.slurm_installed = True else: + self.unit.status = BlockedStatus("Only singleton slurmctld is supported.") logger.debug("Secondary slurmctld not supported.") event.defer() else: self.unit.status = BlockedStatus("Error installing slurmctld") + logger.error("Cannot install slurmctld, please debug.") event.defer() self._on_write_slurm_conf(event) def _on_config_changed(self, event: ConfigChangedEvent) -> None: """Perform config-changed operations.""" - charm_config_nhc_params = self.config.get("health-check-params") - if (user_supplied_nhc_params := charm_config_nhc_params) != self._stored.nhc_params: + charm_config_nhc_params = str(self.config.get("health-check-params", "")) + if (charm_config_nhc_params != self._stored.nhc_params) and ( + charm_config_nhc_params != "" + ): logger.debug("## NHC user supplied params changed, sending to slurmd.") - self._stored.nhc_params = user_supplied_nhc_params + self._stored.nhc_params = charm_config_nhc_params # Send the custom NHC parameters to all slurmd. - self._slurmd.set_nhc_params(user_supplied_nhc_params) + self._slurmd.set_nhc_params(charm_config_nhc_params) write_slurm_conf = False if charm_config_default_partition := self.config.get("default-partition"): @@ -160,15 +170,13 @@ def _on_update_status(self, event: UpdateStatusEvent) -> None: def _on_show_current_config_action(self, event: ActionEvent) -> None: """Show current slurm.conf.""" - slurm_conf = self._slurmctld_manager.slurm_conf_path.read_text() + slurm_conf = SLURM_CONF_PATH.read_text() event.set_results({"slurm.conf": slurm_conf}) def _on_fluentbit_relation_created(self, event: RelationCreatedEvent) -> None: """Set up Fluentbit log forwarding.""" logger.debug("## Configuring fluentbit") - cfg = [] - cfg.extend(self._slurmctld_manager.fluentbit_config_slurm) - self._fluentbit.configure(cfg) + self._fluentbit.configure(FLUENTBIT_CONFIG) def _on_slurmrestd_available(self, event: SlurmrestdAvailableEvent) -> None: """Check that we have slurm_config when slurmrestd available otherwise defer the event.""" @@ -241,16 +249,15 @@ def _on_write_slurm_conf( return if slurm_config := self._assemble_slurm_conf(): + self._slurmctld_manager.stop_slurmctld() self._slurmctld_manager.write_slurm_conf(slurm_config) # Write out any user_supplied_cgroup_parameters to /etc/slurm/cgroup.conf. - if user_supplied_cgroup_parameters := self.config.get("cgroup-parameters"): - self._slurmctld_manager.write_cgroup_conf(user_supplied_cgroup_parameters) + if user_supplied_cgroup_parameters := self.config.get("cgroup-parameters", ""): + self._slurmctld_manager.write_cgroup_conf(str(user_supplied_cgroup_parameters)) + + self._slurmctld_manager.start_slurmctld() - # Restart is needed if nodes are added/removed from the cluster, but since we don't - # currently have a method of identifying if nodes are being added or removed, simply - # restart every time after writing slurm.conf. - self._slurmctld_manager.restart_slurmctld() self._slurmctld_manager.slurm_cmd("scontrol", "reconfigure") # Transitioning Nodes @@ -281,29 +288,28 @@ def _on_write_slurm_conf( logger.debug("## Should write slurm.conf, but we don't have it. " "Deferring.") event.defer() - def _assemble_slurm_conf(self) -> Dict[Any, Any]: + def _assemble_slurm_conf(self) -> Dict[str, Any]: """Return the slurm.conf parameters.""" - slurmctld_manager = self._slurmctld_manager - - charm_maintained_parameters = slurmctld_manager.charm_maintained_slurm_conf_parameters() user_supplied_parameters = self._get_user_supplied_parameters() slurmd_parameters = self._slurmd.get_new_nodes_and_nodes_and_partitions() def _assemble_slurmctld_parameters() -> str: # Preprocess merging slurmctld_parameters if they exist in the context - slurmctld_param_config = charm_maintained_parameters["SlurmctldParameters"].split(",") - if user_supplied_slurmctld_parameters := user_supplied_parameters.get( + slurmctld_param_config = CHARM_MAINTAINED_SLURM_CONF_PARAMETERS[ "SlurmctldParameters" - ): - slurmctld_param_config = list( - set( - slurmctld_param_config.extend( - user_supplied_slurmctld_parameters.split(",") - ) - ) + ].split(",") + user_config = [] + + if ( + user_supplied_slurmctld_parameters := user_supplied_parameters.get( + "SlurmctldParameters", "" ) - return ",".join(slurmctld_param_config) + != "" + ): + user_config.extend(user_supplied_slurmctld_parameters.split(",")) + + return ",".join(slurmctld_param_config + user_config) accounting_params = {} if (slurmdbd_host := self._stored.slurmdbd_host) != "": @@ -319,8 +325,9 @@ def _assemble_slurmctld_parameters() -> str: "SlurmctldAddr": self._slurmd_ingress_address, "SlurmctldHost": self.hostname, "SlurmctldParameters": _assemble_slurmctld_parameters(), + "ProctrackType": "proctrack/linuxproc" if is_container() else "proctrack/cgroup", **accounting_params, - **charm_maintained_parameters, + **CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, **slurmd_parameters, **user_supplied_parameters, } diff --git a/src/constants.py b/src/constants.py new file mode 100644 index 0000000..c074c34 --- /dev/null +++ b/src/constants.py @@ -0,0 +1,99 @@ +# Copyright 2024 Omnivector, LLC. +# See LICENSE file for licensing details. +"""This module provides constants for the slurmctld-operator charm.""" +from pathlib import Path + +SLURM_CONF_PATH = Path("/etc/slurm/slurm.conf") +SLURM_USER = "slurm" +SLURM_GROUP = "slurm" + +CHARM_MAINTAINED_SLURM_CONF_PARAMETERS = { + "AuthAltParameters": "jwt_key=/var/spool/slurmctldjwt_hs256.key", + "AuthAltTypes": "auth/jwt", + "AuthInfo": "/var/run/munge/munge.socket.2", + "AuthType": "auth/munge", + "GresTypes": "gpu", + "HealthCheckInterval": "600", + "HealthCheckNodeState": "ANY,CYCLE", + "HealthCheckProgram": "/usr/sbin/omni-nhc-wrapper", + "MailProg": "/usr/bin/mail.mailutils", + "PluginDir": "/usr/lib/x86_64-linux-gnu/slurm-wlm", + "PlugStackConfig": "/etc/slurm/plugstack.conf.d/plugstack.conf", + "SelectType": "select/cons_tres", + "SlurmctldPort": "6817", + "SlurmdPort": "6818", + "StateSaveLocation": "/var/spool/slurmctld", + "SlurmdSpoolDir": "/var/spool/slurmd", + "SlurmctldParameters": "enable_configless", + "SlurmctldLogFile": "/var/log/slurm/slurmctld.log", + "SlurmdLogFile": "/var/log/slurm/slurmctld.log", + "SlurmdPidFile": "/var/run/slurmd.pid", + "SlurmctldPidFile": "/var/run/slurmctld.pid", + "SlurmUser": SLURM_USER, + "SlurmdUser": "root", + "RebootProgram": '"/usr/sbin/reboot --reboot"', +} + + +FLUENTBIT_CONFIG = [ + { + "input": [ + ("name", "tail"), + ("path", "/var/log/slurm/slurmctld.log"), + ("path_key", "filename"), + ("tag", "slurmctld"), + ("parser", "slurm"), + ] + }, + { + "parser": [ + ("name", "slurm"), + ("format", "regex"), + ("regex", r"^\[(?