From a5a84c487ad82d8343bb00a8595b4aa28c7c6b32 Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Fri, 20 Dec 2024 17:04:18 +0000 Subject: [PATCH 01/15] chore(all): bump `slurmutils` to `0.11.0` and add `jsonschema` dependency. --- charms/sackd/charmcraft.yaml | 1 + charms/slurmctld/charmcraft.yaml | 1 + charms/slurmctld/requirements.txt | 2 +- charms/slurmd/charmcraft.yaml | 1 + charms/slurmdbd/charmcraft.yaml | 1 + charms/slurmdbd/requirements.txt | 2 +- charms/slurmrestd/charmcraft.yaml | 1 + charms/slurmrestd/requirements.txt | 2 +- test-requirements.txt | 2 +- 9 files changed, 9 insertions(+), 4 deletions(-) diff --git a/charms/sackd/charmcraft.yaml b/charms/sackd/charmcraft.yaml index 67bc0a1..21e30ca 100644 --- a/charms/sackd/charmcraft.yaml +++ b/charms/sackd/charmcraft.yaml @@ -34,6 +34,7 @@ parts: charm: charm-binary-python-packages: - cryptography ~= 44.0.0 + - jsonschema ~= 4.23.0 provides: slurmctld: diff --git a/charms/slurmctld/charmcraft.yaml b/charms/slurmctld/charmcraft.yaml index 0a0fc37..6909be7 100644 --- a/charms/slurmctld/charmcraft.yaml +++ b/charms/slurmctld/charmcraft.yaml @@ -54,6 +54,7 @@ parts: charm: charm-binary-python-packages: - cryptography ~= 44.0.0 + - jsonschema ~= 4.23.0 - pydantic config: diff --git a/charms/slurmctld/requirements.txt b/charms/slurmctld/requirements.txt index 15118f6..71fdf6b 100644 --- a/charms/slurmctld/requirements.txt +++ b/charms/slurmctld/requirements.txt @@ -1,2 +1,2 @@ ops==2.17.1 -slurmutils~=0.9.0 +slurmutils~=0.11.0 diff --git a/charms/slurmd/charmcraft.yaml b/charms/slurmd/charmcraft.yaml index 047d315..218d5e6 100644 --- a/charms/slurmd/charmcraft.yaml +++ b/charms/slurmd/charmcraft.yaml @@ -32,6 +32,7 @@ parts: charm: charm-binary-python-packages: - cryptography ~= 44.0.0 + - jsonschema ~= 4.23.0 nhc: plugin: nil build-packages: diff --git a/charms/slurmdbd/charmcraft.yaml b/charms/slurmdbd/charmcraft.yaml index 53a9cd9..0891094 100644 --- a/charms/slurmdbd/charmcraft.yaml +++ b/charms/slurmdbd/charmcraft.yaml @@ -34,6 +34,7 @@ parts: charm: charm-binary-python-packages: - cryptography ~= 44.0.0 + - jsonschema ~= 4.23.0 requires: database: diff --git a/charms/slurmdbd/requirements.txt b/charms/slurmdbd/requirements.txt index 15118f6..71fdf6b 100644 --- a/charms/slurmdbd/requirements.txt +++ b/charms/slurmdbd/requirements.txt @@ -1,2 +1,2 @@ ops==2.17.1 -slurmutils~=0.9.0 +slurmutils~=0.11.0 diff --git a/charms/slurmrestd/charmcraft.yaml b/charms/slurmrestd/charmcraft.yaml index fc900a1..4784aaa 100644 --- a/charms/slurmrestd/charmcraft.yaml +++ b/charms/slurmrestd/charmcraft.yaml @@ -31,6 +31,7 @@ parts: charm: charm-binary-python-packages: - cryptography ~= 44.0.0 + - jsonschema ~= 4.23.0 provides: slurmctld: diff --git a/charms/slurmrestd/requirements.txt b/charms/slurmrestd/requirements.txt index 15118f6..71fdf6b 100644 --- a/charms/slurmrestd/requirements.txt +++ b/charms/slurmrestd/requirements.txt @@ -1,2 +1,2 @@ ops==2.17.1 -slurmutils~=0.9.0 +slurmutils~=0.11.0 diff --git a/test-requirements.txt b/test-requirements.txt index 8a020ee..25a39a2 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -3,7 +3,7 @@ cryptography~=43.0.1 distro==1.9.0 python-dotenv~=1.0.1 pycryptodome==3.20.0 -slurmutils~=0.9.0 +slurmutils~=0.11.0 dbus-fast>=1.90.2 pyfakefs==5.7.1 From cd248fe2352a357e8ed79181f67f6d83a95df52e Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Fri, 20 Dec 2024 17:06:17 +0000 Subject: [PATCH 02/15] feat(slurmd): add GPU detection and driver installation. --- charms/slurmd/src/charm.py | 81 ++++++++++++- charms/slurmd/src/utils/gpu.py | 157 +++++++++++++++++++++++++ charms/slurmd/src/utils/machine.py | 14 ++- charms/slurmd/tests/unit/test_charm.py | 1 + 4 files changed, 247 insertions(+), 6 deletions(-) create mode 100644 charms/slurmd/src/utils/gpu.py diff --git a/charms/slurmd/src/charm.py b/charms/slurmd/src/charm.py index c44837a..adde014 100755 --- a/charms/slurmd/src/charm.py +++ b/charms/slurmd/src/charm.py @@ -4,7 +4,9 @@ """Slurmd Operator Charm.""" +import itertools import logging +from pathlib import Path from typing import Any, Dict, cast from interface_slurmctld import Slurmctld, SlurmctldAvailableEvent @@ -21,7 +23,7 @@ main, ) from slurmutils.models.option import NodeOptionSet, PartitionOptionSet -from utils import machine, nhc, service +from utils import gpu, machine, nhc, service from charms.hpc_libs.v0.slurm_ops import SlurmdManager, SlurmOpsError from charms.operator_libs_linux.v0.juju_systemd_notices import ( # type: ignore[import-untyped] @@ -79,6 +81,7 @@ def _on_install(self, event: InstallEvent) -> None: try: self._slurmd.install() nhc.install() + gpu.autoinstall() self.unit.set_workload_version(self._slurmd.version()) # TODO: https://github.com/orgs/charmed-hpc/discussions/10 - # Evaluate if we should continue doing the service override here @@ -92,6 +95,7 @@ def _on_install(self, event: InstallEvent) -> None: event.defer() self._check_status() + self._reboot_if_required() def _on_config_changed(self, _: ConfigChangedEvent) -> None: """Handle charm configuration changes.""" @@ -214,7 +218,7 @@ def _on_show_nhc_config(self, event: ActionEvent) -> None: event.set_results({"nhc.conf": "/etc/nhc/nhc.conf not found."}) def _on_node_config_action_event(self, event: ActionEvent) -> None: - """Get or set the user_supplied_node_conifg. + """Get or set the user_supplied_node_config. Return the node config if the `node-config` parameter is not specified, otherwise parse, validate, and store the input of the `node-config` parameter in stored state. @@ -321,15 +325,86 @@ def _check_status(self) -> bool: return True + def _reboot_if_required(self) -> None: + """Perform a reboot of the unit if required, e.g. following a driver installation.""" + if Path("/var/run/reboot-required").exists(): + logger.info("unit rebooting") + self.unit.reboot() + + @staticmethod + def _ranges_and_strides(nums) -> str: + """Return ranges and strides for given iterable. + + Requires input elements to be unique and sorted ascending. + + Returns: + A square-bracketed string with comma-separated ranges of consecutive values. + + example_input = [0,1,2,3,4,5,6,8,9,10,12,14,15,16,18] + example_output = '[0-6,8-10,12,14-16,18]' + """ + out = "[" + + # The input is enumerate()-ed to produce a list of tuples of the elements and their indices. + # groupby() uses the lambda key function to group these tuples by the difference between the element and index. + # Consecutive values have equal difference between element and index, so are grouped together. + # Hence, the elements of the first and last members of each group give the range of consecutive values. + # If the group has only a single member, there are no consecutive values either side of it (a "stride"). + for _, group in itertools.groupby(enumerate(nums), lambda elems: elems[1] - elems[0]): + group = list(group) + + if len(group) == 1: + # Single member, this is a stride. + out += f"{group[0][1]}," + else: + # Range of consecutive values is first-last in group. + out += f"{group[0][1]}-{group[-1][1]}," + + out = out.rstrip(",") + "]" + return out + def get_node(self) -> Dict[Any, Any]: """Get the node from stored state.""" + slurmd_info = machine.get_slurmd_info() + + # Get GPU info and build GRES configuration. + gres_info = [] + if gpus := gpu.get_gpus(): + for model, devices in gpus.items(): + # Build gres.conf line for this GPU model. + if len(devices) == 1: + device_suffix = next(iter(devices)) + else: + # For multi-gpu setups, "File" uses ranges and strides syntax, + # e.g. File=/dev/nvidia[0-3], File=/dev/nvidia[0,2-3] + device_suffix = self._ranges_and_strides(sorted(devices)) + gres_line = { + # NodeName included in node_parameters. + "Name": "gpu", + "Type": model, + "File": f"/dev/nvidia{device_suffix}", + } + # Append to list of GRES lines for all models + gres_info.append(gres_line) + + # Add to node parameters to ensure included in slurm.conf. + slurm_conf_gres = f"gpu:{model}:{len(devices)}" + try: + # Add to existing Gres line. + slurmd_info["Gres"].append(slurm_conf_gres) + except KeyError: + # Create a new Gres entry if none present + slurmd_info["Gres"] = [slurm_conf_gres] + node = { "node_parameters": { - **machine.get_slurmd_info(), + **slurmd_info, "MemSpecLimit": "1024", **self._user_supplied_node_parameters, }, "new_node": self._new_node, + # Do not include GRES configuration if no GPUs detected. + **({"gres": gres_info} if len(gres_info) > 0 else {}), } logger.debug(f"Node Configuration: {node}") return node diff --git a/charms/slurmd/src/utils/gpu.py b/charms/slurmd/src/utils/gpu.py new file mode 100644 index 0000000..c5d5ea6 --- /dev/null +++ b/charms/slurmd/src/utils/gpu.py @@ -0,0 +1,157 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Manage GPU driver installation on compute node.""" + +import logging +from importlib import import_module + +import charms.operator_libs_linux.v0.apt as apt + +_logger = logging.getLogger(__name__) + + +class GPUInstallError(Exception): + """Exception raised when a GPU driver installation operation failed.""" + + +class GPUDriverDetector: + """Detects GPU driver and kernel packages appropriate for the current hardware.""" + + def __init__(self): + """Initialise detection attributes and interfaces.""" + # Install ubuntu-drivers tool and Python NVML bindings + pkgs = ["ubuntu-drivers-common", "python3-pynvml"] + try: + apt.add_package(pkgs, update_cache=True) + except (apt.PackageNotFoundError, apt.PackageError) as e: + raise GPUInstallError(f"failed to install {pkgs} reason: {e}") + + # ubuntu-drivers requires apt_pkg for package operations + self._detect = import_module("UbuntuDrivers.detect") + self._apt_pkg = import_module("apt_pkg") + self._apt_pkg.init() + + def _system_gpgpu_driver_packages(self) -> dict: + """Detect the available GPGPU drivers for this node.""" + return self._detect.system_gpgpu_driver_packages() + + def _get_linux_modules_metapackage(self, driver) -> str: + """Retrieve the modules metapackage for the combination of current kernel and given driver. + + e.g. linux-modules-nvidia-535-server-aws for driver nvidia-driver-535-server + """ + return self._detect.get_linux_modules_metapackage(self._apt_pkg.Cache(None), driver) + + def system_packages(self) -> list: + """Return a list of GPU drivers and kernel module packages for this node.""" + # Detect only GPGPU drivers. Not general purpose graphics drivers. + packages = self._system_gpgpu_driver_packages() + + # Gather list of driver and kernel modules to install. + install_packages = [] + for driver_package in packages: + + # Ignore drivers that are not recommended + if packages[driver_package].get("recommended"): + # Retrieve metapackage for this driver, + # e.g. nvidia-headless-no-dkms-535-server for nvidia-driver-535-server + driver_metapackage = packages[driver_package]["metapackage"] + + # Retrieve modules metapackage for combination of current kernel and recommended driver, + # e.g. linux-modules-nvidia-535-server-aws + modules_metapackage = self._get_linux_modules_metapackage(driver_package) + + # Add to list of packages to install + install_packages += [driver_metapackage, modules_metapackage] + + # TODO: do we want to check for nvidia here and add nvidia-fabricmanager-535 libnvidia-nscq-535 in case of nvlink? This is suggested as a manual step at https://documentation.ubuntu.com/server/how-to/graphics/install-nvidia-drivers/#optional-step. If so, how do we get the version number "-535" robustly? + + # TODO: what if drivers install but do not require a reboot? Should we "modprobe nvidia" manually? Just always reboot regardless? + + # Filter out any empty results as returning + return [p for p in install_packages if p] + + +def autoinstall() -> None: + """Autodetect available GPUs and install drivers. + + Raises: + GPUInstallError: Raised if error is encountered during package install. + """ + _logger.info("detecting GPUs") + detector = GPUDriverDetector() + install_packages = detector.system_packages() + + if len(install_packages) == 0: + _logger.info("no GPUs detected") + return + + _logger.info(f"installing GPU driver packages: {install_packages}") + try: + apt.add_package(install_packages) + except (apt.PackageNotFoundError, apt.PackageError) as e: + raise GPUInstallError(f"failed to install packages {install_packages}. reason: {e}") + + +def get_gpus() -> dict: + """Get the GPU devices on this node. + + Returns: + A dict mapping model names to a list of device minor numbers. Model names are lowercase + with whitespace replaced by underscores. For example: + + {'tesla_t4': [0, 1], 'l40s': [2, 3]} + + represents a node with two Tesla T4 GPUs at /dev/nvidia0 and /dev/nvidia1, and two L40S + GPUs at /dev/nvidia2 and /dev/nvidia3. + """ + gpu_info = {} + + # Return immediately if pynvml not installed... + try: + pynvml = import_module("pynvml") + except ModuleNotFoundError: + return gpu_info + + # ...or Nvidia drivers not loaded. + try: + pynvml.nvmlInit() + except pynvml.NVMLError_DriverNotLoaded: + return gpu_info + + gpu_count = pynvml.nvmlDeviceGetCount() + # Loop over all detected GPUs, gathering info by model. + for i in range(gpu_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + + # Make model name lowercase and replace whitespace with underscores to turn into GRES-compatible format, + # e.g. "Tesla T4" -> "tesla_t4", which can be added as "Gres=gpu:tesla_t4:1" + # Aims to follow convention set by Slurm autodetect: + # https://slurm.schedmd.com/gres.html#AutoDetect + model = pynvml.nvmlDeviceGetName(handle) + model = "_".join(model.split()).lower() + + # Number for device path, e.g. if device is /dev/nvidia0, returns 0 + minor_number = pynvml.nvmlDeviceGetMinorNumber(handle) + + try: + # Add minor number to set of existing numbers for this model. + gpu_info[model].add(minor_number) + except KeyError: + # This is the first time we've seen this model. Create a new entry. + gpu_info[model] = {minor_number} + + pynvml.nvmlShutdown() + return gpu_info diff --git a/charms/slurmd/src/utils/machine.py b/charms/slurmd/src/utils/machine.py index c5b4779..b448e84 100644 --- a/charms/slurmd/src/utils/machine.py +++ b/charms/slurmd/src/utils/machine.py @@ -16,12 +16,12 @@ import logging import subprocess -from typing import Dict +from typing import Any, Dict _logger = logging.getLogger(__name__) -def get_slurmd_info() -> Dict[str, str]: +def get_slurmd_info() -> Dict[str, Any]: """Get machine info as reported by `slurmd -C`.""" try: r = subprocess.check_output(["slurmd", "-C"], text=True).strip() @@ -29,4 +29,12 @@ def get_slurmd_info() -> Dict[str, str]: _logger.error(e) raise - return dict([opt.split("=") for opt in r.split()[:-1]]) + info = {} + for opt in r.split()[:-1]: + key, value = opt.split("=") + # Split comma-separated lists, e.g. Gres=gpu:model_a:1,gpu:model_b:1 + if "," in value: + info[key] = value.split(",") + else: + info[key] = value + return info diff --git a/charms/slurmd/tests/unit/test_charm.py b/charms/slurmd/tests/unit/test_charm.py index 77133be..1f9902f 100644 --- a/charms/slurmd/tests/unit/test_charm.py +++ b/charms/slurmd/tests/unit/test_charm.py @@ -50,6 +50,7 @@ def test_config_changed_success(self, defer) -> None: ) defer.assert_not_called() + @patch("utils.gpu.autoinstall") @patch("utils.nhc.install") @patch("utils.service.override_service") @patch("charms.operator_libs_linux.v0.juju_systemd_notices.SystemdNotices.subscribe") From 2c95313be5f87a16daa49268bf5e831a24fc5e98 Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Fri, 20 Dec 2024 17:07:56 +0000 Subject: [PATCH 03/15] feat(slurmctld): add GPU and other GRES device support. --- charms/slurmctld/src/charm.py | 58 ++++++++++++++++++++++-- charms/slurmctld/src/interface_slurmd.py | 44 +++++++++++++++++- 2 files changed, 98 insertions(+), 4 deletions(-) diff --git a/charms/slurmctld/src/charm.py b/charms/slurmctld/src/charm.py index cde9751..1920ea4 100755 --- a/charms/slurmctld/src/charm.py +++ b/charms/slurmctld/src/charm.py @@ -37,7 +37,7 @@ WaitingStatus, main, ) -from slurmutils.models import CgroupConfig, SlurmConfig +from slurmutils.models import CgroupConfig, GRESConfig, GRESNode, SlurmConfig from charms.grafana_agent.v0.cos_agent import COSAgentProvider from charms.hpc_libs.v0.is_container import is_container @@ -87,8 +87,8 @@ def __init__(self, *args): self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, self._slurmd.on.partition_available: self._on_write_slurm_conf, self._slurmd.on.partition_unavailable: self._on_write_slurm_conf, - self._slurmd.on.slurmd_available: self._on_write_slurm_conf, - self._slurmd.on.slurmd_departed: self._on_write_slurm_conf, + self._slurmd.on.slurmd_available: self._on_slurmd_available, + self._slurmd.on.slurmd_departed: self._on_slurmd_departed, self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, self.on.show_current_config_action: self._on_show_current_config_action, self.on.drain_action: self._on_drain_nodes_action, @@ -214,6 +214,58 @@ def _on_resume_nodes_action(self, event: ActionEvent) -> None: except subprocess.CalledProcessError as e: event.fail(message=f"Error resuming {nodes}: {e.output}") + def _on_slurmd_available(self, event: SlurmdAvailableEvent) -> None: + self._add_to_gres_conf(event) + self._on_write_slurm_conf(event) + + def _on_slurmd_departed(self, event: SlurmdDepartedEvent) -> None: + # Lack of map between departing unit and NodeName complicates removal of node from gres.conf. + # Instead, rewrite full gres.conf with data from remaining units. + self._write_gres_conf(event) + self._on_write_slurm_conf(event) + + def _add_to_gres_conf(self, event: SlurmdAvailableEvent) -> None: + """Write new nodes to gres.conf configuration file for Generic Resource scheduling.""" + # This function does not perform an "scontrol reconfigure". It is expected + # _on_write_slurm_conf() is called immediately following to do this. + + # Only the leader should write the config. + if not self.model.unit.is_leader(): + return + + if not self._check_status(): + event.defer() + return + + if gres_info := event.gres_info: + # Build list of GRESNodes expected by slurmutils + gres_nodes = [] + for resource in gres_info: + node = GRESNode(NodeName=str(event.node_name), **resource) + gres_nodes.append(node) + + # Update gres.conf + with self._slurmctld.gres.edit() as config: + config.nodes[event.node_name] = gres_nodes + + def _write_gres_conf(self, event: SlurmdDepartedEvent) -> None: + """Write out current gres.conf configuration file for Generic Resource scheduling.""" + # This function does not perform an "scontrol reconfigure". It is expected + # _on_write_slurm_conf() is called immediately following to do this. + + # Only the leader should write the config. + if not self.model.unit.is_leader(): + return + + if not self._check_status(): + event.defer() + return + + # Get current GRES state for all available nodes and write to gres.conf. + gres_all_nodes = self._slurmd.get_gres() + gres_conf = GRESConfig(Nodes=gres_all_nodes) + self._slurmctld.gres.dump(gres_conf) + def _on_write_slurm_conf( self, event: Union[ diff --git a/charms/slurmctld/src/interface_slurmd.py b/charms/slurmctld/src/interface_slurmd.py index 84dcaae..e92abc0 100644 --- a/charms/slurmctld/src/interface_slurmd.py +++ b/charms/slurmctld/src/interface_slurmd.py @@ -31,6 +31,23 @@ class PartitionUnavailableEvent(EventBase): class SlurmdAvailableEvent(EventBase): """Emitted when the slurmd unit joins the relation.""" + def __init__(self, handle, node_name, gres_info=None): + super().__init__(handle) + self.node_name = node_name + self.gres_info = gres_info + + def snapshot(self): + """Snapshot the event data.""" + return { + "node_name": self.node_name, + "gres_info": self.gres_info, + } + + def restore(self, snapshot): + """Restore the snapshot of the event data.""" + self.node_name = snapshot.get("node_name") + self.gres_info = snapshot.get("gres_info") + class SlurmdDepartedEvent(EventBase): """Emitted when one slurmd departs.""" @@ -124,7 +141,10 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: if node_config := node.get("node_parameters"): if node_name := node_config.get("NodeName"): self._charm.new_nodes = list(set(self._charm.new_nodes + [node_name])) - self.on.slurmd_available.emit() + self.on.slurmd_available.emit( + node_name=node_name, gres_info=node.get("gres") + ) + logger.debug(f"_on_relation_changed node_config = {node_config}") else: logger.debug(f"`node` data does not exist for unit: {unit}.") else: @@ -245,3 +265,25 @@ def get_new_nodes_and_nodes_and_partitions(self) -> Dict[str, Any]: else [] ) return {"DownNodes": new_node_down_nodes, "Nodes": nodes, "Partitions": partitions} + + def get_gres(self) -> Dict[str, Any]: + """Return GRES configuration for all currently related compute nodes.""" + # Loop over all relation units, gathering GRES info. + gres_info = {} + if relations := self.framework.model.relations.get(self._relation_name): + for relation in relations: + for unit in relation.units: + + if node := self._get_node_from_relation(relation, unit): + # Ignore nodes without GRES devices + if (gres := node.get("gres")) and ( + node_config := node.get("node_parameters") + ): + + node_name = node_config["NodeName"] + # slurmutils expects NodeName in values. + for device in gres: + device["NodeName"] = node_name + gres_info[node_name] = gres + + return gres_info From 1bcfa0fbbfed9bf7cef84dc34fe6fc850e34d51e Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Mon, 6 Jan 2025 14:43:31 +0000 Subject: [PATCH 04/15] tests(slurmd): add unit tests for GPU util --- charms/slurmd/src/utils/gpu.py | 11 +++-- charms/slurmd/tests/unit/test_charm.py | 56 +++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/charms/slurmd/src/utils/gpu.py b/charms/slurmd/src/utils/gpu.py index c5d5ea6..8ba98c2 100644 --- a/charms/slurmd/src/utils/gpu.py +++ b/charms/slurmd/src/utils/gpu.py @@ -39,8 +39,8 @@ def __init__(self): raise GPUInstallError(f"failed to install {pkgs} reason: {e}") # ubuntu-drivers requires apt_pkg for package operations - self._detect = import_module("UbuntuDrivers.detect") - self._apt_pkg = import_module("apt_pkg") + self._detect = _import("UbuntuDrivers.detect") + self._apt_pkg = _import("apt_pkg") self._apt_pkg.init() def _system_gpgpu_driver_packages(self) -> dict: @@ -121,7 +121,7 @@ def get_gpus() -> dict: # Return immediately if pynvml not installed... try: - pynvml = import_module("pynvml") + pynvml = _import("pynvml") except ModuleNotFoundError: return gpu_info @@ -155,3 +155,8 @@ def get_gpus() -> dict: pynvml.nvmlShutdown() return gpu_info + + +def _import(module): + """Wrap programmatic import of packages.""" + return import_module(module) diff --git a/charms/slurmd/tests/unit/test_charm.py b/charms/slurmd/tests/unit/test_charm.py index 1f9902f..5165d33 100644 --- a/charms/slurmd/tests/unit/test_charm.py +++ b/charms/slurmd/tests/unit/test_charm.py @@ -50,17 +50,30 @@ def test_config_changed_success(self, defer) -> None: ) defer.assert_not_called() - @patch("utils.gpu.autoinstall") @patch("utils.nhc.install") @patch("utils.service.override_service") @patch("charms.operator_libs_linux.v0.juju_systemd_notices.SystemdNotices.subscribe") + @patch("utils.gpu._import") + @patch("charms.operator_libs_linux.v0.apt.add_package") @patch("ops.framework.EventBase.defer") - def test_install_success(self, defer, *_) -> None: + def test_install_success(self, defer, apt_mock, import_mock, *_) -> None: """Test install success behavior.""" self.harness.charm._slurmd.install = Mock() self.harness.charm._slurmd.version = Mock(return_value="24.05.2-1") + + # GPU detection test setup + detect_mock = Mock() + metapackage = "headless-no-dkms-535-server" + linux_modules = "linux-modules-535-server" + detect_mock.system_gpgpu_driver_packages.return_value = { + "driver-535-server": {"recommended": True, "metapackage": metapackage} + } + detect_mock.get_linux_modules_metapackage.return_value = linux_modules + import_mock.return_value = detect_mock + self.harness.charm.on.install.emit() + apt_mock.assert_called_with([metapackage, linux_modules]) self.assertTrue(self.harness.charm._stored.slurm_installed) defer.assert_not_called() @@ -89,6 +102,45 @@ def test_service_slurmd_stopped(self) -> None: self.harness.charm.on.service_slurmd_stopped.emit() self.assertEqual(self.harness.charm.unit.status, BlockedStatus("slurmd not running")) + @patch("utils.machine.get_slurmd_info") + @patch("utils.gpu._import") + def test_slurmctld_on_relation_created(self, import_mock, machine_mock) -> None: + """Test slurmctld relation create behavior.""" + # Compute node mock data + node = { + "NodeName": "node1", + "CPUs": "16", + "Boards": "1", + "SocketsPerBoard": "1", + "CoresPerSocket": "8", + "ThreadsPerCore": "2", + "RealMemory": "31848", + } + machine_mock.return_value = node + + # GPU mock data + pynvml_mock = Mock() + pynvml_mock.nvmlDeviceGetCount.return_value = 4 + pynvml_mock.nvmlDeviceGetName.side_effect = ["Tesla T4", "Tesla T4", "L40S", "L40S"] + pynvml_mock.nvmlDeviceGetMinorNumber.side_effect = [0, 1, 2, 3] + import_mock.return_value = pynvml_mock + + relation_id = self.harness.add_relation("slurmctld", "slurmd") + + expected = ( + '{"node_parameters": {' + '"NodeName": "node1", "CPUs": "16", "Boards": "1", ' + '"SocketsPerBoard": "1", "CoresPerSocket": "8", ' + '"ThreadsPerCore": "2", "RealMemory": "31848", ' + '"Gres": ["gpu:tesla_t4:2", "gpu:l40s:2"], "MemSpecLimit": "1024"}, ' + '"new_node": true, ' + '"gres": [' + '{"Name": "gpu", "Type": "tesla_t4", "File": "/dev/nvidia[0-1]"}, ' + '{"Name": "gpu", "Type": "l40s", "File": "/dev/nvidia[2-3]"}' + "]}" + ) + self.assertEqual(self.harness.get_relation_data(relation_id, "slurmd/0")["node"], expected) + @patch("interface_slurmctld.Slurmctld.is_joined", new_callable=PropertyMock(return_value=True)) def test_update_status_success(self, *_) -> None: """Test `UpdateStateEvent` hook success.""" From 8fbf7150572607471016ee97636c4452fb5cd1ec Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Tue, 7 Jan 2025 10:57:07 +0000 Subject: [PATCH 05/15] feat(slurmd): account for Juju auto-upgrading base image Adds reboot check as first step in install hook in case of a pending kernel update. --- charms/slurmd/src/charm.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/charms/slurmd/src/charm.py b/charms/slurmd/src/charm.py index adde014..d5f784e 100755 --- a/charms/slurmd/src/charm.py +++ b/charms/slurmd/src/charm.py @@ -76,6 +76,12 @@ def __init__(self, *args, **kwargs): def _on_install(self, event: InstallEvent) -> None: """Perform installation operations for slurmd.""" + # Account for case where base image has been auto-upgraded by Juju and a reboot is pending + # before charm code runs. Reboot "now", before the current hook completes, and restart the + # hook after reboot. Prevents issues such as drivers/kernel modules being installed for a + # running kernel pending replacement by a newer version on reboot. + self._reboot_if_required(now=True) + self.unit.status = WaitingStatus("installing slurmd") try: @@ -325,11 +331,11 @@ def _check_status(self) -> bool: return True - def _reboot_if_required(self) -> None: + def _reboot_if_required(self, now: bool = False) -> None: """Perform a reboot of the unit if required, e.g. following a driver installation.""" if Path("/var/run/reboot-required").exists(): logger.info("unit rebooting") - self.unit.reboot() + self.unit.reboot(now) @staticmethod def _ranges_and_strides(nums) -> str: From 94c1ce3a73d3c0920f0a946fb43de664fa610689 Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Tue, 7 Jan 2025 17:03:11 +0000 Subject: [PATCH 06/15] refactor(slurmd): improve GPU detection comments and log messages --- charms/slurmd/src/utils/gpu.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/charms/slurmd/src/utils/gpu.py b/charms/slurmd/src/utils/gpu.py index 8ba98c2..7b02759 100644 --- a/charms/slurmd/src/utils/gpu.py +++ b/charms/slurmd/src/utils/gpu.py @@ -30,7 +30,7 @@ class GPUDriverDetector: """Detects GPU driver and kernel packages appropriate for the current hardware.""" def __init__(self): - """Initialise detection attributes and interfaces.""" + """Initialize detection attributes and interfaces.""" # Install ubuntu-drivers tool and Python NVML bindings pkgs = ["ubuntu-drivers-common", "python3-pynvml"] try: @@ -90,12 +90,12 @@ def autoinstall() -> None: Raises: GPUInstallError: Raised if error is encountered during package install. """ - _logger.info("detecting GPUs") + _logger.info("detecting GPUs and installing drivers") detector = GPUDriverDetector() install_packages = detector.system_packages() if len(install_packages) == 0: - _logger.info("no GPUs detected") + _logger.info("no GPU drivers requiring installation") return _logger.info(f"installing GPU driver packages: {install_packages}") @@ -123,12 +123,15 @@ def get_gpus() -> dict: try: pynvml = _import("pynvml") except ModuleNotFoundError: + _logger.info("cannot gather GPU info: pynvml module not installed") return gpu_info # ...or Nvidia drivers not loaded. try: pynvml.nvmlInit() - except pynvml.NVMLError_DriverNotLoaded: + except pynvml.NVMLError as e: + _logger.info("no GPU info gathered: drivers cannot be detected") + _logger.debug(f"NVML init failed with reason: {e}") return gpu_info gpu_count = pynvml.nvmlDeviceGetCount() From 55754400c604f47fb0043d1ef9309414bf6db931 Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Tue, 7 Jan 2025 18:31:13 +0000 Subject: [PATCH 07/15] chore(all): update `slurm_ops` to 0.12 and `slurmutils` requirement to `<1.0.0,>=0.11.0` --- charms/slurmctld/requirements.txt | 2 +- charms/slurmdbd/requirements.txt | 2 +- charms/slurmrestd/requirements.txt | 2 +- external/lib/charms/hpc_libs/v0/slurm_ops.py | 83 ++++++++++++++------ test-requirements.txt | 2 +- 5 files changed, 63 insertions(+), 28 deletions(-) diff --git a/charms/slurmctld/requirements.txt b/charms/slurmctld/requirements.txt index 71fdf6b..d70c278 100644 --- a/charms/slurmctld/requirements.txt +++ b/charms/slurmctld/requirements.txt @@ -1,2 +1,2 @@ ops==2.17.1 -slurmutils~=0.11.0 +slurmutils<1.0.0,>=0.11.0 diff --git a/charms/slurmdbd/requirements.txt b/charms/slurmdbd/requirements.txt index 71fdf6b..d70c278 100644 --- a/charms/slurmdbd/requirements.txt +++ b/charms/slurmdbd/requirements.txt @@ -1,2 +1,2 @@ ops==2.17.1 -slurmutils~=0.11.0 +slurmutils<1.0.0,>=0.11.0 diff --git a/charms/slurmrestd/requirements.txt b/charms/slurmrestd/requirements.txt index 71fdf6b..d70c278 100644 --- a/charms/slurmrestd/requirements.txt +++ b/charms/slurmrestd/requirements.txt @@ -1,2 +1,2 @@ ops==2.17.1 -slurmutils~=0.11.0 +slurmutils<1.0.0,>=0.11.0 diff --git a/external/lib/charms/hpc_libs/v0/slurm_ops.py b/external/lib/charms/hpc_libs/v0/slurm_ops.py index 5652db6..1bc8729 100644 --- a/external/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/external/lib/charms/hpc_libs/v0/slurm_ops.py @@ -77,8 +77,20 @@ def _on_install(self, _) -> None: import yaml from cryptography.hazmat.primitives import serialization from cryptography.hazmat.primitives.asymmetric import rsa -from slurmutils.editors import acctgatherconfig, cgroupconfig, slurmconfig, slurmdbdconfig -from slurmutils.models import AcctGatherConfig, CgroupConfig, SlurmConfig, SlurmdbdConfig +from slurmutils.editors import ( + acctgatherconfig, + cgroupconfig, + gresconfig, + slurmconfig, + slurmdbdconfig, +) +from slurmutils.models import ( + AcctGatherConfig, + CgroupConfig, + GRESConfig, + SlurmConfig, + SlurmdbdConfig, +) try: import charms.operator_libs_linux.v0.apt as apt @@ -97,14 +109,14 @@ def _on_install(self, _) -> None: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 10 +LIBPATCH = 12 # Charm library dependencies to fetch during `charmcraft pack`. PYDEPS = [ "cryptography~=44.0.0", "pyyaml>=6.0.2", "python-dotenv~=1.0.1", - "slurmutils~=0.9.0", + "slurmutils<1.0.0,>=0.11.0", "distro~=1.9.0", ] @@ -245,26 +257,6 @@ def edit(self): """Edit the current configuration file.""" -class _SlurmConfigManager(_ConfigManager): - """Control the `slurm.conf` configuration file.""" - - def load(self) -> SlurmConfig: - """Load the current `slurm.conf` configuration file.""" - return slurmconfig.load(self._config_path) - - def dump(self, config: SlurmConfig) -> None: - """Dump new configuration into `slurm.conf` configuration file.""" - slurmconfig.dump(config, self._config_path, mode=0o644, user=self._user, group=self._group) - - @contextmanager - def edit(self) -> SlurmConfig: - """Edit the current `slurm.conf` configuration file.""" - with slurmconfig.edit( - self._config_path, mode=0o644, user=self._user, group=self._group - ) as config: - yield config - - class _AcctGatherConfigManager(_ConfigManager): """Manage the `acct_gather.conf` configuration file.""" @@ -309,6 +301,46 @@ def edit(self) -> CgroupConfig: yield config +class _GRESConfigManager(_ConfigManager): + """Manage the `gres.conf` configuration file.""" + + def load(self) -> GRESConfig: + """Load the current `gres.conf` configuration files.""" + return gresconfig.load(self._config_path) + + def dump(self, config: GRESConfig) -> None: + """Dump new configuration into `gres.conf` configuration file.""" + gresconfig.dump(config, self._config_path, mode=0o644, user=self._user, group=self._group) + + @contextmanager + def edit(self) -> GRESConfig: + """Edit the current `gres.conf` configuration file.""" + with gresconfig.edit( + self._config_path, mode=0o644, user=self._user, group=self._group + ) as config: + yield config + + +class _SlurmConfigManager(_ConfigManager): + """Control the `slurm.conf` configuration file.""" + + def load(self) -> SlurmConfig: + """Load the current `slurm.conf` configuration file.""" + return slurmconfig.load(self._config_path) + + def dump(self, config: SlurmConfig) -> None: + """Dump new configuration into `slurm.conf` configuration file.""" + slurmconfig.dump(config, self._config_path, mode=0o644, user=self._user, group=self._group) + + @contextmanager + def edit(self) -> SlurmConfig: + """Edit the current `slurm.conf` configuration file.""" + with slurmconfig.edit( + self._config_path, mode=0o644, user=self._user, group=self._group + ) as config: + yield config + + class _SlurmdbdConfigManager(_ConfigManager): """Control the `slurmdbd.conf` configuration file.""" @@ -951,6 +983,9 @@ def __init__(self, *args, **kwargs) -> None: self.cgroup = _CgroupConfigManager( self._ops_manager.etc_path / "cgroup.conf", self.user, self.group ) + self.gres = _GRESConfigManager( + self._ops_manager.etc_path / "gres.conf", self.user, self.group + ) class SlurmdManager(_SlurmManagerBase): diff --git a/test-requirements.txt b/test-requirements.txt index 25a39a2..3d41039 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -3,7 +3,7 @@ cryptography~=43.0.1 distro==1.9.0 python-dotenv~=1.0.1 pycryptodome==3.20.0 -slurmutils~=0.11.0 +slurmutils<1.0.0,>=0.11.0 dbus-fast>=1.90.2 pyfakefs==5.7.1 From 5ae35e1ee101118bf118dbc8fb8f3c923009f57a Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Wed, 8 Jan 2025 13:45:38 +0000 Subject: [PATCH 08/15] refactor(slurmd): improve comments and function names. Simplify try/catch blocks. Add better support for comma-separated values in `get_slurmd_info` --- charms/slurmd/src/charm.py | 23 +++++++---------------- charms/slurmd/src/utils/gpu.py | 30 +++++++----------------------- charms/slurmd/src/utils/machine.py | 21 ++++++++++++--------- 3 files changed, 26 insertions(+), 48 deletions(-) diff --git a/charms/slurmd/src/charm.py b/charms/slurmd/src/charm.py index d5f784e..1e02940 100755 --- a/charms/slurmd/src/charm.py +++ b/charms/slurmd/src/charm.py @@ -334,7 +334,7 @@ def _check_status(self) -> bool: def _reboot_if_required(self, now: bool = False) -> None: """Perform a reboot of the unit if required, e.g. following a driver installation.""" if Path("/var/run/reboot-required").exists(): - logger.info("unit rebooting") + logger.info("rebooting unit %s", self.unit.name) self.unit.reboot(now) @staticmethod @@ -373,34 +373,25 @@ def get_node(self) -> Dict[Any, Any]: """Get the node from stored state.""" slurmd_info = machine.get_slurmd_info() - # Get GPU info and build GRES configuration. gres_info = [] - if gpus := gpu.get_gpus(): + if gpus := gpu.get_all_gpu(): for model, devices in gpus.items(): # Build gres.conf line for this GPU model. if len(devices) == 1: device_suffix = next(iter(devices)) else: - # For multi-gpu setups, "File" uses ranges and strides syntax, - # e.g. File=/dev/nvidia[0-3], File=/dev/nvidia[0,2-3] + # Get numeric range of devices associated with this GRES resource. See: + # https://slurm.schedmd.com/gres.conf.html#OPT_File device_suffix = self._ranges_and_strides(sorted(devices)) gres_line = { - # NodeName included in node_parameters. "Name": "gpu", "Type": model, "File": f"/dev/nvidia{device_suffix}", } - # Append to list of GRES lines for all models gres_info.append(gres_line) - - # Add to node parameters to ensure included in slurm.conf. - slurm_conf_gres = f"gpu:{model}:{len(devices)}" - try: - # Add to existing Gres line. - slurmd_info["Gres"].append(slurm_conf_gres) - except KeyError: - # Create a new Gres entry if none present - slurmd_info["Gres"] = [slurm_conf_gres] + slurmd_info["Gres"] = cast(list[str], slurmd_info.get("Gres", [])) + [ + f"gpu:{model}:{len(devices)}" + ] node = { "node_parameters": { diff --git a/charms/slurmd/src/utils/gpu.py b/charms/slurmd/src/utils/gpu.py index 7b02759..9d6daf5 100644 --- a/charms/slurmd/src/utils/gpu.py +++ b/charms/slurmd/src/utils/gpu.py @@ -22,7 +22,7 @@ _logger = logging.getLogger(__name__) -class GPUInstallError(Exception): +class GPUOpsError(Exception): """Exception raised when a GPU driver installation operation failed.""" @@ -36,7 +36,7 @@ def __init__(self): try: apt.add_package(pkgs, update_cache=True) except (apt.PackageNotFoundError, apt.PackageError) as e: - raise GPUInstallError(f"failed to install {pkgs} reason: {e}") + raise GPUOpsError(f"failed to install {pkgs} reason: {e}") # ubuntu-drivers requires apt_pkg for package operations self._detect = _import("UbuntuDrivers.detect") @@ -54,9 +54,8 @@ def _get_linux_modules_metapackage(self, driver) -> str: """ return self._detect.get_linux_modules_metapackage(self._apt_pkg.Cache(None), driver) - def system_packages(self) -> list: + def system_packages(self) -> list[str]: """Return a list of GPU drivers and kernel module packages for this node.""" - # Detect only GPGPU drivers. Not general purpose graphics drivers. packages = self._system_gpgpu_driver_packages() # Gather list of driver and kernel modules to install. @@ -76,11 +75,6 @@ def system_packages(self) -> list: # Add to list of packages to install install_packages += [driver_metapackage, modules_metapackage] - # TODO: do we want to check for nvidia here and add nvidia-fabricmanager-535 libnvidia-nscq-535 in case of nvlink? This is suggested as a manual step at https://documentation.ubuntu.com/server/how-to/graphics/install-nvidia-drivers/#optional-step. If so, how do we get the version number "-535" robustly? - - # TODO: what if drivers install but do not require a reboot? Should we "modprobe nvidia" manually? Just always reboot regardless? - - # Filter out any empty results as returning return [p for p in install_packages if p] @@ -88,7 +82,7 @@ def autoinstall() -> None: """Autodetect available GPUs and install drivers. Raises: - GPUInstallError: Raised if error is encountered during package install. + GPUOpsError: Raised if error is encountered during package install. """ _logger.info("detecting GPUs and installing drivers") detector = GPUDriverDetector() @@ -102,10 +96,10 @@ def autoinstall() -> None: try: apt.add_package(install_packages) except (apt.PackageNotFoundError, apt.PackageError) as e: - raise GPUInstallError(f"failed to install packages {install_packages}. reason: {e}") + raise GPUOpsError(f"failed to install packages {install_packages}. reason: {e}") -def get_gpus() -> dict: +def get_all_gpu() -> dict[str, set[int]]: """Get the GPU devices on this node. Returns: @@ -119,14 +113,12 @@ def get_gpus() -> dict: """ gpu_info = {} - # Return immediately if pynvml not installed... try: pynvml = _import("pynvml") except ModuleNotFoundError: _logger.info("cannot gather GPU info: pynvml module not installed") return gpu_info - # ...or Nvidia drivers not loaded. try: pynvml.nvmlInit() except pynvml.NVMLError as e: @@ -135,7 +127,6 @@ def get_gpus() -> dict: return gpu_info gpu_count = pynvml.nvmlDeviceGetCount() - # Loop over all detected GPUs, gathering info by model. for i in range(gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) @@ -146,15 +137,8 @@ def get_gpus() -> dict: model = pynvml.nvmlDeviceGetName(handle) model = "_".join(model.split()).lower() - # Number for device path, e.g. if device is /dev/nvidia0, returns 0 minor_number = pynvml.nvmlDeviceGetMinorNumber(handle) - - try: - # Add minor number to set of existing numbers for this model. - gpu_info[model].add(minor_number) - except KeyError: - # This is the first time we've seen this model. Create a new entry. - gpu_info[model] = {minor_number} + gpu_info[model] = gpu_info.get(model, set()) | {minor_number} pynvml.nvmlShutdown() return gpu_info diff --git a/charms/slurmd/src/utils/machine.py b/charms/slurmd/src/utils/machine.py index b448e84..0e9f381 100644 --- a/charms/slurmd/src/utils/machine.py +++ b/charms/slurmd/src/utils/machine.py @@ -16,13 +16,16 @@ import logging import subprocess -from typing import Any, Dict _logger = logging.getLogger(__name__) -def get_slurmd_info() -> Dict[str, Any]: - """Get machine info as reported by `slurmd -C`.""" +def get_slurmd_info() -> dict[str, str | list[str]]: + """Get machine info as reported by `slurmd -C`. + + For details see: + https://slurm.schedmd.com/slurmd.html + """ try: r = subprocess.check_output(["slurmd", "-C"], text=True).strip() except subprocess.CalledProcessError as e: @@ -31,10 +34,10 @@ def get_slurmd_info() -> Dict[str, Any]: info = {} for opt in r.split()[:-1]: - key, value = opt.split("=") - # Split comma-separated lists, e.g. Gres=gpu:model_a:1,gpu:model_b:1 - if "," in value: - info[key] = value.split(",") - else: - info[key] = value + k, v = opt.split("=") + if k == "Gres": + info[k] = v.split(",") + continue + + info[k] = v return info From 29e116945a3bbecb7c8279aef11342dd99d9518a Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Wed, 8 Jan 2025 13:45:56 +0000 Subject: [PATCH 09/15] refactor(slurmctld): improve comments and function names --- charms/slurmctld/src/charm.py | 27 ++++++++++++------------ charms/slurmctld/src/interface_slurmd.py | 5 ++--- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/charms/slurmctld/src/charm.py b/charms/slurmctld/src/charm.py index 1920ea4..337856c 100755 --- a/charms/slurmctld/src/charm.py +++ b/charms/slurmctld/src/charm.py @@ -215,7 +215,7 @@ def _on_resume_nodes_action(self, event: ActionEvent) -> None: event.fail(message=f"Error resuming {nodes}: {e.output}") def _on_slurmd_available(self, event: SlurmdAvailableEvent) -> None: - self._add_to_gres_conf(event) + self._update_gres_conf(event) self._on_write_slurm_conf(event) def _on_slurmd_departed(self, event: SlurmdDepartedEvent) -> None: @@ -224,12 +224,13 @@ def _on_slurmd_departed(self, event: SlurmdDepartedEvent) -> None: self._write_gres_conf(event) self._on_write_slurm_conf(event) - def _add_to_gres_conf(self, event: SlurmdAvailableEvent) -> None: - """Write new nodes to gres.conf configuration file for Generic Resource scheduling.""" - # This function does not perform an "scontrol reconfigure". It is expected - # _on_write_slurm_conf() is called immediately following to do this. + def _update_gres_conf(self, event: SlurmdAvailableEvent) -> None: + """Write new nodes to gres.conf configuration file for Generic Resource scheduling. - # Only the leader should write the config. + Warnings: + * This function does not perform an `scontrol reconfigure`. It is expected + that the function `_on_write_slurm_conf()` is called immediately following to do this. + """ if not self.model.unit.is_leader(): return @@ -238,22 +239,21 @@ def _add_to_gres_conf(self, event: SlurmdAvailableEvent) -> None: return if gres_info := event.gres_info: - # Build list of GRESNodes expected by slurmutils gres_nodes = [] for resource in gres_info: node = GRESNode(NodeName=str(event.node_name), **resource) gres_nodes.append(node) - # Update gres.conf with self._slurmctld.gres.edit() as config: config.nodes[event.node_name] = gres_nodes def _write_gres_conf(self, event: SlurmdDepartedEvent) -> None: - """Write out current gres.conf configuration file for Generic Resource scheduling.""" - # This function does not perform an "scontrol reconfigure". It is expected - # _on_write_slurm_conf() is called immediately following to do this. + """Write out current gres.conf configuration file for Generic Resource scheduling. - # Only the leader should write the config. + Warnings: + * This function does not perform an `scontrol reconfigure`. It is expected + that the function `_on_write_slurm_conf()` is called immediately following to do this. + """ if not self.model.unit.is_leader(): return @@ -261,8 +261,7 @@ def _write_gres_conf(self, event: SlurmdDepartedEvent) -> None: event.defer() return - # Get current GRES state for all available nodes and write to gres.conf. - gres_all_nodes = self._slurmd.get_gres() + gres_all_nodes = self._slurmd.get_all_gres_info() gres_conf = GRESConfig(Nodes=gres_all_nodes) self._slurmctld.gres.dump(gres_conf) diff --git a/charms/slurmctld/src/interface_slurmd.py b/charms/slurmctld/src/interface_slurmd.py index e92abc0..cf6c93e 100644 --- a/charms/slurmctld/src/interface_slurmd.py +++ b/charms/slurmctld/src/interface_slurmd.py @@ -266,9 +266,8 @@ def get_new_nodes_and_nodes_and_partitions(self) -> Dict[str, Any]: ) return {"DownNodes": new_node_down_nodes, "Nodes": nodes, "Partitions": partitions} - def get_gres(self) -> Dict[str, Any]: + def get_all_gres_info(self) -> Dict[str, Any]: """Return GRES configuration for all currently related compute nodes.""" - # Loop over all relation units, gathering GRES info. gres_info = {} if relations := self.framework.model.relations.get(self._relation_name): for relation in relations: @@ -281,7 +280,7 @@ def get_gres(self) -> Dict[str, Any]: ): node_name = node_config["NodeName"] - # slurmutils expects NodeName in values. + # Add NodeName to each GRES device to match the format required by slurmutils. for device in gres: device["NodeName"] = node_name gres_info[node_name] = gres From 738081ea88f12626e29650981d518caec26df7da Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Thu, 9 Jan 2025 10:49:03 +0000 Subject: [PATCH 10/15] refactor(slurmd): use list over set for GPU device numbers --- charms/slurmd/src/charm.py | 2 +- charms/slurmd/src/utils/gpu.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charms/slurmd/src/charm.py b/charms/slurmd/src/charm.py index 1e02940..6b1b1f2 100755 --- a/charms/slurmd/src/charm.py +++ b/charms/slurmd/src/charm.py @@ -378,7 +378,7 @@ def get_node(self) -> Dict[Any, Any]: for model, devices in gpus.items(): # Build gres.conf line for this GPU model. if len(devices) == 1: - device_suffix = next(iter(devices)) + device_suffix = devices[0] else: # Get numeric range of devices associated with this GRES resource. See: # https://slurm.schedmd.com/gres.conf.html#OPT_File diff --git a/charms/slurmd/src/utils/gpu.py b/charms/slurmd/src/utils/gpu.py index 9d6daf5..43d8cc1 100644 --- a/charms/slurmd/src/utils/gpu.py +++ b/charms/slurmd/src/utils/gpu.py @@ -99,7 +99,7 @@ def autoinstall() -> None: raise GPUOpsError(f"failed to install packages {install_packages}. reason: {e}") -def get_all_gpu() -> dict[str, set[int]]: +def get_all_gpu() -> dict[str, list[int]]: """Get the GPU devices on this node. Returns: @@ -138,7 +138,7 @@ def get_all_gpu() -> dict[str, set[int]]: model = "_".join(model.split()).lower() minor_number = pynvml.nvmlDeviceGetMinorNumber(handle) - gpu_info[model] = gpu_info.get(model, set()) | {minor_number} + gpu_info[model] = gpu_info.get(model, []) + [minor_number] pynvml.nvmlShutdown() return gpu_info From f85e6f71267e678a795e0e6e73478ff85cf4f45e Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Thu, 9 Jan 2025 16:52:47 +0000 Subject: [PATCH 11/15] refactor(slurmd): use `calculate_rs` from `slurmutils` for ranges and strides calculations --- charms/slurmd/requirements.txt | 1 + charms/slurmd/src/charm.py | 36 ++-------------------------------- 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/charms/slurmd/requirements.txt b/charms/slurmd/requirements.txt index c5cedba..2b46a52 100644 --- a/charms/slurmd/requirements.txt +++ b/charms/slurmd/requirements.txt @@ -1 +1,2 @@ ops==2.17.1 +slurmutils<1.0.0,>=0.12.0 diff --git a/charms/slurmd/src/charm.py b/charms/slurmd/src/charm.py index 6b1b1f2..986f096 100755 --- a/charms/slurmd/src/charm.py +++ b/charms/slurmd/src/charm.py @@ -4,7 +4,6 @@ """Slurmd Operator Charm.""" -import itertools import logging from pathlib import Path from typing import Any, Dict, cast @@ -22,6 +21,7 @@ WaitingStatus, main, ) +from slurmutils import calculate_rs from slurmutils.models.option import NodeOptionSet, PartitionOptionSet from utils import gpu, machine, nhc, service @@ -337,38 +337,6 @@ def _reboot_if_required(self, now: bool = False) -> None: logger.info("rebooting unit %s", self.unit.name) self.unit.reboot(now) - @staticmethod - def _ranges_and_strides(nums) -> str: - """Return ranges and strides for given iterable. - - Requires input elements to be unique and sorted ascending. - - Returns: - A square-bracketed string with comma-separated ranges of consecutive values. - - example_input = [0,1,2,3,4,5,6,8,9,10,12,14,15,16,18] - example_output = '[0-6,8-10,12,14-16,18]' - """ - out = "[" - - # The input is enumerate()-ed to produce a list of tuples of the elements and their indices. - # groupby() uses the lambda key function to group these tuples by the difference between the element and index. - # Consecutive values have equal difference between element and index, so are grouped together. - # Hence, the elements of the first and last members of each group give the range of consecutive values. - # If the group has only a single member, there are no consecutive values either side of it (a "stride"). - for _, group in itertools.groupby(enumerate(nums), lambda elems: elems[1] - elems[0]): - group = list(group) - - if len(group) == 1: - # Single member, this is a stride. - out += f"{group[0][1]}," - else: - # Range of consecutive values is first-last in group. - out += f"{group[0][1]}-{group[-1][1]}," - - out = out.rstrip(",") + "]" - return out - def get_node(self) -> Dict[Any, Any]: """Get the node from stored state.""" slurmd_info = machine.get_slurmd_info() @@ -382,7 +350,7 @@ def get_node(self) -> Dict[Any, Any]: else: # Get numeric range of devices associated with this GRES resource. See: # https://slurm.schedmd.com/gres.conf.html#OPT_File - device_suffix = self._ranges_and_strides(sorted(devices)) + device_suffix = calculate_rs(devices) gres_line = { "Name": "gpu", "Type": model, From 20b764f74dbb34d1ed32c6d9e53a99228d576e9b Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Fri, 10 Jan 2025 13:42:19 +0000 Subject: [PATCH 12/15] feat(slurmd): use packaged `ubuntu-drivers-common` and `pynvml` --- charms/slurmd/charmcraft.yaml | 6 ++++ charms/slurmd/requirements.txt | 2 ++ charms/slurmd/src/utils/gpu.py | 40 +++++++++--------------- charms/slurmd/tests/unit/module_mocks.py | 39 +++++++++++++++++++++++ charms/slurmd/tests/unit/test_charm.py | 29 +++++++++-------- test-requirements.txt | 3 +- 6 files changed, 79 insertions(+), 40 deletions(-) create mode 100644 charms/slurmd/tests/unit/module_mocks.py diff --git a/charms/slurmd/charmcraft.yaml b/charms/slurmd/charmcraft.yaml index 218d5e6..5cc5fc8 100644 --- a/charms/slurmd/charmcraft.yaml +++ b/charms/slurmd/charmcraft.yaml @@ -30,6 +30,12 @@ platforms: parts: charm: + build-packages: + - git + - libdrm-dev + - libkmod-dev + - libpci-dev + - pkgconf charm-binary-python-packages: - cryptography ~= 44.0.0 - jsonschema ~= 4.23.0 diff --git a/charms/slurmd/requirements.txt b/charms/slurmd/requirements.txt index 2b46a52..bc54308 100644 --- a/charms/slurmd/requirements.txt +++ b/charms/slurmd/requirements.txt @@ -1,2 +1,4 @@ ops==2.17.1 slurmutils<1.0.0,>=0.12.0 +nvidia-ml-py==12.560.30 +git+https://github.com/canonical/ubuntu-drivers-common@554b91edfd3699625dbed90f679abb31a897b76e#egg=ubuntu-drivers-common diff --git a/charms/slurmd/src/utils/gpu.py b/charms/slurmd/src/utils/gpu.py index 43d8cc1..5589741 100644 --- a/charms/slurmd/src/utils/gpu.py +++ b/charms/slurmd/src/utils/gpu.py @@ -15,7 +15,11 @@ """Manage GPU driver installation on compute node.""" import logging -from importlib import import_module + +# ubuntu-drivers requires apt_pkg for package operations +import apt_pkg # pyright: ignore [reportMissingImports] +import pynvml +import UbuntuDrivers.detect # pyright: ignore [reportMissingImports] import charms.operator_libs_linux.v0.apt as apt @@ -25,34 +29,29 @@ class GPUOpsError(Exception): """Exception raised when a GPU driver installation operation failed.""" + @property + def message(self) -> str: + """Return message passed as argument to exception.""" + return self.args[0] + class GPUDriverDetector: """Detects GPU driver and kernel packages appropriate for the current hardware.""" def __init__(self): - """Initialize detection attributes and interfaces.""" - # Install ubuntu-drivers tool and Python NVML bindings - pkgs = ["ubuntu-drivers-common", "python3-pynvml"] - try: - apt.add_package(pkgs, update_cache=True) - except (apt.PackageNotFoundError, apt.PackageError) as e: - raise GPUOpsError(f"failed to install {pkgs} reason: {e}") - - # ubuntu-drivers requires apt_pkg for package operations - self._detect = _import("UbuntuDrivers.detect") - self._apt_pkg = _import("apt_pkg") - self._apt_pkg.init() + """Initialize detection interfaces.""" + apt_pkg.init() def _system_gpgpu_driver_packages(self) -> dict: """Detect the available GPGPU drivers for this node.""" - return self._detect.system_gpgpu_driver_packages() + return UbuntuDrivers.detect.system_gpgpu_driver_packages() def _get_linux_modules_metapackage(self, driver) -> str: """Retrieve the modules metapackage for the combination of current kernel and given driver. e.g. linux-modules-nvidia-535-server-aws for driver nvidia-driver-535-server """ - return self._detect.get_linux_modules_metapackage(self._apt_pkg.Cache(None), driver) + return UbuntuDrivers.detect.get_linux_modules_metapackage(apt_pkg.Cache(None), driver) def system_packages(self) -> list[str]: """Return a list of GPU drivers and kernel module packages for this node.""" @@ -113,12 +112,6 @@ def get_all_gpu() -> dict[str, list[int]]: """ gpu_info = {} - try: - pynvml = _import("pynvml") - except ModuleNotFoundError: - _logger.info("cannot gather GPU info: pynvml module not installed") - return gpu_info - try: pynvml.nvmlInit() except pynvml.NVMLError as e: @@ -142,8 +135,3 @@ def get_all_gpu() -> dict[str, list[int]]: pynvml.nvmlShutdown() return gpu_info - - -def _import(module): - """Wrap programmatic import of packages.""" - return import_module(module) diff --git a/charms/slurmd/tests/unit/module_mocks.py b/charms/slurmd/tests/unit/module_mocks.py new file mode 100644 index 0000000..b100965 --- /dev/null +++ b/charms/slurmd/tests/unit/module_mocks.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mock source modules available only in production environment.""" + +import sys +import types +from unittest.mock import Mock + +module_name = "apt_pkg" +apt_pkg_mock = types.ModuleType(module_name) +sys.modules[module_name] = apt_pkg_mock +apt_pkg_mock.init = Mock(name=module_name + ".init") +apt_pkg_mock.Cache = Mock(name=module_name + ".Cache") + +module_name = "UbuntuDrivers" +drivers_mock = types.ModuleType(module_name) +detect_mock = types.ModuleType(module_name + ".detect") +sys.modules[module_name] = drivers_mock +sys.modules[module_name + ".detect"] = detect_mock +drivers_mock.detect = detect_mock +detect_mock.system_gpgpu_driver_packages = Mock( + name=module_name + ".detect.system_gpgpu_driver_packages" +) +detect_mock.get_linux_modules_metapackage = Mock( + name=module_name + ".detect.get_linux_modules_metapackage" +) diff --git a/charms/slurmd/tests/unit/test_charm.py b/charms/slurmd/tests/unit/test_charm.py index 5165d33..4056a14 100644 --- a/charms/slurmd/tests/unit/test_charm.py +++ b/charms/slurmd/tests/unit/test_charm.py @@ -15,7 +15,10 @@ """Unit tests for the slurmd operator.""" -from unittest.mock import Mock, PropertyMock, patch +from unittest.mock import Mock, PropertyMock, patch # noqa: I001 + +# Must come before SlurmdCharm import +from module_mocks import apt_pkg_mock, detect_mock # noqa: F401 from charm import SlurmdCharm from ops.model import ActiveStatus, BlockedStatus @@ -53,23 +56,20 @@ def test_config_changed_success(self, defer) -> None: @patch("utils.nhc.install") @patch("utils.service.override_service") @patch("charms.operator_libs_linux.v0.juju_systemd_notices.SystemdNotices.subscribe") - @patch("utils.gpu._import") @patch("charms.operator_libs_linux.v0.apt.add_package") @patch("ops.framework.EventBase.defer") - def test_install_success(self, defer, apt_mock, import_mock, *_) -> None: + def test_install_success(self, defer, apt_mock, *_) -> None: """Test install success behavior.""" self.harness.charm._slurmd.install = Mock() self.harness.charm._slurmd.version = Mock(return_value="24.05.2-1") # GPU detection test setup - detect_mock = Mock() metapackage = "headless-no-dkms-535-server" linux_modules = "linux-modules-535-server" detect_mock.system_gpgpu_driver_packages.return_value = { "driver-535-server": {"recommended": True, "metapackage": metapackage} } detect_mock.get_linux_modules_metapackage.return_value = linux_modules - import_mock.return_value = detect_mock self.harness.charm.on.install.emit() @@ -102,9 +102,14 @@ def test_service_slurmd_stopped(self) -> None: self.harness.charm.on.service_slurmd_stopped.emit() self.assertEqual(self.harness.charm.unit.status, BlockedStatus("slurmd not running")) + @patch("pynvml.nvmlShutdown") + @patch("pynvml.nvmlInit") + @patch("pynvml.nvmlDeviceGetHandleByIndex") + @patch("pynvml.nvmlDeviceGetMinorNumber") + @patch("pynvml.nvmlDeviceGetName") + @patch("pynvml.nvmlDeviceGetCount") @patch("utils.machine.get_slurmd_info") - @patch("utils.gpu._import") - def test_slurmctld_on_relation_created(self, import_mock, machine_mock) -> None: + def test_slurmctld_on_relation_created(self, machine, count, name, number, *_) -> None: """Test slurmctld relation create behavior.""" # Compute node mock data node = { @@ -116,14 +121,12 @@ def test_slurmctld_on_relation_created(self, import_mock, machine_mock) -> None: "ThreadsPerCore": "2", "RealMemory": "31848", } - machine_mock.return_value = node + machine.return_value = node # GPU mock data - pynvml_mock = Mock() - pynvml_mock.nvmlDeviceGetCount.return_value = 4 - pynvml_mock.nvmlDeviceGetName.side_effect = ["Tesla T4", "Tesla T4", "L40S", "L40S"] - pynvml_mock.nvmlDeviceGetMinorNumber.side_effect = [0, 1, 2, 3] - import_mock.return_value = pynvml_mock + count.return_value = 4 + name.side_effect = ["Tesla T4", "Tesla T4", "L40S", "L40S"] + number.side_effect = [0, 1, 2, 3] relation_id = self.harness.add_relation("slurmctld", "slurmd") diff --git a/test-requirements.txt b/test-requirements.txt index 3d41039..481d72c 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -3,9 +3,10 @@ cryptography~=43.0.1 distro==1.9.0 python-dotenv~=1.0.1 pycryptodome==3.20.0 -slurmutils<1.0.0,>=0.11.0 +slurmutils<1.0.0,>=0.12.0 dbus-fast>=1.90.2 pyfakefs==5.7.1 +nvidia-ml-py==12.560.30 # `cos` dependencies cosl From d2f5a1a7e6353b4fd6c08fc546840f4dbc3a414f Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Mon, 13 Jan 2025 13:00:19 +0000 Subject: [PATCH 13/15] refactor(slurmctld): improve function names and logging --- charms/slurmctld/src/charm.py | 13 +++++++++---- charms/slurmctld/src/interface_slurmd.py | 6 +++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/charms/slurmctld/src/charm.py b/charms/slurmctld/src/charm.py index 337856c..dad1a3a 100755 --- a/charms/slurmctld/src/charm.py +++ b/charms/slurmctld/src/charm.py @@ -215,13 +215,18 @@ def _on_resume_nodes_action(self, event: ActionEvent) -> None: event.fail(message=f"Error resuming {nodes}: {e.output}") def _on_slurmd_available(self, event: SlurmdAvailableEvent) -> None: + """Triggers when a slurmd unit joins the relation.""" self._update_gres_conf(event) self._on_write_slurm_conf(event) def _on_slurmd_departed(self, event: SlurmdDepartedEvent) -> None: - # Lack of map between departing unit and NodeName complicates removal of node from gres.conf. - # Instead, rewrite full gres.conf with data from remaining units. - self._write_gres_conf(event) + """Triggers when a slurmd unit departs the relation. + + Notes: + Lack of map between departing unit and NodeName complicates removal of node from gres.conf. + Instead, rewrite full gres.conf with data from remaining units. + """ + self._refresh_gres_conf(event) self._on_write_slurm_conf(event) def _update_gres_conf(self, event: SlurmdAvailableEvent) -> None: @@ -247,7 +252,7 @@ def _update_gres_conf(self, event: SlurmdAvailableEvent) -> None: with self._slurmctld.gres.edit() as config: config.nodes[event.node_name] = gres_nodes - def _write_gres_conf(self, event: SlurmdDepartedEvent) -> None: + def _refresh_gres_conf(self, event: SlurmdDepartedEvent) -> None: """Write out current gres.conf configuration file for Generic Resource scheduling. Warnings: diff --git a/charms/slurmctld/src/interface_slurmd.py b/charms/slurmctld/src/interface_slurmd.py index cf6c93e..e774b1d 100644 --- a/charms/slurmctld/src/interface_slurmd.py +++ b/charms/slurmctld/src/interface_slurmd.py @@ -144,9 +144,9 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: self.on.slurmd_available.emit( node_name=node_name, gres_info=node.get("gres") ) - logger.debug(f"_on_relation_changed node_config = {node_config}") + logger.debug("_on_relation_changed node_config = %s", node_config) else: - logger.debug(f"`node` data does not exist for unit: {unit}.") + logger.debug("`node` data does not exist for unit: %s.", unit) else: logger.debug("Unit doesn't exist on the relation.") @@ -166,7 +166,7 @@ def set_nhc_params(self, params: str) -> None: """Send NHC parameters to all slurmd.""" # juju does not allow setting empty data/strings on the relation data, # so we set it to something that behaves like empty - logger.debug(f"## set_nhc_params: {params}") + logger.debug("## set_nhc_params: %s", params) if relations := self.framework.model.relations.get(self._relation_name): for relation in relations: From 4cf8083a75bb2ae413103cc2ae36d92c42500cd2 Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Mon, 13 Jan 2025 13:00:58 +0000 Subject: [PATCH 14/15] refactor(slurmd): improve comments and logging --- charms/slurmd/src/utils/gpu.py | 4 ++-- charms/slurmd/src/utils/machine.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charms/slurmd/src/utils/gpu.py b/charms/slurmd/src/utils/gpu.py index 5589741..59a1693 100644 --- a/charms/slurmd/src/utils/gpu.py +++ b/charms/slurmd/src/utils/gpu.py @@ -91,7 +91,7 @@ def autoinstall() -> None: _logger.info("no GPU drivers requiring installation") return - _logger.info(f"installing GPU driver packages: {install_packages}") + _logger.info("installing GPU driver packages: %s", install_packages) try: apt.add_package(install_packages) except (apt.PackageNotFoundError, apt.PackageError) as e: @@ -116,7 +116,7 @@ def get_all_gpu() -> dict[str, list[int]]: pynvml.nvmlInit() except pynvml.NVMLError as e: _logger.info("no GPU info gathered: drivers cannot be detected") - _logger.debug(f"NVML init failed with reason: {e}") + _logger.debug("NVML init failed with reason: %s", e) return gpu_info gpu_count = pynvml.nvmlDeviceGetCount() diff --git a/charms/slurmd/src/utils/machine.py b/charms/slurmd/src/utils/machine.py index 0e9f381..a602d44 100644 --- a/charms/slurmd/src/utils/machine.py +++ b/charms/slurmd/src/utils/machine.py @@ -23,8 +23,7 @@ def get_slurmd_info() -> dict[str, str | list[str]]: """Get machine info as reported by `slurmd -C`. - For details see: - https://slurm.schedmd.com/slurmd.html + For details see: https://slurm.schedmd.com/slurmd.html """ try: r = subprocess.check_output(["slurmd", "-C"], text=True).strip() @@ -40,4 +39,5 @@ def get_slurmd_info() -> dict[str, str | list[str]]: continue info[k] = v + return info From fffc9d8d40fed2b867638fa93a2741a0d781338d Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Mon, 13 Jan 2025 15:17:09 +0000 Subject: [PATCH 15/15] refactor(slurmctld): remove unnecessary str conversion --- charms/slurmctld/src/charm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charms/slurmctld/src/charm.py b/charms/slurmctld/src/charm.py index dad1a3a..c64482f 100755 --- a/charms/slurmctld/src/charm.py +++ b/charms/slurmctld/src/charm.py @@ -246,7 +246,7 @@ def _update_gres_conf(self, event: SlurmdAvailableEvent) -> None: if gres_info := event.gres_info: gres_nodes = [] for resource in gres_info: - node = GRESNode(NodeName=str(event.node_name), **resource) + node = GRESNode(NodeName=event.node_name, **resource) gres_nodes.append(node) with self._slurmctld.gres.edit() as config: