Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Nvidia GPU Support #57

Merged
merged 15 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charms/sackd/charmcraft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ parts:
charm:
charm-binary-python-packages:
- cryptography ~= 44.0.0
- jsonschema ~= 4.23.0

provides:
slurmctld:
Expand Down
1 change: 1 addition & 0 deletions charms/slurmctld/charmcraft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ parts:
charm:
charm-binary-python-packages:
- cryptography ~= 44.0.0
- jsonschema ~= 4.23.0
- pydantic

config:
Expand Down
2 changes: 1 addition & 1 deletion charms/slurmctld/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ops==2.17.1
slurmutils~=0.9.0
slurmutils<1.0.0,>=0.11.0
62 changes: 59 additions & 3 deletions charms/slurmctld/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
WaitingStatus,
main,
)
from slurmutils.models import CgroupConfig, SlurmConfig
from slurmutils.models import CgroupConfig, GRESConfig, GRESNode, SlurmConfig

from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from charms.hpc_libs.v0.is_container import is_container
Expand Down Expand Up @@ -87,8 +87,8 @@ def __init__(self, *args):
self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable,
self._slurmd.on.partition_available: self._on_write_slurm_conf,
self._slurmd.on.partition_unavailable: self._on_write_slurm_conf,
self._slurmd.on.slurmd_available: self._on_write_slurm_conf,
self._slurmd.on.slurmd_departed: self._on_write_slurm_conf,
self._slurmd.on.slurmd_available: self._on_slurmd_available,
self._slurmd.on.slurmd_departed: self._on_slurmd_departed,
self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available,
self.on.show_current_config_action: self._on_show_current_config_action,
self.on.drain_action: self._on_drain_nodes_action,
Expand Down Expand Up @@ -214,6 +214,62 @@ def _on_resume_nodes_action(self, event: ActionEvent) -> None:
except subprocess.CalledProcessError as e:
event.fail(message=f"Error resuming {nodes}: {e.output}")

def _on_slurmd_available(self, event: SlurmdAvailableEvent) -> None:
"""Triggers when a slurmd unit joins the relation."""
self._update_gres_conf(event)
self._on_write_slurm_conf(event)

def _on_slurmd_departed(self, event: SlurmdDepartedEvent) -> None:
"""Triggers when a slurmd unit departs the relation.

Notes:
Lack of map between departing unit and NodeName complicates removal of node from gres.conf.
Instead, rewrite full gres.conf with data from remaining units.
"""
self._refresh_gres_conf(event)
self._on_write_slurm_conf(event)

def _update_gres_conf(self, event: SlurmdAvailableEvent) -> None:
"""Write new nodes to gres.conf configuration file for Generic Resource scheduling.

Warnings:
* This function does not perform an `scontrol reconfigure`. It is expected
that the function `_on_write_slurm_conf()` is called immediately following to do this.
"""
if not self.model.unit.is_leader():
return

if not self._check_status():
event.defer()
return

if gres_info := event.gres_info:
gres_nodes = []
for resource in gres_info:
node = GRESNode(NodeName=event.node_name, **resource)
gres_nodes.append(node)

with self._slurmctld.gres.edit() as config:
config.nodes[event.node_name] = gres_nodes

def _refresh_gres_conf(self, event: SlurmdDepartedEvent) -> None:
"""Write out current gres.conf configuration file for Generic Resource scheduling.

Warnings:
* This function does not perform an `scontrol reconfigure`. It is expected
that the function `_on_write_slurm_conf()` is called immediately following to do this.
"""
if not self.model.unit.is_leader():
return

if not self._check_status():
event.defer()
return

gres_all_nodes = self._slurmd.get_all_gres_info()
gres_conf = GRESConfig(Nodes=gres_all_nodes)
self._slurmctld.gres.dump(gres_conf)

def _on_write_slurm_conf(
self,
event: Union[
Expand Down
47 changes: 44 additions & 3 deletions charms/slurmctld/src/interface_slurmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,23 @@ class PartitionUnavailableEvent(EventBase):
class SlurmdAvailableEvent(EventBase):
"""Emitted when the slurmd unit joins the relation."""

def __init__(self, handle, node_name, gres_info=None):
super().__init__(handle)
self.node_name = node_name
self.gres_info = gres_info

def snapshot(self):
"""Snapshot the event data."""
return {
"node_name": self.node_name,
"gres_info": self.gres_info,
}

def restore(self, snapshot):
"""Restore the snapshot of the event data."""
self.node_name = snapshot.get("node_name")
self.gres_info = snapshot.get("gres_info")
dsloanm marked this conversation as resolved.
Show resolved Hide resolved


class SlurmdDepartedEvent(EventBase):
"""Emitted when one slurmd departs."""
Expand Down Expand Up @@ -124,9 +141,12 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None:
if node_config := node.get("node_parameters"):
if node_name := node_config.get("NodeName"):
self._charm.new_nodes = list(set(self._charm.new_nodes + [node_name]))
self.on.slurmd_available.emit()
self.on.slurmd_available.emit(
node_name=node_name, gres_info=node.get("gres")
)
logger.debug("_on_relation_changed node_config = %s", node_config)
else:
logger.debug(f"`node` data does not exist for unit: {unit}.")
logger.debug("`node` data does not exist for unit: %s.", unit)
else:
logger.debug("Unit doesn't exist on the relation.")

Expand All @@ -146,7 +166,7 @@ def set_nhc_params(self, params: str) -> None:
"""Send NHC parameters to all slurmd."""
# juju does not allow setting empty data/strings on the relation data,
# so we set it to something that behaves like empty
logger.debug(f"## set_nhc_params: {params}")
logger.debug("## set_nhc_params: %s", params)

if relations := self.framework.model.relations.get(self._relation_name):
for relation in relations:
Expand Down Expand Up @@ -245,3 +265,24 @@ def get_new_nodes_and_nodes_and_partitions(self) -> Dict[str, Any]:
else []
)
return {"DownNodes": new_node_down_nodes, "Nodes": nodes, "Partitions": partitions}

def get_all_gres_info(self) -> Dict[str, Any]:
"""Return GRES configuration for all currently related compute nodes."""
gres_info = {}
if relations := self.framework.model.relations.get(self._relation_name):
for relation in relations:
for unit in relation.units:

if node := self._get_node_from_relation(relation, unit):
# Ignore nodes without GRES devices
if (gres := node.get("gres")) and (
node_config := node.get("node_parameters")
):

node_name = node_config["NodeName"]
# Add NodeName to each GRES device to match the format required by slurmutils.
for device in gres:
device["NodeName"] = node_name
gres_info[node_name] = gres

return gres_info
7 changes: 7 additions & 0 deletions charms/slurmd/charmcraft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,15 @@ platforms:

parts:
charm:
build-packages:
- git
- libdrm-dev
- libkmod-dev
- libpci-dev
- pkgconf
charm-binary-python-packages:
- cryptography ~= 44.0.0
- jsonschema ~= 4.23.0
nhc:
plugin: nil
build-packages:
Expand Down
3 changes: 3 additions & 0 deletions charms/slurmd/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
ops==2.17.1
slurmutils<1.0.0,>=0.12.0
nvidia-ml-py==12.560.30
git+https://github.com/canonical/ubuntu-drivers-common@554b91edfd3699625dbed90f679abb31a897b76e#egg=ubuntu-drivers-common
dsloanm marked this conversation as resolved.
Show resolved Hide resolved
46 changes: 43 additions & 3 deletions charms/slurmd/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""Slurmd Operator Charm."""

import logging
from pathlib import Path
from typing import Any, Dict, cast

from interface_slurmctld import Slurmctld, SlurmctldAvailableEvent
Expand All @@ -20,8 +21,9 @@
WaitingStatus,
main,
)
from slurmutils import calculate_rs
from slurmutils.models.option import NodeOptionSet, PartitionOptionSet
from utils import machine, nhc, service
from utils import gpu, machine, nhc, service

from charms.hpc_libs.v0.slurm_ops import SlurmdManager, SlurmOpsError
from charms.operator_libs_linux.v0.juju_systemd_notices import ( # type: ignore[import-untyped]
Expand Down Expand Up @@ -74,11 +76,18 @@ def __init__(self, *args, **kwargs):

def _on_install(self, event: InstallEvent) -> None:
"""Perform installation operations for slurmd."""
# Account for case where base image has been auto-upgraded by Juju and a reboot is pending
# before charm code runs. Reboot "now", before the current hook completes, and restart the
# hook after reboot. Prevents issues such as drivers/kernel modules being installed for a
# running kernel pending replacement by a newer version on reboot.
self._reboot_if_required(now=True)

self.unit.status = WaitingStatus("installing slurmd")

try:
self._slurmd.install()
nhc.install()
gpu.autoinstall()
self.unit.set_workload_version(self._slurmd.version())
# TODO: https://github.com/orgs/charmed-hpc/discussions/10 -
# Evaluate if we should continue doing the service override here
Expand All @@ -92,6 +101,7 @@ def _on_install(self, event: InstallEvent) -> None:
event.defer()

self._check_status()
self._reboot_if_required()

def _on_config_changed(self, _: ConfigChangedEvent) -> None:
"""Handle charm configuration changes."""
Expand Down Expand Up @@ -214,7 +224,7 @@ def _on_show_nhc_config(self, event: ActionEvent) -> None:
event.set_results({"nhc.conf": "/etc/nhc/nhc.conf not found."})

def _on_node_config_action_event(self, event: ActionEvent) -> None:
"""Get or set the user_supplied_node_conifg.
"""Get or set the user_supplied_node_config.

Return the node config if the `node-config` parameter is not specified, otherwise
parse, validate, and store the input of the `node-config` parameter in stored state.
Expand Down Expand Up @@ -321,15 +331,45 @@ def _check_status(self) -> bool:

return True

def _reboot_if_required(self, now: bool = False) -> None:
"""Perform a reboot of the unit if required, e.g. following a driver installation."""
if Path("/var/run/reboot-required").exists():
logger.info("rebooting unit %s", self.unit.name)
self.unit.reboot(now)

def get_node(self) -> Dict[Any, Any]:
"""Get the node from stored state."""
slurmd_info = machine.get_slurmd_info()

gres_info = []
if gpus := gpu.get_all_gpu():
for model, devices in gpus.items():
# Build gres.conf line for this GPU model.
if len(devices) == 1:
device_suffix = devices[0]
else:
# Get numeric range of devices associated with this GRES resource. See:
# https://slurm.schedmd.com/gres.conf.html#OPT_File
device_suffix = calculate_rs(devices)
gres_line = {
"Name": "gpu",
"Type": model,
"File": f"/dev/nvidia{device_suffix}",
}
gres_info.append(gres_line)
slurmd_info["Gres"] = cast(list[str], slurmd_info.get("Gres", [])) + [
f"gpu:{model}:{len(devices)}"
]

node = {
"node_parameters": {
**machine.get_slurmd_info(),
**slurmd_info,
"MemSpecLimit": "1024",
**self._user_supplied_node_parameters,
},
"new_node": self._new_node,
# Do not include GRES configuration if no GPUs detected.
**({"gres": gres_info} if len(gres_info) > 0 else {}),
}
logger.debug(f"Node Configuration: {node}")
return node
Expand Down
Loading