From 519d9b5759f3fb979e9196fa2b54a5149173f996 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Fri, 12 Jul 2024 21:39:38 -0400 Subject: [PATCH 1/6] fix: ensure that nhc tarball is properly primed within charm Needed to add the part `charm: {}` to make the charm pack correctly. jedel1043 and I found that this part declaration must be included in `charmcraft.yaml`, or it will fail to pack the charm correctly. nhc will be there, but the charm won't :'( Signed-off-by: Jason C. Nucciarone --- charmcraft.yaml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/charmcraft.yaml b/charmcraft.yaml index dac7bdb..1af8322 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -34,12 +34,16 @@ bases: architectures: [amd64] parts: - charm: + charm: {} + nhc: + plugin: nil build-packages: - wget + override-pull: | + wget https://github.com/mej/nhc/releases/download/1.4.3/lbnl-nhc-1.4.3.tar.gz override-build: | - wget https://github.com/mej/nhc/releases/download/1.4.3/lbnl-nhc-1.4.3.tar.gz - craftctl default + install -m644 -D -t $CRAFT_PART_INSTALL lbnl-nhc-1.4.3.tar.gz + craftctl default provides: slurmctld: From f0102f2d3f4ca252860cdee8a397183fab56d2d5 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Fri, 12 Jul 2024 21:44:21 -0400 Subject: [PATCH 2/6] fix: handle if tar fails to extract contents nhc tarball Previously, if tar failed to extract the contents of the nhc tarball to `/tmp/nhc`, _install_nhc_from_tarball would throw an unhandled excepting that would cause the charm to bork. Now we catch if tar fails to extract the contents of the tarball, log the error output for the asministrator to read, and return False so that the slurmd operator can properly handle and install failure. Signed-off-by: Jason C. Nucciarone --- src/slurmd_ops.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/slurmd_ops.py b/src/slurmd_ops.py index dcab81b..7b3a1b8 100644 --- a/src/slurmd_ops.py +++ b/src/slurmd_ops.py @@ -173,7 +173,12 @@ def _install_nhc_from_tarball(self) -> bool: base_path.mkdir() cmd = f"tar --extract --directory {base_path} --file lbnl-nhc-1.4.3.tar.gz".split() - subprocess.run(cmd) + try: + result = subprocess.check_output(cmd, stderr=subprocess.STDOUT, text=True) + logger.debug(result) + except subprocess.CalledProcessError as e: + logger.error("failed to extract NHC using tar. reason:\n%s", e.stdout) + return False full_path = base_path / os.listdir(base_path)[0] From d00481429032f2ba935d1ed0ebad01889f224ff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Mon, 15 Jul 2024 17:07:13 -0600 Subject: [PATCH 3/6] chore: fix CI pipeline This will always pull from `main` when testing against `slurmctld-operator`, which is arguably better when trying to do breaking changes in integrations between the charms. --- .github/workflows/ci.yaml | 11 +++++++++-- tests/integration/conftest.py | 20 -------------------- tests/integration/test_charm.py | 32 +++----------------------------- tox.ini | 2 +- 4 files changed, 13 insertions(+), 52 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c1014b1..2d60a51 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -67,7 +67,7 @@ jobs: strategy: fail-fast: true matrix: - bases: + bases: - ubuntu@22.04 name: Integration tests (LXD) | ${{ matrix.bases }} runs-on: ubuntu-latest @@ -79,10 +79,17 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 + with: + path: main + - name: Fetch slurmctld + uses: actions/checkout@v4 + with: + repository: charmed-hpc/slurmctld-operator + path: slurmctld-operator - name: Setup operator environment uses: charmed-kubernetes/actions-operator@main with: provider: lxd juju-channel: 3.1/stable - name: Run tests - run: tox run -e integration -- --charm-base=${{ matrix.bases }} + run: cd main && tox run -e integration -- --charm-base=${{ matrix.bases }} --use-local diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 148c6d2..bb64c01 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -73,23 +73,3 @@ async def slurmctld_charm(request, ops_test: OpsTest) -> Union[str, Path]: ) return "slurmctld" - - -@pytest.fixture(scope="module") -async def slurmdbd_charm(request, ops_test: OpsTest) -> Union[str, Path]: - """Pack slurmdbd charm to use for integration tests when --use-local is specified. - - Returns: - `str` "slurmdbd" if --use-local not specified or if SLURMDBD_DIR does not exist. - """ - if request.config.option.use_local: - logger.info("Using local slurmdbd operator rather than pulling from Charmhub") - if SLURMDBD_DIR.exists(): - return await ops_test.build_charm(SLURMDBD_DIR) - else: - logger.warning( - f"{SLURMDBD_DIR} not found. " - f"Defaulting to latest/edge slurmdbd operator from Charmhub" - ) - - return "slurmdbd" diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index bfa1eef..2204b70 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -35,14 +35,12 @@ @pytest.mark.skip_if_deployed @pytest.mark.order(1) async def test_build_and_deploy( - ops_test: OpsTest, charm_base: str, slurmd_charm, slurmctld_charm, slurmdbd_charm + ops_test: OpsTest, charm_base: str, slurmd_charm, slurmctld_charm ) -> None: """Test that the slurmd charm can stabilize against slurmctld, slurmdbd and MySQL.""" - logger.info(f"Deploying {SLURMD} against {SLURMCTLD}, {SLURMDBD}, and {DATABASE}") + logger.info(f"Deploying {SLURMD} against {SLURMCTLD}") # Pack charms and download NHC resource for slurmd operator. - slurmd, slurmctld, slurmdbd = await asyncio.gather( - slurmd_charm, slurmctld_charm, slurmdbd_charm - ) + slurmd, slurmctld = await asyncio.gather(slurmd_charm, slurmctld_charm) # Deploy the test Charmed SLURM cloud. await asyncio.gather( ops_test.model.deploy( @@ -58,33 +56,9 @@ async def test_build_and_deploy( num_units=1, base=charm_base, ), - ops_test.model.deploy( - str(slurmdbd), - application_name=SLURMDBD, - channel="edge" if isinstance(slurmdbd, str) else None, - num_units=1, - base=charm_base, - ), - ops_test.model.deploy( - ROUTER, - application_name=f"{SLURMDBD}-{ROUTER}", - channel="dpe/edge", - num_units=0, - base=charm_base, - ), - ops_test.model.deploy( - DATABASE, - application_name=DATABASE, - channel="8.0/edge", - num_units=1, - base="ubuntu@22.04", - ), ) # Set relations for charmed applications. await ops_test.model.integrate(f"{SLURMD}:{SLURMCTLD}", f"{SLURMCTLD}:{SLURMD}") - await ops_test.model.integrate(f"{SLURMDBD}:{SLURMCTLD}", f"{SLURMCTLD}:{SLURMDBD}") - await ops_test.model.integrate(f"{SLURMDBD}-{ROUTER}:backend-database", f"{DATABASE}:database") - await ops_test.model.integrate(f"{SLURMDBD}:database", f"{SLURMDBD}-{ROUTER}:database") # Reduce the update status frequency to accelerate the triggering of deferred events. async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[SLURMD], status="active", timeout=1000) diff --git a/tox.ini b/tox.ini index bc09f5c..fd54f39 100644 --- a/tox.ini +++ b/tox.ini @@ -78,5 +78,5 @@ commands = -s \ --tb native \ --log-cli-level=INFO \ - {[vars]tst_path}integration \ + {[vars]tst_path}/integration \ {posargs} From 4595cf3366cafdfea7dbeb7eaeebc86dd23cf8a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Wed, 17 Jul 2024 11:34:21 -0600 Subject: [PATCH 4/6] ci: test in CI against Charmhub (edge) --- .github/workflows/ci.yaml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2d60a51..237d7d7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -65,12 +65,16 @@ jobs: integration-test: strategy: - fail-fast: true + fail-fast: false matrix: bases: - ubuntu@22.04 - name: Integration tests (LXD) | ${{ matrix.bases }} + local: [true, false] + name: Integration tests (LXD) ${{ matrix.local && '|' || '| Charmhub (edge) |'}} ${{ matrix.bases }} runs-on: ubuntu-latest + # Testing against Charmhub will probably yield errors when doing breaking changes, so don't + # block CI on that. + continue-on-error: ${{ !matrix.local }} needs: - inclusive-naming-check - lint @@ -83,6 +87,7 @@ jobs: path: main - name: Fetch slurmctld uses: actions/checkout@v4 + if: ${{ matrix.local }} with: repository: charmed-hpc/slurmctld-operator path: slurmctld-operator @@ -92,4 +97,7 @@ jobs: provider: lxd juju-channel: 3.1/stable - name: Run tests - run: cd main && tox run -e integration -- --charm-base=${{ matrix.bases }} --use-local + run: | + cd main && tox run -e integration -- \ + --charm-base=${{ matrix.bases }} \ + ${{ matrix.local && '--use-local' || '' }} From d842d84f3565ac06e6aeab3d263411ad69ddde7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Wed, 17 Jul 2024 13:42:21 -0600 Subject: [PATCH 5/6] ci: bump charming-actions to 2.5.0-rc This fixes the CI release job, because 2.5.0 added support for the unified charmcraft.yaml file. --- .github/workflows/release.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index dc0a8a2..f060d34 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -32,10 +32,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 - name: Select charmhub channel - uses: canonical/charming-actions/channel@2.2.0 + uses: canonical/charming-actions/channel@2.5.0-rc id: channel - name: Upload charm to charmhub - uses: canonical/charming-actions/upload-charm@2.2.0 + uses: canonical/charming-actions/upload-charm@2.5.0-rc with: credentials: "${{ secrets.CHARMCRAFT_AUTH }}" github-token: "${{ secrets.GITHUB_TOKEN }}" From fb1222c6c161abd23784a97306c1148cd2244b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Thu, 18 Jul 2024 17:23:11 -0600 Subject: [PATCH 6/6] test: fix unit tests --- lib/charms/hpc_libs/v0/slurm_ops.py | 32 +++++++++++++++++++++++++++-- tests/unit/test_charm.py | 1 + tests/unit/utils/test_slurmd.py | 12 ----------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py index d49a38a..b48f0a3 100644 --- a/lib/charms/hpc_libs/v0/slurm_ops.py +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -62,11 +62,13 @@ def _on_install(self, _) -> None: "ConfigurationManager", "ServiceType", "SlurmManagerBase", + "SlurmOpsError", ] import json import logging import re +import socket import subprocess from collections.abc import Mapping from enum import Enum @@ -82,7 +84,7 @@ def _on_install(self, _) -> None: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 3 +LIBPATCH = 5 # Charm library dependencies to fetch during `charmcraft pack`. PYDEPS = ["pyyaml>=6.0.1"] @@ -131,7 +133,8 @@ def install() -> None: def version() -> str: """Get the current version of Slurm installed on the system.""" info = yaml.safe_load(_snap("info", "slurm")) - ver: str = info["installed"] + if (ver := info.get("installed")) is None: + raise SlurmOpsError("unable to retrive snap info. Ensure slurm is correctly installed") return ver.split(maxsplit=1)[0] @@ -177,6 +180,7 @@ class ServiceType(Enum): """Type of Slurm service to manage.""" MUNGED = "munged" + PROMETHEUS_EXPORTER = "slurm-prometheus-exporter" SLURMD = "slurmd" SLURMCTLD = "slurmctld" SLURMDBD = "slurmdbd" @@ -208,6 +212,17 @@ def restart(self) -> None: """Restart service.""" _snap("restart", f"slurm.{self._service.value}") + def active(self) -> bool: + """Return True if the service is active.""" + info = yaml.safe_load(_snap("info", "slurm")) + if (services := info.get("services")) is None: + raise SlurmOpsError("unable to retrive snap info. Ensure slurm is correctly installed") + + # Assume `services` contains the service, since `ServiceManager` is not exposed as a + # public interface for now. + # We don't do `"active" in state` because the word "active" is also part of "inactive" :) + return "inactive" not in services[f"slurm.{self._service.value}"] + class ConfigurationManager: """Control configuration of a Slurm component.""" @@ -271,6 +286,13 @@ def generate_key(self) -> None: _mungectl("key", "generate") +class PrometheusExporterManager(ServiceManager): + """Manage `slurm-prometheus-exporter` service operations.""" + + def __init__(self) -> None: + self._service = ServiceType.PROMETHEUS_EXPORTER + + class SlurmManagerBase(ServiceManager): """Base manager for Slurm services.""" @@ -278,3 +300,9 @@ def __init__(self, service: ServiceType) -> None: self._service = service self.config = ConfigurationManager(service.config_name) self.munge = MungeManager() + self.exporter = PrometheusExporterManager() + + @property + def hostname(self) -> str: + """The hostname where this manager is running.""" + return socket.gethostname().split(".")[0] diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 783c38a..48aeff4 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -56,6 +56,7 @@ def test_install_fail(self, _, defer) -> None: defer.assert_called() @patch("slurmd_ops.SlurmdManager.install") + @patch("slurmd_ops.SlurmdManager.version", return_value="23.11.7") @patch("pathlib.Path.read_text", return_value="v1.0.0") @patch("ops.model.Unit.set_workload_version") @patch("ops.model.Resources.fetch") diff --git a/tests/unit/utils/test_slurmd.py b/tests/unit/utils/test_slurmd.py index 548086c..9c70eb2 100644 --- a/tests/unit/utils/test_slurmd.py +++ b/tests/unit/utils/test_slurmd.py @@ -26,18 +26,6 @@ class TestSlurmd(unittest.TestCase): """Unit tests for methods in slurmd utility module.""" - @patch("charms.operator_libs_linux.v1.systemd._systemctl") - def test_start(self, _) -> None: - slurmd.start() - - @patch("charms.operator_libs_linux.v1.systemd._systemctl") - def test_stop(self, _) -> None: - slurmd.stop() - - @patch("charms.operator_libs_linux.v1.systemd._systemctl") - def test_restart(self, _) -> None: - slurmd.restart() - @patch("pathlib.Path.write_text") def test_overwrite_default(self, _) -> None: slurmd.override_default("127.0.0.1")