From 4a07c3b0064b6deec2f4ab642553678ad8349185 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 09:25:39 -0800 Subject: [PATCH 01/17] Add check for single node experiment --- experiments/amg2023/experiment.py | 1 - experiments/ior/experiment.py | 1 - experiments/kripke/experiment.py | 1 - experiments/laghos/experiment.py | 12 ------------ experiments/remhos/experiment.py | 13 ------------- lib/benchpark/experiment.py | 5 +++++ modifiers/allocation/modifier.py | 6 ++++++ 7 files changed, 11 insertions(+), 28 deletions(-) diff --git a/experiments/amg2023/experiment.py b/experiments/amg2023/experiment.py index d4ef26666..399673cf6 100644 --- a/experiments/amg2023/experiment.py +++ b/experiments/amg2023/experiment.py @@ -58,7 +58,6 @@ def compute_applications_section(self): "strong": self.spec.satisfies("+strong"), "weak": self.spec.satisfies("+weak"), "throughput": self.spec.satisfies("+throughput"), - "single_node": self.spec.satisfies("+single_node"), } scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] diff --git a/experiments/ior/experiment.py b/experiments/ior/experiment.py index 871e16a4f..25ddf1343 100644 --- a/experiments/ior/experiment.py +++ b/experiments/ior/experiment.py @@ -32,7 +32,6 @@ def compute_applications_section(self): scaling_modes = { "strong": self.spec.satisfies("+strong"), "weak": self.spec.satisfies("+weak"), - "single_node": self.spec.satisfies("+single_node"), } scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] diff --git a/experiments/kripke/experiment.py b/experiments/kripke/experiment.py index 94d59efa2..fbe91a6a5 100644 --- a/experiments/kripke/experiment.py +++ b/experiments/kripke/experiment.py @@ -41,7 +41,6 @@ def compute_applications_section(self): "strong": self.spec.satisfies("+strong"), "weak": self.spec.satisfies("+weak"), "throughput": self.spec.satisfies("+throughput"), - "single_node": self.spec.satisfies("+single_node"), } scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] diff --git a/experiments/laghos/experiment.py b/experiments/laghos/experiment.py index 68f390793..fa9249dcc 100644 --- a/experiments/laghos/experiment.py +++ b/experiments/laghos/experiment.py @@ -29,18 +29,6 @@ class Laghos( ) def compute_applications_section(self): - # TODO: Replace with conflicts clause - scaling_modes = { - "strong": self.spec.satisfies("+strong"), - "single_node": self.spec.satisfies("+single_node"), - } - - scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] - if len(scaling_mode_enabled) != 1: - raise BenchparkError( - f"Only one type of scaling per experiment is allowed for application package {self.name}" - ) - # Number of initial nodes num_nodes = {"n_nodes": 1} diff --git a/experiments/remhos/experiment.py b/experiments/remhos/experiment.py index 079ecac56..8d83d0cc4 100644 --- a/experiments/remhos/experiment.py +++ b/experiments/remhos/experiment.py @@ -31,19 +31,6 @@ class Remhos( ) def compute_applications_section(self): - # TODO: Replace with conflicts clause - scaling_modes = { - "strong": self.spec.satisfies("+strong"), - "single_node": self.spec.satisfies("+single_node"), - } - - scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] - if len(scaling_mode_enabled) != 1: - print(scaling_mode_enabled) - raise BenchparkError( - f"Only one type of scaling per experiment is allowed for application package {self.name}" - ) - # Number of initial nodes num_nodes = {"n_nodes": 1} diff --git a/lib/benchpark/experiment.py b/lib/benchpark/experiment.py index a37013b0c..bc66b8609 100644 --- a/lib/benchpark/experiment.py +++ b/lib/benchpark/experiment.py @@ -55,6 +55,9 @@ class SingleNode: description="Single node execution mode", ) + def run_single_node_expr(self): + return 1 if self.spec.satisfies("+single_node") else 0 + class Helper(ExperimentHelper): def get_helper_name_prefix(self): return "single_node" if self.spec.satisfies("+single_node") else "" @@ -172,6 +175,8 @@ def compute_applications_section_wrapper(self): self.compute_applications_section() + self.add_experiment_variable("run_single_node_expr", self.run_single_node_expr()) + expr_name_list = [self.name, self.workload] for cls in self.helpers: helper_prefix = cls.get_helper_name_prefix() diff --git a/modifiers/allocation/modifier.py b/modifiers/allocation/modifier.py index 373cf0d12..00fcfaf60 100644 --- a/modifiers/allocation/modifier.py +++ b/modifiers/allocation/modifier.py @@ -37,6 +37,8 @@ class AllocOpt(Enum): POST_EXEC_CMDS = 302 PRE_EXEC_CMDS = 303 + RUN_SINGLE_NODE_EXPR = 666 + @staticmethod def as_type(enumval, input): if enumval in [ @@ -291,6 +293,10 @@ def determine_allocation(self, v): ) v.n_nodes = max(cores_node_request or 0, gpus_node_request or 0) + if v.run_single_node_expr and v.n_nodes > 1: + raise ValueError(f"Experiment must run on 1 node. Requested {v.n_nodes} nodes") + + if not v.n_threads_per_proc: v.n_threads_per_proc = 1 From 040bdfb6306ee7df7988ef58d57dadd336a8498c Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 09:40:22 -0800 Subject: [PATCH 02/17] lint --- lib/benchpark/experiment.py | 4 +++- modifiers/allocation/modifier.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/benchpark/experiment.py b/lib/benchpark/experiment.py index 91cd6af8e..fa590bc84 100644 --- a/lib/benchpark/experiment.py +++ b/lib/benchpark/experiment.py @@ -175,7 +175,9 @@ def compute_applications_section_wrapper(self): self.compute_applications_section() - self.add_experiment_variable("run_single_node_expr", self.run_single_node_expr()) + self.add_experiment_variable( + "run_single_node_expr", self.run_single_node_expr() + ) expr_helper_list = [] for cls in self.helpers: diff --git a/modifiers/allocation/modifier.py b/modifiers/allocation/modifier.py index 00fcfaf60..cbdaf1496 100644 --- a/modifiers/allocation/modifier.py +++ b/modifiers/allocation/modifier.py @@ -294,7 +294,9 @@ def determine_allocation(self, v): v.n_nodes = max(cores_node_request or 0, gpus_node_request or 0) if v.run_single_node_expr and v.n_nodes > 1: - raise ValueError(f"Experiment must run on 1 node. Requested {v.n_nodes} nodes") + raise ValueError( + f"Experiment must run on 1 node. Requested {v.n_nodes} nodes" + ) if not v.n_threads_per_proc: From b0129bf7a73c5a11138e357b5d971b8e4170312b Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 09:43:32 -0800 Subject: [PATCH 03/17] lint --- modifiers/allocation/modifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modifiers/allocation/modifier.py b/modifiers/allocation/modifier.py index cbdaf1496..393ea60c9 100644 --- a/modifiers/allocation/modifier.py +++ b/modifiers/allocation/modifier.py @@ -298,7 +298,6 @@ def determine_allocation(self, v): f"Experiment must run on 1 node. Requested {v.n_nodes} nodes" ) - if not v.n_threads_per_proc: v.n_threads_per_proc = 1 From 4f61c947a281eb944aa95053b5fa1d7135913310 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 09:54:39 -0800 Subject: [PATCH 04/17] Remove scaling mode checks in experiments --- experiments/amg2023/experiment.py | 14 -------------- experiments/ior/experiment.py | 13 ------------- experiments/kripke/experiment.py | 14 -------------- experiments/laghos/experiment.py | 1 - experiments/osu-micro-benchmarks/experiment.py | 11 ----------- experiments/remhos/experiment.py | 1 - 6 files changed, 54 deletions(-) diff --git a/experiments/amg2023/experiment.py b/experiments/amg2023/experiment.py index 399673cf6..f908f9b6c 100644 --- a/experiments/amg2023/experiment.py +++ b/experiments/amg2023/experiment.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from benchpark.error import BenchparkError from benchpark.directives import variant from benchpark.experiment import Experiment from benchpark.openmp import OpenMPExperiment @@ -53,19 +52,6 @@ class Amg2023( # ) def compute_applications_section(self): - # TODO: Replace with conflicts clause - scaling_modes = { - "strong": self.spec.satisfies("+strong"), - "weak": self.spec.satisfies("+weak"), - "throughput": self.spec.satisfies("+throughput"), - } - - scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] - if len(scaling_mode_enabled) != 1: - raise BenchparkError( - f"Only one type of scaling per experiment is allowed for application package {self.name}" - ) - # Number of processes in each dimension num_procs = {"px": 2, "py": 2, "pz": 2} diff --git a/experiments/ior/experiment.py b/experiments/ior/experiment.py index 25ddf1343..37f39eb3f 100644 --- a/experiments/ior/experiment.py +++ b/experiments/ior/experiment.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from benchpark.error import BenchparkError from benchpark.directives import variant from benchpark.experiment import Experiment from benchpark.scaling import StrongScaling @@ -28,18 +27,6 @@ class Ior( ) def compute_applications_section(self): - # TODO: Replace with conflicts clause - scaling_modes = { - "strong": self.spec.satisfies("+strong"), - "weak": self.spec.satisfies("+weak"), - } - - scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] - if len(scaling_mode_enabled) != 1: - raise BenchparkError( - f"Only one type of scaling per experiment is allowed for application package {self.name}" - ) - num_nodes = {"n_nodes": 1} t = "{b}/256" self.add_experiment_variable("t", t, True) diff --git a/experiments/kripke/experiment.py b/experiments/kripke/experiment.py index fbe91a6a5..57f2a937e 100644 --- a/experiments/kripke/experiment.py +++ b/experiments/kripke/experiment.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from benchpark.error import BenchparkError from benchpark.directives import variant from benchpark.experiment import Experiment from benchpark.openmp import OpenMPExperiment @@ -36,19 +35,6 @@ class Kripke( ) def compute_applications_section(self): - # TODO: Replace with conflicts clause - scaling_modes = { - "strong": self.spec.satisfies("+strong"), - "weak": self.spec.satisfies("+weak"), - "throughput": self.spec.satisfies("+throughput"), - } - - scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] - if len(scaling_mode_enabled) != 1: - raise BenchparkError( - f"Only one type of scaling per experiment is allowed for application package {self.name}" - ) - input_variables = { "ngroups": 64, "gs": 1, diff --git a/experiments/laghos/experiment.py b/experiments/laghos/experiment.py index fa9249dcc..4cbf7fb6e 100644 --- a/experiments/laghos/experiment.py +++ b/experiments/laghos/experiment.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from benchpark.error import BenchparkError from benchpark.directives import variant from benchpark.experiment import Experiment from benchpark.scaling import StrongScaling diff --git a/experiments/osu-micro-benchmarks/experiment.py b/experiments/osu-micro-benchmarks/experiment.py index 0ef52f33f..73061c378 100644 --- a/experiments/osu-micro-benchmarks/experiment.py +++ b/experiments/osu-micro-benchmarks/experiment.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: Apache-2.0 from benchpark.directives import variant -from benchpark.error import BenchparkError from benchpark.experiment import Experiment @@ -90,16 +89,6 @@ class OsuMicroBenchmarks(Experiment): ) def compute_applications_section(self): - scaling_modes = { - "single_node": self.spec.satisfies("+single_node"), - } - - scaling_mode_enabled = [key for key, value in scaling_modes.items() if value] - if len(scaling_mode_enabled) != 1: - raise BenchparkError( - f"Only one type of scaling per experiment is allowed for application package {self.name}" - ) - num_nodes = {"n_nodes": 2} if self.spec.satisfies("+single_node"): diff --git a/experiments/remhos/experiment.py b/experiments/remhos/experiment.py index 8d83d0cc4..74a080cff 100644 --- a/experiments/remhos/experiment.py +++ b/experiments/remhos/experiment.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from benchpark.error import BenchparkError from benchpark.directives import variant from benchpark.experiment import Experiment from benchpark.scaling import StrongScaling From 2a22bbd588330fa0339edd33f1efc1997c8099eb Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 10:16:34 -0800 Subject: [PATCH 05/17] workflows --- .github/workflows/run.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index e9089dae7..93868f8d7 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -143,7 +143,7 @@ jobs: - name: Dry run dynamic kripke-openmp on nosite-x86_64 with allocation modifier run: | - ./bin/benchpark experiment init --dest=kripke-openmp kripke+openmp + ./bin/benchpark experiment init --dest=kripke-openmp kripke+openmp~single_node ./bin/benchpark setup ./kripke-openmp nosite-x86_64 workspace/ . workspace/setup.sh ramble \ @@ -154,7 +154,7 @@ jobs: - name: Dry run dynamic kripke-rocm on LLNL-Tioga-HPECray-zen3-MI250X-Slingshot with allocation modifier run: | - ./bin/benchpark experiment init --dest=kripke-rocm kripke+rocm + ./bin/benchpark experiment init --dest=kripke-rocm kripke+rocm~single_node ./bin/benchpark setup ./kripke-openmp LLNL-Tioga-HPECray-zen3-MI250X-Slingshot workspace/ . workspace/setup.sh ramble \ From f167afcac1e71e5eadc529dcaa2c70a425b6a185 Mon Sep 17 00:00:00 2001 From: Riyaz Haque <5333387+rfhaque@users.noreply.github.com> Date: Sat, 14 Dec 2024 10:24:17 -0800 Subject: [PATCH 06/17] Update run.yml --- .github/workflows/run.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 93868f8d7..813855eb2 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -143,7 +143,7 @@ jobs: - name: Dry run dynamic kripke-openmp on nosite-x86_64 with allocation modifier run: | - ./bin/benchpark experiment init --dest=kripke-openmp kripke+openmp~single_node + ./bin/benchpark experiment init --dest=kripke-openmp kripke+openmp ./bin/benchpark setup ./kripke-openmp nosite-x86_64 workspace/ . workspace/setup.sh ramble \ From 3be9c783061c781e7f22ae37d4d1cabf56e8c411 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 10:49:28 -0800 Subject: [PATCH 07/17] Fix kripke single node case --- .github/workflows/run.yml | 4 ++-- experiments/kripke/experiment.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 813855eb2..32aeb803c 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -154,7 +154,7 @@ jobs: - name: Dry run dynamic kripke-rocm on LLNL-Tioga-HPECray-zen3-MI250X-Slingshot with allocation modifier run: | - ./bin/benchpark experiment init --dest=kripke-rocm kripke+rocm~single_node + ./bin/benchpark experiment init --dest=kripke-rocm kripke+rocm ./bin/benchpark setup ./kripke-openmp LLNL-Tioga-HPECray-zen3-MI250X-Slingshot workspace/ . workspace/setup.sh ramble \ @@ -309,7 +309,7 @@ jobs: - name: Dry run dynamic quicksilver-openmp on nosite-x86_64 with allocation modifier run: | - ./bin/benchpark experiment init --dest=quicksilver-openmp quicksilver+openmp +weak~single_node + ./bin/benchpark experiment init --dest=quicksilver-openmp quicksilver+openmp +weak ./bin/benchpark setup ./quicksilver-openmp nosite-x86_64 workspace/ . workspace/setup.sh ramble \ diff --git a/experiments/kripke/experiment.py b/experiments/kripke/experiment.py index 57f2a937e..0a5b1de1a 100644 --- a/experiments/kripke/experiment.py +++ b/experiments/kripke/experiment.py @@ -52,15 +52,7 @@ def compute_applications_section(self): for k, v in input_variables.items(): self.add_experiment_variable(k, v, True) - if self.spec.satisfies("+single_node"): - n_resources = 1 - # TODO: Check if n_ranks / n_resources_per_node <= 1 - for pk, pv in num_procs.items(): - self.add_experiment_variable(pk, pv, True) - n_resources *= pv - for nk, nv in problem_sizes.items(): - self.add_experiment_variable(nk, nv, True) - elif self.spec.satisfies("+throughput"): + if self.spec.satisfies("+throughput"): n_resources = 1 for pk, pv in num_procs.items(): self.add_experiment_variable(pk, pv, True) @@ -103,6 +95,14 @@ def compute_applications_section(self): ] for k, v in scaled_variables.items(): self.add_experiment_variable(k, v, True) + else: + n_resources = 1 + # TODO: Check if n_ranks / n_resources_per_node <= 1 + for pk, pv in num_procs.items(): + self.add_experiment_variable(pk, pv, True) + n_resources *= pv + for nk, nv in problem_sizes.items(): + self.add_experiment_variable(nk, nv, True) if self.spec.satisfies("+openmp"): self.add_experiment_variable("n_ranks", n_resources, True) From bbecd3bf710645f659b03aa946f1e2ea31ec97e0 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 11:02:24 -0800 Subject: [PATCH 08/17] single node mode --- .github/workflows/run.yml | 4 ++-- experiments/amg2023/experiment.py | 18 +++++++++--------- experiments/ior/experiment.py | 10 +++++----- experiments/laghos/experiment.py | 8 ++++---- experiments/osu-micro-benchmarks/experiment.py | 5 ++--- experiments/remhos/experiment.py | 8 ++++---- 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 32aeb803c..d22e9bd1c 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -143,7 +143,7 @@ jobs: - name: Dry run dynamic kripke-openmp on nosite-x86_64 with allocation modifier run: | - ./bin/benchpark experiment init --dest=kripke-openmp kripke+openmp + ./bin/benchpark experiment init --dest=kripke-openmp kripke+openmp~single_node ./bin/benchpark setup ./kripke-openmp nosite-x86_64 workspace/ . workspace/setup.sh ramble \ @@ -152,7 +152,7 @@ jobs: --disable-logger \ workspace setup --dry-run - - name: Dry run dynamic kripke-rocm on LLNL-Tioga-HPECray-zen3-MI250X-Slingshot with allocation modifier + - name: Dry run dynamic kripke-rocm on LLNL-Tioga-HPECray-zen3-MI250X-Slingshot with allocation modifier~single_node run: | ./bin/benchpark experiment init --dest=kripke-rocm kripke+rocm ./bin/benchpark setup ./kripke-openmp LLNL-Tioga-HPECray-zen3-MI250X-Slingshot workspace/ diff --git a/experiments/amg2023/experiment.py b/experiments/amg2023/experiment.py index f908f9b6c..64d5b0254 100644 --- a/experiments/amg2023/experiment.py +++ b/experiments/amg2023/experiment.py @@ -58,15 +58,7 @@ def compute_applications_section(self): # Per-process size (in zones) in each dimension problem_sizes = {"nx": 80, "ny": 80, "nz": 80} - if self.spec.satisfies("+single_node"): - n_resources = 1 - # TODO: Check if n_ranks / n_resources_per_node <= 1 - for pk, pv in num_procs.items(): - self.add_experiment_variable(pk, pv, True) - n_resources *= pv - for nk, nv in problem_sizes.items(): - self.add_experiment_variable(nk, nv, True) - elif self.spec.satisfies("+throughput"): + if self.spec.satisfies("+throughput"): n_resources = 1 for pk, pv in num_procs.items(): self.add_experiment_variable(pk, pv, True) @@ -109,6 +101,14 @@ def compute_applications_section(self): ] for k, v in scaled_variables.items(): self.add_experiment_variable(k, v, True) + else: + n_resources = 1 + # TODO: Check if n_ranks / n_resources_per_node <= 1 + for pk, pv in num_procs.items(): + self.add_experiment_variable(pk, pv, True) + n_resources *= pv + for nk, nv in problem_sizes.items(): + self.add_experiment_variable(nk, nv, True) if self.spec.satisfies("+openmp"): self.add_experiment_variable("n_ranks", n_resources, True) diff --git a/experiments/ior/experiment.py b/experiments/ior/experiment.py index 37f39eb3f..ab095f162 100644 --- a/experiments/ior/experiment.py +++ b/experiments/ior/experiment.py @@ -31,11 +31,7 @@ def compute_applications_section(self): t = "{b}/256" self.add_experiment_variable("t", t, True) - if self.spec.satisfies("+single_node"): - for pk, pv in num_nodes.items(): - self.add_experiment_variable(pk, pv, True) - self.add_experiment_variable("b", "268435456", True) - elif self.spec.satisfies("+strong"): + if self.spec.satisfies("+strong"): scaled_variables = self.generate_strong_scaling_params( {tuple(num_nodes.keys()): list(num_nodes.values())}, int(self.spec.variants["scaling-factor"][0]), @@ -56,6 +52,10 @@ def compute_applications_section(self): self.add_experiment_variable(k, v, True) self.add_experiment_variable("b", "268435456", True) + else: + for pk, pv in num_nodes.items(): + self.add_experiment_variable(pk, pv, True) + self.add_experiment_variable("b", "268435456", True) self.add_experiment_variable("t", t, True) self.add_experiment_variable( diff --git a/experiments/laghos/experiment.py b/experiments/laghos/experiment.py index 4cbf7fb6e..a558ccac9 100644 --- a/experiments/laghos/experiment.py +++ b/experiments/laghos/experiment.py @@ -31,10 +31,7 @@ def compute_applications_section(self): # Number of initial nodes num_nodes = {"n_nodes": 1} - if self.spec.satisfies("+single_node"): - for pk, pv in num_nodes.items(): - self.add_experiment_variable(pk, pv, True) - elif self.spec.satisfies("+strong"): + if self.spec.satisfies("+strong"): scaled_variables = self.generate_strong_scaling_params( {tuple(num_nodes.keys()): list(num_nodes.values())}, int(self.spec.variants["scaling-factor"][0]), @@ -42,6 +39,9 @@ def compute_applications_section(self): ) for pk, pv in scaled_variables.items(): self.add_experiment_variable(pk, pv, True) + else: + for pk, pv in num_nodes.items(): + self.add_experiment_variable(pk, pv, True) self.add_experiment_variable( "n_ranks", "{sys_cores_per_node} * {n_nodes}", True diff --git a/experiments/osu-micro-benchmarks/experiment.py b/experiments/osu-micro-benchmarks/experiment.py index 73061c378..db13dcdf8 100644 --- a/experiments/osu-micro-benchmarks/experiment.py +++ b/experiments/osu-micro-benchmarks/experiment.py @@ -91,9 +91,8 @@ class OsuMicroBenchmarks(Experiment): def compute_applications_section(self): num_nodes = {"n_nodes": 2} - if self.spec.satisfies("+single_node"): - for pk, pv in num_nodes.items(): - self.add_experiment_variable(pk, pv, True) + for pk, pv in num_nodes.items(): + self.add_experiment_variable(pk, pv, True) def compute_spack_section(self): system_specs = {} diff --git a/experiments/remhos/experiment.py b/experiments/remhos/experiment.py index 74a080cff..0a71fbecf 100644 --- a/experiments/remhos/experiment.py +++ b/experiments/remhos/experiment.py @@ -33,10 +33,7 @@ def compute_applications_section(self): # Number of initial nodes num_nodes = {"n_nodes": 1} - if self.spec.satisfies("+single_node"): - for pk, pv in num_nodes.items(): - self.add_experiment_variable(pk, pv, True) - elif self.spec.satisfies("+strong"): + if self.spec.satisfies("+strong"): scaled_variables = self.generate_strong_scaling_params( {tuple(num_nodes.keys()): list(num_nodes.values())}, int(self.spec.variants["scaling-factor"][0]), @@ -44,6 +41,9 @@ def compute_applications_section(self): ) for pk, pv in scaled_variables.items(): self.add_experiment_variable(pk, pv, True) + else: + for pk, pv in num_nodes.items(): + self.add_experiment_variable(pk, pv, True) self.add_experiment_variable( "n_ranks", "{sys_cores_per_node} * {n_nodes}", True From b59bba8360245067ce867bbe0357b0c241f3ab66 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 12:26:01 -0800 Subject: [PATCH 09/17] workflows --- .github/workflows/run.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index d22e9bd1c..2affe5fc4 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -265,7 +265,7 @@ jobs: - name: Dry run dynamic lammps/openmp on static Ruby run: | - ./bin/benchpark experiment init --dest=lammps-openmp lammps+openmp + ./bin/benchpark experiment init --dest=lammps-openmp lammps+openmp~single_node ./bin/benchpark setup ./lammps-openmp LLNL-Ruby-icelake-OmniPath workspace/ . workspace/setup.sh ramble \ @@ -276,7 +276,7 @@ jobs: - name: Dry run dynamic lammps/rocm on static Tioga run: | - ./bin/benchpark experiment init --dest=lammps-rocm lammps+rocm + ./bin/benchpark experiment init --dest=lammps-rocm lammps+rocm~single_node ./bin/benchpark setup ./lammps-rocm LLNL-Tioga-HPECray-zen3-MI250X-Slingshot workspace/ . workspace/setup.sh ramble \ @@ -287,7 +287,7 @@ jobs: - name: Dry run dynamic lammps/rocm with dynamic Tioga run: | - ./bin/benchpark experiment init --dest=lammps-rocm-tioga lammps+rocm + ./bin/benchpark experiment init --dest=lammps-rocm-tioga lammps+rocm~single_node ./bin/benchpark setup lammps-rocm-tioga ./tioga-system workspace/ system_id=$(./bin/benchpark system id ./tioga-system) . workspace/setup.sh From a4d84049d754129a14b26ac6d5ebfba3d8753b5b Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 12:33:16 -0800 Subject: [PATCH 10/17] workflows --- .github/workflows/run.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 2affe5fc4..47e4eef9f 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -309,7 +309,7 @@ jobs: - name: Dry run dynamic quicksilver-openmp on nosite-x86_64 with allocation modifier run: | - ./bin/benchpark experiment init --dest=quicksilver-openmp quicksilver+openmp +weak + ./bin/benchpark experiment init --dest=quicksilver-openmp quicksilver+openmp +weak~single_node ./bin/benchpark setup ./quicksilver-openmp nosite-x86_64 workspace/ . workspace/setup.sh ramble \ From eb68570f6662e5f137bdfe4a7d82ecffbe9ec9fd Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 12:41:58 -0800 Subject: [PATCH 11/17] workflows --- .github/workflows/run.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 47e4eef9f..ccb92b8f4 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -382,7 +382,7 @@ jobs: run: | ./bin/benchpark system init --dest=ruby-system llnl-cluster cluster=ruby system_id=$(./bin/benchpark system id ./ruby-system) - ./bin/benchpark experiment init --dest=saxpy-openmp saxpy+openmp + ./bin/benchpark experiment init --dest=saxpy-openmp saxpy+openmp~single_node ./bin/benchpark setup ./saxpy-openmp ./ruby-system workspace/ . workspace/setup.sh ramble \ @@ -395,7 +395,7 @@ jobs: run: | ./bin/benchpark system init --dest=dane-system llnl-cluster cluster=dane system_id=$(./bin/benchpark system id ./dane-system) - ./bin/benchpark experiment init --dest=saxpy-openmp2 saxpy+openmp + ./bin/benchpark experiment init --dest=saxpy-openmp2 saxpy+openmp~single_node ./bin/benchpark setup ./saxpy-openmp2 ./dane-system workspace/ . workspace/setup.sh ramble \ @@ -407,7 +407,7 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic llnl-cluster magma run: | ./bin/benchpark system init --dest=magma-system llnl-cluster cluster=magma - ./bin/benchpark experiment init --dest=saxpy-openmp3 saxpy+openmp + ./bin/benchpark experiment init --dest=saxpy-openmp3 saxpy+openmp~single_node ./bin/benchpark setup ./saxpy-openmp3 ./magma-system workspace/ . workspace/setup.sh ramble \ @@ -419,7 +419,7 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic generic x86 run: | ./bin/benchpark system init --dest=x86-system genericx86 - ./bin/benchpark experiment init --dest=saxpy-omp-generic saxpy+openmp + ./bin/benchpark experiment init --dest=saxpy-omp-generic saxpy+openmp~single_node ./bin/benchpark setup ./saxpy-omp-generic ./x86-system workspace/ . workspace/setup.sh ramble \ @@ -441,7 +441,7 @@ jobs: - name: Dry run dynamic remhos/mpi with dynamic llnl-cluster ruby run: | - ./bin/benchpark experiment init --dest=remhos-ruby remhos + ./bin/benchpark experiment init --dest=remhos-ruby remhos~single_node ./bin/benchpark setup ./remhos-ruby ./ruby-system workspace/ system_id=$(./bin/benchpark system id ./ruby-system) . workspace/setup.sh @@ -451,7 +451,7 @@ jobs: --disable-logger \ workspace setup --dry-run - - name: Dry run dynamic remhos/mpi with dynamic Tioga + - name: Dry run dynamic remhos/mpi with dynamic Tioga~single_node run: | ./bin/benchpark experiment init --dest=remhos-tioga remhos ./bin/benchpark setup remhos-tioga ./tioga-system workspace/ @@ -491,7 +491,7 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic fugaku run: | ./bin/benchpark system init --dest=fugaku-system fugaku - ./bin/benchpark experiment init --dest=saxpy-omp-fugaku saxpy+openmp + ./bin/benchpark experiment init --dest=saxpy-omp-fugaku saxpy+openmp~single_node ./bin/benchpark setup ./saxpy-omp-fugaku ./fugaku-system workspace/ . workspace/setup.sh ramble \ From 28351d7f0717ff832f088bd9c0dc280f5848022e Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Sat, 14 Dec 2024 12:59:42 -0800 Subject: [PATCH 12/17] workflows --- .github/workflows/run.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index ccb92b8f4..55d2ec9d1 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -502,7 +502,7 @@ jobs: - name: Dry run dynamic osu-micro-benchmarks/mpi-only with dynamic CTS ruby run: | - ./bin/benchpark experiment init --dest=osu-micro-benchmarks-mpi-only osu-micro-benchmarks workload=all + ./bin/benchpark experiment init --dest=osu-micro-benchmarks-mpi-only osu-micro-benchmarks workload=all ~single_node ./bin/benchpark setup ./osu-micro-benchmarks-mpi-only ./ruby-system workspace/ system_id=$(./bin/benchpark system id ./ruby-system) . workspace/setup.sh From 4af1c3a4862d0c2bcdad95f38b934a6b49b10fa0 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Mon, 16 Dec 2024 20:58:33 -0800 Subject: [PATCH 13/17] Add max_node_limit to system specs --- .github/workflows/run.yml | 38 +++++++++---------- .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../LLNL-Ruby-icelake-OmniPath/variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + .../variables.yaml | 1 + legacy/systems/nosite-x86_64/variables.yaml | 1 + .../test-extra-batch-opts/variables.yaml | 1 + lib/benchpark/experiment.py | 7 +--- lib/benchpark/system.py | 10 +++++ modifiers/allocation/modifier.py | 7 ---- 21 files changed, 48 insertions(+), 31 deletions(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index b784624a3..b2c1f511e 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -143,7 +143,7 @@ jobs: - name: Dry run dynamic kripke-openmp on nosite-x86_64 with allocation modifier run: | - ./bin/benchpark experiment init --dest=kripke-openmp kripke+openmp~single_node + ./bin/benchpark experiment init --dest=kripke-openmp kripke+openmp ./bin/benchpark setup ./kripke-openmp nosite-x86_64 workspace/ . workspace/setup.sh ramble \ @@ -152,7 +152,7 @@ jobs: --disable-logger \ workspace setup --dry-run - - name: Dry run dynamic kripke-rocm on LLNL-Tioga-HPECray-zen3-MI250X-Slingshot with allocation modifier~single_node + - name: Dry run dynamic kripke-rocm on LLNL-Tioga-HPECray-zen3-MI250X-Slingshot with allocation modifier run: | ./bin/benchpark experiment init --dest=kripke-rocm kripke+rocm ./bin/benchpark setup ./kripke-openmp LLNL-Tioga-HPECray-zen3-MI250X-Slingshot workspace/ @@ -304,7 +304,7 @@ jobs: - name: Dry run dynamic lammps/openmp on static Ruby run: | - ./bin/benchpark experiment init --dest=lammps-openmp lammps+openmp~single_node + ./bin/benchpark experiment init --dest=lammps-openmp lammps+openmp ./bin/benchpark setup ./lammps-openmp LLNL-Ruby-icelake-OmniPath workspace/ . workspace/setup.sh ramble \ @@ -315,7 +315,7 @@ jobs: - name: Dry run dynamic lammps/rocm on static Tioga run: | - ./bin/benchpark experiment init --dest=lammps-rocm lammps+rocm~single_node + ./bin/benchpark experiment init --dest=lammps-rocm lammps+rocm ./bin/benchpark setup ./lammps-rocm LLNL-Tioga-HPECray-zen3-MI250X-Slingshot workspace/ . workspace/setup.sh ramble \ @@ -326,7 +326,7 @@ jobs: - name: Dry run dynamic lammps/rocm with dynamic Tioga run: | - ./bin/benchpark experiment init --dest=lammps-rocm-tioga lammps+rocm~single_node + ./bin/benchpark experiment init --dest=lammps-rocm-tioga lammps+rocm ./bin/benchpark setup lammps-rocm-tioga ./tioga-system workspace/ system_id=$(./bin/benchpark system id ./tioga-system) . workspace/setup.sh @@ -348,7 +348,7 @@ jobs: - name: Dry run dynamic quicksilver-openmp on nosite-x86_64 with allocation modifier run: | - ./bin/benchpark experiment init --dest=quicksilver-openmp quicksilver+openmp +weak~single_node + ./bin/benchpark experiment init --dest=quicksilver-openmp quicksilver+openmp +weak ./bin/benchpark setup ./quicksilver-openmp nosite-x86_64 workspace/ . workspace/setup.sh ramble \ @@ -419,9 +419,9 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic llnl-cluster ruby run: | - ./bin/benchpark system init --dest=ruby-system llnl-cluster cluster=ruby + ./bin/benchpark system init --dest=ruby-system llnl-cluster cluster=ruby max_node_limit=0 system_id=$(./bin/benchpark system id ./ruby-system) - ./bin/benchpark experiment init --dest=saxpy-openmp saxpy+openmp~single_node + ./bin/benchpark experiment init --dest=saxpy-openmp saxpy+openmp ./bin/benchpark setup ./saxpy-openmp ./ruby-system workspace/ . workspace/setup.sh ramble \ @@ -432,9 +432,9 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic llnl-cluster dane run: | - ./bin/benchpark system init --dest=dane-system llnl-cluster cluster=dane + ./bin/benchpark system init --dest=dane-system llnl-cluster cluster=dane max_node_limit=0 system_id=$(./bin/benchpark system id ./dane-system) - ./bin/benchpark experiment init --dest=saxpy-openmp2 saxpy+openmp~single_node + ./bin/benchpark experiment init --dest=saxpy-openmp2 saxpy+openmp ./bin/benchpark setup ./saxpy-openmp2 ./dane-system workspace/ . workspace/setup.sh ramble \ @@ -445,8 +445,8 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic llnl-cluster magma run: | - ./bin/benchpark system init --dest=magma-system llnl-cluster cluster=magma - ./bin/benchpark experiment init --dest=saxpy-openmp3 saxpy+openmp~single_node + ./bin/benchpark system init --dest=magma-system llnl-cluster cluster=magma max_node_limit=0 + ./bin/benchpark experiment init --dest=saxpy-openmp3 saxpy+openmp ./bin/benchpark setup ./saxpy-openmp3 ./magma-system workspace/ . workspace/setup.sh ramble \ @@ -457,8 +457,8 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic generic x86 run: | - ./bin/benchpark system init --dest=x86-system genericx86 - ./bin/benchpark experiment init --dest=saxpy-omp-generic saxpy+openmp~single_node + ./bin/benchpark system init --dest=x86-system genericx86 max_node_limit=0 + ./bin/benchpark experiment init --dest=saxpy-omp-generic saxpy+openmp ./bin/benchpark setup ./saxpy-omp-generic ./x86-system workspace/ . workspace/setup.sh ramble \ @@ -480,7 +480,7 @@ jobs: - name: Dry run dynamic remhos/mpi with dynamic llnl-cluster ruby run: | - ./bin/benchpark experiment init --dest=remhos-ruby remhos~single_node + ./bin/benchpark experiment init --dest=remhos-ruby remhos ./bin/benchpark setup ./remhos-ruby ./ruby-system workspace/ system_id=$(./bin/benchpark system id ./ruby-system) . workspace/setup.sh @@ -490,7 +490,7 @@ jobs: --disable-logger \ workspace setup --dry-run - - name: Dry run dynamic remhos/mpi with dynamic Tioga~single_node + - name: Dry run dynamic remhos/mpi with dynamic Tioga run: | ./bin/benchpark experiment init --dest=remhos-tioga remhos ./bin/benchpark setup remhos-tioga ./tioga-system workspace/ @@ -529,8 +529,8 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic fugaku run: | - ./bin/benchpark system init --dest=fugaku-system fugaku - ./bin/benchpark experiment init --dest=saxpy-omp-fugaku saxpy+openmp~single_node + ./bin/benchpark system init --dest=fugaku-system fugaku max_node_limit=0 + ./bin/benchpark experiment init --dest=saxpy-omp-fugaku saxpy+openmp ./bin/benchpark setup ./saxpy-omp-fugaku ./fugaku-system workspace/ . workspace/setup.sh ramble \ @@ -541,7 +541,7 @@ jobs: - name: Dry run dynamic osu-micro-benchmarks/mpi-only with dynamic CTS ruby run: | - ./bin/benchpark experiment init --dest=osu-micro-benchmarks-mpi-only osu-micro-benchmarks workload=all ~single_node + ./bin/benchpark experiment init --dest=osu-micro-benchmarks-mpi-only osu-micro-benchmarks workload=all ./bin/benchpark setup ./osu-micro-benchmarks-mpi-only ./ruby-system workspace/ system_id=$(./bin/benchpark system id ./ruby-system) . workspace/setup.sh diff --git a/legacy/systems/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml b/legacy/systems/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml index da7333c04..166463ede 100644 --- a/legacy/systems/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/legacy/systems/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -13,6 +13,7 @@ variables: sys_gpus_per_node: "8" sys_mem_per_node: "512" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml b/legacy/systems/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml index 5ce00dcbe..4d0d5e03f 100644 --- a/legacy/systems/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml +++ b/legacy/systems/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml @@ -14,6 +14,7 @@ variables: sys_gpus_per_node: "1" sys_mem_per_node: "64" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml b/legacy/systems/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml index acee05641..e6bde18a4 100644 --- a/legacy/systems/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml +++ b/legacy/systems/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml @@ -10,6 +10,7 @@ variables: # sys_gpus_per_node unset # sys_mem_per_node unset max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/JSC-JUWELS-Booster-rome-A100-Infiniband/variables.yaml b/legacy/systems/JSC-JUWELS-Booster-rome-A100-Infiniband/variables.yaml index a07feeba3..b8a68c84f 100644 --- a/legacy/systems/JSC-JUWELS-Booster-rome-A100-Infiniband/variables.yaml +++ b/legacy/systems/JSC-JUWELS-Booster-rome-A100-Infiniband/variables.yaml @@ -12,6 +12,7 @@ variables: sys_cores_per_node: "48" sys_gpus_per_node: "4" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/LLNL-Dane-DELL-sapphirerapids-OmniPath/variables.yaml b/legacy/systems/LLNL-Dane-DELL-sapphirerapids-OmniPath/variables.yaml index a106ac8da..061d59492 100644 --- a/legacy/systems/LLNL-Dane-DELL-sapphirerapids-OmniPath/variables.yaml +++ b/legacy/systems/LLNL-Dane-DELL-sapphirerapids-OmniPath/variables.yaml @@ -8,6 +8,7 @@ variables: scheduler: "slurm" sys_cores_per_node: "112" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml b/legacy/systems/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml index 46ca2504b..7cce678c3 100644 --- a/legacy/systems/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml +++ b/legacy/systems/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml @@ -8,6 +8,7 @@ variables: scheduler: "slurm" sys_cores_per_node: "96" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml b/legacy/systems/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml index fa6dccf02..e62264f52 100644 --- a/legacy/systems/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml +++ b/legacy/systems/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml @@ -12,6 +12,7 @@ variables: sys_cores_per_node: "36" sys_gpus_per_node: "2" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/LLNL-Ruby-icelake-OmniPath/variables.yaml b/legacy/systems/LLNL-Ruby-icelake-OmniPath/variables.yaml index 5c6d5ed68..ac6e28778 100644 --- a/legacy/systems/LLNL-Ruby-icelake-OmniPath/variables.yaml +++ b/legacy/systems/LLNL-Ruby-icelake-OmniPath/variables.yaml @@ -9,6 +9,7 @@ variables: sys_cores_per_node: "56" sys_gpus_per_node: 0 max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml b/legacy/systems/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml index c4c802503..fe6a5a345 100644 --- a/legacy/systems/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml +++ b/legacy/systems/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml @@ -13,6 +13,7 @@ variables: sys_cores_per_node: "44" sys_gpus_per_node: "4" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml b/legacy/systems/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml index 508744c83..594e437ae 100644 --- a/legacy/systems/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/legacy/systems/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -11,6 +11,7 @@ variables: sys_cores_per_node: "64" sys_gpus_per_node: "8" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml b/legacy/systems/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml index 5e29684d6..7db9b954c 100644 --- a/legacy/systems/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml +++ b/legacy/systems/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml @@ -18,6 +18,7 @@ variables: post_exec_cmds: | for F in $(ls -1v fjmpioutdir/bmexe.*); do cat $F >> {log_file}; done max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/TAMU-Grace-Dell-cascadelake-Infiniband/variables.yaml b/legacy/systems/TAMU-Grace-Dell-cascadelake-Infiniband/variables.yaml index 1d8d9d518..925e437e3 100644 --- a/legacy/systems/TAMU-Grace-Dell-cascadelake-Infiniband/variables.yaml +++ b/legacy/systems/TAMU-Grace-Dell-cascadelake-Infiniband/variables.yaml @@ -8,6 +8,7 @@ variables: scheduler: "slurm" sys_cores_per_node: "24" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/nosite-AWS_PCluster_Hpc6a-zen3-EFA/variables.yaml b/legacy/systems/nosite-AWS_PCluster_Hpc6a-zen3-EFA/variables.yaml index f5db177a1..9b02fcdd5 100644 --- a/legacy/systems/nosite-AWS_PCluster_Hpc6a-zen3-EFA/variables.yaml +++ b/legacy/systems/nosite-AWS_PCluster_Hpc6a-zen3-EFA/variables.yaml @@ -10,5 +10,6 @@ variables: batch_nodes: '#SBATCH -N {n_nodes}' batch_ranks: '#SBATCH -n {n_ranks}' batch_timeout: '#SBATCH -t {batch_time}:00' + max_node_limit: "1" sys_cpus_per_node: 96 sys_gpus_per_node: 0 diff --git a/legacy/systems/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml b/legacy/systems/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml index d92d39c6e..cdb361081 100644 --- a/legacy/systems/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml +++ b/legacy/systems/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml @@ -9,6 +9,7 @@ variables: sys_cores_per_node: "1" # sys_gpus_per_node unset max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "1" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml b/legacy/systems/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml index 59954bdbd..58c0af995 100644 --- a/legacy/systems/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/legacy/systems/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -11,6 +11,7 @@ variables: sys_cores_per_node: "1" # sys_gpus_per_node unset max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "1" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/nosite-x86_64/variables.yaml b/legacy/systems/nosite-x86_64/variables.yaml index 2c5b01c5f..ba8c5466d 100644 --- a/legacy/systems/nosite-x86_64/variables.yaml +++ b/legacy/systems/nosite-x86_64/variables.yaml @@ -10,6 +10,7 @@ variables: extra_cmd_opts: | --oversubscribe max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/legacy/systems/test-extra-batch-opts/variables.yaml b/legacy/systems/test-extra-batch-opts/variables.yaml index dee749bd6..3bbbfaab3 100644 --- a/legacy/systems/test-extra-batch-opts/variables.yaml +++ b/legacy/systems/test-extra-batch-opts/variables.yaml @@ -16,6 +16,7 @@ variables: for F in $(ls -1v fjmpioutdir/bmexe.*); do cat $F >> {log_file}; done echo "done" max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "0" n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/lib/benchpark/experiment.py b/lib/benchpark/experiment.py index 3399d5073..b72dd86d2 100644 --- a/lib/benchpark/experiment.py +++ b/lib/benchpark/experiment.py @@ -55,9 +55,6 @@ class SingleNode: description="Single node execution mode", ) - def run_single_node_expr(self): - return 1 if self.spec.satisfies("+single_node") else 0 - class Helper(ExperimentHelper): def get_helper_name_prefix(self): return "single_node" if self.spec.satisfies("+single_node") else "" @@ -179,8 +176,8 @@ def compute_applications_section_wrapper(self): self.compute_applications_section() - self.add_experiment_variable( - "run_single_node_expr", self.run_single_node_expr() + self.add_experiment_exclude( + f"{{n_nodes}} > 0 and {{n_nodes}} <= {{max_node_limit}}" ) expr_helper_list = [] diff --git a/lib/benchpark/system.py b/lib/benchpark/system.py index 550790727..f874aa251 100644 --- a/lib/benchpark/system.py +++ b/lib/benchpark/system.py @@ -13,6 +13,7 @@ import benchpark.paths from benchpark.directives import ExperimentSystemBase +from benchpark.directives import variant import benchpark.repo from benchpark.runtime import RuntimeResources @@ -74,6 +75,13 @@ class System(ExperimentSystemBase): Tuple["benchpark.variant.Variant", "benchpark.spec.ConcreteSystemSpec"], ] + variant( + "max_node_limit", + default="1", + values=int, + description="Max number of allocatable nodes for experiments, 0 (no limits), default 1", + ) + def __init__(self, spec): self.spec: "benchpark.spec.ConcreteSystemSpec" = spec super().__init__() @@ -87,6 +95,7 @@ def initialize(self): self.scheduler = None self.timeout = "120" self.queue = None + self.max_node_limit = self.spec.variants["max_node_limit"][0] self.required = ["sys_cores_per_node", "scheduler", "timeout"] @@ -185,6 +194,7 @@ def variables_yaml(self): sys_cores_per_node: "{self.sys_cores_per_node}" {extras_as_cfg} max_request: "1000" # n_ranks/n_nodes cannot exceed this + max_node_limit: "{self.max_node_limit}" # 0: no limits, default: 1 n_ranks: '1000001' # placeholder value n_nodes: '1000001' # placeholder value batch_submit: "placeholder" diff --git a/modifiers/allocation/modifier.py b/modifiers/allocation/modifier.py index 393ea60c9..373cf0d12 100644 --- a/modifiers/allocation/modifier.py +++ b/modifiers/allocation/modifier.py @@ -37,8 +37,6 @@ class AllocOpt(Enum): POST_EXEC_CMDS = 302 PRE_EXEC_CMDS = 303 - RUN_SINGLE_NODE_EXPR = 666 - @staticmethod def as_type(enumval, input): if enumval in [ @@ -293,11 +291,6 @@ def determine_allocation(self, v): ) v.n_nodes = max(cores_node_request or 0, gpus_node_request or 0) - if v.run_single_node_expr and v.n_nodes > 1: - raise ValueError( - f"Experiment must run on 1 node. Requested {v.n_nodes} nodes" - ) - if not v.n_threads_per_proc: v.n_threads_per_proc = 1 From fe01f03c4f68c70943bf6f2db2bcbe40b7169163 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Mon, 16 Dec 2024 21:58:43 -0800 Subject: [PATCH 14/17] Fix fugaku system id in workflow --- .github/workflows/run.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index b2c1f511e..92b99d1ac 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -530,11 +530,12 @@ jobs: - name: Dry run dynamic saxpy/openmp with dynamic fugaku run: | ./bin/benchpark system init --dest=fugaku-system fugaku max_node_limit=0 + system_id=$(./bin/benchpark system id ./fugaku-system) ./bin/benchpark experiment init --dest=saxpy-omp-fugaku saxpy+openmp ./bin/benchpark setup ./saxpy-omp-fugaku ./fugaku-system workspace/ . workspace/setup.sh ramble \ - --workspace-dir workspace/saxpy-omp-fugaku/Fugaku-cf3cb1d/workspace \ + --workspace-dir workspace/saxpy-omp-fugaku/$system_id/workspace \ --disable-progress-bar \ --disable-logger \ workspace setup --dry-run From d34513a0ae68b31f61818d7bc5f6feff3cbd899e Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Mon, 16 Dec 2024 22:07:37 -0800 Subject: [PATCH 15/17] lint --- lib/benchpark/experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/benchpark/experiment.py b/lib/benchpark/experiment.py index b72dd86d2..d52f53033 100644 --- a/lib/benchpark/experiment.py +++ b/lib/benchpark/experiment.py @@ -177,7 +177,7 @@ def compute_applications_section_wrapper(self): self.compute_applications_section() self.add_experiment_exclude( - f"{{n_nodes}} > 0 and {{n_nodes}} <= {{max_node_limit}}" + "{n_nodes} > 0 and {n_nodes} <= {max_node_limit}" ) expr_helper_list = [] From a1ddec4b36e7d042598d13be3774b2b50b963299 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Mon, 16 Dec 2024 22:11:53 -0800 Subject: [PATCH 16/17] lint --- lib/benchpark/experiment.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/benchpark/experiment.py b/lib/benchpark/experiment.py index d52f53033..61940c0e8 100644 --- a/lib/benchpark/experiment.py +++ b/lib/benchpark/experiment.py @@ -176,9 +176,7 @@ def compute_applications_section_wrapper(self): self.compute_applications_section() - self.add_experiment_exclude( - "{n_nodes} > 0 and {n_nodes} <= {max_node_limit}" - ) + self.add_experiment_exclude("{n_nodes} > 0 and {n_nodes} <= {max_node_limit}") expr_helper_list = [] for cls in self.helpers: From 9828525323e20b9f1bf046ea9d253cd397a784a9 Mon Sep 17 00:00:00 2001 From: Riyaz Haque Date: Tue, 17 Dec 2024 10:43:36 -0800 Subject: [PATCH 17/17] Remove SingleNode class --- lib/benchpark/experiment.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/lib/benchpark/experiment.py b/lib/benchpark/experiment.py index 61940c0e8..772799915 100644 --- a/lib/benchpark/experiment.py +++ b/lib/benchpark/experiment.py @@ -48,19 +48,7 @@ def get_spack_variants(self): return None -class SingleNode: - variant( - "single_node", - default=True, - description="Single node execution mode", - ) - - class Helper(ExperimentHelper): - def get_helper_name_prefix(self): - return "single_node" if self.spec.satisfies("+single_node") else "" - - -class Experiment(ExperimentSystemBase, SingleNode): +class Experiment(ExperimentSystemBase): """This is the superclass for all benchpark experiments. ***The Experiment class***