From 38e4683fbc774de8ae6402be710642e2b24489f8 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 6 Jul 2023 09:14:50 -0400 Subject: [PATCH 1/6] Docker tweajs --- docs/docker.rst | 6 ++++-- milabench/executors.py | 25 ++++++++++++++++++++----- milabench/multi.py | 2 +- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/docs/docker.rst b/docs/docker.rst index 09aa3e738..3824cb7d5 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -112,7 +112,8 @@ There are currently two multi-node benchmarks, ``opt-1_3b-multinode`` (data-para .. code-block:: yaml system: - docker-image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly + arch: cuda + docker_image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly nodes: - name: node1 @@ -161,7 +162,8 @@ For example, for 4 nodes: .. code-block:: yaml system: - docker-image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly + arch: cuda + docker_image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly nodes: - name: node1 diff --git a/milabench/executors.py b/milabench/executors.py index 241ce0010..63aac7f23 100644 --- a/milabench/executors.py +++ b/milabench/executors.py @@ -83,6 +83,12 @@ def _set_pack(self, pack): return False + def packs(self): + if self.pack: + yield self.pack + else: + yield from self.exec.packs() + def copy(self, pack): """Copy the execution plan but use a different pack""" copy = deepcopy(self) @@ -108,24 +114,27 @@ def commands(self) -> Generator[Tuple[pack.BasePackage, List, Dict], None, None] """ yield self.pack, [], self.kwargs() - async def execute(self, timeout=False, timeout_delay=600, **kwargs): + async def execute(self, phase, timeout=False, timeout_delay=600, **kwargs): """Execute all the commands and return the aggregated results""" coro = [] + + for pack in self.packs(): + pack.phase = phase for pack, argv, _kwargs in self.commands(): await pack.send(event="config", data=pack.config) await pack.send(event="meta", data=machine_metadata()) - pack.phase = "run" fut = pack.execute(*argv, **{**_kwargs, **kwargs}) - coro.append(fut) if timeout: delay = pack.config.get("max_duration", timeout_delay) - asyncio.create_task(force_terminate(pack, delay)) + task = asyncio.create_task(force_terminate(pack, delay)) - return await asyncio.gather(*coro) + results = await asyncio.gather(*coro) + task.cancel() + return results class SingleCmdExecutor(Executor): @@ -168,6 +177,10 @@ def pack(self): def commands(self) -> Generator[Tuple[pack.BasePackage, List, Dict], None, None]: for executor in self.executors: yield from executor.commands() + + def packs(self): + for exec in self.executors: + yield from exec.packs() class CmdExecutor(SingleCmdExecutor): @@ -307,6 +320,8 @@ def _argv(self, **kwargs) -> List: argv = super()._argv(**kwargs) env = self.pack.make_env() + print(self.pack.phase) + print(env) for var in ("MILABENCH_CONFIG", "XDG_CACHE_HOME", "OMP_NUM_THREADS"): argv.append("--env") argv.append(f"{var}='{env[var]}'") diff --git a/milabench/multi.py b/milabench/multi.py index 9c67a0ba7..96e717af0 100644 --- a/milabench/multi.py +++ b/milabench/multi.py @@ -176,7 +176,7 @@ async def do_run(self, repeat=1): continue exec_plan = make_execution_plan(pack, index, repeat) - await exec_plan.execute(timeout=True, timeout_delay=600) + await exec_plan.execute("run", timeout=True, timeout_delay=600) except Exception as exc: import traceback From 42db21a22b3502b3f4f8d0f3093e27a6201e4b3b Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 6 Jul 2023 09:42:45 -0400 Subject: [PATCH 2/6] - --- milabench/cli.py | 4 ++-- milabench/executors.py | 49 +++++++++++++++++++++++++++++++++--------- milabench/utils.py | 6 +++--- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/milabench/cli.py b/milabench/cli.py index 4859db33c..d163114ff 100644 --- a/milabench/cli.py +++ b/milabench/cli.py @@ -354,7 +354,7 @@ def run(): # Which type of dashboard to show (short, long, or no) dash: Option & str = os.environ.get("MILABENCH_DASH", "long") - + noterm: Option & bool = os.getenv("MILABENCH_NOTERM", "0") == "1" validations: Option & str = None @@ -376,7 +376,7 @@ def run(): success = run_with_loggers( mp.do_run(repeat=repeat), loggers=[ - # Terminal Formatter slows down the dashboard, + # Terminal Formatter slows down the dashboard, # if lots of info needs to be printed # in particular rwkv TerminalFormatter() if not noterm else None, diff --git a/milabench/executors.py b/milabench/executors.py index 63aac7f23..d3d734408 100644 --- a/milabench/executors.py +++ b/milabench/executors.py @@ -114,10 +114,10 @@ def commands(self) -> Generator[Tuple[pack.BasePackage, List, Dict], None, None] """ yield self.pack, [], self.kwargs() - async def execute(self, phase, timeout=False, timeout_delay=600, **kwargs): + async def execute(self, phase="run", timeout=False, timeout_delay=600, **kwargs): """Execute all the commands and return the aggregated results""" coro = [] - + for pack in self.packs(): pack.phase = phase @@ -177,7 +177,7 @@ def pack(self): def commands(self) -> Generator[Tuple[pack.BasePackage, List, Dict], None, None]: for executor in self.executors: yield from executor.commands() - + def packs(self): for exec in self.executors: yield from exec.packs() @@ -311,6 +311,40 @@ def __init__( ) self.image = image + def as_container_path(self, path): + # replace local output path with docker path + base = self.pack.dirs.base + path = path.replace(str(base), "/milabench/envs") + + # Replace local installation path with docker path + install_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + path = path.replace(str(install_path), "/milabench/milabench") + + return path + + def argv(self, **kwargs) -> List: + """Return the list of command line's arguments for this `Executor` + followed by its embedded `Executor`'s list of command line's arguments + + Arguments: + **kwargs: some `Executor` might need an argument to dynamically + generate the list of command line's arguments + """ + script_args = self.exec.argv(**kwargs) + docker_args = self._argv(**kwargs) + + # we are already in docker the path are correct + if len(docker_args) == 0: + return script_args + + # we are outisde docker + rewritten = [] + for arg in script_args: + # rewrite path to be inside docker + rewritten.append(self.as_container_path(arg)) + + return docker_args + rewritten + def _argv(self, **kwargs) -> List: if self.image is None or os.environ.get("MILABENCH_DOCKER", None): # No-op when there's no docker image to run or inside a docker @@ -320,16 +354,11 @@ def _argv(self, **kwargs) -> List: argv = super()._argv(**kwargs) env = self.pack.make_env() - print(self.pack.phase) - print(env) - for var in ("MILABENCH_CONFIG", "XDG_CACHE_HOME", "OMP_NUM_THREADS"): + for var in ("XDG_CACHE_HOME", "OMP_NUM_THREADS"): argv.append("--env") - argv.append(f"{var}='{env[var]}'") + argv.append(f"{var}='{self.as_container_path(env[var])}'") argv.append(self.image) - argv.append(f"{self.pack.dirs.code / 'activator'}") - argv.append(f"{self.pack.dirs.venv}") - return argv diff --git a/milabench/utils.py b/milabench/utils.py index 00f44c623..d7914eb5f 100644 --- a/milabench/utils.py +++ b/milabench/utils.py @@ -207,14 +207,14 @@ def multilogger(*logs, **kwargs): def select_nodes(nodes, n): """Select n nodes, main node is always first""" ranked = [] - + for node in nodes: if node["main"]: ranked.insert(0, node) else: ranked.append(node) - - return ranked[:max(1, min(n, len(ranked)))] + + return ranked[: max(1, min(n, len(ranked)))] def enumerate_rank(nodes): From 05d4c8163af877d88b233e03b2fc6629fc0dc034 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 6 Jul 2023 09:46:33 -0400 Subject: [PATCH 3/6] - --- milabench/executors.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/milabench/executors.py b/milabench/executors.py index d3d734408..96998e627 100644 --- a/milabench/executors.py +++ b/milabench/executors.py @@ -130,10 +130,13 @@ async def execute(self, phase="run", timeout=False, timeout_delay=600, **kwargs) if timeout: delay = pack.config.get("max_duration", timeout_delay) - task = asyncio.create_task(force_terminate(pack, delay)) + timeout_task = asyncio.create_task(force_terminate(pack, delay)) results = await asyncio.gather(*coro) - task.cancel() + + if timeout: + timeout_task.cancel() + return results From e389a86c56027a38e7d5b6de2e061f335d4d58cb Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 6 Jul 2023 10:14:36 -0400 Subject: [PATCH 4/6] - --- benchmarks/accelerate_opt/benchfile.py | 16 ++++++++++++++-- docs/docker.rst | 2 +- milabench/executors.py | 4 ++-- milabench/remote.py | 2 ++ 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/benchmarks/accelerate_opt/benchfile.py b/benchmarks/accelerate_opt/benchfile.py index 675930df6..1a16cc2c4 100644 --- a/benchmarks/accelerate_opt/benchfile.py +++ b/benchmarks/accelerate_opt/benchfile.py @@ -22,13 +22,23 @@ def make_env(self): def build_docker_prepare_remote_plan(self): executors = [] + key = self.config["system"].get("sshkey") docker_pull_exec = CmdExecutor( self, "docker", "pull", self.config["system"].get("docker_image", None) ) for node in self.config["system"]["nodes"]: if node["main"]: continue - executors.append(SSHExecutor(docker_pull_exec, node["ip"])) + + host = node["ip"] + user = node["user"] + + executors.append(SSHExecutor(docker_pull_exec, + host=host, + user=user, + key=key, + ) + ) return ListExecutor(*executors) def build_prepare_plan(self): @@ -61,7 +71,8 @@ def build_run_plan(self): max_num = self.config["num_machines"] nodes = select_nodes(self.config["system"]["nodes"], max_num) - + key = self.config["system"].get("sshkey") + for rank, node in enumerate(nodes): host = node["ip"] user = node["user"] @@ -83,6 +94,7 @@ def build_run_plan(self): worker = SSHExecutor( host=host, user=user, + key=key, executor=DockerRunExecutor( AccelerateLaunchExecutor(pack, rank=rank), self.config["system"].get("docker_image"), diff --git a/docs/docker.rst b/docs/docker.rst index 3824cb7d5..b57055faa 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -136,7 +136,7 @@ Then, the command should look like this: # Change if needed export SSH_KEY_FILE=$HOME/.ssh/id_rsa - + export MILABENCH_IMAGE=ghcr.io/mila-iqia/milabench:cuda-nightly docker run -it --rm --gpus all --network host --ipc=host --privileged \ -v $SSH_KEY_FILE:/milabench/id_milabench \ -v $(pwd)/results:/milabench/envs/runs \ diff --git a/milabench/executors.py b/milabench/executors.py index 96998e627..d5325d7f0 100644 --- a/milabench/executors.py +++ b/milabench/executors.py @@ -133,10 +133,10 @@ async def execute(self, phase="run", timeout=False, timeout_delay=600, **kwargs) timeout_task = asyncio.create_task(force_terminate(pack, delay)) results = await asyncio.gather(*coro) - + if timeout: timeout_task.cancel() - + return results diff --git a/milabench/remote.py b/milabench/remote.py index e26eba2bc..e2d03fe43 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -122,6 +122,7 @@ def worker_pack(pack, worker): def milabench_remote_command(pack, *command, run_for="worker") -> ListExecutor: nodes = pack.config["system"]["nodes"] + key = pack.config["system"].get("sshkey") cmds = [] for worker in nodes: @@ -134,6 +135,7 @@ def milabench_remote_command(pack, *command, run_for="worker") -> ListExecutor: CmdExecutor(worker_pack(pack, worker), f"milabench", *command), host=host, user=user, + key=key, ) ) From 89f717771e269bfa6825c9fe18edd65e81c11d50 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 6 Jul 2023 10:53:51 -0400 Subject: [PATCH 5/6] - --- docs/docker.rst | 5 ++-- milabench/cli.py | 37 ++++++++++------------- milabench/slurm.py | 26 ++++++++++++++++ milabench/utils.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 23 deletions(-) diff --git a/docs/docker.rst b/docs/docker.rst index b57055faa..0eafd87e2 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -100,8 +100,8 @@ There are currently two multi-node benchmarks, ``opt-1_3b-multinode`` (data-para - ``ssh-keygen`` 1. Pull the milabench docker image you would like to run on all machines - ``docker pull`` -2. Create a list of nodes that will participate in the benchmark inside a ``system.yaml`` file (see example below) - - ``vi system.yaml`` +2. Create a list of nodes that will participate in the benchmark inside a ``results/system.yaml`` file (see example below) + - ``vi results/system.yaml`` 3. Call milabench with by specifying the node list we created. - ``docker ...-v :/milabench/id_milabench milabench run ... --system system.yaml`` @@ -113,6 +113,7 @@ There are currently two multi-node benchmarks, ``opt-1_3b-multinode`` (data-para system: arch: cuda + sshkey: /milabench/id_milabench docker_image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly nodes: diff --git a/milabench/cli.py b/milabench/cli.py index d163114ff..84a670c1f 100644 --- a/milabench/cli.py +++ b/milabench/cli.py @@ -8,11 +8,11 @@ import tempfile import traceback from datetime import datetime -import getpass from coleo import Option, config as configuration, default, run_cli, tooled from omegaconf import OmegaConf from voir.instruments.gpu import deduce_backend, select_backend +import yaml from milabench.alt_async import proceed from milabench.utils import blabla, validation_layers, multilogger, available_layers @@ -31,7 +31,7 @@ from .merge import merge from .multi import MultiPackage from .report import make_report -from .slurm import expand_node_list +from .slurm import build_system_config from .summary import aggregate, make_summary @@ -312,7 +312,7 @@ def run_with_loggers(coro, loggers, mp=None): def run_sync(coro, terminal=True): - return run_with_loggers(coro, [TerminalFormatter()] if terminal else []) + return run_with_loggers(coro, [TerminalFormatter()] if tesystemminal else []) def validation_names(layers): @@ -467,6 +467,17 @@ def install(): ], mp=mp, ) + + def helpme(): + """""" + # "192.168.0.[25-30,123,126]" + node_range: Option & str = "node[0,1]" + arch: Option & str = "cuda" + version: Option & str = "nightly" + + from .utils import command_generator + + command_generator(node_range, arch, version) def pin(): """Pin the benchmarks' dependencies.""" @@ -682,24 +693,8 @@ def pip(): def slurm_system(): """Generate a system file based of slurm environment variables""" - - node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", "")) - - def make_node(i, ip): - node = {"name": ip, "ip": ip, "user": getpass.getuser(), "main": i == 0} - - if i == 0: - node["port"] = 8123 - - return node - - system = dict( - arch="cuda", nodes=[make_node(i, ip) for i, ip in enumerate(node_list)] - ) - - import yaml - - print(yaml.dump({"system": system})) + system = build_system_config(os.getenv("SLURM_JOB_NODELIST", "")) + print(yaml.dump(system)) def machine(): """Display machine metadata. diff --git a/milabench/slurm.py b/milabench/slurm.py index cadf0f73f..c1e3d8fe7 100644 --- a/milabench/slurm.py +++ b/milabench/slurm.py @@ -1,3 +1,6 @@ +import getpass + + def expand_range(s): numbers = [] count = 0 @@ -51,3 +54,26 @@ def expand_node_list(node_list): s = next + 1 return nodes + + +def build_system_config(node_range): + node_list = expand_node_list(node_range) + + def make_node(i, ip): + node = { + "name": ip, + "ip": ip, + "user": getpass.getuser(), + "main": i == 0, + } + + if i == 0: + node["port"] = 8123 + + return node + + system = dict( + arch="cuda", + nodes=[make_node(i, ip) for i, ip in enumerate(node_list)], + ) + return {"system": system} diff --git a/milabench/utils.py b/milabench/utils.py index d7914eb5f..5001af41a 100644 --- a/milabench/utils.py +++ b/milabench/utils.py @@ -10,11 +10,13 @@ from functools import wraps from typing import Any +import yaml from ovld import ovld from milabench.fs import XPath import milabench.validation from milabench.validation.validation import Summary +from .slurm import build_system_config class Named: @@ -225,3 +227,76 @@ def enumerate_rank(nodes): else: yield rank, node rank += 1 + + + +def find_private_ssh_key(): + homessh = os.path.expanduser("~/.ssh/") + key_path = None + + for key in os.listdir(homessh): + if key.endswith('.pub'): + continue + + if key.startswith("id_"): + key_path = os.path.join(homessh, key) + break + + assert key_path is not None, "No ssh key found" + return key_path + + +def command_generator(node_range, arch, version): + output = os.path.join(os.getcwd(), "results") + system = os.path.join(output, "system.yaml") + + # Make output folder + os.makedirs('results', exist_ok=True) + + # Find a ssh private key + key_path = find_private_ssh_key() + + # make system file + if os.path.exists(system): + print("System file already exists, skipping") + else: + with open(system, 'w') as file: + system_conf = build_system_config(node_range) + system_conf['system']['sshkey'] = '/milabench/id_milabench' + + formatted = yaml.dump(system_conf, indent=2) + print(formatted) + file.write(formatted) + + # At that point we might as well run it for them ? + print() + print("Running milabench") + print() + print(f" output: {output}") + print(f" system: {system}") + print() + print("Command:") + print() + + cmd = [ + (f" docker run -it --rm --gpus all --network host --ipc=host --privileged"), + (f" -v {key_path}:/milabench/id_milabench "), + (f" -v {output}:/milabench/envs/runs "), + (f" ghcr.io/mila-iqia/milabench:{arch}-{version} "), + (f" milabench run --system /milabench/envs/runs/system.yaml") + ] + + length = 0 + for line in cmd: + line = line.strip() + length = max(len(line) + 4, length) + + formatted = [] + for i, line in enumerate(cmd): + line = line.strip() + idt_size = ((i > 0) + 1) * 2 + idt = ' ' * idt_size + + formatted.append(f"{idt}{line:<{length - idt_size}}") + + print("\\\n".join(formatted)) \ No newline at end of file From 9c3db5e87789c12c2a4810213c9290eb9f2b2655 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 6 Jul 2023 16:51:43 -0400 Subject: [PATCH 6/6] format --- benchmarks/accelerate_opt/benchfile.py | 2 +- benchmarks/timm/benchfile.py | 18 ++++++++---- benchmarks/timm/voirfile.py | 9 +++--- milabench/cli.py | 6 ++-- milabench/slurm.py | 6 ++-- milabench/utils.py | 39 +++++++++++++------------- 6 files changed, 43 insertions(+), 37 deletions(-) diff --git a/benchmarks/accelerate_opt/benchfile.py b/benchmarks/accelerate_opt/benchfile.py index 5cfb7ade8..c886e8f5f 100644 --- a/benchmarks/accelerate_opt/benchfile.py +++ b/benchmarks/accelerate_opt/benchfile.py @@ -40,7 +40,7 @@ def build_run_plan(self): max_num = self.config["num_machines"] nodes = select_nodes(self.config["system"]["nodes"], max_num) key = self.config["system"].get("sshkey") - + for rank, node in enumerate(nodes): host = node["ip"] user = node["user"] diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py index 50f7e69dc..f8d0652e0 100644 --- a/benchmarks/timm/benchfile.py +++ b/benchmarks/timm/benchfile.py @@ -14,17 +14,21 @@ class TimmBenchmarkPack(Package): def make_env(self): return { **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)) + "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), } @property def argv(self): return [ *super().argv, - "--data-dir", self.dirs.data, - "--dataset", "FakeImageNet", - "--output", self.dirs.extra / self.logdir.name / self.tag, - "--checkpoint-hist", 1, + "--data-dir", + self.dirs.data, + "--dataset", + "FakeImageNet", + "--output", + self.dirs.extra / self.logdir.name / self.tag, + "--checkpoint-hist", + 1, ] async def install(self): @@ -32,7 +36,9 @@ async def install(self): timm = self.dirs.code / "pytorch-image-models" if not timm.exists(): - timm.clone_subtree("https://github.com/huggingface/pytorch-image-models", BRANCH) + timm.clone_subtree( + "https://github.com/huggingface/pytorch-image-models", BRANCH + ) def build_run_plan(self): # self.config is not the right config for this diff --git a/benchmarks/timm/voirfile.py b/benchmarks/timm/voirfile.py index 19ac71fa5..5f17d8408 100644 --- a/benchmarks/timm/voirfile.py +++ b/benchmarks/timm/voirfile.py @@ -33,12 +33,14 @@ def setup(args): ov.require(dash) ov.require( - log("value", "progress", "rate", "units", "loss", "gpudata", context="task"), + log( + "value", "progress", "rate", "units", "loss", "gpudata", context="task" + ), rate( interval=options.interval, skip=options.skip, sync=torch.cuda.synchronize if torch.cuda.is_available() else None, - batch_size_calc=lambda b: len(b) * args.world_size + batch_size_calc=lambda b: len(b) * args.world_size, ), early_stop(n=options.stop, key="rate", task="train", signal="stop"), gpu_monitor(poll_interval=options.gpu_poll), @@ -46,8 +48,7 @@ def setup(args): # Loss ( - loss_probe - .throttle(1)["loss"] + loss_probe.throttle(1)["loss"] .map(lambda loss: {"task": "train", "loss": float(loss)}) .give() ) diff --git a/milabench/cli.py b/milabench/cli.py index c6d35ea8c..4c9833da0 100644 --- a/milabench/cli.py +++ b/milabench/cli.py @@ -467,16 +467,16 @@ def install(): ], mp=mp, ) - + def helpme(): """""" # "192.168.0.[25-30,123,126]" node_range: Option & str = "node[0,1]" arch: Option & str = "cuda" version: Option & str = "nightly" - + from .utils import command_generator - + command_generator(node_range, arch, version) def pin(): diff --git a/milabench/slurm.py b/milabench/slurm.py index c1e3d8fe7..255251d5a 100644 --- a/milabench/slurm.py +++ b/milabench/slurm.py @@ -61,8 +61,8 @@ def build_system_config(node_range): def make_node(i, ip): node = { - "name": ip, - "ip": ip, + "name": ip, + "ip": ip, "user": getpass.getuser(), "main": i == 0, } @@ -73,7 +73,7 @@ def make_node(i, ip): return node system = dict( - arch="cuda", + arch="cuda", nodes=[make_node(i, ip) for i, ip in enumerate(node_list)], ) return {"system": system} diff --git a/milabench/utils.py b/milabench/utils.py index 5001af41a..705dc5a48 100644 --- a/milabench/utils.py +++ b/milabench/utils.py @@ -229,19 +229,18 @@ def enumerate_rank(nodes): rank += 1 - def find_private_ssh_key(): homessh = os.path.expanduser("~/.ssh/") key_path = None - + for key in os.listdir(homessh): - if key.endswith('.pub'): + if key.endswith(".pub"): continue - + if key.startswith("id_"): key_path = os.path.join(homessh, key) break - + assert key_path is not None, "No ssh key found" return key_path @@ -249,25 +248,25 @@ def find_private_ssh_key(): def command_generator(node_range, arch, version): output = os.path.join(os.getcwd(), "results") system = os.path.join(output, "system.yaml") - + # Make output folder - os.makedirs('results', exist_ok=True) - + os.makedirs("results", exist_ok=True) + # Find a ssh private key key_path = find_private_ssh_key() - + # make system file if os.path.exists(system): print("System file already exists, skipping") else: - with open(system, 'w') as file: + with open(system, "w") as file: system_conf = build_system_config(node_range) - system_conf['system']['sshkey'] = '/milabench/id_milabench' - + system_conf["system"]["sshkey"] = "/milabench/id_milabench" + formatted = yaml.dump(system_conf, indent=2) print(formatted) file.write(formatted) - + # At that point we might as well run it for them ? print() print("Running milabench") @@ -277,26 +276,26 @@ def command_generator(node_range, arch, version): print() print("Command:") print() - + cmd = [ (f" docker run -it --rm --gpus all --network host --ipc=host --privileged"), (f" -v {key_path}:/milabench/id_milabench "), (f" -v {output}:/milabench/envs/runs "), (f" ghcr.io/mila-iqia/milabench:{arch}-{version} "), - (f" milabench run --system /milabench/envs/runs/system.yaml") + (f" milabench run --system /milabench/envs/runs/system.yaml"), ] length = 0 for line in cmd: line = line.strip() length = max(len(line) + 4, length) - + formatted = [] for i, line in enumerate(cmd): line = line.strip() idt_size = ((i > 0) + 1) * 2 - idt = ' ' * idt_size - + idt = " " * idt_size + formatted.append(f"{idt}{line:<{length - idt_size}}") - - print("\\\n".join(formatted)) \ No newline at end of file + + print("\\\n".join(formatted))