diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py index 50f7e69dc..f8d0652e0 100644 --- a/benchmarks/timm/benchfile.py +++ b/benchmarks/timm/benchfile.py @@ -14,17 +14,21 @@ class TimmBenchmarkPack(Package): def make_env(self): return { **super().make_env(), - "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)) + "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)), } @property def argv(self): return [ *super().argv, - "--data-dir", self.dirs.data, - "--dataset", "FakeImageNet", - "--output", self.dirs.extra / self.logdir.name / self.tag, - "--checkpoint-hist", 1, + "--data-dir", + self.dirs.data, + "--dataset", + "FakeImageNet", + "--output", + self.dirs.extra / self.logdir.name / self.tag, + "--checkpoint-hist", + 1, ] async def install(self): @@ -32,7 +36,9 @@ async def install(self): timm = self.dirs.code / "pytorch-image-models" if not timm.exists(): - timm.clone_subtree("https://github.com/huggingface/pytorch-image-models", BRANCH) + timm.clone_subtree( + "https://github.com/huggingface/pytorch-image-models", BRANCH + ) def build_run_plan(self): # self.config is not the right config for this diff --git a/benchmarks/timm/voirfile.py b/benchmarks/timm/voirfile.py index 19ac71fa5..5f17d8408 100644 --- a/benchmarks/timm/voirfile.py +++ b/benchmarks/timm/voirfile.py @@ -33,12 +33,14 @@ def setup(args): ov.require(dash) ov.require( - log("value", "progress", "rate", "units", "loss", "gpudata", context="task"), + log( + "value", "progress", "rate", "units", "loss", "gpudata", context="task" + ), rate( interval=options.interval, skip=options.skip, sync=torch.cuda.synchronize if torch.cuda.is_available() else None, - batch_size_calc=lambda b: len(b) * args.world_size + batch_size_calc=lambda b: len(b) * args.world_size, ), early_stop(n=options.stop, key="rate", task="train", signal="stop"), gpu_monitor(poll_interval=options.gpu_poll), @@ -46,8 +48,7 @@ def setup(args): # Loss ( - loss_probe - .throttle(1)["loss"] + loss_probe.throttle(1)["loss"] .map(lambda loss: {"task": "train", "loss": float(loss)}) .give() ) diff --git a/docs/docker.rst b/docs/docker.rst index 582ca95a6..c9c6e2d98 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -113,8 +113,8 @@ There are currently two multi-node benchmarks, ``opt-1_3b-multinode`` (data-para .. code-block:: yaml system: - sshkey: arch: cuda + sshkey: /milabench/id_milabench docker_image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly nodes: diff --git a/milabench/cli.py b/milabench/cli.py index d163114ff..4c9833da0 100644 --- a/milabench/cli.py +++ b/milabench/cli.py @@ -8,11 +8,11 @@ import tempfile import traceback from datetime import datetime -import getpass from coleo import Option, config as configuration, default, run_cli, tooled from omegaconf import OmegaConf from voir.instruments.gpu import deduce_backend, select_backend +import yaml from milabench.alt_async import proceed from milabench.utils import blabla, validation_layers, multilogger, available_layers @@ -31,7 +31,7 @@ from .merge import merge from .multi import MultiPackage from .report import make_report -from .slurm import expand_node_list +from .slurm import build_system_config from .summary import aggregate, make_summary @@ -468,6 +468,17 @@ def install(): mp=mp, ) + def helpme(): + """""" + # "192.168.0.[25-30,123,126]" + node_range: Option & str = "node[0,1]" + arch: Option & str = "cuda" + version: Option & str = "nightly" + + from .utils import command_generator + + command_generator(node_range, arch, version) + def pin(): """Pin the benchmarks' dependencies.""" @@ -682,24 +693,8 @@ def pip(): def slurm_system(): """Generate a system file based of slurm environment variables""" - - node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", "")) - - def make_node(i, ip): - node = {"name": ip, "ip": ip, "user": getpass.getuser(), "main": i == 0} - - if i == 0: - node["port"] = 8123 - - return node - - system = dict( - arch="cuda", nodes=[make_node(i, ip) for i, ip in enumerate(node_list)] - ) - - import yaml - - print(yaml.dump({"system": system})) + system = build_system_config(os.getenv("SLURM_JOB_NODELIST", "")) + print(yaml.dump(system)) def machine(): """Display machine metadata. diff --git a/milabench/slurm.py b/milabench/slurm.py index cadf0f73f..255251d5a 100644 --- a/milabench/slurm.py +++ b/milabench/slurm.py @@ -1,3 +1,6 @@ +import getpass + + def expand_range(s): numbers = [] count = 0 @@ -51,3 +54,26 @@ def expand_node_list(node_list): s = next + 1 return nodes + + +def build_system_config(node_range): + node_list = expand_node_list(node_range) + + def make_node(i, ip): + node = { + "name": ip, + "ip": ip, + "user": getpass.getuser(), + "main": i == 0, + } + + if i == 0: + node["port"] = 8123 + + return node + + system = dict( + arch="cuda", + nodes=[make_node(i, ip) for i, ip in enumerate(node_list)], + ) + return {"system": system} diff --git a/milabench/utils.py b/milabench/utils.py index d7914eb5f..705dc5a48 100644 --- a/milabench/utils.py +++ b/milabench/utils.py @@ -10,11 +10,13 @@ from functools import wraps from typing import Any +import yaml from ovld import ovld from milabench.fs import XPath import milabench.validation from milabench.validation.validation import Summary +from .slurm import build_system_config class Named: @@ -225,3 +227,75 @@ def enumerate_rank(nodes): else: yield rank, node rank += 1 + + +def find_private_ssh_key(): + homessh = os.path.expanduser("~/.ssh/") + key_path = None + + for key in os.listdir(homessh): + if key.endswith(".pub"): + continue + + if key.startswith("id_"): + key_path = os.path.join(homessh, key) + break + + assert key_path is not None, "No ssh key found" + return key_path + + +def command_generator(node_range, arch, version): + output = os.path.join(os.getcwd(), "results") + system = os.path.join(output, "system.yaml") + + # Make output folder + os.makedirs("results", exist_ok=True) + + # Find a ssh private key + key_path = find_private_ssh_key() + + # make system file + if os.path.exists(system): + print("System file already exists, skipping") + else: + with open(system, "w") as file: + system_conf = build_system_config(node_range) + system_conf["system"]["sshkey"] = "/milabench/id_milabench" + + formatted = yaml.dump(system_conf, indent=2) + print(formatted) + file.write(formatted) + + # At that point we might as well run it for them ? + print() + print("Running milabench") + print() + print(f" output: {output}") + print(f" system: {system}") + print() + print("Command:") + print() + + cmd = [ + (f" docker run -it --rm --gpus all --network host --ipc=host --privileged"), + (f" -v {key_path}:/milabench/id_milabench "), + (f" -v {output}:/milabench/envs/runs "), + (f" ghcr.io/mila-iqia/milabench:{arch}-{version} "), + (f" milabench run --system /milabench/envs/runs/system.yaml"), + ] + + length = 0 + for line in cmd: + line = line.strip() + length = max(len(line) + 4, length) + + formatted = [] + for i, line in enumerate(cmd): + line = line.strip() + idt_size = ((i > 0) + 1) * 2 + idt = " " * idt_size + + formatted.append(f"{idt}{line:<{length - idt_size}}") + + print("\\\n".join(formatted))