mila-iqia · Delaunay · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py
@@ -14,25 +14,31 @@ class TimmBenchmarkPack(Package):
     def make_env(self):
         return {
             **super().make_env(),
-            "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8))
+            "OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
         }
 
     @property
     def argv(self):
         return [
             *super().argv,
-            "--data-dir", self.dirs.data,
-            "--dataset", "FakeImageNet",
-            "--output", self.dirs.extra / self.logdir.name / self.tag,
-            "--checkpoint-hist", 1,
+            "--data-dir",
+            self.dirs.data,
+            "--dataset",
+            "FakeImageNet",
+            "--output",
+            self.dirs.extra / self.logdir.name / self.tag,
+            "--checkpoint-hist",
+            1,
         ]
 
     async def install(self):
         await super().install()
 
         timm = self.dirs.code / "pytorch-image-models"
         if not timm.exists():
-            timm.clone_subtree("https://github.com/huggingface/pytorch-image-models", BRANCH)
+            timm.clone_subtree(
+                "https://github.com/huggingface/pytorch-image-models", BRANCH
+            )
 
     def build_run_plan(self):
         # self.config is not the right config for this

diff --git a/benchmarks/timm/voirfile.py b/benchmarks/timm/voirfile.py
@@ -33,21 +33,22 @@ def setup(args):
             ov.require(dash)
 
         ov.require(
-            log("value", "progress", "rate", "units", "loss", "gpudata", context="task"),
+            log(
+                "value", "progress", "rate", "units", "loss", "gpudata", context="task"
+            ),
             rate(
                 interval=options.interval,
                 skip=options.skip,
                 sync=torch.cuda.synchronize if torch.cuda.is_available() else None,
-                batch_size_calc=lambda b: len(b) * args.world_size
+                batch_size_calc=lambda b: len(b) * args.world_size,
             ),
             early_stop(n=options.stop, key="rate", task="train", signal="stop"),
             gpu_monitor(poll_interval=options.gpu_poll),
         )
 
         # Loss
         (
-            loss_probe
-            .throttle(1)["loss"]
+            loss_probe.throttle(1)["loss"]
             .map(lambda loss: {"task": "train", "loss": float(loss)})
             .give()
         )

diff --git a/docs/docker.rst b/docs/docker.rst
@@ -113,8 +113,8 @@ There are currently two multi-node benchmarks, ``opt-1_3b-multinode`` (data-para
 .. code-block:: yaml
 
    system:
-     sshkey: <privatekey>
      arch: cuda
+     sshkey: /milabench/id_milabench
      docker_image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly
 
      nodes:

diff --git a/milabench/cli.py b/milabench/cli.py
@@ -8,11 +8,11 @@
 import tempfile
 import traceback
 from datetime import datetime
-import getpass
 
 from coleo import Option, config as configuration, default, run_cli, tooled
 from omegaconf import OmegaConf
 from voir.instruments.gpu import deduce_backend, select_backend
+import yaml
 
 from milabench.alt_async import proceed
 from milabench.utils import blabla, validation_layers, multilogger, available_layers
@@ -31,7 +31,7 @@
 from .merge import merge
 from .multi import MultiPackage
 from .report import make_report
-from .slurm import expand_node_list
+from .slurm import build_system_config
 from .summary import aggregate, make_summary
 
 
@@ -468,6 +468,17 @@ def install():
             mp=mp,
         )
 
+    def helpme():
+        """"""
+        # "192.168.0.[25-30,123,126]"
+        node_range: Option & str = "node[0,1]"
+        arch: Option & str = "cuda"
+        version: Option & str = "nightly"
+
+        from .utils import command_generator
+
+        command_generator(node_range, arch, version)
+
     def pin():
         """Pin the benchmarks' dependencies."""
 
@@ -682,24 +693,8 @@ def pip():
 
     def slurm_system():
         """Generate a system file based of slurm environment variables"""
-
-        node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", ""))
-
-        def make_node(i, ip):
-            node = {"name": ip, "ip": ip, "user": getpass.getuser(), "main": i == 0}
-
-            if i == 0:
-                node["port"] = 8123
-
-            return node
-
-        system = dict(
-            arch="cuda", nodes=[make_node(i, ip) for i, ip in enumerate(node_list)]
-        )
-
-        import yaml
-
-        print(yaml.dump({"system": system}))
+        system = build_system_config(os.getenv("SLURM_JOB_NODELIST", ""))
+        print(yaml.dump(system))
 
     def machine():
         """Display machine metadata.

diff --git a/milabench/slurm.py b/milabench/slurm.py
@@ -1,3 +1,6 @@
+import getpass
+
+
 def expand_range(s):
     numbers = []
     count = 0
@@ -51,3 +54,26 @@ def expand_node_list(node_list):
             s = next + 1
 
     return nodes
+
+
+def build_system_config(node_range):
+    node_list = expand_node_list(node_range)
+
+    def make_node(i, ip):
+        node = {
+            "name": ip,
+            "ip": ip,
+            "user": getpass.getuser(),
+            "main": i == 0,
+        }
+
+        if i == 0:
+            node["port"] = 8123
+
+        return node
+
+    system = dict(
+        arch="cuda",
+        nodes=[make_node(i, ip) for i, ip in enumerate(node_list)],
+    )
+    return {"system": system}
diff --git a/milabench/utils.py b/milabench/utils.py
@@ -10,11 +10,13 @@
 from functools import wraps
 from typing import Any
 
+import yaml
 from ovld import ovld
 
 from milabench.fs import XPath
 import milabench.validation
 from milabench.validation.validation import Summary
+from .slurm import build_system_config
 
 
 class Named:
@@ -225,3 +227,75 @@ def enumerate_rank(nodes):
         else:
             yield rank, node
             rank += 1
+
+
+def find_private_ssh_key():
+    homessh = os.path.expanduser("~/.ssh/")
+    key_path = None
+
+    for key in os.listdir(homessh):
+        if key.endswith(".pub"):
+            continue
+
+        if key.startswith("id_"):
+            key_path = os.path.join(homessh, key)
+            break
+
+    assert key_path is not None, "No ssh key found"
+    return key_path
+
+
+def command_generator(node_range, arch, version):
+    output = os.path.join(os.getcwd(), "results")
+    system = os.path.join(output, "system.yaml")
+
+    # Make output folder
+    os.makedirs("results", exist_ok=True)
+
+    # Find a ssh private key
+    key_path = find_private_ssh_key()
+
+    # make system file
+    if os.path.exists(system):
+        print("System file already exists, skipping")
+    else:
+        with open(system, "w") as file:
+            system_conf = build_system_config(node_range)
+            system_conf["system"]["sshkey"] = "/milabench/id_milabench"
+
+            formatted = yaml.dump(system_conf, indent=2)
+            print(formatted)
+            file.write(formatted)
+
+    # At that point we might as well run it for them ?
+    print()
+    print("Running milabench")
+    print()
+    print(f"    output: {output}")
+    print(f"    system: {system}")
+    print()
+    print("Command:")
+    print()
+
+    cmd = [
+        (f"  docker run -it --rm --gpus all --network host --ipc=host --privileged"),
+        (f"    -v {key_path}:/milabench/id_milabench                              "),
+        (f"    -v {output}:/milabench/envs/runs                                   "),
+        (f"    ghcr.io/mila-iqia/milabench:{arch}-{version}                        "),
+        (f"    milabench run --system /milabench/envs/runs/system.yaml"),
+    ]
+
+    length = 0
+    for line in cmd:
+        line = line.strip()
+        length = max(len(line) + 4, length)
+
+    formatted = []
+    for i, line in enumerate(cmd):
+        line = line.strip()
+        idt_size = ((i > 0) + 1) * 2
+        idt = " " * idt_size
+
+        formatted.append(f"{idt}{line:<{length - idt_size}}")
+
+    print("\\\n".join(formatted))