Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docker cmd generator #161

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions benchmarks/timm/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,31 @@ class TimmBenchmarkPack(Package):
def make_env(self):
return {
**super().make_env(),
"OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8))
"OMP_NUM_THREADS": str(self.config.get("cpus_per_gpu", 8)),
}

@property
def argv(self):
return [
*super().argv,
"--data-dir", self.dirs.data,
"--dataset", "FakeImageNet",
"--output", self.dirs.extra / self.logdir.name / self.tag,
"--checkpoint-hist", 1,
"--data-dir",
self.dirs.data,
"--dataset",
"FakeImageNet",
"--output",
self.dirs.extra / self.logdir.name / self.tag,
"--checkpoint-hist",
1,
]

async def install(self):
await super().install()

timm = self.dirs.code / "pytorch-image-models"
if not timm.exists():
timm.clone_subtree("https://github.com/huggingface/pytorch-image-models", BRANCH)
timm.clone_subtree(
"https://github.com/huggingface/pytorch-image-models", BRANCH
)

def build_run_plan(self):
# self.config is not the right config for this
Expand Down
9 changes: 5 additions & 4 deletions benchmarks/timm/voirfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,22 @@ def setup(args):
ov.require(dash)

ov.require(
log("value", "progress", "rate", "units", "loss", "gpudata", context="task"),
log(
"value", "progress", "rate", "units", "loss", "gpudata", context="task"
),
rate(
interval=options.interval,
skip=options.skip,
sync=torch.cuda.synchronize if torch.cuda.is_available() else None,
batch_size_calc=lambda b: len(b) * args.world_size
batch_size_calc=lambda b: len(b) * args.world_size,
),
early_stop(n=options.stop, key="rate", task="train", signal="stop"),
gpu_monitor(poll_interval=options.gpu_poll),
)

# Loss
(
loss_probe
.throttle(1)["loss"]
loss_probe.throttle(1)["loss"]
.map(lambda loss: {"task": "train", "loss": float(loss)})
.give()
)
Expand Down
2 changes: 1 addition & 1 deletion docs/docker.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ There are currently two multi-node benchmarks, ``opt-1_3b-multinode`` (data-para
.. code-block:: yaml

system:
sshkey: <privatekey>
arch: cuda
sshkey: /milabench/id_milabench
docker_image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly

nodes:
Expand Down
35 changes: 15 additions & 20 deletions milabench/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
import tempfile
import traceback
from datetime import datetime
import getpass

from coleo import Option, config as configuration, default, run_cli, tooled
from omegaconf import OmegaConf
from voir.instruments.gpu import deduce_backend, select_backend
import yaml

from milabench.alt_async import proceed
from milabench.utils import blabla, validation_layers, multilogger, available_layers
Expand All @@ -31,7 +31,7 @@
from .merge import merge
from .multi import MultiPackage
from .report import make_report
from .slurm import expand_node_list
from .slurm import build_system_config
from .summary import aggregate, make_summary


Expand Down Expand Up @@ -468,6 +468,17 @@ def install():
mp=mp,
)

def helpme():
""""""
# "192.168.0.[25-30,123,126]"
node_range: Option & str = "node[0,1]"
arch: Option & str = "cuda"
version: Option & str = "nightly"

from .utils import command_generator

command_generator(node_range, arch, version)

def pin():
"""Pin the benchmarks' dependencies."""

Expand Down Expand Up @@ -682,24 +693,8 @@ def pip():

def slurm_system():
"""Generate a system file based of slurm environment variables"""

node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", ""))

def make_node(i, ip):
node = {"name": ip, "ip": ip, "user": getpass.getuser(), "main": i == 0}

if i == 0:
node["port"] = 8123

return node

system = dict(
arch="cuda", nodes=[make_node(i, ip) for i, ip in enumerate(node_list)]
)

import yaml

print(yaml.dump({"system": system}))
system = build_system_config(os.getenv("SLURM_JOB_NODELIST", ""))
print(yaml.dump(system))

def machine():
"""Display machine metadata.
Expand Down
26 changes: 26 additions & 0 deletions milabench/slurm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import getpass


def expand_range(s):
numbers = []
count = 0
Expand Down Expand Up @@ -51,3 +54,26 @@ def expand_node_list(node_list):
s = next + 1

return nodes


def build_system_config(node_range):
node_list = expand_node_list(node_range)

def make_node(i, ip):
node = {
"name": ip,
"ip": ip,
"user": getpass.getuser(),
"main": i == 0,
}

if i == 0:
node["port"] = 8123

return node

system = dict(
arch="cuda",
nodes=[make_node(i, ip) for i, ip in enumerate(node_list)],
)
return {"system": system}
74 changes: 74 additions & 0 deletions milabench/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
from functools import wraps
from typing import Any

import yaml
from ovld import ovld

from milabench.fs import XPath
import milabench.validation
from milabench.validation.validation import Summary
from .slurm import build_system_config


class Named:
Expand Down Expand Up @@ -225,3 +227,75 @@ def enumerate_rank(nodes):
else:
yield rank, node
rank += 1


def find_private_ssh_key():
homessh = os.path.expanduser("~/.ssh/")
key_path = None

for key in os.listdir(homessh):
if key.endswith(".pub"):
continue

if key.startswith("id_"):
key_path = os.path.join(homessh, key)
break

assert key_path is not None, "No ssh key found"
return key_path


def command_generator(node_range, arch, version):
output = os.path.join(os.getcwd(), "results")
system = os.path.join(output, "system.yaml")

# Make output folder
os.makedirs("results", exist_ok=True)

# Find a ssh private key
key_path = find_private_ssh_key()

# make system file
if os.path.exists(system):
print("System file already exists, skipping")
else:
with open(system, "w") as file:
system_conf = build_system_config(node_range)
system_conf["system"]["sshkey"] = "/milabench/id_milabench"

formatted = yaml.dump(system_conf, indent=2)
print(formatted)
file.write(formatted)

# At that point we might as well run it for them ?
print()
print("Running milabench")
print()
print(f" output: {output}")
print(f" system: {system}")
print()
print("Command:")
print()

cmd = [
(f" docker run -it --rm --gpus all --network host --ipc=host --privileged"),
(f" -v {key_path}:/milabench/id_milabench "),
(f" -v {output}:/milabench/envs/runs "),
(f" ghcr.io/mila-iqia/milabench:{arch}-{version} "),
(f" milabench run --system /milabench/envs/runs/system.yaml"),
]

length = 0
for line in cmd:
line = line.strip()
length = max(len(line) + 4, length)

formatted = []
for i, line in enumerate(cmd):
line = line.strip()
idt_size = ((i > 0) + 1) * 2
idt = " " * idt_size

formatted.append(f"{idt}{line:<{length - idt_size}}")

print("\\\n".join(formatted))