Skip to content

Commit

Permalink
Merge branch 'master' of github.com:mila-iqia/milabench into docker_c…
Browse files Browse the repository at this point in the history
…md_generator
  • Loading branch information
pierre.delaunay committed Jul 6, 2023
2 parents 89f7177 + 7209c33 commit 83c0aca
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 61 deletions.
68 changes: 18 additions & 50 deletions benchmarks/accelerate_opt/benchfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@
CmdExecutor,
DockerRunExecutor,
ListExecutor,
SCPExecutor,
SSHExecutor,
SequenceExecutor,
VoidExecutor,
)
from milabench.pack import Package
from milabench.utils import select_nodes
Expand All @@ -20,59 +17,30 @@ def make_env(self):
env["OMP_NUM_THREADS"] = str(self.config["argv"]["--cpus_per_gpu"])
return env

def build_docker_prepare_remote_plan(self):
executors = []
key = self.config["system"].get("sshkey")
docker_pull_exec = CmdExecutor(
self, "docker", "pull", self.config["system"].get("docker_image", None)
)
for node in self.config["system"]["nodes"]:
if node["main"]:
continue

host = node["ip"]
user = node["user"]

executors.append(SSHExecutor(docker_pull_exec,
host=host,
user=user,
key=key,
)
)
return ListExecutor(*executors)

def build_prepare_plan(self):
prepare = [
CmdExecutor(
self,
"accelerate",
"launch",
"--mixed_precision=fp16",
"--num_machines=1",
"--dynamo_backend=no",
"--num_processes=1",
"--num_cpu_threads_per_process=8",
str(self.dirs.code / "main.py"),
*self.argv,
"--prepare_only",
"--cache",
str(self.dirs.cache)
)
]

docker_image = self.config["system"].get("docker_image", None)
if docker_image:
prepare.append(self.build_docker_prepare_remote_plan())

return SequenceExecutor(*prepare)
return CmdExecutor(
self,
"accelerate",
"launch",
"--mixed_precision=fp16",
"--num_machines=1",
"--dynamo_backend=no",
"--num_processes=1",
"--num_cpu_threads_per_process=8",
str(self.dirs.code / "main.py"),
*self.argv,
"--prepare_only",
"--cache",
str(self.dirs.cache)
)

def build_run_plan(self):
plans = []

max_num = self.config["num_machines"]
nodes = select_nodes(self.config["system"]["nodes"], max_num)
key = self.config["system"].get("sshkey")

for rank, node in enumerate(nodes):
host = node["ip"]
user = node["user"]
Expand All @@ -83,7 +51,7 @@ def build_run_plan(self):
setsid=True,
use_stdout=True,
)

tags = [*self.config["tag"], node["name"]]
if rank != 0:
# Workers do not send training data
Expand Down
6 changes: 3 additions & 3 deletions docker/Dockerfile-cuda
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ ENV MILABENCH_ARGS=""
WORKDIR /milabench
COPY . /milabench/milabench/


# Install Dependencies
# --------------------

Expand All @@ -40,14 +39,15 @@ COPY . /milabench/milabench/
# Use ofed_info -s to get your local version
ARG MOFED_VERSION=5.4-3.4.0.0

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y &&\
apt-get install -y git build-essential curl python3 python-is-python3 python3-pip &&\
curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\
curl -o /etc/apt/sources.list.d/mellanox.list https://linux.mellanox.com/public/repo/mlnx_ofed/${MOFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list &&\
curl -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb &&\
dpkg -i cuda-keyring_1.1-1_all.deb &&\
apt-get update -y &&\
apt-get install -y libibverbs1 cuda-compat-11-8 &&\
apt-get install -y libibverbs1 nvidia-compute-utils-535 nvidia-utils-535 cuda-11-8 &&\
apt-get clean &&\
rm -rf /var/lib/apt/lists/* &&\
rm cuda-keyring_1.1-1_all.deb
Expand All @@ -56,7 +56,7 @@ RUN apt-get update -y &&\

RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"

ENV CUDA_HOME=/usr/local/cuda-11.8

# Install Milabench
# -----------------
Expand Down
11 changes: 6 additions & 5 deletions docs/docker.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,14 @@ There are currently two multi-node benchmarks, ``opt-1_3b-multinode`` (data-para
``opt-6_7b-multinode`` (model-parallel, that model is too large to fit on a single GPU). Here is how to run them:

0. Make sure the machine can ssh between each other without passwords
- ``ssh-keygen``
1. Pull the milabench docker image you would like to run on all machines
- ``docker pull``
1. Create the output directory
- ``mkdir -p results``
2. Create a list of nodes that will participate in the benchmark inside a ``results/system.yaml`` file (see example below)
- ``vi results/system.yaml``
3. Call milabench with by specifying the node list we created.
- ``docker ...-v <privatekey>:/milabench/id_milabench milabench run ... --system system.yaml``
- ``docker ... -v $(pwd)/results:/milabench/envs/runs -v <privatekey>:/milabench/id_milabench milabench run ... --system /milabench/envs/runs/system.yaml``

.. notes::

Expand Down Expand Up @@ -142,7 +143,7 @@ Then, the command should look like this:
-v $SSH_KEY_FILE:/milabench/id_milabench \
-v $(pwd)/results:/milabench/envs/runs \
$MILABENCH_IMAGE \
milabench run --system system.yaml \
milabench run --system /milabench/envs/runs/system.yaml \
--select multinode
The last line (``--select multinode``) specifically selects the multi-node benchmarks. Omit that line to run all benchmarks.
Expand Down Expand Up @@ -171,7 +172,7 @@ For example, for 4 nodes:
ip: 192.168.0.25
main: true
port: 8123
user: delaunap
user: <username>
- name: node2
ip: 192.168.0.26
Expand All @@ -193,7 +194,7 @@ The command would look like

.. code-block:: bash
docker ... milabench run ... --system system.yaml --overrides overrides.yaml
docker ... milabench run ... --system /milabench/envs/runs/system.yaml --overrides /milabench/envs/runs/overrides.yaml
.. note::
Expand Down
10 changes: 9 additions & 1 deletion milabench/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class Executor:
def __init__(self, pack_or_exec: Executor | pack.BasePackage, **kwargs) -> None:
self._pack = None
self.exec = None
# used to know if the command is executed through SSH or locally
self.remote = False

if isinstance(pack_or_exec, Executor):
self.exec = pack_or_exec
Expand Down Expand Up @@ -348,8 +350,13 @@ def argv(self, **kwargs) -> List:

return docker_args + rewritten

def is_inside_docker(self):
return os.environ.get("MILABENCH_DOCKER", None)

def _argv(self, **kwargs) -> List:
if self.image is None or os.environ.get("MILABENCH_DOCKER", None):
# if the command is executed remotely it does not matter
# if we are inside docker or not
if (self.image is None) or (self.is_inside_docker() and not self.remote):
# No-op when there's no docker image to run or inside a docker
# container
return []
Expand Down Expand Up @@ -405,6 +412,7 @@ def __init__(
self.user = user
self.key = key
self.port = port
executor.remote = not self.is_local()

def _find_node_config(self) -> Dict:
for n in self.pack.config["system"]["nodes"]:
Expand Down
2 changes: 0 additions & 2 deletions milabench/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

here = XPath(__file__).parent

gpus = get_gpu_info()["gpus"].values()

planning_methods = {}


Expand Down

0 comments on commit 83c0aca

Please sign in to comment.