diff --git a/benchmarks/accelerate_opt/benchfile.py b/benchmarks/accelerate_opt/benchfile.py index 1a16cc2c4..5cfb7ade8 100644 --- a/benchmarks/accelerate_opt/benchfile.py +++ b/benchmarks/accelerate_opt/benchfile.py @@ -3,10 +3,7 @@ CmdExecutor, DockerRunExecutor, ListExecutor, - SCPExecutor, SSHExecutor, - SequenceExecutor, - VoidExecutor, ) from milabench.pack import Package from milabench.utils import select_nodes @@ -20,59 +17,30 @@ def make_env(self): env["OMP_NUM_THREADS"] = str(self.config["argv"]["--cpus_per_gpu"]) return env - def build_docker_prepare_remote_plan(self): - executors = [] - key = self.config["system"].get("sshkey") - docker_pull_exec = CmdExecutor( - self, "docker", "pull", self.config["system"].get("docker_image", None) - ) - for node in self.config["system"]["nodes"]: - if node["main"]: - continue - - host = node["ip"] - user = node["user"] - - executors.append(SSHExecutor(docker_pull_exec, - host=host, - user=user, - key=key, - ) - ) - return ListExecutor(*executors) - def build_prepare_plan(self): - prepare = [ - CmdExecutor( - self, - "accelerate", - "launch", - "--mixed_precision=fp16", - "--num_machines=1", - "--dynamo_backend=no", - "--num_processes=1", - "--num_cpu_threads_per_process=8", - str(self.dirs.code / "main.py"), - *self.argv, - "--prepare_only", - "--cache", - str(self.dirs.cache) - ) - ] - - docker_image = self.config["system"].get("docker_image", None) - if docker_image: - prepare.append(self.build_docker_prepare_remote_plan()) - - return SequenceExecutor(*prepare) + return CmdExecutor( + self, + "accelerate", + "launch", + "--mixed_precision=fp16", + "--num_machines=1", + "--dynamo_backend=no", + "--num_processes=1", + "--num_cpu_threads_per_process=8", + str(self.dirs.code / "main.py"), + *self.argv, + "--prepare_only", + "--cache", + str(self.dirs.cache) + ) def build_run_plan(self): plans = [] - + max_num = self.config["num_machines"] nodes = select_nodes(self.config["system"]["nodes"], max_num) key = self.config["system"].get("sshkey") - + for rank, node in enumerate(nodes): host = node["ip"] user = node["user"] @@ -83,7 +51,7 @@ def build_run_plan(self): setsid=True, use_stdout=True, ) - + tags = [*self.config["tag"], node["name"]] if rank != 0: # Workers do not send training data diff --git a/docker/Dockerfile-cuda b/docker/Dockerfile-cuda index eaad2dae5..ebbc4a460 100644 --- a/docker/Dockerfile-cuda +++ b/docker/Dockerfile-cuda @@ -27,7 +27,6 @@ ENV MILABENCH_ARGS="" WORKDIR /milabench COPY . /milabench/milabench/ - # Install Dependencies # -------------------- @@ -40,6 +39,7 @@ COPY . /milabench/milabench/ # Use ofed_info -s to get your local version ARG MOFED_VERSION=5.4-3.4.0.0 +ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y &&\ apt-get install -y git build-essential curl python3 python-is-python3 python3-pip &&\ curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\ @@ -47,7 +47,7 @@ RUN apt-get update -y &&\ curl -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb &&\ dpkg -i cuda-keyring_1.1-1_all.deb &&\ apt-get update -y &&\ - apt-get install -y libibverbs1 cuda-compat-11-8 &&\ + apt-get install -y libibverbs1 nvidia-compute-utils-535 nvidia-utils-535 cuda-11-8 &&\ apt-get clean &&\ rm -rf /var/lib/apt/lists/* &&\ rm cuda-keyring_1.1-1_all.deb @@ -56,7 +56,7 @@ RUN apt-get update -y &&\ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" - +ENV CUDA_HOME=/usr/local/cuda-11.8 # Install Milabench # ----------------- diff --git a/docs/docker.rst b/docs/docker.rst index 0eafd87e2..c9c6e2d98 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -97,13 +97,14 @@ There are currently two multi-node benchmarks, ``opt-1_3b-multinode`` (data-para ``opt-6_7b-multinode`` (model-parallel, that model is too large to fit on a single GPU). Here is how to run them: 0. Make sure the machine can ssh between each other without passwords - - ``ssh-keygen`` 1. Pull the milabench docker image you would like to run on all machines - ``docker pull`` +1. Create the output directory + - ``mkdir -p results`` 2. Create a list of nodes that will participate in the benchmark inside a ``results/system.yaml`` file (see example below) - ``vi results/system.yaml`` 3. Call milabench with by specifying the node list we created. - - ``docker ...-v :/milabench/id_milabench milabench run ... --system system.yaml`` + - ``docker ... -v $(pwd)/results:/milabench/envs/runs -v :/milabench/id_milabench milabench run ... --system /milabench/envs/runs/system.yaml`` .. notes:: @@ -142,7 +143,7 @@ Then, the command should look like this: -v $SSH_KEY_FILE:/milabench/id_milabench \ -v $(pwd)/results:/milabench/envs/runs \ $MILABENCH_IMAGE \ - milabench run --system system.yaml \ + milabench run --system /milabench/envs/runs/system.yaml \ --select multinode The last line (``--select multinode``) specifically selects the multi-node benchmarks. Omit that line to run all benchmarks. @@ -171,7 +172,7 @@ For example, for 4 nodes: ip: 192.168.0.25 main: true port: 8123 - user: delaunap + user: - name: node2 ip: 192.168.0.26 @@ -193,7 +194,7 @@ The command would look like .. code-block:: bash - docker ... milabench run ... --system system.yaml --overrides overrides.yaml + docker ... milabench run ... --system /milabench/envs/runs/system.yaml --overrides /milabench/envs/runs/overrides.yaml .. note:: diff --git a/milabench/executors.py b/milabench/executors.py index d5325d7f0..e7334de04 100644 --- a/milabench/executors.py +++ b/milabench/executors.py @@ -55,6 +55,8 @@ class Executor: def __init__(self, pack_or_exec: Executor | pack.BasePackage, **kwargs) -> None: self._pack = None self.exec = None + # used to know if the command is executed through SSH or locally + self.remote = False if isinstance(pack_or_exec, Executor): self.exec = pack_or_exec @@ -348,8 +350,13 @@ def argv(self, **kwargs) -> List: return docker_args + rewritten + def is_inside_docker(self): + return os.environ.get("MILABENCH_DOCKER", None) + def _argv(self, **kwargs) -> List: - if self.image is None or os.environ.get("MILABENCH_DOCKER", None): + # if the command is executed remotely it does not matter + # if we are inside docker or not + if (self.image is None) or (self.is_inside_docker() and not self.remote): # No-op when there's no docker image to run or inside a docker # container return [] @@ -405,6 +412,7 @@ def __init__( self.user = user self.key = key self.port = port + executor.remote = not self.is_local() def _find_node_config(self) -> Dict: for n in self.pack.config["system"]["nodes"]: diff --git a/milabench/multi.py b/milabench/multi.py index 96e717af0..5ff20036d 100644 --- a/milabench/multi.py +++ b/milabench/multi.py @@ -21,8 +21,6 @@ here = XPath(__file__).parent -gpus = get_gpu_info()["gpus"].values() - planning_methods = {}