From 2e73ab6ccd8ead6d505929c05ce881a5423a42be Mon Sep 17 00:00:00 2001 From: Vanessasaurus <814322+vsoch@users.noreply.github.com> Date: Sun, 22 Jan 2023 20:25:12 -0800 Subject: [PATCH] WIP to add consistent run (#21) * WIP to add consistent run the server is currently too flakey with the port forward to be reliable for communication. I need to rethink how to do this because I am not happy with it. Signed-off-by: vsoch --- CHANGELOG.md | 1 + docs/getting_started/commands.md | 68 +- docs/getting_started/examples.md | 3 +- docs/getting_started/minikube.md | 35 +- docs/index.rst | 18 +- examples/up-submit-down/README.md | 62 ++ .../.scripts/broker-id.sh | 12 + .../.scripts/cluster-create.sh | 204 +++++ .../.scripts/cluster-destroy.sh | 161 ++++ .../.scripts/flux-operator.yaml | 848 ++++++++++++++++++ .../.scripts/minicluster-size-2.yaml | 29 + .../.scripts/minicluster-submit-size-2.sh | 219 +++++ .../hello-world-1-minicluster-size-2/log.out | 1 + .../hello-world-2-minicluster-size-2/log.out | 1 + .../hello-world-3-minicluster-size-2/log.out | 1 + .../hello-world-4-minicluster-size-2/log.out | 1 + .../hello-world-5-minicluster-size-2/log.out | 1 + .../data/k8s-size-4-n1-standard-1/meta.json | 696 ++++++++++++++ .../reaxc-hns-1-minicluster-size-2/log.out | 80 ++ .../reaxc-hns-2-minicluster-size-2/log.out | 80 ++ .../reaxc-hns-3-minicluster-size-2/log.out | 80 ++ .../reaxc-hns-4-minicluster-size-2/log.out | 80 ++ .../reaxc-hns-5-minicluster-size-2/log.out | 80 ++ examples/up-submit-down/experiments.yaml | 31 + examples/up-submit-down/plot_results.py | 145 +++ fluxcloud/client/__init__.py | 27 +- fluxcloud/client/apply.py | 34 +- fluxcloud/client/down.py | 18 +- fluxcloud/client/helpers.py | 29 +- fluxcloud/client/run.py | 32 +- fluxcloud/client/up.py | 22 +- fluxcloud/logger.py | 4 +- fluxcloud/main/api.py | 170 ++++ fluxcloud/main/client.py | 194 ++-- .../main/clouds/shared/scripts/broker-id | 12 + .../clouds/shared/scripts/minicluster-run | 57 +- .../clouds/shared/scripts/minicluster-submit | 33 + .../clouds/shared/scripts/wait_for_all.sh | 20 + .../clouds/shared/scripts/wait_for_broker.sh | 40 + .../clouds/shared/scripts/wait_for_cleanup.sh | 15 + .../shared/scripts/wait_for_flux_restful.sh | 29 + fluxcloud/main/decorator.py | 2 +- fluxcloud/main/experiment.py | 89 +- fluxcloud/minicluster-template.yaml | 22 +- fluxcloud/settings.yml | 2 +- fluxcloud/version.py | 5 +- .../.scripts/cluster-create-minikube.sh | 2 +- .../.scripts/flux-operator.yaml | 8 + ...uster-run-lmp-size-2-minicluster-size-2.sh | 6 +- ...uster-run-lmp-size-4-minicluster-size-4.sh | 6 +- .../.scripts/minicluster-size-2.yaml | 22 + ...nicluster.yaml => minicluster-size-4.yaml} | 0 tests/lammps/data/k8s-size-4-local/meta.json | 2 +- 53 files changed, 3592 insertions(+), 247 deletions(-) create mode 100644 examples/up-submit-down/README.md create mode 100755 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh create mode 100755 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh create mode 100755 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml create mode 100755 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out create mode 100644 examples/up-submit-down/experiments.yaml create mode 100644 examples/up-submit-down/plot_results.py create mode 100644 fluxcloud/main/api.py create mode 100755 fluxcloud/main/clouds/shared/scripts/broker-id create mode 100755 fluxcloud/main/clouds/shared/scripts/minicluster-submit create mode 100644 fluxcloud/main/clouds/shared/scripts/wait_for_all.sh create mode 100644 fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh create mode 100644 fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh create mode 100644 fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh create mode 100644 tests/lammps/data/k8s-size-4-local/.scripts/minicluster-size-2.yaml rename tests/lammps/data/k8s-size-4-local/.scripts/{minicluster.yaml => minicluster-size-4.yaml} (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6489a4..371280d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are: The versions coincide with releases on pip. Only major versions will be released as tags on Github. ## [0.0.x](https://github.com/converged-computing/flux-cloud/tree/main) (0.0.x) + - support for submit and batch, to run jobs on the same MiniCluster (0.1.15) - minikube docker pull needs message, update tests and typo (0.1.14) - wait until pods terminated and removed between applies (0.1.13) - add support for custom placement group name (0.1.12) diff --git a/docs/getting_started/commands.md b/docs/getting_started/commands.md index 3e0b6e1..3002b81 100644 --- a/docs/getting_started/commands.md +++ b/docs/getting_started/commands.md @@ -1,6 +1,11 @@ # Commands -The following commands are provided by Flux Cloud. +The following commands are provided by Flux Cloud. For running jobs, you can either do: + +- **apply**/**run**: A single/multi job submission intended for different containers to re-create pods each time. +- **batch**/**submit**: A single/multi job submission intended for a common container base where we use the same set of pods. + +Both are described in the following sections. ## list @@ -43,6 +48,8 @@ $ flux-cloud apply -e k8s-size-8-m5.large --size 2 ## run +> Up, apply, down in one command, ideal for completely headless runs and jobs with different containers. + The main command is a "run" that is going to, for each cluster: 1. Create the cluster @@ -131,7 +138,9 @@ $ flux-cloud up -e n1-standard-1-2 --force-cluster ## apply -And then run experiments (as you feel) with "apply." +> Ideal for running multiple jobs with different containers. + +After "up" you can choose to run experiments (as you feel) with "apply." ```bash $ flux-cloud apply @@ -150,9 +159,61 @@ To force overwrite of existing results (by default they are skipped) $ flux-cloud apply -e n1-standard-1-2 --force ``` -Note that by default, we always wait for a previous run to be cleaned up +Apply is going to be creating on CRD per job, so that's a lot of +pod creation and deletion. This is in comparison to "submit" that +brings up a MiniCluster once, and then executes commands to it, allowing +Flux to serve as the scheduler. Note that by default, we always wait for a previous run to be cleaned up before continuing. +## submit + +> Ideal for one or more commands across the same container(s) and MiniCluster size. + +```bash +$ flux-cloud up --cloud minikube +$ flux-cloud submit --cloud minikube +$ flux-cloud down --cloud minikube +``` + +The submit will always check if the MiniCluster is already created, and if not, create it +to submit jobs. For submit (and the equivalent to bring it up and down with batch) +your commands aren't provided in the CRD, +but rather to the Flux Restful API. Submit / batch will also generate one CRD +per MiniCluster size, but use the same MiniCluster across jobs. This is different +from apply, which generates one CRD per job to run. + +## batch + +> Up, submit, down in one command, ideal for jobs with the same container(s) + +The "batch" command is comparable to "run" except we are running commands +across the same set of containers. We don't need to bring pods up/down each time, +and we are using Flux in our cluster to handle scheduling. +This command is going to: + +1. Create the cluster +2. Run each of the experiments, saving output and timing, on the same pods +3. Bring down the cluster + +The output is organized in the same way, and as before, you can choose to run a single +command with "submit" + +```bash +$ flux-cloud batch --cloud aws +``` + +Note that since we are communicating with the FluxRestful API, you are required to +provide a `FLUX_USER` and `FLUX_TOKEN` for the API. If you are running this programmatically, +the Flux Restful Client will handle this, however if you, for example, press control C to +cancel a run, you'll need to copy paste the username and token that was previously shown +before running submit again to continue where you left off. Batch is equivalent to: + +```bash +$ flux-cloud up +$ flux-cloud submit +$ flux-cloud down +``` + ## down And then bring down your first (or named) cluster: @@ -174,6 +235,7 @@ You can also use `--force-cluster` here: $ flux-cloud down --force-cluster ``` + ## debug For any command, you can add `--debug` as a main client argument to see additional information. E.g., diff --git a/docs/getting_started/examples.md b/docs/getting_started/examples.md index 9bed344..8bb4bd4 100644 --- a/docs/getting_started/examples.md +++ b/docs/getting_started/examples.md @@ -3,8 +3,9 @@ The easiest thing to do is arguably to start with an example, and then customize it. Here we will add examples as we create them. -- [up-apply-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-apply-down) +- [up-apply-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-apply-down): shows using `flux-cloud apply` for individual CRD submission. - [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/osu-benchmarks) +- [up-submit-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-submit-down): shows using `flux-cloud submit` for batch submission. The above example runs a single command in a single Kubernetes cluster and MiniCluster, and it's lammps! diff --git a/docs/getting_started/minikube.md b/docs/getting_started/minikube.md index 1aa4c1f..4a40444 100644 --- a/docs/getting_started/minikube.md +++ b/docs/getting_started/minikube.md @@ -2,10 +2,17 @@ > Running on a local MiniKube cluster -Flux Cloud (as of version 0.1.0) can run on MiniKube! The main steps of running experiments are: +Flux Cloud (as of version 0.1.0) can run on MiniKube! The main steps of running experiments with +different container bases are: - **up** to bring up a cluster - - **apply** to apply one or more experiments defined by an experiments.yaml + - **apply** to apply one or more CRDs from experiments defined by an experiments.yaml + - **down** to destroy a cluster + +or one or more commands with the same container base(s): + + - **up** to bring up a cluster + - **submit** to submit one or more experiments to the same set of pods defined by an experiments.yaml - **down** to destroy a cluster Each of these commands can be run in isolation, and we provide a single command **run** to @@ -19,7 +26,6 @@ want to remove the abstraction at any point and run the commands on your own, yo You should first [install minikube](https://minikube.sigs.k8s.io/docs/start/) and kubectl. - ## Run Experiments Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to @@ -29,7 +35,11 @@ provide this library for you to easily edit and use! Take a look at the [example directory for a few that we provide. We will walk through a generic one here to launch an experiment on a MiniKube Kubernetes cluster. Note that before doing this step you should have installed flux-cloud, along with kubectl and minikube. Note that if it's not the default, -you'll need to specify using MiniKube: +you'll need to specify using MiniKube + +### Apply / Run + +> Ideal if you need to run multiple jobs on different containers ```bash $ flux-cloud run --cloud minikube experiments.yaml @@ -108,3 +118,20 @@ spec: workingDir: /home/flux/examples/reaxff/HNS command: {{ job.command }} ``` + +### Submit + +> Ideal for one or more commands across the same container(s) and MiniCluster size. + +```bash +$ flux-cloud up --cloud minikube +$ flux-cloud submit --cloud minikube +$ flux-cloud down --cloud minikube +``` + +The submit will always check if the MiniCluster is already created, and if not, create it +to submit jobs. For submit (and the equivalent to bring it up and down with batch) +your commands aren't provided in the CRD, +but rather to the Flux Restful API. Submit / batch will also generate one CRD +per MiniCluster size, but use the same MiniCluster across jobs. This is different +from apply, which generates one CRD per job to run. diff --git a/docs/index.rst b/docs/index.rst index 42ccd01..efdb1a1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,7 +18,9 @@ and save the output, and bring it down. This is what flux cloud does! With Flux 4. Run the experiments (each a MiniCluster) and save output and timings. 5. Bring down the cluster as soon as you are done. -For all of the above, you can either run with one command `flux-cloud run` or break into three: +For all of the above, there are two modes of execution. If you have different containers you want to run for jobs, +then you would want to use **run** or **apply** to create separate sets of pods, each time bringing them up and down. +That can be done with either run with one command `flux-cloud run` or broken into three: .. code-block:: console @@ -26,8 +28,20 @@ For all of the above, you can either run with one command `flux-cloud run` or br $ flux-cloud apply $ flux-cloud down +If you want to instead run one or more commands *across the same set of pods* meaning that your container(s) +base(s) do not need to change, you can use **submit**: -And given any failure of a command, you are given the option to try again or exit and cancel. E.g., +.. code-block:: console + + $ flux-cloud up + $ flux-cloud submit + $ flux-cloud down + +And for the single command equivalent, do `flux-cloud batch`. The difference in the latter is that we will actually +be using Flux as a scheduler, and have much more efficient runs in that we don't need to bring down pods and bring them +back up each time. + +For either approach, given any failure of a command, you are given the option to try again or exit and cancel. E.g., when you are developing, you can run "apply" and then easily debug until you are done and ready to bring the cluster down. diff --git a/examples/up-submit-down/README.md b/examples/up-submit-down/README.md new file mode 100644 index 0000000..3196db2 --- /dev/null +++ b/examples/up-submit-down/README.md @@ -0,0 +1,62 @@ +``# Up, Submit, Down + +This is an example of using flux cloud to bring up a cluster, install the Flux Operator +(and then you would use it as you please) and run jobs with submit (on the same +MiniCluster) and then bring it down. +You should have kubectl and gcloud OR minikube installed for this demo. Note that +we use the [experiments.yaml](experiments.yaml) file as a default, +and we only provide basic metadata needed for a single experiment. + +## Up + +```bash +$ flux-cloud up +``` + +This will bring up your cluster, per the size and machine type defined +in your experiments file, and install the operator. + +## Submit + +A "submit" means running the single (or multiple) experiments defined in your +experiments.yaml on the same MiniCluster, without bringing it down between jobs. +This means we are using Flux as the scheduler proper, and we don't need to bring pods +up and down unecessarily (and submit a gazillion YAML files). There is only the number +of YAML CRD needed to correspond to the sizes of MiniClusters you run across. + +```bash +$ flux-cloud submit --cloud minikube +$ flux-cloud submit --cloud google +``` + +## Down + +To bring it down: + +```bash +$ flux-cloud down +``` + +## Batch + +Run all three with one command: + +```bash +$ flux-cloud batch --cloud minikube +$ flux-cloud batch --cloud google +``` + + +## Plot + +I threw together a script to compare running times with info and output times, +where: + +running time < info < output + +```bash +$ pip install pandas matplotlib seaborn +``` +```bash +$ python plot_results.py data/k8s-size-4-n1-standard-1/meta.json +``` diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh new file mode 100755 index 0000000..c6fb8e0 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +NAMESPACE="flux-operator" +JOB="lammps-job" +brokerPrefix="${JOB}-0" + +for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do + if [[ "${pod}" == ${brokerPrefix}* ]]; then + echo ${pod} + break + fi +done diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh new file mode 100755 index 0000000..bdace99 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +# Source shared helper scripts +# Colors +red='\033[0;31m' +green='\033[0;32m' +yellow='\033[0;33m' +blue='\033[0;34m' +magenta='\033[0;35m' +cyan='\033[0;36m' +clear='\033[0m' + +function print_red() { + echo -e "${red}$@${clear}" +} +function print_yellow() { + echo -e "${yellow}$@${clear}" +} +function print_green() { + echo -e "${green}$@${clear}" +} +function print_blue() { + echo -e "${blue}$@${clear}" +} +function print_magenta() { + echo -e "${magenta}$@${clear}" +} +function print_cyan() { + echo -e "${cyan}$@${clear}" +} + +function is_installed () { + # Determine if a command is available for use! + cmd="${1}" + if command -v $cmd >/dev/null; then + echo "$cmd is installed" + else + echo "$cmd could not be found" + exit 1 + fi +} + +function install_operator() { + # Shared function to install the operator from a specific repository branch and cleanup + script_dir=${1} + repository=${2} + branch=${3} + tmpfile="${script_dir}/flux-operator.yaml" + run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml + kubectl apply -f $tmpfile +} + + +function run_echo() { + # Show the user the command then run it + echo + print_green "$@" + retry $@ +} + +function run_echo_allow_fail() { + echo + print_green "$@" + $@ || true +} + +function retry() { + # Retry an unsuccessful user command, per request + while true + do + $@ + retval=$? + if [[ "${retval}" == "0" ]]; then + return + fi + print_blue "That command was not successful. Do you want to try again? šŸ¤”ļø" + read -p " (yes/no) " answer + # Exit with non-zero response so we know to stop in script. + case ${answer} in + yes ) continue;; + no ) echo exiting...; + exit 1;; + * ) echo invalid response; + exit 1;; + esac + done +} + + +function prompt() { + # Prompt the user with a yes/no command to continue or exit + print_blue "$@ šŸ¤”ļø" + read -p " (yes/no) " answer + case ${answer} in + yes ) echo ok, we will proceed;; + no ) echo exiting...; + exit 1;; + * ) echo invalid response; + exit 1;; + esac +} + + +function with_exponential_backoff { + # Run with exponential backoff - assume containers take a while to pull + local max_attempts=100 + local timeout=1 + local attempt=0 + local exitcode=0 + + while [[ $attempt < $max_attempts ]]; do + "$@" + exitcode=$? + + if [[ $exitcode == 0 ]]; then + break + fi + + echo "Failure! Retrying in $timeout.." 1>&2 + sleep $timeout + attempt=$(( attempt + 1 )) + timeout=$(( timeout * 2 )) + done + + if [[ $exitCode != 0 ]]; then + echo "You've failed me for the last time! ($@)" 1>&2 + fi + return $exitcode +} + +# Defaults - these are in the config but left here for information +CLUSTER_NAME="flux-cluster" +ZONE="us-central1-a" +CLUSTER_VERSION="1.23" +MACHINE_TYPE="n1-standard-1" +FORCE_CLUSTER="false" +SIZE=4 +TAGS="flux-cluster" +REPOSITORY="flux-framework/flux-operator" +BRANCH="main" +GOOGLE_PROJECT="dinodev" +SCRIPT_DIR="/home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts" + +# Required arguments +if [ -z ${GOOGLE_PROJECT+x} ]; then + echo "Missing Google Project template variable as GOOGLE_PROJECT"; + exit 1 +fi + +if [ -z ${ZONE+x} ]; then + echo "Missing Google Cloud zone template variable as ZONE"; + exit 1 +fi + +if [ -z ${MACHINE_TYPE+x} ]; then + echo "Missing Google Cloud machine type template variable as MACHINE_TYPE"; + exit 1 +fi + +print_magenta " cluster : ${CLUSTER_NAME}" +print_magenta " version : ${CLUSTER_VERSION}" +print_magenta " project : ${GOOGLE_PROJECT}" +print_magenta " machine : ${MACHINE_TYPE}" +print_magenta " zone : ${ZONE}" +print_magenta " tags : ${TAGS}" +print_magenta " size : ${SIZE}" +print_magenta "repository : ${REPOSITORY}" +print_magenta " branch : ${BRANCH}" + +is_installed kubectl +is_installed gcloud +is_installed wget + +# Check if it already exists +gcloud container clusters list --zone ${ZONE} | grep ${CLUSTER_NAME} +retval=$? +if [[ "${retval}" == "0" ]]; then + print_blue "${CLUSTER_NAME} in ${ZONE} already exists." + echo + exit 0 +fi + +if [[ "${FORCE_CLUSTER}" != "true" ]]; then + prompt "Do you want to create this cluster?" +fi + +# Create the cluster +run_echo gcloud container clusters create ${CLUSTER_NAME} --project $GOOGLE_PROJECT \ + --zone ${ZONE} --cluster-version ${CLUSTER_VERSION} --machine-type ${MACHINE_TYPE} \ + --num-nodes=${SIZE} --enable-network-policy --tags=${TAGS} --enable-intra-node-visibility + +# Get credentials so kubectl will work +run_echo gcloud container clusters get-credentials ${CLUSTER_NAME} --zone ${ZONE} --project $GOOGLE_PROJECT +run_echo kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user $(gcloud config get-value core/account) + +# Show nodes +run_echo kubectl get nodes + +# Deploy the operator +mkdir -p ${SCRIPT_DIR} +install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} + +run_echo kubectl get namespace +run_echo kubectl describe namespace operator-system diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh new file mode 100755 index 0000000..de8988e --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh @@ -0,0 +1,161 @@ +#!/bin/bash + +# Source shared helper scripts +# Colors +red='\033[0;31m' +green='\033[0;32m' +yellow='\033[0;33m' +blue='\033[0;34m' +magenta='\033[0;35m' +cyan='\033[0;36m' +clear='\033[0m' + +function print_red() { + echo -e "${red}$@${clear}" +} +function print_yellow() { + echo -e "${yellow}$@${clear}" +} +function print_green() { + echo -e "${green}$@${clear}" +} +function print_blue() { + echo -e "${blue}$@${clear}" +} +function print_magenta() { + echo -e "${magenta}$@${clear}" +} +function print_cyan() { + echo -e "${cyan}$@${clear}" +} + +function is_installed () { + # Determine if a command is available for use! + cmd="${1}" + if command -v $cmd >/dev/null; then + echo "$cmd is installed" + else + echo "$cmd could not be found" + exit 1 + fi +} + +function install_operator() { + # Shared function to install the operator from a specific repository branch and cleanup + script_dir=${1} + repository=${2} + branch=${3} + tmpfile="${script_dir}/flux-operator.yaml" + run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml + kubectl apply -f $tmpfile +} + + +function run_echo() { + # Show the user the command then run it + echo + print_green "$@" + retry $@ +} + +function run_echo_allow_fail() { + echo + print_green "$@" + $@ || true +} + +function retry() { + # Retry an unsuccessful user command, per request + while true + do + $@ + retval=$? + if [[ "${retval}" == "0" ]]; then + return + fi + print_blue "That command was not successful. Do you want to try again? šŸ¤”ļø" + read -p " (yes/no) " answer + # Exit with non-zero response so we know to stop in script. + case ${answer} in + yes ) continue;; + no ) echo exiting...; + exit 1;; + * ) echo invalid response; + exit 1;; + esac + done +} + + +function prompt() { + # Prompt the user with a yes/no command to continue or exit + print_blue "$@ šŸ¤”ļø" + read -p " (yes/no) " answer + case ${answer} in + yes ) echo ok, we will proceed;; + no ) echo exiting...; + exit 1;; + * ) echo invalid response; + exit 1;; + esac +} + + +function with_exponential_backoff { + # Run with exponential backoff - assume containers take a while to pull + local max_attempts=100 + local timeout=1 + local attempt=0 + local exitcode=0 + + while [[ $attempt < $max_attempts ]]; do + "$@" + exitcode=$? + + if [[ $exitcode == 0 ]]; then + break + fi + + echo "Failure! Retrying in $timeout.." 1>&2 + sleep $timeout + attempt=$(( attempt + 1 )) + timeout=$(( timeout * 2 )) + done + + if [[ $exitCode != 0 ]]; then + echo "You've failed me for the last time! ($@)" 1>&2 + fi + return $exitcode +} + +# Defaults - these are in the config but left here for information +CLUSTER_NAME="flux-cluster" +FORCE_CLUSTER="false" +ZONE="us-central1-a" + +if [ -z ${ZONE+x} ]; then + echo "Google Cloud zone template missing as ZONE"; + exit 1 +fi + +echo " cluster : ${CLUSTER_NAME}" +echo " zone : ${ZONE}" + +is_installed gcloud +is_installed yes || FORCE_CLUSTER="false" + +# Check if it already exists +gcloud container clusters list --zone ${ZONE} | grep ${CLUSTER_NAME} +retval=$? +if [[ "${retval}" != "0" ]]; then + print_blue "${CLUSTER_NAME} in ${ZONE} does not exist." + echo + exit 0 +fi + +# This command has a confirmation already +if [[ "${FORCE_CLUSTER}" == "true" ]]; then + yes | gcloud container clusters delete --zone ${ZONE} ${CLUSTER_NAME} +else + run_echo gcloud container clusters delete --zone ${ZONE} ${CLUSTER_NAME} +fi diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml new file mode 100644 index 0000000..b4bc03e --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml @@ -0,0 +1,848 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + control-plane: controller-manager + name: operator-system +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.0 + creationTimestamp: null + name: miniclusters.flux-framework.org +spec: + group: flux-framework.org + names: + kind: MiniCluster + listKind: MiniClusterList + plural: miniclusters + singular: minicluster + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: MiniCluster is the Schema for a Flux job launcher on K8s + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: MiniCluster defines the desired state of a Flux MiniCluster + "I am a Flux user and I want to launch a MiniCluster for my job!" A + MiniCluster corresponds to a Batch Job -> StatefulSet + ConfigMaps A + "task" within that cluster is flux running something. + properties: + containers: + description: Containers is one or more containers to be created in + a pod. There should only be one container to run flux with runFlux + items: + properties: + command: + description: 'Single user executable to provide to flux start + IMPORTANT: This is left here, but not used in favor of exposing + Flux via a Restful API. We Can remove this when that is finalized.' + type: string + cores: + description: Cores the container should use + format: int32 + type: integer + diagnostics: + description: Run flux diagnostics on start instead of command + type: boolean + environment: + additionalProperties: + type: string + description: Key/value pairs for the environment + type: object + fluxLogLevel: + default: 6 + description: Log level to use for flux logging (only in non + TestMode) + format: int32 + type: integer + fluxOptionFlags: + description: Flux option flags, usually provided with -o optional + - if needed, default option flags for the server These can + also be set in the user interface to override here. This is + only valid for a FluxRunner + type: string + image: + default: fluxrm/flux-sched:focal + description: Container image must contain flux and flux-sched + install + type: string + imagePullSecret: + description: Allow the user to pull authenticated images By + default no secret is selected. Setting this with the name + of an already existing imagePullSecret will specify that secret + in the pod spec. + type: string + name: + description: Container name is only required for non flux runners + type: string + ports: + description: Ports to be exposed to other containers in the + cluster We take a single list of integers and map to the same + items: + format: int32 + type: integer + type: array + postStartExec: + description: Lifecycle can handle post start commands, etc. + type: string + preCommand: + description: Special command to run at beginning of script, + directly after asFlux is defined as sudo -u flux -E (so you + can change that if desired.) This is only valid if FluxRunner + is set (that writes a wait.sh script) + type: string + pullAlways: + default: false + description: Allow the user to dictate pulling By default we + pull if not present. Setting this to true will indicate to + pull always + type: boolean + resources: + description: Resources include limits and requests + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + type: object + runFlux: + description: Main container to run flux (only should be one) + type: boolean + volumes: + additionalProperties: + description: A Container volume must reference one defined + for the MiniCluster The path here is in the container + properties: + path: + type: string + readOnly: + default: true + type: boolean + required: + - path + type: object + description: Volumes that can be mounted (must be defined in + volumes) + type: object + workingDir: + description: Working directory to run command from + type: string + required: + - image + type: object + type: array + deadlineSeconds: + default: 31500000 + description: Should the job be limited to a particular number of seconds? + Approximately one year. This cannot be zero or job won't start + format: int64 + type: integer + fluxRestful: + description: Customization to Flux Restful API There should only be + one container to run flux with runFlux + properties: + branch: + default: main + description: Branch to clone Flux Restful API from + type: string + port: + default: 5000 + description: Port to run Flux Restful Server On + format: int32 + type: integer + token: + description: Token to use for RestFul API + type: string + username: + description: These two should not actually be set by a user, but + rather generated by tools and provided Username to use for RestFul + API + type: string + type: object + jobLabels: + additionalProperties: + type: string + description: Labels for the job + type: object + localDeploy: + default: false + description: localDeploy should be true for development, or deploying + in the case that there isn't an actual kubernetes cluster (e.g., + you are not using make deploy. It uses a persistent volume instead + of a claim + type: boolean + logging: + description: Logging modes determine the output you see in the job + log + properties: + quiet: + default: false + description: Quiet mode silences all output so the job only shows + the test running + type: boolean + timed: + default: false + description: Timed mode adds timing to Flux commands + type: boolean + type: object + pod: + description: Pod spec details + properties: + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + description: Resources include limits and requests + type: object + type: object + podLabels: + additionalProperties: + type: string + description: Labels for each pod + type: object + size: + default: 1 + description: Size (number of job pods to run, size of minicluster + in pods) + format: int32 + type: integer + tasks: + default: 1 + description: Total number of CPUs being run across entire cluster + format: int32 + type: integer + volumes: + additionalProperties: + description: Mini Cluster local volumes available to mount (these + are on the host) + properties: + path: + type: string + required: + - path + type: object + description: Volumes on the host (named) accessible to containers + type: object + required: + - containers + type: object + status: + description: MiniClusterStatus defines the observed state of Flux + properties: + conditions: + description: conditions hold the latest Flux Job and MiniCluster states + items: + description: "Condition contains details for one aspect of the current + state of this API Resource. --- This struct is intended for direct + use as an array at the field path .status.conditions. For example, + \n type FooStatus struct{ // Represents the observations of a + foo's current state. // Known .status.conditions.type are: \"Available\", + \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge + // +listType=map // +listMapKey=type Conditions []metav1.Condition + `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" + protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be when + the underlying condition changed. If that is not known, then + using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if .metadata.generation + is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the current + state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. Producers + of specific condition types may define expected values and + meanings for this field, and whether the values are considered + a guaranteed API. The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + --- Many .condition.type values are consistent across resources + like Available, but because arbitrary conditions can be useful + (see .node.status.conditions), the ability to deconflict is + important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + jobid: + description: The JobUid is set internally to associate to a miniCluster + type: string + required: + - jobid + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-controller-manager + namespace: operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: operator-leader-election-role + namespace: operator-system +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + creationTimestamp: null + name: operator-manager-role +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - watch +- apiGroups: + - "" + resources: + - events + - nodes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - exec + - get + - list + - patch + - update + - watch +- apiGroups: + - batch + resources: + - jobs/status + verbs: + - create + - delete + - exec + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - "" + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - batch + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +- apiGroups: + - "" + resources: + - jobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - networks + verbs: + - create + - patch +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - persistentvolumes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods/exec + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods/log + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - secrets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - statefulsets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - flux-framework.org + resources: + - clusters + - clusters/status + verbs: + - get + - list + - watch +- apiGroups: + - flux-framework.org + resources: + - machineclasses + - machinedeployments + - machinedeployments/status + - machines + - machines/status + - machinesets + - machinesets/status + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - flux-framework.org + resources: + - miniclusters + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - flux-framework.org + resources: + - miniclusters/finalizers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - flux-framework.org + resources: + - miniclusters/status + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: operator-metrics-reader +rules: +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: operator-proxy-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: operator-leader-election-rolebinding + namespace: operator-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: operator-leader-election-role +subjects: +- kind: ServiceAccount + name: operator-controller-manager + namespace: operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-manager-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: operator-manager-role +subjects: +- kind: ServiceAccount + name: operator-controller-manager + namespace: operator-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-proxy-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: operator-proxy-role +subjects: +- kind: ServiceAccount + name: operator-controller-manager + namespace: operator-system +--- +apiVersion: v1 +data: + controller_manager_config.yaml: | + apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 + kind: ControllerManagerConfig + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: 127.0.0.1:8080 + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: 14dde902.flux-framework.org +kind: ConfigMap +metadata: + name: operator-manager-config + namespace: operator-system +--- +apiVersion: v1 +kind: Service +metadata: + labels: + control-plane: controller-manager + name: operator-controller-manager-metrics-service + namespace: operator-system +spec: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: https + selector: + control-plane: controller-manager +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + control-plane: controller-manager + name: operator-controller-manager + namespace: operator-system +spec: + replicas: 1 + selector: + matchLabels: + control-plane: controller-manager + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + containers: + - args: + - --secure-listen-address=0.0.0.0:8443 + - --upstream=http://127.0.0.1:8080/ + - --logtostderr=true + - --v=0 + image: gcr.io/kubebuilder/kube-rbac-proxy:v0.11.0 + name: kube-rbac-proxy + ports: + - containerPort: 8443 + name: https + protocol: TCP + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 5m + memory: 64Mi + securityContext: + allowPrivilegeEscalation: false + - args: + - --health-probe-bind-address=:8081 + - --metrics-bind-address=127.0.0.1:8080 + - --leader-elect + command: + - /manager + image: ghcr.io/flux-framework/flux-operator:latest + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + securityContext: + allowPrivilegeEscalation: false + securityContext: + runAsNonRoot: true + serviceAccountName: operator-controller-manager + terminationGracePeriodSeconds: 10 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml new file mode 100644 index 0000000..5d5b94d --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml @@ -0,0 +1,29 @@ +apiVersion: flux-framework.org/v1alpha1 +kind: MiniCluster + +metadata: + name: lammps-job + namespace: flux-operator +spec: + # localDeploy needs to be false + localDeploy: false + + # Number of pods to create for MiniCluster + size: 2 + tasks: 1 + + # Disable verbose output + + + # Optional credentials if running the flux restful api + fluxRestful: + token: "b6223555-a19a-4035-b40f-68a6ce4dd5a5" + username: "fluxuser" + + # TODO add pod resources, if needed + containers: + - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + + + + cores: 1 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh new file mode 100755 index 0000000..0bd72c3 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh @@ -0,0 +1,219 @@ +#!/bin/bash + +# This is a template that will be populated with variables by Flux-Cloud +# We only run it to check if a MiniCluster is running. An apply is only +# needed if the MiniCluster is not created yet. + +# Include shared helper scripts +# Colors +red='\033[0;31m' +green='\033[0;32m' +yellow='\033[0;33m' +blue='\033[0;34m' +magenta='\033[0;35m' +cyan='\033[0;36m' +clear='\033[0m' + +function print_red() { + echo -e "${red}$@${clear}" +} +function print_yellow() { + echo -e "${yellow}$@${clear}" +} +function print_green() { + echo -e "${green}$@${clear}" +} +function print_blue() { + echo -e "${blue}$@${clear}" +} +function print_magenta() { + echo -e "${magenta}$@${clear}" +} +function print_cyan() { + echo -e "${cyan}$@${clear}" +} + +function is_installed () { + # Determine if a command is available for use! + cmd="${1}" + if command -v $cmd >/dev/null; then + echo "$cmd is installed" + else + echo "$cmd could not be found" + exit 1 + fi +} + +function install_operator() { + # Shared function to install the operator from a specific repository branch and cleanup + script_dir=${1} + repository=${2} + branch=${3} + tmpfile="${script_dir}/flux-operator.yaml" + run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml + kubectl apply -f $tmpfile +} + + +function run_echo() { + # Show the user the command then run it + echo + print_green "$@" + retry $@ +} + +function run_echo_allow_fail() { + echo + print_green "$@" + $@ || true +} + +function retry() { + # Retry an unsuccessful user command, per request + while true + do + $@ + retval=$? + if [[ "${retval}" == "0" ]]; then + return + fi + print_blue "That command was not successful. Do you want to try again? šŸ¤”ļø" + read -p " (yes/no) " answer + # Exit with non-zero response so we know to stop in script. + case ${answer} in + yes ) continue;; + no ) echo exiting...; + exit 1;; + * ) echo invalid response; + exit 1;; + esac + done +} + + +function prompt() { + # Prompt the user with a yes/no command to continue or exit + print_blue "$@ šŸ¤”ļø" + read -p " (yes/no) " answer + case ${answer} in + yes ) echo ok, we will proceed;; + no ) echo exiting...; + exit 1;; + * ) echo invalid response; + exit 1;; + esac +} + + +function with_exponential_backoff { + # Run with exponential backoff - assume containers take a while to pull + local max_attempts=100 + local timeout=1 + local attempt=0 + local exitcode=0 + + while [[ $attempt < $max_attempts ]]; do + "$@" + exitcode=$? + + if [[ $exitcode == 0 ]]; then + break + fi + + echo "Failure! Retrying in $timeout.." 1>&2 + sleep $timeout + attempt=$(( attempt + 1 )) + timeout=$(( timeout * 2 )) + done + + if [[ $exitCode != 0 ]]; then + echo "You've failed me for the last time! ($@)" 1>&2 + fi + return $exitcode +} + +NAMESPACE="flux-operator" +CRD="/home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml" +JOB="lammps-job" + +# Size -1 to account for certificate generator +SIZE=2 + +print_magenta " apply : ${CRD}" +print_magenta " job : ${JOB}" + +is_installed kubectl + +# Create the namespace (ok if already exists) +run_echo_allow_fail kubectl create namespace ${NAMESPACE} + +# Always cleanup a previous one so tokens don't get stale +run_echo_allow_fail kubectl delete -f ${CRD} +echo +podsCleaned="false" +print_blue "Waiting for previous MiniCluster to be cleaned up..." +while [[ "${podsCleaned}" == "false" ]]; do + echo -n "." + sleep 2 + state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1) + lines=$(echo $state | wc -l) + if [[ "${lines}" == "1" ]] && [[ "${state}" == *"No resources found in"* ]]; then + echo + print_green "šŸŒ€ļø Previous pods are cleaned up." + podsCleaned="true" + break + fi +done + +# Ensure we have a MiniCluster of the right namespace running +echo +print_green "šŸŒ€ļø Creating MiniCluster in ${NAMESPACE}" +# Apply the job, get pods +run_echo kubectl apply -f ${CRD} +run_echo kubectl get -n ${NAMESPACE} pods + +# continue until we find the index-0 pod +podsReady="false" + +echo +print_blue "Waiting for MiniCluster of size ${SIZE} to be ready..." +while [[ "${podsReady}" == "false" ]]; do + echo -n "." + sleep 2 + pods=$(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=name | wc -l) + if [[ "${pods}" == "${SIZE}" ]]; then + echo + print_green "šŸŒ€ļø All pods are running." + podsReady="true" + break + fi +done + +echo +brokerPod="" +brokerPrefix="${JOB}-0" +while [[ "${brokerPod}" == "" ]]; do + for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do + if [[ "${pod}" == ${brokerPrefix}* ]]; then + echo + brokerPod=${pod} + break + fi + done +done + +echo +serverReady="false" +print_blue "Waiting for Flux Restful API Server to be ready..." +while [[ "${serverReady}" == "false" ]]; do + echo -n "." + sleep 2 + logs=$(kubectl logs --namespace ${NAMESPACE} ${brokerPod} | grep "Uvicorn running") + retval=$? + if [[ "${retval}" == "0" ]]; then + echo + serverReady="true" + print_green "šŸŒ€ļø Flux RestFul API Server is Ready." + break + fi +done diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out new file mode 100644 index 0000000..3b18e51 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out @@ -0,0 +1 @@ +hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out new file mode 100644 index 0000000..3b18e51 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out @@ -0,0 +1 @@ +hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out new file mode 100644 index 0000000..3b18e51 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out @@ -0,0 +1 @@ +hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out new file mode 100644 index 0000000..3b18e51 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out @@ -0,0 +1 @@ +hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out new file mode 100644 index 0000000..3b18e51 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out @@ -0,0 +1 @@ +hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json new file mode 100644 index 0000000..778fee9 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json @@ -0,0 +1,696 @@ +{ + "times": { + "destroy-cluster": 324.709, + "create-cluster": 491.577, + "minicluster-submit-size-2": 183.626, + "reaxc-hns-1-minicluster-size-2": 32.1847505569458, + "reaxc-hns-2-minicluster-size-2": 33.41048860549927, + "reaxc-hns-3-minicluster-size-2": 30.96457529067993, + "reaxc-hns-4-minicluster-size-2": 30.777089595794678, + "reaxc-hns-5-minicluster-size-2": 31.048890829086304, + "sleep-1-minicluster-size-2": 5.0783607959747314, + "sleep-2-minicluster-size-2": 5.040483474731445, + "sleep-3-minicluster-size-2": 5.04453706741333, + "sleep-4-minicluster-size-2": 5.048432111740112, + "sleep-5-minicluster-size-2": 5.058692455291748, + "hello-world-1-minicluster-size-2": 0.07241106033325195, + "hello-world-2-minicluster-size-2": 0.052734375, + "hello-world-3-minicluster-size-2": 0.04248523712158203, + "hello-world-4-minicluster-size-2": 0.045003652572631836, + "hello-world-5-minicluster-size-2": 0.05110311508178711, + "minicluster-destroy-size-2": 0.277 + }, + "size": 4, + "machine": "n1-standard-1", + "minicluster": { + "name": "lammps-job", + "namespace": "flux-operator", + "size": [ + 2 + ] + }, + "jobs": { + "reaxc-hns-1": { + "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "reaxc-hns-2": { + "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "reaxc-hns-3": { + "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "reaxc-hns-4": { + "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "reaxc-hns-5": { + "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "sleep-1": { + "command": "sleep 5", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "sleep-2": { + "command": "sleep 5", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "sleep-3": { + "command": "sleep 5", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "sleep-4": { + "command": "sleep 5", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "sleep-5": { + "command": "sleep 5", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "hello-world-1": { + "command": "echo hello world", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "hello-world-2": { + "command": "echo hello world", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "hello-world-3": { + "command": "echo hello world", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "hello-world-4": { + "command": "echo hello world", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + }, + "hello-world-5": { + "command": "echo hello world", + "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + "repeats": 5, + "workdir": "/home/flux/examples/reaxff/HNS" + } + }, + "info": { + "reaxc-hns-1-minicluster-size-2": { + "id": 130073755648, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674444768.0517902, + "t_depend": 1674444768.0517902, + "t_run": 1674444768.100832, + "t_cleanup": 1674444800.2855825, + "t_inactive": 1674444800.290403, + "state": "INACTIVE", + "name": "lmp", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675049568.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 32.1847505569458, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 40.13091278076172, + "start_to_output_seconds": 43.215059757232666 + }, + "reaxc-hns-2-minicluster-size-2": { + "id": 816932978688, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674444808.9904723, + "t_depend": 1674444808.9904723, + "t_run": 1674444809.0098114, + "t_cleanup": 1674444842.4203, + "t_inactive": 1674444842.4249685, + "state": "INACTIVE", + "name": "lmp", + "ntasks": 1, + "nnodes": 1, + "ranks": "0", + "nodelist": "lammps-job-0", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675049609.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 33.41048860549927, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 97.17731666564941, + "start_to_output_seconds": 97.31685972213745 + }, + "reaxc-hns-3-minicluster-size-2": { + "id": 2450245287936, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674444906.3438601, + "t_depend": 1674444906.3438601, + "t_run": 1674444906.3633585, + "t_cleanup": 1674444937.3279338, + "t_inactive": 1674444937.33689, + "state": "INACTIVE", + "name": "lmp", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675049706.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 30.96457529067993, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 67.29511857032776, + "start_to_output_seconds": 67.40737009048462 + }, + "reaxc-hns-4-minicluster-size-2": { + "id": 3581969170432, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674444973.8004916, + "t_depend": 1674444973.8004916, + "t_run": 1674444973.8231413, + "t_cleanup": 1674445004.600231, + "t_inactive": 1674445004.6049078, + "state": "INACTIVE", + "name": "lmp", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675049773.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 30.777089595794678, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 62.43251633644104, + "start_to_output_seconds": 62.51574635505676 + }, + "reaxc-hns-5-minicluster-size-2": { + "id": 4631065264128, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445036.3308098, + "t_depend": 1674445036.3308098, + "t_run": 1674445036.3509514, + "t_cleanup": 1674445067.3998423, + "t_inactive": 1674445067.4045572, + "state": "INACTIVE", + "name": "lmp", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675049836.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 31.048890829086304, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 92.83428883552551, + "start_to_output_seconds": 92.92412114143372 + }, + "sleep-1-minicluster-size-2": { + "id": 6190792704000, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445129.2985795, + "t_depend": 1674445129.2985795, + "t_run": 1674445129.3183057, + "t_cleanup": 1674445134.3966665, + "t_inactive": 1674445134.400485, + "state": "INACTIVE", + "name": "sleep", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675049929.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 5.0783607959747314, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 62.536019802093506, + "start_to_output_seconds": 62.61232876777649 + }, + "sleep-2-minicluster-size-2": { + "id": 7241700737024, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445191.9373004, + "t_depend": 1674445191.9373004, + "t_run": 1674445191.9571338, + "t_cleanup": 1674445196.9976172, + "t_inactive": 1674445197.001546, + "state": "INACTIVE", + "name": "sleep", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675049991.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 5.040483474731445, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 13.608879566192627, + "start_to_output_seconds": 16.584359407424927 + }, + "sleep-3-minicluster-size-2": { + "id": 7520689061888, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445208.5661342, + "t_depend": 1674445208.5661342, + "t_run": 1674445208.588134, + "t_cleanup": 1674445213.632671, + "t_inactive": 1674445213.63706, + "state": "INACTIVE", + "name": "sleep", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675050008.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 5.04453706741333, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 13.602198600769043, + "start_to_output_seconds": 16.66467046737671 + }, + "sleep-4-minicluster-size-2": { + "id": 7800382029824, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445225.2364438, + "t_depend": 1674445225.2364438, + "t_run": 1674445225.2567546, + "t_cleanup": 1674445230.3051867, + "t_inactive": 1674445230.309494, + "state": "INACTIVE", + "name": "sleep", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675050025.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 5.048432111740112, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 13.53053617477417, + "start_to_output_seconds": 16.569458484649658 + }, + "sleep-5-minicluster-size-2": { + "id": 8079202582528, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445241.855504, + "t_depend": 1674445241.855504, + "t_run": 1674445241.8752344, + "t_cleanup": 1674445246.9339268, + "t_inactive": 1674445246.9379504, + "state": "INACTIVE", + "name": "sleep", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675050041.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 5.058692455291748, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 13.458645582199097, + "start_to_output_seconds": 16.474400281906128 + }, + "hello-world-1-minicluster-size-2": { + "id": 8356177641472, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445258.3653252, + "t_depend": 1674445258.3653252, + "t_run": 1674445258.3868065, + "t_cleanup": 1674445258.4592175, + "t_inactive": 1674445258.46398, + "state": "INACTIVE", + "name": "echo", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675050058.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 0.07241106033325195, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 13.482953310012817, + "start_to_output_seconds": 16.53845739364624 + }, + "hello-world-2-minicluster-size-2": { + "id": 8635753168896, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445275.028449, + "t_depend": 1674445275.028449, + "t_run": 1674445275.0489655, + "t_cleanup": 1674445275.1016998, + "t_inactive": 1674445275.1059186, + "state": "INACTIVE", + "name": "echo", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675050075.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 0.052734375, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 0.5918288230895996, + "start_to_output_seconds": 0.6222965717315674 + }, + "hello-world-3-minicluster-size-2": { + "id": 8641507753984, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445275.3710968, + "t_depend": 1674445275.3710968, + "t_run": 1674445275.3893383, + "t_cleanup": 1674445275.4318235, + "t_inactive": 1674445275.4359808, + "state": "INACTIVE", + "name": "echo", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675050075.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 0.04248523712158203, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 0.17513155937194824, + "start_to_output_seconds": 0.21306657791137695 + }, + "hello-world-4-minicluster-size-2": { + "id": 8646121488384, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445275.6465385, + "t_depend": 1674445275.6465385, + "t_run": 1674445275.6643715, + "t_cleanup": 1674445275.7093751, + "t_inactive": 1674445275.7134967, + "state": "INACTIVE", + "name": "echo", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675050075.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 0.045003652572631836, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 0.19276666641235352, + "start_to_output_seconds": 0.2307295799255371 + }, + "hello-world-5-minicluster-size-2": { + "id": 8649946693632, + "userid": 1234, + "urgency": 16, + "priority": 16, + "t_submit": 1674445275.8740122, + "t_depend": 1674445275.8740122, + "t_run": 1674445275.8942568, + "t_cleanup": 1674445275.94536, + "t_inactive": 1674445275.95746, + "state": "INACTIVE", + "name": "echo", + "ntasks": 1, + "nnodes": 1, + "ranks": "1", + "nodelist": "lammps-job-1", + "success": true, + "exception_occurred": false, + "result": "COMPLETED", + "expiration": 1675050075.0, + "annotations": { + "sched": { + "queue": "default" + } + }, + "waitstatus": 0, + "returncode": 0, + "runtime": 0.05110311508178711, + "exception": { + "occurred": false, + "severity": "", + "type": "", + "note": "" + }, + "duration": "", + "start_to_info_seconds": 0.17215561866760254, + "start_to_output_seconds": 0.19998478889465332 + } + } +} diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out new file mode 100644 index 0000000..647c484 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out @@ -0,0 +1,80 @@ +LAMMPS (29 Sep 2021 - Update 2) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +Reading data file ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 304 atoms + reading velocities ... + 304 velocities + read_data CPU = 0.005 seconds +Replicating atoms ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) + 1 by 1 by 1 MPI processor grid + bounding box image = (0 -1 -1) to (0 1 1) + bounding box extra memory = 0.03 MB + average # of replicas added to proc = 8.00 out of 8 (100.00%) + 2432 atoms + replicate CPU = 0.001 seconds +Neighbor list info ... + update every 20 steps, delay 0 steps, check no + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 11 + ghost atom cutoff = 11 + binsize = 5.5, bins = 10 5 6 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair reax/c, perpetual + attributes: half, newton off, ghost + pair build: half/bin/newtoff/ghost + stencil: full/ghost/bin/3d + bin: standard + (2) fix qeq/reax, perpetual, copy from (1) + attributes: half, newton off, ghost + pair build: copy + stencil: none + bin: none +Setting up Verlet run ... + Unit style : real + Current step : 0 + Time step : 0.1 +Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes +Step Temp PotEng Press E_vdwl E_coul Volume + 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 + 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 + 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 + 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 + 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 + 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 + 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 + 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 + 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 + 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 + 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 +Loop time of 29.8322 on 1 procs for 100 steps with 2432 atoms + +Performance: 0.029 ns/day, 828.671 hours/ns, 3.352 timesteps/s +94.2% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 22.21 | 22.21 | 22.21 | 0.0 | 74.45 +Neigh | 0.61723 | 0.61723 | 0.61723 | 0.0 | 2.07 +Comm | 0.010007 | 0.010007 | 0.010007 | 0.0 | 0.03 +Output | 0.0004328 | 0.0004328 | 0.0004328 | 0.0 | 0.00 +Modify | 6.9933 | 6.9933 | 6.9933 | 0.0 | 23.44 +Other | | 0.00162 | | | 0.01 + +Nlocal: 2432.00 ave 2432 max 2432 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 10685.0 ave 10685 max 10685 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 823958.0 ave 823958 max 823958 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 823958 +Ave neighs/atom = 338.79852 +Neighbor list builds = 5 +Dangerous builds not checked +Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out new file mode 100644 index 0000000..0b9df79 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out @@ -0,0 +1,80 @@ +LAMMPS (29 Sep 2021 - Update 2) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +Reading data file ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 304 atoms + reading velocities ... + 304 velocities + read_data CPU = 0.010 seconds +Replicating atoms ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) + 1 by 1 by 1 MPI processor grid + bounding box image = (0 -1 -1) to (0 1 1) + bounding box extra memory = 0.03 MB + average # of replicas added to proc = 8.00 out of 8 (100.00%) + 2432 atoms + replicate CPU = 0.001 seconds +Neighbor list info ... + update every 20 steps, delay 0 steps, check no + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 11 + ghost atom cutoff = 11 + binsize = 5.5, bins = 10 5 6 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair reax/c, perpetual + attributes: half, newton off, ghost + pair build: half/bin/newtoff/ghost + stencil: full/ghost/bin/3d + bin: standard + (2) fix qeq/reax, perpetual, copy from (1) + attributes: half, newton off, ghost + pair build: copy + stencil: none + bin: none +Setting up Verlet run ... + Unit style : real + Current step : 0 + Time step : 0.1 +Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes +Step Temp PotEng Press E_vdwl E_coul Volume + 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 + 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 + 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 + 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 + 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 + 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 + 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 + 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 + 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 + 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 + 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 +Loop time of 31.2338 on 1 procs for 100 steps with 2432 atoms + +Performance: 0.028 ns/day, 867.606 hours/ns, 3.202 timesteps/s +91.3% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 23.353 | 23.353 | 23.353 | 0.0 | 74.77 +Neigh | 0.62616 | 0.62616 | 0.62616 | 0.0 | 2.00 +Comm | 0.0096617 | 0.0096617 | 0.0096617 | 0.0 | 0.03 +Output | 0.00044694 | 0.00044694 | 0.00044694 | 0.0 | 0.00 +Modify | 7.2429 | 7.2429 | 7.2429 | 0.0 | 23.19 +Other | | 0.001518 | | | 0.00 + +Nlocal: 2432.00 ave 2432 max 2432 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 10685.0 ave 10685 max 10685 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 823958.0 ave 823958 max 823958 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 823958 +Ave neighs/atom = 338.79852 +Neighbor list builds = 5 +Dangerous builds not checked +Total wall time: 0:00:32 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out new file mode 100644 index 0000000..b6380b6 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out @@ -0,0 +1,80 @@ +LAMMPS (29 Sep 2021 - Update 2) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +Reading data file ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 304 atoms + reading velocities ... + 304 velocities + read_data CPU = 0.002 seconds +Replicating atoms ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) + 1 by 1 by 1 MPI processor grid + bounding box image = (0 -1 -1) to (0 1 1) + bounding box extra memory = 0.03 MB + average # of replicas added to proc = 8.00 out of 8 (100.00%) + 2432 atoms + replicate CPU = 0.001 seconds +Neighbor list info ... + update every 20 steps, delay 0 steps, check no + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 11 + ghost atom cutoff = 11 + binsize = 5.5, bins = 10 5 6 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair reax/c, perpetual + attributes: half, newton off, ghost + pair build: half/bin/newtoff/ghost + stencil: full/ghost/bin/3d + bin: standard + (2) fix qeq/reax, perpetual, copy from (1) + attributes: half, newton off, ghost + pair build: copy + stencil: none + bin: none +Setting up Verlet run ... + Unit style : real + Current step : 0 + Time step : 0.1 +Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes +Step Temp PotEng Press E_vdwl E_coul Volume + 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 + 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 + 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 + 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 + 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 + 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 + 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 + 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 + 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 + 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 + 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 +Loop time of 29.6229 on 1 procs for 100 steps with 2432 atoms + +Performance: 0.029 ns/day, 822.859 hours/ns, 3.376 timesteps/s +94.4% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 22.175 | 22.175 | 22.175 | 0.0 | 74.86 +Neigh | 0.63724 | 0.63724 | 0.63724 | 0.0 | 2.15 +Comm | 0.0097153 | 0.0097153 | 0.0097153 | 0.0 | 0.03 +Output | 0.00041342 | 0.00041342 | 0.00041342 | 0.0 | 0.00 +Modify | 6.799 | 6.799 | 6.799 | 0.0 | 22.95 +Other | | 0.001424 | | | 0.00 + +Nlocal: 2432.00 ave 2432 max 2432 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 10685.0 ave 10685 max 10685 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 823958.0 ave 823958 max 823958 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 823958 +Ave neighs/atom = 338.79852 +Neighbor list builds = 5 +Dangerous builds not checked +Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out new file mode 100644 index 0000000..6c889f5 --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out @@ -0,0 +1,80 @@ +LAMMPS (29 Sep 2021 - Update 2) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +Reading data file ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 304 atoms + reading velocities ... + 304 velocities + read_data CPU = 0.002 seconds +Replicating atoms ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) + 1 by 1 by 1 MPI processor grid + bounding box image = (0 -1 -1) to (0 1 1) + bounding box extra memory = 0.03 MB + average # of replicas added to proc = 8.00 out of 8 (100.00%) + 2432 atoms + replicate CPU = 0.001 seconds +Neighbor list info ... + update every 20 steps, delay 0 steps, check no + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 11 + ghost atom cutoff = 11 + binsize = 5.5, bins = 10 5 6 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair reax/c, perpetual + attributes: half, newton off, ghost + pair build: half/bin/newtoff/ghost + stencil: full/ghost/bin/3d + bin: standard + (2) fix qeq/reax, perpetual, copy from (1) + attributes: half, newton off, ghost + pair build: copy + stencil: none + bin: none +Setting up Verlet run ... + Unit style : real + Current step : 0 + Time step : 0.1 +Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes +Step Temp PotEng Press E_vdwl E_coul Volume + 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 + 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 + 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 + 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 + 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 + 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 + 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 + 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 + 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 + 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 + 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 +Loop time of 29.7805 on 1 procs for 100 steps with 2432 atoms + +Performance: 0.029 ns/day, 827.235 hours/ns, 3.358 timesteps/s +94.2% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 22.214 | 22.214 | 22.214 | 0.0 | 74.59 +Neigh | 0.62414 | 0.62414 | 0.62414 | 0.0 | 2.10 +Comm | 0.01756 | 0.01756 | 0.01756 | 0.0 | 0.06 +Output | 0.00041921 | 0.00041921 | 0.00041921 | 0.0 | 0.00 +Modify | 6.9226 | 6.9226 | 6.9226 | 0.0 | 23.25 +Other | | 0.00152 | | | 0.01 + +Nlocal: 2432.00 ave 2432 max 2432 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 10685.0 ave 10685 max 10685 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 823958.0 ave 823958 max 823958 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 823958 +Ave neighs/atom = 338.79852 +Neighbor list builds = 5 +Dangerous builds not checked +Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out new file mode 100644 index 0000000..9c9d4df --- /dev/null +++ b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out @@ -0,0 +1,80 @@ +LAMMPS (29 Sep 2021 - Update 2) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) + using 1 OpenMP thread(s) per MPI task +Reading data file ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) + 1 by 1 by 1 MPI processor grid + reading atoms ... + 304 atoms + reading velocities ... + 304 velocities + read_data CPU = 0.002 seconds +Replicating atoms ... + triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) + 1 by 1 by 1 MPI processor grid + bounding box image = (0 -1 -1) to (0 1 1) + bounding box extra memory = 0.03 MB + average # of replicas added to proc = 8.00 out of 8 (100.00%) + 2432 atoms + replicate CPU = 0.001 seconds +Neighbor list info ... + update every 20 steps, delay 0 steps, check no + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 11 + ghost atom cutoff = 11 + binsize = 5.5, bins = 10 5 6 + 2 neighbor lists, perpetual/occasional/extra = 2 0 0 + (1) pair reax/c, perpetual + attributes: half, newton off, ghost + pair build: half/bin/newtoff/ghost + stencil: full/ghost/bin/3d + bin: standard + (2) fix qeq/reax, perpetual, copy from (1) + attributes: half, newton off, ghost + pair build: copy + stencil: none + bin: none +Setting up Verlet run ... + Unit style : real + Current step : 0 + Time step : 0.1 +Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes +Step Temp PotEng Press E_vdwl E_coul Volume + 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 + 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 + 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 + 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 + 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 + 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 + 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 + 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 + 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 + 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 + 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 +Loop time of 30.0677 on 1 procs for 100 steps with 2432 atoms + +Performance: 0.029 ns/day, 835.214 hours/ns, 3.326 timesteps/s +93.3% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 22.337 | 22.337 | 22.337 | 0.0 | 74.29 +Neigh | 0.73472 | 0.73472 | 0.73472 | 0.0 | 2.44 +Comm | 0.009731 | 0.009731 | 0.009731 | 0.0 | 0.03 +Output | 0.00041722 | 0.00041722 | 0.00041722 | 0.0 | 0.00 +Modify | 6.9844 | 6.9844 | 6.9844 | 0.0 | 23.23 +Other | | 0.001495 | | | 0.00 + +Nlocal: 2432.00 ave 2432 max 2432 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 10685.0 ave 10685 max 10685 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 823958.0 ave 823958 max 823958 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 823958 +Ave neighs/atom = 338.79852 +Neighbor list builds = 5 +Dangerous builds not checked +Total wall time: 0:00:30 diff --git a/examples/up-submit-down/experiments.yaml b/examples/up-submit-down/experiments.yaml new file mode 100644 index 0000000..880e652 --- /dev/null +++ b/examples/up-submit-down/experiments.yaml @@ -0,0 +1,31 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + machine: [n1-standard-1] + +# Flux Mini Cluster experiment attributes +minicluster: + name: lammps-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + +# Since we are creating a minicluster here to submit commands across +# on the same container, the container is required here. If you specify +# a size here, the image must be the same across sizes +jobs: + reaxc-hns: + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + workdir: /home/flux/examples/reaxff/HNS + sleep: + command: 'sleep 5' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + workdir: /home/flux/examples/reaxff/HNS + hello-world: + command: 'echo hello world' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + workdir: /home/flux/examples/reaxff/HNS diff --git a/examples/up-submit-down/plot_results.py b/examples/up-submit-down/plot_results.py new file mode 100644 index 0000000..6395f83 --- /dev/null +++ b/examples/up-submit-down/plot_results.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import sys + +import matplotlib.pyplot as plt +import pandas +import seaborn as sns + + +def read_json(filename): + """ + Read a file into a text blob. + """ + with open(filename, "r") as fd: + content = json.loads(fd.read()) + return content + + +def plot_outputs(raw, plotname, ext="pdf"): + """ + Parse results.json into dataframe and plots to save. + """ + # Let's save the following, with runid as index + columns = ["minicluster_size", "job_type", "time_seconds", "time_type"] + + # Let's first organize distributions of times + data = [] + index = [] + for jobname, item in raw["info"].items(): + index += [jobname, jobname, jobname] + jobtype = jobname.split("-minicluster-size")[0].rsplit("-", 1)[0] + + # This is how flux-cloud organized the output + minicluster_size = int(jobname.rsplit("size-", 1)[-1]) + + # Manual melt :) + data.append([minicluster_size, jobtype, item["runtime"], "runtime"]) + data.append( + [ + minicluster_size, + jobtype, + item["start_to_output_seconds"], + "output_seconds", + ] + ) + data.append( + [minicluster_size, jobtype, item["start_to_info_seconds"], "info_seconds"] + ) + + # Assemble the data frame, index is the runids + df = pandas.DataFrame(data, columns=columns) + df.index = index + + # Save raw data + df.to_csv("results-df.csv") + + # We need colors! + colors = sns.color_palette("hls", 8) + hexcolors = colors.as_hex() + + palette = {} + for size in df.time_type.unique(): + palette[size] = hexcolors.pop(0) + + # Sort by size + palette = dict(sorted(palette.items())) + + # Let's make a plot that shows distributions of the times by the cluster size, across all + make_plot( + df, + title="Flux MiniCluster Time Variation", + tag="minicluster_times", + ydimension="time_seconds", + palette=palette, + ext=ext, + plotname=plotname, + ) + + +def make_plot(df, title, tag, ydimension, palette, ext="pdf", plotname="lammps"): + """ + Helper function to make common plots. + """ + ext = ext.strip(".") + plt.figure(figsize=(12, 12)) + sns.set_style("dark") + ax = sns.boxplot( + x="job_type", + y=ydimension, + hue="time_type", + data=df, + whis=[5, 95], + palette=palette, + ) + plt.title(title) + plt.legend([], [], frameon=False) + ax.set_xlabel("Job Type", fontsize=16) + ax.set_ylabel("Time (seconds)", fontsize=16) + ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=14) + ax.set_yticklabels(ax.get_yticks(), fontsize=14) + handles, _ = ax.get_legend_handles_labels() + ax.legend(handles, list(palette)) + plt.savefig(f"{tag}_{plotname}.{ext}") + plt.clf() + + +def get_parser(): + """ + Process results file into plots. + """ + parser = argparse.ArgumentParser(description="Plot LAMMPS outputs") + parser.add_argument("results_json", help="results json file", nargs="?") + parser.add_argument( + "-p", + "--plotname", + default="lammps", + help="base name for plot output files", + ) + parser.add_argument( + "-e", + "--extension", + dest="extension", + default="pdf", + help="image extension to use (defaults to pdf)", + ) + return parser + + +def main(): + """ + Read in results json, and make plots. + """ + parser = get_parser() + args = parser.parse_args() + if not os.path.exists(args.results_json): + sys.exit(f"{args.results_json} does not exist.") + data = read_json(args.results_json) + plot_outputs(data, args.plotname, ext=args.extension) + + +if __name__ == "__main__": + main() diff --git a/fluxcloud/client/__init__.py b/fluxcloud/client/__init__.py index c4bbf6d..7e690ae 100644 --- a/fluxcloud/client/__init__.py +++ b/fluxcloud/client/__init__.py @@ -124,18 +124,27 @@ def get_parser(): type=str, ) - # Experiment runner is "run" + # These are multi-commands, e.g., up down run = subparsers.add_parser( "run", - description="Main run command to run experiments", + description="Bring the cluster up, run experiments via applying CRDs, and bring it down.", + formatter_class=argparse.RawTextHelpFormatter, + ) + batch = subparsers.add_parser( + "batch", + description="Bring the cluster up, run experiments via a Flux Restful API submit, and bring it down.", + formatter_class=argparse.RawTextHelpFormatter, + ) + submit = subparsers.add_parser( + "submit", + description="Submit experiments via the Flux Restful API (one set of pods, shared)", formatter_class=argparse.RawTextHelpFormatter, ) apply = subparsers.add_parser( "apply", - description="Apply experiments (CRDs) to the cluster.", + description="Run experiments via the applying experiments (CRDs) to the cluster (each a set of pods)", formatter_class=argparse.RawTextHelpFormatter, ) - up = subparsers.add_parser( "up", description="Bring up a cluster and install the operator", @@ -159,7 +168,7 @@ def get_parser(): description="List experiment ids available.", formatter_class=argparse.RawTextHelpFormatter, ) - for command in run, up, down, apply, listing: + for command in run, up, down, apply, listing, batch, submit: command.add_argument( "experiments", default="experiments.yaml", @@ -174,7 +183,7 @@ def get_parser(): choices=clouds.cloud_names, ) - for command in apply, up, down, run: + for command in apply, up, down, run, batch, submit: command.add_argument( "--force-cluster", dest="force_cluster", @@ -202,8 +211,6 @@ def get_parser(): type=int, help="experiment size under ID to apply to", ) - - for command in run, apply, up, down: command.add_argument( "-o", "--output-dir", @@ -275,10 +282,14 @@ def help(return_code=0): # Does the user want a shell? if args.command == "apply": from .apply import main + elif args.command == "submit": + from .apply import submit as main elif args.command == "list": from .listing import main elif args.command == "run": from .run import main + elif args.command == "batch": + from .run import batch as main elif args.command == "config": from .config import main elif args.command == "up": diff --git a/fluxcloud/client/apply.py b/fluxcloud/client/apply.py index af252ab..13d1369 100644 --- a/fluxcloud/client/apply.py +++ b/fluxcloud/client/apply.py @@ -3,30 +3,22 @@ # # SPDX-License-Identifier: Apache-2.0 -import fluxcloud.utils as utils -from fluxcloud.main import get_experiment_client -from fluxcloud.main.experiment import ExperimentSetup - -from .helpers import select_experiment +from .helpers import prepare_client def main(args, parser, extra, subparser): - utils.ensure_no_extra(extra) + """ + apply parser submits via separate CRDs. + """ + cli, setup, experiment = prepare_client(args, extra) + cli.apply(setup, experiment=experiment) + setup.cleanup(setup.matrices) - cli = get_experiment_client(args.cloud) - setup = ExperimentSetup( - args.experiments, - force_cluster=args.force_cluster, - template=args.template, - cleanup=args.cleanup, - outdir=args.output_dir, - test=args.test, - quiet=True, - ) - # Update config settings on the fly - cli.settings.update_params(args.config_params) - setup.settings.update_params(args.config_params) - experiment = select_experiment(setup, args.experiment_id, args.size) - cli.apply(setup, experiment=experiment) +def submit(args, parser, extra, subparser): + """ + submit parser submits via the Flux Restful API to one cluster + """ + cli, setup, experiment = prepare_client(args, extra) + cli.submit(setup, experiment=experiment) setup.cleanup(setup.matrices) diff --git a/fluxcloud/client/down.py b/fluxcloud/client/down.py index 67a00d7..c797595 100644 --- a/fluxcloud/client/down.py +++ b/fluxcloud/client/down.py @@ -4,27 +4,13 @@ # SPDX-License-Identifier: Apache-2.0 import fluxcloud.utils as utils -from fluxcloud.main import get_experiment_client -from fluxcloud.main.experiment import ExperimentSetup -from .helpers import select_experiment +from .helpers import prepare_client, select_experiment def main(args, parser, extra, subparser): utils.ensure_no_extra(extra) - - cli = get_experiment_client(args.cloud) - setup = ExperimentSetup( - args.experiments, - quiet=True, - cleanup=args.cleanup, - force_cluster=args.force_cluster, - outdir=args.output_dir, - ) - - # Update config settings on the fly - cli.settings.update_params(args.config_params) - setup.settings.update_params(args.config_params) + cli, setup, experiment = prepare_client(args, extra) if args.down_all: experiments = setup.matrices diff --git a/fluxcloud/client/helpers.py b/fluxcloud/client/helpers.py index 59d477b..5cefd9a 100644 --- a/fluxcloud/client/helpers.py +++ b/fluxcloud/client/helpers.py @@ -1,9 +1,36 @@ -# Copyright 2022 Lawrence Livermore National Security, LLC and other +# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other # This is part of Flux Framework. See the COPYRIGHT file for details. # # SPDX-License-Identifier: Apache-2.0 +import fluxcloud.utils as utils from fluxcloud.logger import logger +from fluxcloud.main import get_experiment_client +from fluxcloud.main.experiment import ExperimentSetup + + +def prepare_client(args, extra): + """ + apply parser submits via separate CRDs. + """ + utils.ensure_no_extra(extra) + + cli = get_experiment_client(args.cloud) + setup = ExperimentSetup( + args.experiments, + force_cluster=args.force_cluster, + template=args.template, + cleanup=args.cleanup, + outdir=args.output_dir, + test=args.test, + quiet=True, + ) + + # Update config settings on the fly + cli.settings.update_params(args.config_params) + setup.settings.update_params(args.config_params) + experiment = select_experiment(setup, args.experiment_id, args.size) + return cli, setup, experiment def select_experiment(setup, experiment_id, size=None): diff --git a/fluxcloud/client/run.py b/fluxcloud/client/run.py index b3c69ec..82a8e3a 100644 --- a/fluxcloud/client/run.py +++ b/fluxcloud/client/run.py @@ -3,28 +3,11 @@ # # SPDX-License-Identifier: Apache-2.0 -import fluxcloud.utils as utils -from fluxcloud.main import get_experiment_client -from fluxcloud.main.experiment import ExperimentSetup +from .helpers import prepare_client def main(args, parser, extra, subparser): - utils.ensure_no_extra(extra) - - cli = get_experiment_client(args.cloud) - setup = ExperimentSetup( - args.experiments, - template=args.template, - outdir=args.output_dir, - test=args.test, - cleanup=args.cleanup, - force_cluster=args.force_cluster, - force=args.force, - ) - - # Update config settings on the fly - cli.settings.update_params(args.config_params) - setup.settings.update_params(args.config_params) + cli, setup, _ = prepare_client(args, extra) # Set the Minicluster size across experiments if args.size: @@ -32,3 +15,14 @@ def main(args, parser, extra, subparser): cli.run(setup) setup.cleanup(setup.matrices) + + +def batch(args, parser, extra, subparser): + cli, setup, _ = prepare_client(args, extra) + + # Set the Minicluster size across experiments + if args.size: + setup.set_minicluster_size(args.size) + + cli.batch(setup) + setup.cleanup(setup.matrices) diff --git a/fluxcloud/client/up.py b/fluxcloud/client/up.py index 4c785a9..4fd6a67 100644 --- a/fluxcloud/client/up.py +++ b/fluxcloud/client/up.py @@ -3,28 +3,10 @@ # # SPDX-License-Identifier: Apache-2.0 -import fluxcloud.utils as utils -from fluxcloud.main import get_experiment_client -from fluxcloud.main.experiment import ExperimentSetup - -from .helpers import select_experiment +from .helpers import prepare_client def main(args, parser, extra, subparser): - utils.ensure_no_extra(extra) - - cli = get_experiment_client(args.cloud) - setup = ExperimentSetup( - args.experiments, - quiet=True, - cleanup=args.cleanup, - force_cluster=args.force_cluster, - outdir=args.output_dir, - ) - - # Update config settings on the fly - cli.settings.update_params(args.config_params) - setup.settings.update_params(args.config_params) - experiment = select_experiment(setup, args.experiment_id, args.size) + cli, setup, experiment = prepare_client(args, extra) cli.up(setup, experiment=experiment) setup.cleanup(setup.matrices) diff --git a/fluxcloud/logger.py b/fluxcloud/logger.py index 527cb16..6f7f16c 100644 --- a/fluxcloud/logger.py +++ b/fluxcloud/logger.py @@ -192,7 +192,6 @@ def setup_logger( stdout=False, debug=False, use_threads=False, - wms_monitor=None, ): # console output only if no custom logger was specified stream_handler = ColorizingStreamHandler( @@ -201,6 +200,7 @@ def setup_logger( use_threads=use_threads, ) logger.set_stream_handler(stream_handler) - logger.set_level(_logging.DEBUG if debug else _logging.INFO) + logger.level = _logging.DEBUG if debug else _logging.INFO + logger.set_level(logger.level) logger.quiet = quiet logger.printshellcmds = printshellcmds diff --git a/fluxcloud/main/api.py b/fluxcloud/main/api.py new file mode 100644 index 0000000..69ddbeb --- /dev/null +++ b/fluxcloud/main/api.py @@ -0,0 +1,170 @@ +# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import atexit +import logging +import os +import shutil +import subprocess +import threading +import time +import uuid + +from flux_restful_client.main import get_client + +import fluxcloud.utils as utils +from fluxcloud.logger import logger + +here = os.path.dirname(os.path.abspath(__file__)) + +exit_event = threading.Event() + + +class APIClient: + def __init__(self, token=None, user=None): + """ + API client wrapper. + """ + self.user = token or os.environ.get("FLUX_USER") or "fluxuser" + self.token = token or os.environ.get("FLUX_TOKEN") or str(uuid.uuid4()) + self.cli = get_client(user=self.user, token=self.token) + self.proc = None + self.broker_pod = None + + def check(self, experiment): + """ + Set the basic auth for username and password and check it works + """ + minicluster = experiment.minicluster + get_broker_pod = experiment.get_shared_script( + "broker-id", {"minicluster": minicluster} + ) + + logger.info("Waiting for id of running broker pod...") + + # We've already waited for them to be running + broker_pod = None + while not broker_pod: + result = utils.run_capture(["/bin/bash", get_broker_pod], stream=True) + + # Save the broker pod, or exit on failure. + if result["message"]: + broker_pod = result["message"].strip() + + self.broker_pod = broker_pod + self.port_forward(minicluster["namespace"], self.broker_pod) + + def port_forward(self, namespace, broker_pod): + """ + Ask user to open port to forward + """ + command = ["kubectl", "port-forward", "-n", namespace, broker_pod, "5000:5000"] + + # This is detached - we can kill but not interact + logger.info(" ".join(command)) + self.proc = proc = subprocess.Popen( + command, + stdout=subprocess.DEVNULL if logger.level >= logging.DEBUG else None, + ) + + def cleanup(): + proc.kill() + + # Ensure we cleanup if anything goes wrong + atexit.register(cleanup) + + def submit(self, setup, experiment, size): + """ + Use the client to submit the jobs programatically. + """ + # Submit jobs! + + # Sleep time will be time of last job, assuming they are similar + sleep_time = 5 + for jobname, job in experiment.jobs.items(): + + # Do we want to run this job for this size and machine? + if not experiment.check_job_run(job, size): + logger.debug( + f"Skipping job {jobname} as does not match inclusion criteria." + ) + continue + + if "command" not in job: + logger.debug(f"Skipping job {jobname} as does not have a command.") + continue + + # The experiment is defined by the machine type and size + experiment_dir = experiment.root_dir + + # Add the size + jobname = f"{jobname}-minicluster-size-{size}" + job_output = os.path.join(experiment_dir, jobname) + logfile = os.path.join(job_output, "log.out") + + # Do we have output? + if os.path.exists(logfile) and not setup.force: + relpath = os.path.relpath(logfile, experiment_dir) + logger.warning( + f"{relpath} already exists and force is False, skipping." + ) + continue + + elif os.path.exists(logfile) and setup.force: + logger.warning(f"Cleaning up previous run in {job_output}.") + shutil.rmtree(job_output) + + # Create job directory anew + utils.mkdir_p(job_output) + + kwargs = dict(job) + del kwargs["command"] + + # Assume the task gets all nodes, unless specified in job + # Also assume the flux restful server is using one node + if "nodes" not in kwargs: + kwargs["nodes"] = size - 1 + if "tasks" not in kwargs: + kwargs["tasks"] = size - 1 + + # Ensure we convert - map between job params and the flux restful api + for convert in ( + ["num_tasks", "tasks"], + ["cores_per_task", "cores"], + ["gpus_per_task", "gpus"], + ["num_nodes", "nodes"], + ): + if convert[1] in kwargs: + kwargs[convert[0]] = kwargs[convert[1]] + + # Let's also keep track of actual time to get logs, info, etc. + start = time.time() + + # Run and block output until job is done + res = self.cli.submit(command=job["command"], **kwargs) + + logger.info(f"Submitting {jobname}: {job['command']}") + info = self.cli.jobs(res["id"]) + + while info["returncode"] == "": + info = self.cli.jobs(res["id"]) + time.sleep(sleep_time) + + end1 = time.time() + output = self.cli.output(res["id"]).get("Output") + if output: + utils.write_file("".join(output), logfile) + end2 = time.time() + + # Get the full job info, and add some wrapper times + info = self.cli.jobs(res["id"]) + info["start_to_info_seconds"] = end1 - start + info["start_to_output_seconds"] = end2 - start + + yield jobname, info + sleep_time = info["runtime"] + + # Kill the connection to the service + self.proc.kill() diff --git a/fluxcloud/main/client.py b/fluxcloud/main/client.py index 83c2b08..052bab5 100644 --- a/fluxcloud/main/client.py +++ b/fluxcloud/main/client.py @@ -8,6 +8,7 @@ import fluxcloud.utils as utils from fluxcloud.logger import logger +from fluxcloud.main.api import APIClient from fluxcloud.main.decorator import save_meta, timed here = os.path.dirname(os.path.abspath(__file__)) @@ -22,6 +23,7 @@ def __init__(self, *args, **kwargs): import fluxcloud.main.settings as settings self.settings = settings.Settings + self.info = {} self.times = {} # Job prefix is used for organizing time entries @@ -40,17 +42,12 @@ def run_timed(self, name, cmd): if res.returncode != 0: raise ValueError("nonzero exit code, exiting.") - def run_command(self, cmd, cleanup_func=None): + def run_command(self, cmd): """ Run a timed command, and handle nonzero exit codes. """ logger.debug("\n> Running Command: " + " ".join(cmd)) res = utils.run_command(cmd) - - # An optional cleanup function (also can run if not successful) - if cleanup_func is not None: - cleanup_func() - if res.returncode != 0: raise ValueError("nonzero exit code, exiting.") @@ -67,17 +64,24 @@ def run(self, setup): 3. bring down the cluster """ # Each experiment has its own cluster size and machine type - for experiment in setup.matrices: + for experiment in setup.iter_experiments(): + self.up(setup, experiment=experiment) + self.apply(setup, experiment=experiment) + self.down(setup, experiment=experiment) - # Don't bring up a cluster if experiments already run! - if not setup.force and experiment.is_run(): - logger.info( - f"Experiment on machine {experiment.expid} was already run and force is False, skipping." - ) - continue + @save_meta + def batch(self, setup): + """ + Run Flux Operator experiments via batch submit + 1. create the cluster + 2. run each command via submit to same MiniCluster + 3. bring down the cluster + """ + # Each experiment has its own cluster size and machine type + for experiment in setup.iter_experiments(): self.up(setup, experiment=experiment) - self.apply(setup, experiment=experiment) + self.submit(setup, experiment=experiment) self.down(setup, experiment=experiment) @save_meta @@ -88,11 +92,9 @@ def down(self, *args, **kwargs): raise NotImplementedError @save_meta - def apply(self, setup, experiment): + def submit(self, setup, experiment): """ - Apply a CRD to run the experiment and wait for output. - - This is really just running the setup! + Submit a Job via the Restful API """ # The MiniCluster can vary on size minicluster = experiment.minicluster @@ -102,8 +104,14 @@ def apply(self, setup, experiment): ) return - # The experiment is defined by the machine type and size - experiment_dir = experiment.root_dir + # Create a FluxRestful API to submit to + # This will get creds from the environment or create new ones + api = APIClient() + logger.info( + "Save these if you want to log into the Flux RESTFul interface, there are specific to the MiniCluster" + ) + logger.info(f"export FLUX_USER={api.user}") + logger.info(f"export FLUX_TOKEN={api.token}") # Iterate through all the cluster sizes for size in minicluster["size"]: @@ -115,53 +123,109 @@ def apply(self, setup, experiment): ) continue - # Jobname is used for output - for jobname, job in experiment.jobs.items(): - - # Do we want to run this job for this size and machine? - if not experiment.check_job_run(job, size): - logger.debug( - f"Skipping job {jobname} as does not match inclusion criteria." - ) - continue - - # Add the size - jobname = f"{jobname}-minicluster-size-{size}" - job_output = os.path.join(experiment_dir, jobname) - logfile = os.path.join(job_output, "log.out") - - # Any custom commands to run first? - if hasattr(self, "pre_apply"): - self.pre_apply(experiment, jobname, job) - - # Do we have output? - if os.path.exists(logfile) and not setup.force: - logger.warning( - f"{logfile} already exists and force is False, skipping." - ) - continue - - elif os.path.exists(logfile) and setup.force: - logger.warning(f"Cleaning up previous run in {job_output}.") - shutil.rmtree(job_output) - - # Create job directory anew - utils.mkdir_p(job_output) - - # Generate the populated crd from the template - crd = experiment.generate_crd(job, size) - - # Prepare specific .crd for template - # Note the output directory is already specific to the job index - kwargs = {"minicluster": minicluster, "logfile": logfile, "crd": crd} - apply_script = experiment.get_shared_script( - "minicluster-run", kwargs, suffix=f"-{jobname}" - ) + logger.info(f"\nšŸŒ€ Bringing up MiniCluster of size {size}") + + # Get the global "job" for the size (and validate only one image) + # This will raise error if > 1 image, or no image. + image = experiment.get_persistent_image(size) + job = {"image": image, "token": api.token, "user": api.user} + + # Pre-pull containers, etc. + if hasattr(self, "pre_apply"): + self.pre_apply(experiment, "global-job", job=job) + + # Create the minicluster via a CRD without a command + crd = experiment.generate_crd(job, size) + + # Create one MiniCluster CRD (without a command) to run the Flux Restful API + kwargs = { + "minicluster": minicluster, + "crd": crd, + "token": api.token, + "user": api.user, + "size": size, + } + submit_script = experiment.get_shared_script( + "minicluster-submit", kwargs, suffix=f"-size-{size}" + ) + # Start the MiniCluster! This should probably be done better... + self.run_timed( + f"minicluster-submit-size-{size}", ["/bin/bash", submit_script] + ) + + # Ensure our credentials still work, and open port forward + api.check(experiment) + + # Save times (and logs in submit) as we go + for jobid, info in api.submit(setup, experiment, size): + logger.info(f"{jobid} took {info['runtime']} seconds.") + self.times[jobid] = info["runtime"] + self.info[jobid] = info + + logger.info(f"\nšŸŒ€ MiniCluster of size {size} is finished") + self.run_timed( + f"minicluster-destroy-size-{size}", ["kubectl", "delete", "-f", crd] + ) - # Apply the job, and save to output directory - self.run_timed( - f"{self.job_prefix}-{jobname}", ["/bin/bash", apply_script] + @save_meta + def apply(self, setup, experiment): + """ + Apply a CRD to run the experiment and wait for output. + + This is really just running the setup! + """ + # The MiniCluster can vary on size + if not experiment.jobs: + logger.warning( + f"Experiment {experiment.expid} has no jobs, nothing to run." + ) + return + + # Save output here + experiment_dir = experiment.root_dir + + for size, jobname, job in experiment.iter_jobs(): + + # Add the size + jobname = f"{jobname}-minicluster-size-{size}" + job_output = os.path.join(experiment_dir, jobname) + logfile = os.path.join(job_output, "log.out") + + # Any custom commands to run first? + if hasattr(self, "pre_apply"): + self.pre_apply(experiment, jobname, job) + + # Do we have output? + if os.path.exists(logfile) and not setup.force: + relpath = os.path.relpath(logfile, experiment_dir) + logger.warning( + f"{relpath} already exists and force is False, skipping." ) + continue + + elif os.path.exists(logfile) and setup.force: + logger.warning(f"Cleaning up previous run in {job_output}.") + shutil.rmtree(job_output) + + # Create job directory anew + utils.mkdir_p(job_output) + + # Generate the populated crd from the template + crd = experiment.generate_crd(job, size) + + # Prepare specific .crd for template + # Note the output directory is already specific to the job index + kwargs = { + "minicluster": experiment.minicluster, + "logfile": logfile, + "crd": crd, + } + apply_script = experiment.get_shared_script( + "minicluster-run", kwargs, suffix=f"-{jobname}" + ) + + # Apply the job, and save to output directory + self.run_timed(f"{self.job_prefix}-{jobname}", ["/bin/bash", apply_script]) def clear_minicluster_times(self): """ diff --git a/fluxcloud/main/clouds/shared/scripts/broker-id b/fluxcloud/main/clouds/shared/scripts/broker-id new file mode 100755 index 0000000..a45ba8c --- /dev/null +++ b/fluxcloud/main/clouds/shared/scripts/broker-id @@ -0,0 +1,12 @@ +#!/bin/bash + +NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}" +JOB="{{ minicluster.name }}" +brokerPrefix="${JOB}-0" + +for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do + if [[ "${pod}" == ${brokerPrefix}* ]]; then + echo ${pod} + break + fi +done diff --git a/fluxcloud/main/clouds/shared/scripts/minicluster-run b/fluxcloud/main/clouds/shared/scripts/minicluster-run index 31a081b..aa6e11a 100755 --- a/fluxcloud/main/clouds/shared/scripts/minicluster-run +++ b/fluxcloud/main/clouds/shared/scripts/minicluster-run @@ -19,65 +19,12 @@ print_magenta "logfile : ${LOGFILE}" is_installed kubectl # Ensure we wait for the space to be cleaned up -echo -podsCleaned="false" -print_blue "Waiting for previous pods to be cleaned up..." -while [[ "${podsCleaned}" == "false" ]]; do - echo -n "." - sleep 2 - state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1) - lines=$(echo $state | wc -l) - if [[ "${lines}" == "1" ]] && [[ "${state}" == *"No resources found in"* ]]; then - echo - print_green "šŸŒ€ļø Previous pods are cleaned up." - podsCleaned="true" - break - fi -done +{% include "wait_for_cleanup.sh" %} # Create the namespace (ok if already exists) run_echo_allow_fail kubectl create namespace ${NAMESPACE} -# Apply the job, get pods -run_echo kubectl apply -f ${CRD} -run_echo kubectl get -n ${NAMESPACE} pods - -# continue until we find the index-0 pod -brokerPrefix="${JOB}-0" -brokerReady="false" - -echo -print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be created..." -while [[ "${brokerReady}" == "false" ]]; do - echo -n "." - sleep 2 - for pod in $(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - print_green "šŸŒ€ļø Broker pod is created." - brokerReady="true" - break - fi - done -done - -# Now broker pod needs to be running -echo -print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be running..." -brokerReady="false" -while [[ "${brokerReady}" == "false" ]]; do - echo -n "." - - # TODO - we likely want to check for running OR completed, it's rare but sometimes they can complete too fast. - for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - print_green "šŸŒ€ļø Broker pod is running." - brokerReady="true" - break - fi - done -done +{% include "wait_for_broker.sh" %} # Get the name of the pods pods=($(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}')) diff --git a/fluxcloud/main/clouds/shared/scripts/minicluster-submit b/fluxcloud/main/clouds/shared/scripts/minicluster-submit new file mode 100755 index 0000000..3b2db0a --- /dev/null +++ b/fluxcloud/main/clouds/shared/scripts/minicluster-submit @@ -0,0 +1,33 @@ +#!/bin/bash + +# This is a template that will be populated with variables by Flux-Cloud +# We only run it to check if a MiniCluster is running. An apply is only +# needed if the MiniCluster is not created yet. + +# Include shared helper scripts +{% include "helpers.sh" %} + +NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}" +CRD="{{ crd }}" +JOB="{{ minicluster.name }}" + +# Size -1 to account for certificate generator +SIZE={{ size }} + +print_magenta " apply : ${CRD}" +print_magenta " job : ${JOB}" + +is_installed kubectl + +# Create the namespace (ok if already exists) +run_echo_allow_fail kubectl create namespace ${NAMESPACE} + +# Always cleanup a previous one so tokens don't get stale +run_echo_allow_fail kubectl delete -f ${CRD} +{% include "wait_for_cleanup.sh" %} + +# Ensure we have a MiniCluster of the right namespace running +echo +print_green "šŸŒ€ļø Creating MiniCluster in ${NAMESPACE}" +{% include "wait_for_all.sh" %} +{% include "wait_for_flux_restful.sh" %} diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh new file mode 100644 index 0000000..3916e9e --- /dev/null +++ b/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh @@ -0,0 +1,20 @@ +# Apply the job, get pods +run_echo kubectl apply -f ${CRD} +run_echo kubectl get -n ${NAMESPACE} pods + +# continue until we find the index-0 pod +podsReady="false" + +echo +print_blue "Waiting for MiniCluster of size ${SIZE} to be ready..." +while [[ "${podsReady}" == "false" ]]; do + echo -n "." + sleep 2 + pods=$(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=name | wc -l) + if [[ "${pods}" == "${SIZE}" ]]; then + echo + print_green "šŸŒ€ļø All pods are running." + podsReady="true" + break + fi +done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh new file mode 100644 index 0000000..9335313 --- /dev/null +++ b/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh @@ -0,0 +1,40 @@ +# Apply the job, get pods +run_echo kubectl apply -f ${CRD} +run_echo kubectl get -n ${NAMESPACE} pods + +# continue until we find the index-0 pod +brokerPrefix="${JOB}-0" +brokerReady="false" + +echo +print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be created..." +while [[ "${brokerReady}" == "false" ]]; do + echo -n "." + sleep 2 + for pod in $(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}'); do + if [[ "${pod}" == ${brokerPrefix}* ]]; then + echo + print_green "šŸŒ€ļø Broker pod is created." + brokerReady="true" + break + fi + done +done + +# Now broker pod needs to be running +echo +print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be running..." +brokerReady="false" +while [[ "${brokerReady}" == "false" ]]; do + echo -n "." + + # TODO - we likely want to check for running OR completed, it's rare but sometimes they can complete too fast. + for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do + if [[ "${pod}" == ${brokerPrefix}* ]]; then + echo + print_green "šŸŒ€ļø Broker pod is running." + brokerReady="true" + break + fi + done +done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh new file mode 100644 index 0000000..ce232f1 --- /dev/null +++ b/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh @@ -0,0 +1,15 @@ +echo +podsCleaned="false" +print_blue "Waiting for previous MiniCluster to be cleaned up..." +while [[ "${podsCleaned}" == "false" ]]; do + echo -n "." + sleep 2 + state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1) + lines=$(echo $state | wc -l) + if [[ "${lines}" == "1" ]] && [[ "${state}" == *"No resources found in"* ]]; then + echo + print_green "šŸŒ€ļø Previous pods are cleaned up." + podsCleaned="true" + break + fi +done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh new file mode 100644 index 0000000..280e0d3 --- /dev/null +++ b/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh @@ -0,0 +1,29 @@ + +echo +brokerPod="" +brokerPrefix="${JOB}-0" +while [[ "${brokerPod}" == "" ]]; do + for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do + if [[ "${pod}" == ${brokerPrefix}* ]]; then + echo + brokerPod=${pod} + break + fi + done +done + +echo +serverReady="false" +print_blue "Waiting for Flux Restful API Server to be ready..." +while [[ "${serverReady}" == "false" ]]; do + echo -n "." + sleep 2 + logs=$(kubectl logs --namespace ${NAMESPACE} ${brokerPod} | grep "Uvicorn running") + retval=$? + if [[ "${retval}" == "0" ]]; then + echo + serverReady="true" + print_green "šŸŒ€ļø Flux RestFul API Server is Ready." + break + fi +done diff --git a/fluxcloud/main/decorator.py b/fluxcloud/main/decorator.py index 8bc47cb..23f3fd4 100644 --- a/fluxcloud/main/decorator.py +++ b/fluxcloud/main/decorator.py @@ -41,7 +41,7 @@ def __call__(self, cls, *args, **kwargs): res = self.func(cls, *args, **kwargs) experiment = experiment or setup.get_single_experiment() - experiment.save_metadata(cls.times) + experiment.save_metadata(cls.times, cls.info) cls.clear_minicluster_times() return res diff --git a/fluxcloud/main/experiment.py b/fluxcloud/main/experiment.py index ec2eefd..1326271 100644 --- a/fluxcloud/main/experiment.py +++ b/fluxcloud/main/experiment.py @@ -59,6 +59,19 @@ def __init__( # Prepare the matrices for the setup self.prepare_matrices() + def iter_experiments(self): + """ + yield experiments that are not run yet. + """ + for experiment in self.matrices: + # Don't bring up a cluster if experiments already run! + if not self.force and experiment.is_run(): + logger.info( + f"Experiment on machine {experiment.expid} was already run and force is False, skipping." + ) + continue + yield experiment + def cleanup(self, experiments): """ Cleanup the experiment script directory, if cleanup is true @@ -152,6 +165,60 @@ def root_dir(self): """ return os.path.join(self.outdir, self.expid) + def iter_jobs(self): + """ + Iterate through experiment jobs + """ + minicluster = self.minicluster + + # Iterate through all the cluster sizes + for size in minicluster["size"]: + + # We can't run if the minicluster > the experiment size + if size > self.size: + logger.warning( + f"Cluster of size {self.size} cannot handle a MiniCluster of size {size}, skipping." + ) + continue + + # Jobname is used for output + for jobname, job in self.jobs.items(): + + # Do we want to run this job for this size and machine? + if not self.check_job_run(job, size): + logger.debug( + f"Skipping job {jobname} as does not match inclusion criteria." + ) + continue + + yield size, jobname, job + + def get_persistent_image(self, size): + """ + A persistent image is a job image used across a size of MiniCluster + """ + image = None + for _, job in self.jobs.items(): + + # Skip jobs targeted for a different size + if "size" in job and job["size"] != size: + continue + + if "image" in job and not image: + image = job["image"] + continue + if "image" in job and image != job["image"]: + raise ValueError( + f"Submit uses a consistent container image, but found two images under size {size}: {image} and {job['image']}" + ) + + # If we get here and we don't have an image + if not image: + raise ValueError( + 'Submit requires a container "image" under at least one job spec to create the MiniCluster.' + ) + return image + @property def script_dir(self): """ @@ -207,11 +274,13 @@ def generate_crd(self, job, minicluster_size): if "jobs" in experiment: del experiment["jobs"] experiment["job"] = job - result = template.render(**experiment) + result = template.render(**experiment).strip(" ") logger.debug(result) # Write to output directory - outfile = os.path.join(self.script_dir, "minicluster.yaml") + outfile = os.path.join( + self.script_dir, f"minicluster-size-{minicluster_size}.yaml" + ) outdir = os.path.dirname(outfile) if not os.path.exists(outdir): logger.info(f"Creating output directory for scripts {outdir}") @@ -274,11 +343,12 @@ def check_job_run(self, job, size): return False return True - def save_metadata(self, times): + def save_metadata(self, times, info=None): """ Save experiment metadata, loading an existing meta.json, if present. """ experiment_dir = self.root_dir + info = info or {} # The experiment is defined by the machine type and size if not os.path.exists(experiment_dir): @@ -286,7 +356,7 @@ def save_metadata(self, times): meta_file = os.path.join(experiment_dir, "meta.json") # Load existing metadata, if we have it - meta = {"times": times} + meta = {"times": times, "info": info} if os.path.exists(meta_file): meta = utils.read_json(meta_file) @@ -297,9 +367,20 @@ def save_metadata(self, times): continue meta["times"][timekey] = timevalue + # Update info + if "info" not in meta and info: + meta["info"] = {} + for key, value in info.items(): + meta["info"][key] = value + # TODO we could add cost estimation here - data from cloud select for key, value in self.experiment.items(): meta[key] = value + + # Do not add empty info (only for batch mode) + if "info" in meta and not meta["info"]: + del meta["info"] + utils.write_json(meta, meta_file) return meta diff --git a/fluxcloud/minicluster-template.yaml b/fluxcloud/minicluster-template.yaml index d7e2a92..ede959d 100644 --- a/fluxcloud/minicluster-template.yaml +++ b/fluxcloud/minicluster-template.yaml @@ -13,34 +13,30 @@ spec: tasks: {% if job.tasks %}{{ job.tasks }}{% else %}1{% endif %} # Disable verbose output - test: {% if minicluster.verbose %}false{% else %}true{% endif %} + {% if job.quiet or job.timed %}logging: + {% if job.quiet %}quiet: true{% endif %} + {% if job.timed %}timed: true{% endif %}{% endif %} + + # Optional credentials if running the flux restful api + {% if job.token or job.user %}fluxRestful: + {% if job.token %}token: "{{ job.token }}"{% endif %} + {% if job.user %}username: "{{ job.user }}"{% endif %}{% endif %} # TODO add pod resources, if needed - # This is a list because a pod can support multiple containers containers: - image: {{ job.image }} {% if job.workdir %}workingDir: {{ job.workdir }}{% endif %} - command: {{ job.command }} - - # Option Flags for this flux runner wait.sh entrypoint + {% if job.command %}command: {{ job.command }}{% endif %} {% if job.flux_option_flags %}fluxOptionFlags: "-ompi=openmpi@5"{% endif %} - - # Leave 2 cores for kubernetes, we have 96, - # this is just for eksctl cores: {% if job.cores %}{{ job.cores }}{% else %}1{% endif %} - - # Resource limits to enable efa {% if job.limits or job.resources %}resources:{% endif %} {% if job.limits %}limits: {% for limit in job.limits %} {{ limit[0] }}: {{ limit[1] }} {% endfor %}{% endif %} - {% if job.requests %}requests: {% for limit in job.requests %} {{ limit[0] }}: {{ limit[1] }} {% endfor %}{% endif %} - - # custom preCommand logic (run at start of script) {% if job.pre_command %}preCommand: | {{ job.pre_command }}{% endif %} diff --git a/fluxcloud/settings.yml b/fluxcloud/settings.yml index 0d01b01..cf833ed 100644 --- a/fluxcloud/settings.yml +++ b/fluxcloud/settings.yml @@ -24,7 +24,7 @@ kubernetes: google: zone: us-central1-a - machine: n2-standard-1 + machine: n1-standard-1 project: null aws: diff --git a/fluxcloud/version.py b/fluxcloud/version.py index c51d2d1..b60ddbb 100644 --- a/fluxcloud/version.py +++ b/fluxcloud/version.py @@ -1,7 +1,7 @@ # Copyright 2022-2023 Lawrence Livermore National Security, LLC # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.1.14" +__version__ = "0.1.15" AUTHOR = "Vanessa Sochat" EMAIL = "vsoch@users.noreply.github.com" NAME = "flux-cloud" @@ -13,13 +13,12 @@ ################################################################################ # Global requirements -# Since we assume wanting Singularity and lmod, we require spython and Jinja2 - INSTALL_REQUIRES = ( ("ruamel.yaml", {"min_version": None}), ("jsonschema", {"min_version": None}), ("requests", {"min_version": None}), ("jinja2", {"min_version": None}), + ("flux-restful-client", {"min_version": None}), ) TESTS_REQUIRES = (("pytest", {"min_version": "4.6.2"}),) diff --git a/tests/lammps/data/k8s-size-4-local/.scripts/cluster-create-minikube.sh b/tests/lammps/data/k8s-size-4-local/.scripts/cluster-create-minikube.sh index e3316b2..77fe414 100755 --- a/tests/lammps/data/k8s-size-4-local/.scripts/cluster-create-minikube.sh +++ b/tests/lammps/data/k8s-size-4-local/.scripts/cluster-create-minikube.sh @@ -135,7 +135,7 @@ FORCE_CLUSTER="true" SIZE=4 REPOSITORY="flux-framework/flux-operator" BRANCH="main" -SCRIPT_DIR="/tmp/lammps-data-WHqAsc/k8s-size-4-local/.scripts" +SCRIPT_DIR="/tmp/lammps-data-PeHJF2/k8s-size-4-local/.scripts" print_magenta " cluster : ${CLUSTER_NAME}" print_magenta " version : ${CLUSTER_VERSION}" diff --git a/tests/lammps/data/k8s-size-4-local/.scripts/flux-operator.yaml b/tests/lammps/data/k8s-size-4-local/.scripts/flux-operator.yaml index 242be7d..b4bc03e 100644 --- a/tests/lammps/data/k8s-size-4-local/.scripts/flux-operator.yaml +++ b/tests/lammps/data/k8s-size-4-local/.scripts/flux-operator.yaml @@ -177,6 +177,14 @@ spec: description: Port to run Flux Restful Server On format: int32 type: integer + token: + description: Token to use for RestFul API + type: string + username: + description: These two should not actually be set by a user, but + rather generated by tools and provided Username to use for RestFul + API + type: string type: object jobLabels: additionalProperties: diff --git a/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-run-lmp-size-2-minicluster-size-2.sh b/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-run-lmp-size-2-minicluster-size-2.sh index 21ce39d..9622222 100755 --- a/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-run-lmp-size-2-minicluster-size-2.sh +++ b/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-run-lmp-size-2-minicluster-size-2.sh @@ -133,9 +133,9 @@ function with_exponential_backoff { } NAMESPACE="flux-operator" -CRD="/tmp/lammps-data-WHqAsc/k8s-size-4-local/.scripts/minicluster.yaml" +CRD="/tmp/lammps-data-PeHJF2/k8s-size-4-local/.scripts/minicluster-size-2.yaml" JOB="lammps" -LOGFILE="/tmp/lammps-data-WHqAsc/k8s-size-4-local/lmp-size-2-minicluster-size-2/log.out" +LOGFILE="/tmp/lammps-data-PeHJF2/k8s-size-4-local/lmp-size-2-minicluster-size-2/log.out" print_magenta " apply : ${CRD}" print_magenta " job : ${JOB}" @@ -146,7 +146,7 @@ is_installed kubectl # Ensure we wait for the space to be cleaned up echo podsCleaned="false" -print_blue "Waiting for previous pods to be cleaned up..." +print_blue "Waiting for previous MiniCluster to be cleaned up..." while [[ "${podsCleaned}" == "false" ]]; do echo -n "." sleep 2 diff --git a/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-run-lmp-size-4-minicluster-size-4.sh b/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-run-lmp-size-4-minicluster-size-4.sh index 17fc079..51d7299 100755 --- a/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-run-lmp-size-4-minicluster-size-4.sh +++ b/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-run-lmp-size-4-minicluster-size-4.sh @@ -133,9 +133,9 @@ function with_exponential_backoff { } NAMESPACE="flux-operator" -CRD="/tmp/lammps-data-WHqAsc/k8s-size-4-local/.scripts/minicluster.yaml" +CRD="/tmp/lammps-data-PeHJF2/k8s-size-4-local/.scripts/minicluster-size-4.yaml" JOB="lammps" -LOGFILE="/tmp/lammps-data-WHqAsc/k8s-size-4-local/lmp-size-4-minicluster-size-4/log.out" +LOGFILE="/tmp/lammps-data-PeHJF2/k8s-size-4-local/lmp-size-4-minicluster-size-4/log.out" print_magenta " apply : ${CRD}" print_magenta " job : ${JOB}" @@ -146,7 +146,7 @@ is_installed kubectl # Ensure we wait for the space to be cleaned up echo podsCleaned="false" -print_blue "Waiting for previous pods to be cleaned up..." +print_blue "Waiting for previous MiniCluster to be cleaned up..." while [[ "${podsCleaned}" == "false" ]]; do echo -n "." sleep 2 diff --git a/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-size-2.yaml b/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-size-2.yaml new file mode 100644 index 0000000..0a7c0fc --- /dev/null +++ b/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-size-2.yaml @@ -0,0 +1,22 @@ +apiVersion: flux-framework.org/v1alpha1 +kind: MiniCluster + +metadata: + name: lammps + namespace: flux-operator +spec: + # localDeploy needs to be false + localDeploy: false + + # Number of pods to create for MiniCluster + size: 2 + + # Disable verbose output + logging: + quiet: true + + # This is a list because a pod can support multiple containers + containers: + - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + workingDir: /home/flux/examples/reaxff/HNS + command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite \ No newline at end of file diff --git a/tests/lammps/data/k8s-size-4-local/.scripts/minicluster.yaml b/tests/lammps/data/k8s-size-4-local/.scripts/minicluster-size-4.yaml similarity index 100% rename from tests/lammps/data/k8s-size-4-local/.scripts/minicluster.yaml rename to tests/lammps/data/k8s-size-4-local/.scripts/minicluster-size-4.yaml diff --git a/tests/lammps/data/k8s-size-4-local/meta.json b/tests/lammps/data/k8s-size-4-local/meta.json index a105595..0efdda2 100644 --- a/tests/lammps/data/k8s-size-4-local/meta.json +++ b/tests/lammps/data/k8s-size-4-local/meta.json @@ -1,6 +1,6 @@ { "times": { - "create-cluster": 91.648, + "create-cluster": 101.871, "minicluster-run-lmp-size-2-minicluster-size-2": 29.376, "minicluster-run-lmp-size-4-minicluster-size-4": 156.728, "destroy-cluster": 13.656