From 97f2777b34503b875f463f343d8a7149adf89fdf Mon Sep 17 00:00:00 2001 From: Vanessasaurus <814322+vsoch@users.noreply.github.com> Date: Sun, 5 Mar 2023 21:35:48 -0700 Subject: [PATCH] start of work to refactor submit (#28) * start of work to refactor submit submit now uses the flux operator python sdk (non released version) to create / submit / delete the minicluster, no need for external scripts except for starting minikube. This is likely an improvement and I am going to also test using the sdk for apply. * refactor of apply I still need to save the nodes/pods and other cluster info to the .scripts directory, but the basic apply is now working JUST using the Python SDK. When this is all done, we should be able to remove the custom scripts that are running jobs, and update and test all the cloud examples. * finish up refactor of submit/apply everything should now use the python sdk * set min version of fluxoperator sdk * test only apply * add back helpers script Signed-off-by: vsoch --- .github/workflows/main.yml | 85 +- .gitignore | 2 + CHANGELOG.md | 4 + README.md | 5 - docs/getting_started/commands.md | 333 ++++--- docs/getting_started/debugging.md | 163 ++++ docs/getting_started/examples.md | 34 +- docs/getting_started/experiments.md | 143 +-- docs/getting_started/google.md | 74 -- docs/getting_started/index.md | 4 +- docs/getting_started/minikube.md | 134 --- docs/index.rst | 6 +- docs/{getting_started => tutorials}/aws.md | 99 +- docs/tutorials/google.md | 91 ++ docs/tutorials/index.md | 11 + docs/tutorials/minikube.md | 313 +++++++ examples/aws-lammps/README.md | 71 -- examples/aws-lammps/experiments.yaml | 18 - examples/aws-lammps/minicluster-template.yaml | 15 - examples/google/osu-benchmarks/README.md | 5 + .../google/osu-benchmarks/experiments.yaml | 59 ++ examples/minikube/basic/README.md | 3 + .../basic}/experiments.yaml | 10 +- examples/minikube/logging/README.md | 3 + examples/minikube/logging/experiments.yaml | 24 + examples/minikube/osu-benchmarks/README.md | 5 + .../minikube/osu-benchmarks/experiments.yaml | 65 ++ examples/minikube/persistent/README.md | 49 + examples/minikube/persistent/experiments.yaml | 16 + examples/minikube/resources/README.md | 3 + examples/minikube/resources/experiments.yaml | 25 + examples/minikube/volumes/README.md | 3 + examples/minikube/volumes/experiments.yaml | 31 + examples/osu-benchmarks/README.md | 60 -- .../data/n1-standard-1-2/meta.json | 50 -- .../n1-standard-1-2/osu_acc_latency/log.out | 27 - .../n1-standard-1-2/osu_cas_latency/log.out | 5 - .../n1-standard-1-2/osu_fop_latency/log.out | 5 - .../osu_get_acc_latency/log.out | 27 - .../data/n1-standard-1-2/osu_get_bw/log.out | 27 - .../n1-standard-1-2/osu_get_latency/log.out | 27 - .../data/n1-standard-1-2/osu_put_bibw/log.out | 27 - .../data/n1-standard-1-2/osu_put_bw/log.out | 27 - .../n1-standard-1-2/osu_put_latency/log.out | 27 - .../data/n1-standard-2-2/meta.json | 51 -- .../n1-standard-2-2/osu_acc_latency/log.out | 27 - .../n1-standard-2-2/osu_cas_latency/log.out | 5 - .../n1-standard-2-2/osu_fop_latency/log.out | 5 - .../osu_get_acc_latency/log.out | 27 - .../data/n1-standard-2-2/osu_get_bw/log.out | 27 - .../n1-standard-2-2/osu_get_latency/log.out | 27 - .../data/n1-standard-2-2/osu_put_bibw/log.out | 27 - .../data/n1-standard-2-2/osu_put_bw/log.out | 27 - .../n1-standard-2-2/osu_put_latency/log.out | 27 - .../data/n1-standard-4-2/meta.json | 49 - .../n1-standard-4-2/osu_acc_latency/log.out | 27 - .../n1-standard-4-2/osu_cas_latency/log.out | 5 - .../n1-standard-4-2/osu_fop_latency/log.out | 5 - .../osu_get_acc_latency/log.out | 27 - .../data/n1-standard-4-2/osu_get_bw/log.out | 27 - .../n1-standard-4-2/osu_get_latency/log.out | 27 - .../data/n1-standard-4-2/osu_put_bibw/log.out | 27 - .../data/n1-standard-4-2/osu_put_bw/log.out | 27 - .../n1-standard-4-2/osu_put_latency/log.out | 27 - examples/osu-benchmarks/experiments.yaml | 32 - .../osu-benchmarks/minicluster-template.yaml | 31 - examples/up-apply-down/README.md | 37 - examples/up-apply-down/data/meta.json | 19 - .../data/n1-standard-1-2/reaxc-hns/log.out | 80 -- examples/up-apply-down/experiments.yaml | 14 - .../up-apply-down/minicluster-template.yaml | 24 - examples/up-submit-down/README.md | 72 -- .../.scripts/broker-id.sh | 12 - .../.scripts/cluster-create.sh | 204 ----- .../.scripts/cluster-destroy.sh | 161 ---- .../.scripts/flux-operator.yaml | 848 ------------------ .../.scripts/minicluster-size-2.yaml | 29 - .../.scripts/minicluster-submit-size-2.sh | 219 ----- .../hello-world-1-minicluster-size-2/log.out | 1 - .../hello-world-2-minicluster-size-2/log.out | 1 - .../hello-world-3-minicluster-size-2/log.out | 1 - .../hello-world-4-minicluster-size-2/log.out | 1 - .../hello-world-5-minicluster-size-2/log.out | 1 - .../data/k8s-size-4-n1-standard-1/meta.json | 698 -------------- .../reaxc-hns-1-minicluster-size-2/log.out | 80 -- .../reaxc-hns-2-minicluster-size-2/log.out | 80 -- .../reaxc-hns-3-minicluster-size-2/log.out | 80 -- .../reaxc-hns-4-minicluster-size-2/log.out | 80 -- .../reaxc-hns-5-minicluster-size-2/log.out | 80 -- examples/up-submit-down/plot_results.py | 145 --- fluxcloud/client/__init__.py | 59 +- fluxcloud/client/apply.py | 4 +- fluxcloud/client/experiment.py | 27 + fluxcloud/client/helpers.py | 3 +- fluxcloud/client/ui.py | 24 - fluxcloud/defaults.py | 3 - fluxcloud/main/__init__.py | 6 +- fluxcloud/main/api.py | 392 +++++--- fluxcloud/main/client.py | 174 ++-- .../main/clouds/aws/scripts/cluster-create | 4 + .../main/clouds/google/scripts/cluster-create | 4 +- .../local/scripts/cluster-create-minikube | 4 +- .../main/clouds/shared/scripts/broker-id | 12 - .../scripts/minicluster-create-persistent | 33 - .../clouds/shared/scripts/minicluster-run | 44 - .../clouds/shared/scripts/wait_for_all.sh | 20 - .../clouds/shared/scripts/wait_for_broker.sh | 40 - .../clouds/shared/scripts/wait_for_cleanup.sh | 15 - .../shared/scripts/wait_for_flux_restful.sh | 29 - fluxcloud/main/experiment.py | 164 ++-- fluxcloud/main/schemas.py | 7 +- fluxcloud/main/template.py | 91 ++ fluxcloud/minicluster-template.yaml | 42 - fluxcloud/tests/__init__.py | 0 fluxcloud/tests/helpers.py | 47 + fluxcloud/tests/test_examples.py | 181 ++++ fluxcloud/tests/test_settings.py | 50 ++ fluxcloud/tests/test_utils.py | 133 +++ fluxcloud/tests/testdata/hashtest.txt | 2 + fluxcloud/utils/__init__.py | 2 +- fluxcloud/utils/misc.py | 15 + fluxcloud/version.py | 4 +- tests/test.sh | 31 +- 123 files changed, 2355 insertions(+), 5054 deletions(-) create mode 100644 docs/getting_started/debugging.md delete mode 100644 docs/getting_started/google.md delete mode 100644 docs/getting_started/minikube.md rename docs/{getting_started => tutorials}/aws.md (54%) create mode 100644 docs/tutorials/google.md create mode 100644 docs/tutorials/index.md create mode 100644 docs/tutorials/minikube.md delete mode 100644 examples/aws-lammps/README.md delete mode 100644 examples/aws-lammps/experiments.yaml delete mode 100644 examples/aws-lammps/minicluster-template.yaml create mode 100644 examples/google/osu-benchmarks/README.md create mode 100644 examples/google/osu-benchmarks/experiments.yaml create mode 100644 examples/minikube/basic/README.md rename examples/{up-submit-down => minikube/basic}/experiments.yaml (64%) create mode 100644 examples/minikube/logging/README.md create mode 100644 examples/minikube/logging/experiments.yaml create mode 100644 examples/minikube/osu-benchmarks/README.md create mode 100644 examples/minikube/osu-benchmarks/experiments.yaml create mode 100644 examples/minikube/persistent/README.md create mode 100644 examples/minikube/persistent/experiments.yaml create mode 100644 examples/minikube/resources/README.md create mode 100644 examples/minikube/resources/experiments.yaml create mode 100644 examples/minikube/volumes/README.md create mode 100644 examples/minikube/volumes/experiments.yaml delete mode 100644 examples/osu-benchmarks/README.md delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/meta.json delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/meta.json delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/meta.json delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out delete mode 100644 examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out delete mode 100644 examples/osu-benchmarks/experiments.yaml delete mode 100644 examples/osu-benchmarks/minicluster-template.yaml delete mode 100644 examples/up-apply-down/README.md delete mode 100644 examples/up-apply-down/data/meta.json delete mode 100644 examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out delete mode 100644 examples/up-apply-down/experiments.yaml delete mode 100644 examples/up-apply-down/minicluster-template.yaml delete mode 100644 examples/up-submit-down/README.md delete mode 100755 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh delete mode 100755 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh delete mode 100755 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml delete mode 100755 examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out delete mode 100644 examples/up-submit-down/plot_results.py create mode 100644 fluxcloud/client/experiment.py delete mode 100644 fluxcloud/client/ui.py delete mode 100755 fluxcloud/main/clouds/shared/scripts/broker-id delete mode 100755 fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent delete mode 100755 fluxcloud/main/clouds/shared/scripts/minicluster-run delete mode 100644 fluxcloud/main/clouds/shared/scripts/wait_for_all.sh delete mode 100644 fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh delete mode 100644 fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh delete mode 100644 fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh create mode 100644 fluxcloud/main/template.py delete mode 100644 fluxcloud/minicluster-template.yaml create mode 100644 fluxcloud/tests/__init__.py create mode 100644 fluxcloud/tests/helpers.py create mode 100644 fluxcloud/tests/test_examples.py create mode 100644 fluxcloud/tests/test_settings.py create mode 100644 fluxcloud/tests/test_utils.py create mode 100644 fluxcloud/tests/testdata/hashtest.txt diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3739a40..04aa71f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -24,13 +24,74 @@ jobs: pip install -r .github/dev-requirements.txt pre-commit run --all-files - test-runs: + test-python: + runs-on: ubuntu-latest + steps: + - name: Clone the code + uses: actions/checkout@v3 + + - name: Install flux-cloud + run: | + conda create --quiet --name fc jinja2 + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + pip install .[all] + + - name: Test Python + run: | + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + export SHELL=/bin/bash + # This will bring MiniKube up/down + pytest -xs ./fluxcloud/tests/test_settings.py + pytest -xs ./fluxcloud/tests/test_utils.py + + test-examples: runs-on: ubuntu-latest strategy: fail-fast: false matrix: - test: ["lammps"] + test: ["test_minicluster_logging", "test_minicluster_volumes", + "test_minicluster_resources"] + steps: + - name: Clone the code + uses: actions/checkout@v3 + + - name: Setup Go + uses: actions/setup-go@v3 + with: + go-version: ^1.18 + + - name: Install flux-cloud + run: | + conda create --quiet --name fc jinja2 + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + pip install .[all] + pip install kubernetes + - name: Start minikube + uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9 + + - name: Test Example + env: + test: ${{ matrix.test }} + run: | + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + export SHELL=/bin/bash + eval $(minikube -p minikube docker-env) + # We need to delete the minikube cluster to bring it up again + minikube delete + # This will bring MiniKube up/down + pytest -xs ./fluxcloud/tests/test_examples.py::${test} + + test-runs: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + command: [["apply", "lammps"], ["submit", "./examples/minikube/basic"]] steps: - name: Clone the code uses: actions/checkout@v3 @@ -50,9 +111,10 @@ jobs: - name: Start minikube uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9 - - name: Test ${{ matrix.test }} + - name: Test ${{ matrix.command }} env: - name: ${{ matrix.test }} + name: ${{ matrix.command[1] }} + if: (matrix.command[0] == 'apply') run: | export PATH="/usr/share/miniconda/bin:$PATH" source activate fc @@ -61,3 +123,18 @@ jobs: # We need to delete the minikube cluster to bring it up again minikube delete /bin/bash ./tests/test.sh ${name} + + - name: Test ${{ matrix.command }} + env: + workdir: ${{ matrix.command[1] }} + if: (matrix.command[0] == 'submit') + run: | + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + export SHELL=/bin/bash + eval $(minikube -p minikube docker-env) + minikube delete + cd ${workdir} + flux-cloud up --cloud minikube --force-cluster + flux-cloud --debug submit --non-interactive + flux-cloud down --cloud minikube diff --git a/.gitignore b/.gitignore index 50fc771..50592df 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ env .env dist __pycache__ +examples/**/data +examples/**/_data diff --git a/CHANGELOG.md b/CHANGELOG.md index aa1b003..b231b2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ and **Merged pull requests**. Critical items to know are: The versions coincide with releases on pip. Only major versions will be released as tags on Github. ## [0.0.x](https://github.com/converged-computing/flux-cloud/tree/main) (0.0.x) + - refactor flux submit and apply to use fluxoperator Python SDK (0.2.0) + - This reduces scripts in output folder, but is a good tradeoff for fewer errors + - remove "ui" command, flux-cloud is intended mostly for automation + - command and image will always be required. - fix bash script bugs (0.1.19) - support for node group level aws avail. zones, save times on each experiment apply (0.1.18) - data should be namespaced by cloud type (so multiple experiments can be run alongside) (0.1.17) diff --git a/README.md b/README.md index 4e9ae0c..8cd28ea 100644 --- a/README.md +++ b/README.md @@ -23,11 +23,6 @@ It will be expanded as needed. Code is under development and likely to change! In the meantime, for early documentation, see our ā­ļø [Documentation](https://converged-computing.github.io/flux-cloud/) ā­ļø -## TODO - - - test for list of experiments - - cloud-select could estimate the cost? - - run and add more cluster examples ## šŸ˜ļø Contributors šŸ˜ļø diff --git a/docs/getting_started/commands.md b/docs/getting_started/commands.md index 863e9ae..0c7f846 100644 --- a/docs/getting_started/commands.md +++ b/docs/getting_started/commands.md @@ -1,11 +1,104 @@ # Commands -The following commands are provided by Flux Cloud. For running jobs, you can either do: +Welcome to the commands section! You can learn the details of each command below, or +check out an [example](examples.md) or [cloud tutorial](../tutorials/index.md). +The general steps you want to take are: -- **apply**/**run**: A single/multi job submission intended for different containers to re-create pods each time. -- **batch**/**submit**: A single/multi job submission intended for a common container base where we use the same set of pods. +1. Generate or find an `experiments.yaml` configuration. +2. Decide if you want to use `submit` or `apply` +3. Create the cluster, run experiments, and clean up. -Both are described in the following sections. +If you don't want to use an existing example, see [experiment init](#init) for how to create an `experiments.yaml` from scratch. + +> What's the difference between submit and apply? + +For `apply`, we are running one job per Minicluster (the Flux Operator custom resource definition). This means +we bring up an entire set of pods for each container (each entry under "jobs" in your experiment.yaml), +run the single job directly with `flux start -> flux submit` to provide the command to the broker, and then +when it finished the container will exit and the job clean up. This approach likely is suited to fewer jobs +that are longer running, and if you want to see output appear as it's available (we stream the log from the broker pod). +For `apply` we also skip creating the [Flux RESTFul API](https;//github.com/flux-framework/flux-restful-api) server, +so it's one less dependency to worry about, and you also don't need to think about exposing an API or users. + +For `submit`, we take advantage of Flux as a scheduler, bringing up the fewest number of MiniClusters we can +derive based on the unique containers and sizes in your `experiments.yaml`. This means that, for each unique +set, we bring up one MiniCluster, and then submit all your jobs at once, allowing Flux to act as a scheduler. +We poll the server every 30 seconds to get an update on running jobs, and when they are all complete, jobs +output and results are saved. This approach is more ideal for many smaller jobs, as the MiniClusters are +only brought up once (and you don't need to wait for pods to go up and down for each job). The cons of this +approach are getting logs at the end, unless you decide to interact with the Flux RESTFul API on your own +earlier. + +Next, read about how to use these commands in detail. + +## experiment + +### init + +When you want to create a new experiment, do: + +```bash +$ mkdir -p my-experiment +$ cd my-experiment + +# Create a new experiment for minikube +$ flux-cloud experiment init --cloud minikube +$ flux-cloud experiment init --cloud aws +$ flux-cloud experiment init --cloud google +``` + +This will create an `experiments.yaml` template with custom variables for your +cloud of choice, and robustly commented. + +
+ +View Example Output of flux-cloud experiment init + +```bash +$ flux-cloud experiment init --cloud google > experiments.yaml +``` +```yaml +matrix: + size: [4] + + # This is a Google Cloud machine + machine: [n1-standard-1] + +variables: + # Customize zone just for this experiment + # otherwise defaults to your settings.yml + zone: us-central1-a + +# Flux MiniCluster experiment attributes +minicluster: + name: my-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + # They must be smaller than the Kubernetes cluster size or not possible to run! + size: [2, 4] + +# Under jobs should be named jobs (output orgainzed by name) where +# each is required to have a command and image. Repeats is the number +# of times to run each job +jobs: + reaxc-hns: + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + sleep: + command: 'sleep 5' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + hello-world: + command: 'echo hello world' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS +``` + +
## list @@ -46,9 +139,86 @@ And this will run across sizes. To ask for a specific size: $ flux-cloud apply -e k8s-size-8-m5.large --size 2 ``` -## run +### up + +Here is how to bring up a cluster (with the operator installed). For this command, +we will either select the first in the matrix (default): + +```bash +$ flux-cloud up +``` +```console +No experiment ID provided, assuming first experiment n1-standard-1-2. +``` + +or if you want to specify an experiment identifier based on the machine and size, you can do that: + +```bash +$ flux-cloud up -e n1-standard-1-2 +``` +```console +Selected experiment n1-standard-1-2. +``` + +And to force up without a prompt: + +```bash +$ flux-cloud up -e n1-standard-1-2 --force-cluster +``` + +## Ways to run jobs + +The following commands are provided by Flux Cloud. For running jobs, you can either do: + +- **apply**/**run**: A single/multi job submission intended for different containers to re-create pods each time. +- **batch**/**submit**: A batch mode, where we submit / schedule many jobs on the fewest MiniClusters + +Both are described in the following sections. + +### apply / run + +> Ideal for running multiple jobs with different containers. + +An apply assumes that you want to create a separate MiniCluster each time, meaning +bringing up an entire set of pods, running a single command, and then bringing everything +down. This is ideal for longing running experiments, but note that it does not take advantage +of using Flux as a scheduler. Flux is basically running one job and going away. + +#### apply + +After "up" you can choose to run experiments (as you feel) with "apply." + +```bash +$ flux-cloud apply +``` + +The same convention applies - not providing the identifier runs the +first entry, otherwise we use the identifier you provide. + +```bash +$ flux-cloud apply -e n1-standard-1-2 +``` + +To force overwrite of existing results (by default they are skipped) + +```bash +$ flux-cloud apply -e n1-standard-1-2 --force +``` + +Apply is going to be creating on CRD per job, so that's a lot of +pod creation and deletion. This is in comparison to "submit" that +brings up a MiniCluster once, and then executes commands to it, allowing +Flux to serve as the scheduler. Note that by default, we always wait for a previous run to be cleaned up +before continuing. If you don't want apply to be interactive (e.g., it will +ask you before cleaning up) you can do: + +```bash +$ flux-cloud apply --non-interactive +``` + +By default, apply via a "run" is non-interactive. -> Up, apply, down in one command, ideal for completely headless runs and jobs with different containers. +#### run The main command is a "run" that is going to, for each cluster: @@ -112,67 +282,18 @@ $ flux-cloud apply -e n1-standard-1-2 $ flux-cloud down -e n1-standard-1-2 ``` -These commands are discussed in more next. - -### up - -Here is how to bring up a cluster (with the operator installed). For this command, -we will either select the first in the matrix (default): - -```bash -$ flux-cloud up -``` -```console -No experiment ID provided, assuming first experiment n1-standard-1-2. -``` - -or if you want to specify an experiment identifier based on the machine and size, you can do that: - -```bash -$ flux-cloud up -e n1-standard-1-2 -``` -```console -Selected experiment n1-standard-1-2. -``` - -And to force up without a prompt: - -```bash -$ flux-cloud up -e n1-standard-1-2 --force-cluster -``` - -## apply +### submit / batch -> Ideal for running multiple jobs with different containers. +> Ideal for one or more commands and/or containers across persistent MiniClusters. -After "up" you can choose to run experiments (as you feel) with "apply." +These commands submit multiple jobs to the same MiniCluster and actually use Flux +as a scheduler! This means we get the unique set of images and MiniCluster sizes for +your experiments, and then bring up each one, submitting the matching jobs to it. +We submit all jobs at once, and then poll Flux until they are completed to get output. -```bash -$ flux-cloud apply -``` +#### submit -The same convention applies - not providing the identifier runs the -first entry, otherwise we use the identifier you provide. - -```bash -$ flux-cloud apply -e n1-standard-1-2 -``` - -To force overwrite of existing results (by default they are skipped) - -```bash -$ flux-cloud apply -e n1-standard-1-2 --force -``` - -Apply is going to be creating on CRD per job, so that's a lot of -pod creation and deletion. This is in comparison to "submit" that -brings up a MiniCluster once, and then executes commands to it, allowing -Flux to serve as the scheduler. Note that by default, we always wait for a previous run to be cleaned up -before continuing. - -## submit - -> Ideal for one or more commands across the same container(s) and MiniCluster size. +The entire flow might look like: ```bash $ flux-cloud up --cloud minikube @@ -185,27 +306,31 @@ to submit jobs. For submit (and the equivalent to bring it up and down with batc your commands aren't provided in the CRD, but rather to the Flux Restful API. Submit / batch will also generate one CRD per MiniCluster size, but use the same MiniCluster across jobs. This is different -from apply, which generates one CRD per job to run. +from apply, which generates one CRD per job to run. If you don't want submit to be interactive +(e.g., it will ask you before cleaning up) you can do: -## batch +```bash +$ flux-cloud submit --non-interactive +``` -> Up, submit, down in one command, ideal for jobs with the same container(s) +By default, submit run with batch is non-interactive. + +#### batch + +This is the equivalent of "submit" but includes the up and down for the larger +Kubernetes cluster. + +```bash +$ flux-cloud batch --cloud aws +``` -The "batch" command is comparable to "run" except we are running commands -across the same set of containers. We don't need to bring pods up/down each time, -and we are using Flux in our cluster to handle scheduling. This command is going to: 1. Create the cluster 2. Run each of the experiments, saving output and timing, on the same pods 3. Bring down the cluster -The output is organized in the same way, and as before, you can choose to run a single -command with "submit" - -```bash -$ flux-cloud batch --cloud aws -``` +The output is organized in the same way, Note that since we are communicating with the FluxRestful API, you are required to provide a `FLUX_USER` and `FLUX_TOKEN` for the API. If you are running this programmatically, @@ -219,32 +344,6 @@ $ flux-cloud submit $ flux-cloud down ``` -## ui - -If you are interested in interactive submission on your own, either in the user interface -or via one of our client SDKs, you can bring up the MiniCluster and it's interface with -the Flux Restful API with `ui`: - -```bash -$ flux-cloud ui --cloud minikube -``` - -If you have many sizes of MiniClusters, you'll need to specify the one that you want: - -```bash -$ flux-cloud ui --cloud minikube --size 4 -``` - -By default, it will use your single MiniCluster size. - - - -Which then looks like this in the browser, available for submission via the interface itself -or the restful API until the user presses control+c to close the port forward and delete -the MiniCluster. - -![img/ui.png](img/ui.png) - ## down And then bring down your first (or named) cluster: @@ -266,7 +365,6 @@ You can also use `--force-cluster` here: $ flux-cloud down --force-cluster ``` - ## debug For any command, you can add `--debug` as a main client argument to see additional information. E.g., @@ -297,11 +395,10 @@ managedNodeGroups: ## scripts -By default, flux cloud keeps all scripts that the job renders in the experiment output directory under `.scripts`. If you -want to cleanup instead, you can add the `--cleanup` flag. We do this so you can inspect a script to debug, or if you -just want to keep them for reproducibility. As an example, here is outfrom from a run with multiple repeats of the -same command, across two MiniCluster cluster sizes (2 and 4). As of version `0.1.17` the data is also organized -by the runner (e.g., minikube vs google) so you can run the experiments across multiple clouds without conflict. +Flux cloud (prior to version 0.2.0) ran each job with a script, and it would save each script. Since version 0.2.0, +we refactored to do everything with Python APIs/SDKs, so we no longer save submit scripts. However, we still save +scripts for bringing up an down each cluster, along with node and pod metadata (as json). We save this in in the +hidden `.scripts` directory. ```console $ tree -a ./data/ @@ -314,17 +411,11 @@ $ tree -a ./data/ ā”‚ ā””ā”€ā”€ log.out ā”œā”€ā”€ meta.json ā””ā”€ā”€ .scripts - ā”œā”€ā”€ cluster-create-minikube.sh - ā”œā”€ā”€ flux-operator.yaml - ā”œā”€ā”€ kubectl-version.yaml - ā”œā”€ā”€ minicluster-run-lmp-size-2-minicluster-size-2.sh - ā”œā”€ā”€ minicluster-run-lmp-size-4-minicluster-size-4.sh - ā”œā”€ā”€ minicluster-size-2.yaml - ā”œā”€ā”€ minicluster-size-4.yaml - ā”œā”€ā”€ minikube-version.json - ā”œā”€ā”€ nodes-size-4.json - ā””ā”€ā”€ nodes-size-4.txt -``` - -And that's it! I think there might be a more elegant way to determine what cluster is running, -however if the user decides to launch more than one, it might be harder. More thinking / docs / examples coming soon. + ā”œā”€ā”€ minicluster-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json + ā”œā”€ā”€ nodes-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json + ā””ā”€ā”€ pods-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json +``` + +And that's it! We recommend you look at [examples](examples.md) or [tutorials](../tutorials/index.md) for +getting started. If you are brave, just run `flux-cloud experiment init --cloud ` to create +your own experiment from scratch. diff --git a/docs/getting_started/debugging.md b/docs/getting_started/debugging.md new file mode 100644 index 0000000..53bea0a --- /dev/null +++ b/docs/getting_started/debugging.md @@ -0,0 +1,163 @@ +# Debugging + +> Oh no, my MiniCluster jobs aren't running! + +Kubernetes is a complex beast, so here are some debugging tips that might help you figure out what +is going on. We are generally going to be looking at objects owned by the Flux Operator - pods, +config maps, and (sometimes volumes or services). Note that the object deployed by the Flux Operator +custom resource definition is called a `minicluster`: + +```bash +$ kubectl get -n flux-operator minicluster +``` +```console +NAME AGE +osu-benchmarks 57s +``` + +## 0. kubectl pro tips + +These tips come from the amazing [Claudia](https://github.com/cmisale)! + +It's fairly arduous to copy paste or type complete pod names, especially for indexed jobs where there is a random +set of characters. You can enable kubectl to autocomplete by adding this to your bash profile (`~/.bashrc`): + +```bash +source <(kubectl completion bash) +``` + +Another shortcut that is nice to have is to make an alias for `kubectl` to just be `k`: + +```bash +alias k=kubectl +``` + +Another tip is how to get an interactive session to a pod: + +```bash +$ kubectl exec -n flux-operator -it -- bash +``` + +Yes, it's very docker-like! I've found I'm much faster having these tricks than before. + + +## 1. Start with logs + +You can usually first look to pod logs to see what pods are there, and their various states: + +```bash +$ kubectl get -n flux-operator pods +``` + +Remember that if you use `flux-cloud` apply without debug, you won't see output after it finds the broker pod, +but you'll see it being printed to logs in your `data` folder. If you want to see output, either add `--debug` +after `flux-cloud` or look at the log and add `-f` to keep it hanging: + +```bash +# See instant of a log +$ kubectl logs -n flux-operator osu-benchmarks-0-vxnfq + +# Stream to the terminal until the container is done +$ kubectl logs -n flux-operator osu-benchmarks-0-vxnfq -f +``` + +Here is looking at output for the certificate generator pod: + +```bash +$ kubectl logs -n flux-operator osu-benchmarks-cert-generator +``` + +For `flux-cloud apply` if you want to see output consistently, it's suggested to add `--debug`, +as the miniclusters are going to be created / deleted and you'd need to grab the pod logs +multiple times! + +### What should I expect to see? + +The certificate generator pod runs first. It's output should *only* be +the certificate: + +```console +# **** Generated on 2023-03-04 04:24:46 by CZMQ **** +# ZeroMQ CURVE **Secret** Certificate +# DO NOT PROVIDE THIS FILE TO OTHER USERS nor change its permissions. + +metadata + name = "osu-benchmarks-cert-generator" + time = "2023-03-04T04:24:46" + userid = "0" + hostname = "osu-benchmarks-cert-generator" +curve + public-key = "l12&OlN-DwF*6rhx##Y#ZQ^9w1zON039Vxh2&+8r" + secret-key = "o^(dM0R96q-d=2Jk-tEjgh=syRjW?q6%Kq{Q8Y4H" +``` + +If you see any error message about "invalid curve cert" this means that something was incorrectly +generated. As an example, you should use `preCommand` for any logic that is shared between +the certificate generator and worker/broker pods (e.g., sourcing an environment for Flux) and commands->pre +for anything else that is just for the worker/broker pods (printing to debug, etc.) + +For the broker pod, you should expect to see debugging output (if logging->debug is true) and then the +Flux Broker starting. The quorum should be reported to be full. E.g., + +```console +šŸŒ€ flux start -o --config /etc/flux/config -Scron.directory=/etc/flux/system/cron.d -Stbon.fanout=256 -Srundir=/run/flux -Sstatedir=/var/lib/flux -Slocal-uri=local:///run/flux/local -Slog-stderr-level=6 -Slog-stderr-mode=local +broker.info[1]: start: none->join 13.3684ms +broker.info[1]: parent-ready: join->init 1.14525s +broker.info[1]: configuration updated +broker.info[1]: rc1.0: running /etc/flux/rc1.d/01-sched-fluxion +broker.info[1]: rc1.0: running /etc/flux/rc1.d/02-cron +broker.info[1]: rc1.0: /etc/flux/rc1 Exited (rc=0) 0.2s +broker.info[1]: rc1-success: init->quorum 0.234173s +broker.info[1]: quorum-full: quorum->run 0.204937s +``` + +If you see any error messages from the broker, this should be looked into. +Warnings can sometimes be OK. Ask if you aren't sure. + +## 2. Use describe + +You can describe any object in Kubernetes space to debug. Describe is especially important when you are debugging +storage and want to figure out why something isn't mounting. Typically you might start by looking at pods in all +namespaces: + +```bash +$ kubectl get pods --all-namespaces -o wide +``` + +The wide format is useful because it will show you the node each pod is assigned to, which can be useful +for debugging resource limits and requests. You then might want to describe a particular pod, +maybe to look at annotations or volume mounts: + +```bash +$ kubectl describe pod -n flux-operator osu-benchmarks-1-tj6bt +``` + +You can get json output with a get for the pod (or object): + +```bash +$ kubectl get pod -n flux-operator osu-benchmarks-1-tj6bt -o json +``` + +And pipe that into `jq` to look for specific attributes! So let's say you see that a volume +failed for your pod. You likely want to next check your persistent volumes "pv" and claims "pvc": + +```bash +$ kubectl describe -n flux-operator pv +$ kubectl describe -n flux-operator pvc +``` + +For volumes, if you are using a container storage interface (CSI) you likely are using a daemon set that +deploys pods. Try looking at the logs for the pods, and/or the daemonset for issues: + +```bash +$ kubectl describe daemonset --all-namespaces +``` + +Finally, services (svc) can be useful if you suspect a permission or credential is wonky. + +## 3. Advanced + +Often when I'm debugging something complex I try to create the object I'm interested in so it is in a +continuously running state. As an example, to test a pod for a daemonset, I will get the raw YAML +for the daemonset and change the entrypoint to `sleep infinity`. I can then shell in and manually run +commands to see their output. diff --git a/docs/getting_started/examples.md b/docs/getting_started/examples.md index b2e0cdd..7ddd9be 100644 --- a/docs/getting_started/examples.md +++ b/docs/getting_started/examples.md @@ -3,24 +3,24 @@ The easiest thing to do is arguably to start with an example, and then customize it. Here we will add examples as we create them. -- [up-apply-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-apply-down): shows using `flux-cloud apply` for individual CRD submission. -- [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/osu-benchmarks) -- [up-submit-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-submit-down): shows using `flux-cloud submit` for batch submission. -- [aws-lammps](https://github.com/converged-computing/flux-cloud/tree/main/examples/aws-lammps): a simple lammps run on AWS. - -The above example runs a single command in a single Kubernetes cluster and MiniCluster, -and it's lammps! - -## Demo - -Here is a quick demo from the [up-apply-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-apply-down) in the repository. - - - -which was actually run as: +- [minikube](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube) + - [basic](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/basic) + - [volumes](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/volumes) + - [resources](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/resources) + - [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/osu-benchmarks) + - [persistent](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/persistent) +- [google](https://github.com/converged-computing/flux-cloud/tree/main/examples/google) + - [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/google/osu-benchmarks) + +All of the examples above (for MiniKube) are tested, and can be adopted for another cloud typically by adding +the "machines" directive under "matrix" and then any custom variables. As a reminder, you can generate +a blank template for any cloud (including variables) via: ```bash -$ flux-cloud run +$ flux-cloud experiment init --cloud minikube +$ flux-cloud experiment init --cloud aws +$ flux-cloud experiment init --cloud google ``` -for the purposes of the demo, and runs a lammps job on two tiny nodes! + +New examples for AWS will be coming soon - I didn't have credits to test when I wrote these. diff --git a/docs/getting_started/experiments.md b/docs/getting_started/experiments.md index 2736cfb..fa31abf 100644 --- a/docs/getting_started/experiments.md +++ b/docs/getting_started/experiments.md @@ -3,12 +3,21 @@ Welcome to the Flux Cloud experiments user guide! If you come here, we are assuming you want to run jobs with the Flux Operator on GKE, and that you have [installed](install.md) flux-cloud. Note this project is early in development so this could change or bugs could be introduced. -Let's get started with talking about experiments. Your experiments will typically be defined by two files: +Let's get started with talking about experiments. As of version 0.2.0, your experiments will be defined by one file: - - experiments.yaml: a yaml file that describes sizes, machines, and jobs to run - - minicluster-template.yaml: a completely or partially filled template custom resource definition. + - experiments.yaml: a yaml file that describes sizes, machines, miniclusters, and jobs to run We will walk through example experiment files here, along with a full set of fields you can use. +Note that to get an example experiments.yaml template for any cloud, you can always do: + +```bash +$ flux-cloud experiment init --cloud minikube +$ flux-cloud experiment init --cloud aws +$ flux-cloud experiment init --cloud google +``` + +The documentation here outlines the sections in details, however the above is the best +means to get an entire, holistic file. ## Experiment Definition @@ -29,6 +38,7 @@ matrix: size: [2, 4] machine: ["n1-standard-1", "n1-standard-2"] ``` + Note that the sizes at this level indicate *the size of the Kubernetes cluster*. We will expand on this idea later. This would run each size across each machine, for a total of 4 Kubernetes clusters created. The number of custom resource (CRD) definitions applied to each one would vary based on the number of jobs. @@ -167,78 +177,85 @@ jobs: osu_get_latency: command: './osu_get_latency' image: ghcr.io/awesome/science:latest - workdir: /path/to/science + working_dir: /path/to/science repeats: 3 ``` For repeats, we add another level to the output directory, and represent the result data as -subdirectories of the machine and size from 1..N. Note also that likely in the future we -can provide a default template and require all these variables -defined. For now we require you to provide the template. - - -## Custom Resource Definition - -> minicluster-template.yaml +subdirectories of the machine and size from 1..N. -The custom resource definition template "CRD" is currently suggested so you can customize exactly to your liking, -but it's not required. It is used by flux-cloud to populate your job metadata and then submit one or more jobs to your Kubernetes cluster. +#### Flux Options -### Use Your Own - -Here is an example that uses a shared working directory (so it's hard coded) and a variable -for the command: +How do job parameters map to Flux, in the case of using `flux-cloud submit`? Good question! Here is the mapping: ```yaml -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # Number of pods to create for MiniCluster - size: {{ minicluster.size }} - - # Disable verbose output - logging: - quiet: true - - # This is a list because a pod can support multiple containers - containers: - # The container URI to pull (currently needs to be public) - - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 - - # You can set the working directory if your container WORKDIR is not correct. - workingDir: /home/flux/examples/reaxff/HNS - command: {{ job.command }} +jobs: + example-job: + command: './osu_get_latency' + flux_option_flags: "-ompi=openmpi@5" + working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided # workdir + image: ghcr.io/rse-ops/osu-microbench:test + + # osu benchmarks requires exactly 2 processes + tasks: 2 # num_tasks + cores: 1 # cores_per_task + gpus: 0 # gpus_per_task + nodes: 1 # num_nodes ``` -### Use The Default - -To use the default, you want to make sure that you provide all variables that are required. -The following are required (and have defaults or are otherwise generated by flux cloud -so you could leave them out of your experiments.yaml): - -- minicluster.name -- minicluster.namespace -- minicluster.local_deploy (defaults to false) -- minicluster.verbose (default to false to run in test mode) - -It's recommended to set your listing of sizes for miniclusters: +#### Yaml Tricks -- minicluster.size +For your jobs, you likely will want to re-use parameters. There is a trick with YAML +to define a named section, and then re-use it. Here is an example running the OSU +benchmarks. -The following are specific to the job and required: +```yaml +# matrix of experiments to run - machine types and sizes are required +# This can obviously be expanded to more sizes or machines, +matrix: + size: [2] + machine: ["n1-standard-1", "n1-standard-2"] -- job.image -- job.command +# An example of shared container options! +x-container-options: &options + fluxOptionFlags: "-ompi=openmpi@5" + working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided + image: ghcr.io/rse-ops/osu-microbench:app-latest + # This MUST be run for the certificate generator and workers/broker + pre_command: source /etc/profile.d/z10_spack_environment.sh -The following are specific to the job but not required: +# Flux Mini Cluster experiment attributes +minicluster: + name: osu-benchmarks + namespace: flux-operator -- job.workdir -- job.tasks (recommended for better control of flux, as this would default to 1) -- job.flux_option_flags (e.g., "-ompi=openmpi@5") -- job.cores (defaults to 1 if not set, likely not ideal for your experiment) -- job.limits (key value pairs) -- job.requests (key value pairs) -- job.pre_command: the job pre-command (usually multiple lines) but not required. +# Each job can have a command and working directory +jobs: + osu_get_latency: + command: './osu_get_latency' + <<: *options + osu_acc_latency: + command: './osu_acc_latency' + <<: *options + osu_fop_latency: + command: './osu_fop_latency' + <<: *options + osu_get_bw: + command: './osu_get_bw' + <<: *options + osu_put_bibw: + command: './osu_put_bibw' + <<: *options + osu_put_latency: + command: './osu_put_latency' + <<: *options + osu_cas_latency: + command: './osu_cas_latency' + <<: *options + osu_get_acc_latency: + command: './osu_get_acc_latency' + <<: *options + osu_put_bw: + command: './osu_put_bw' + <<: *options +``` diff --git a/docs/getting_started/google.md b/docs/getting_started/google.md deleted file mode 100644 index 9f7c96c..0000000 --- a/docs/getting_started/google.md +++ /dev/null @@ -1,74 +0,0 @@ -# Google Cloud - -> Running on Google Kubernetes Engine, GKE - -The main functionality that flux-cloud provides are easy wrappers (and templates) to running -the Flux Operator on GKE. The main steps of running experiments are: - - - **up** to bring up a cluster - - **apply** to apply one or more experiments defined by an experiments.yaml - - **down** to destroy a cluster - -Each of these commands can be run in isolation, and we provide a single command **run** to -automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your -machine to do the work (e.g., kubectl and gcloud) and importantly, for every step we show -you the command, and if it fails, give you a chance to bail out. We do this so if you -want to remove the abstraction at any point and run the commands on your own, you can. - -## Pre-requisites - -You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts) -and ensure you are logged in and have kubectl installed: - -```bash -$ gcloud auth login -``` - -Depending on your install, you can either install with gcloud: - -```bash -$ gcloud components install kubectl -``` -or just [on your own](https://kubernetes.io/docs/tasks/tools/). - -## Cloud - -Finally, ensure that google is either your default cloud (the `default_cloud` in your settings.yml) -or you specify it with `--cloud` when you do run. - - -## Custom Variables - -The following custom variables are supported in the "variables" section (key value pairs) -for Google in an `experiments.yaml` - -```yaml -variables: - # Customize zone just for this experiment - zone: us-central1-a -``` - - -## Run Experiments - -Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to -populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the -library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to -provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) -directory for a few that we provide. We will walk through a generic one here to launch -an experiment on a Kubernetes cluster. Note that before doing this step you should -have installed flux-cloud, along with kubectl and gcloud, and set your defaults (e.g., project zone) -in your settings. - -```bash -$ flux-cloud run experiments.yaml -``` - -Note that since the experiments file defaults to that name, you can also just do: - -```bash -$ flux-cloud run -``` - -Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory. -Note that machines and size are required for the matrix, and variables get piped into all experiments (in full). diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md index 30879f8..9fc263c 100644 --- a/docs/getting_started/index.md +++ b/docs/getting_started/index.md @@ -10,8 +10,6 @@ install commands examples experiments +debugging settings -google -aws -minikube ``` diff --git a/docs/getting_started/minikube.md b/docs/getting_started/minikube.md deleted file mode 100644 index f851b51..0000000 --- a/docs/getting_started/minikube.md +++ /dev/null @@ -1,134 +0,0 @@ -# MiniKube - -> Running on a local MiniKube cluster - -Flux Cloud (as of version 0.1.0) can run on MiniKube! The main steps of running experiments with -different container bases are: - - - **up** to bring up a cluster - - **apply** to apply one or more CRDs from experiments defined by an experiments.yaml - - **down** to destroy a cluster - -or one or more commands with the same container base(s): - - - **up** to bring up a cluster - - **submit** to submit one or more experiments to the same set of pods defined by an experiments.yaml - - **down** to destroy a cluster - -Each of these commands can be run in isolation, and we provide a single command **run** to -automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your -machine to do the work (e.g., minikube and kubectl) and importantly, for every step we show -you the command, and if it fails, give you a chance to bail out. We do this so if you -want to remove the abstraction at any point and run the commands on your own, you can. - -## Pre-requisites - -You should first [install minikube](https://minikube.sigs.k8s.io/docs/start/) -and kubectl. - -## Run Experiments - -Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to -populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the -library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to -provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) -directory for a few that we provide. We will walk through a generic one here to launch -an experiment on a MiniKube Kubernetes cluster. Note that before doing this step you should -have installed flux-cloud, along with kubectl and minikube. Note that if it's not the default, -you'll need to specify using MiniKube - -### Apply / Run - -> Ideal if you need to run multiple jobs on different containers - -```bash -$ flux-cloud run --cloud minikube experiments.yaml -``` - -Or set to the default: - -```bash -$ flux-cloud config set default_cloud:minikube -``` - -Given MiniKube is the default, since the experiments file defaults to that name, you can also just do: - -```bash -$ flux-cloud run -``` - -Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory. -Note that only size is required for the matrix for MiniKube (there is currently no concept of a machine, -although there could be), and variables get piped into all experiments (in full). Under variables, -both "commands" and "ids" are required, and must be equal in length (each command is assigned to one id -for output). To just run the first entry in the matrix (test mode) do: - -```bash -$ flux-cloud run experiments.yaml --test -``` - -Note that you can also use the other commands in place of a single run, notably "up" "apply" and "down." -By default, results will be written to a temporary output directory, but you can customize this with `--outdir`. -Finally, since MiniKube often has trouble pulling images, we recommend you include the container image as a variable -in the experiment.yaml so it can be pulled before the experiment is run. E.g., this experiment: - -```yaml -matrix: - size: [4] - -# Flux Mini Cluster experiment attributes -minicluster: - name: lammps - namespace: flux-operator - size: [2, 4] - -# Each job can have a command and working directory -jobs: - lmp: - command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite - repeats: 2 - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 -``` - -And this config file: - -```yaml -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # Number of pods to create for MiniCluster - size: {{ minicluster.size }} - - # Disable verbose output - logging: - quiet: true - - # This is a list because a pod can support multiple containers - containers: - # The container URI to pull (currently needs to be public) - - image: {{ job.image }} - - # You can set the working directory if your container WORKDIR is not correct. - workingDir: /home/flux/examples/reaxff/HNS - command: {{ job.command }} -``` - -### Submit - -> Ideal for one or more commands across the same container(s) and MiniCluster size. - -```bash -$ flux-cloud up --cloud minikube -$ flux-cloud submit --cloud minikube -$ flux-cloud down --cloud minikube -``` - -The submit will always check if the MiniCluster is already created, and if not, create it -to submit jobs. For submit (and the equivalent to bring it up and down with batch) -your commands aren't provided in the CRD, -but rather to the Flux Restful API. Submit / batch will also generate one CRD -per MiniCluster size, but use the same MiniCluster across jobs. This is different -from apply, which generates one CRD per job to run. diff --git a/docs/index.rst b/docs/index.rst index efdb1a1..d9804fb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -52,15 +52,15 @@ to unite the worlds and technologies typical of cloud computing and high performance computing. To get started, check out the links below! -Would you like to request a feature or contribute? -[Open an issue](https://github.com/flux-framework/flux-cloud/issues). +Would you like to request a feature or contribute? `Open an issue `_. .. toctree:: :caption: Getting Started - :maxdepth: 1 + :maxdepth: 2 getting_started/index.md + tutorials/index.md .. toctree:: :caption: About diff --git a/docs/getting_started/aws.md b/docs/tutorials/aws.md similarity index 54% rename from docs/getting_started/aws.md rename to docs/tutorials/aws.md index 2ef4a6c..be1a0b3 100644 --- a/docs/getting_started/aws.md +++ b/docs/tutorials/aws.md @@ -1,19 +1,14 @@ # AWS > Running on Amazon Elastic Kubernetes Service EKS +Flux Cloud (as of version 0.1.0) can run on MiniKube! There are two primary use cases for using flux-cloud: -The flux-cloud software provides are easy wrappers (and templates) to running -the Flux Operator on Amazon. The main steps of running experiments are: + - **apply** is good for many larger experiments that require different container bases and / or take a longer time to run. + - **submit** is good for smaller experiments that might use the same container bases and / or take a shorter time to run. - - **up** to bring up a cluster - - **apply** to apply one or more experiments defined by an experiments.yaml - - **down** to destroy a cluster - -Each of these commands can be run in isolation, and we provide a single command **run** to -automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your -machine to do the work (e.g., kubectl and gcloud) and importantly, for every step we show -you the command, and if it fails, give you a chance to bail out. We do this so if you -want to remove the abstraction at any point and run the commands on your own, you can. +For the latter (submit) we will bring up the minimum number of MiniClusters required (unique based on container image size) +and launch all jobs across them, using Flux as a scheduler. As of version 0.2.0 both commands both use the fluxoperator Python +SDK, so we only use bash scripts to bring up and down cloud-specific clusters. ## Pre-requisites @@ -45,6 +40,52 @@ This is used so you can ssh (connect) to your workers! Finally, ensure that aws is either your default cloud (the `default_cloud` in your settings.yml) or you specify it with `--cloud` when you do run. +## Run Experiments + +**IMPORTANT** for any experiment when you choose an instance type, you absolutely +need to choose a size that has [IsTrunkingCompatible](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go) +true. E.g., `m5.large` has it set to true so it would work. Each experiment is defined by the matrix and variables in an `experiment.yaml`. It's recommended you +start with a template populated for aws: + +```bash +$ flux-cloud experiment init --cloud aws +``` + +And see the [custom variables](#custom-variables) defined below to learn more about them, +or the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) +directory for a few examples that we provide. We will walk through a generic one here to launch +an experiment on a Kubernetes cluster. Note that before doing this step you should +have installed flux-cloud, along with ekctl, and set your defaults (e.g., project zone) +in your settings. + +Given an experiments.yaml in the present working directory, you can do an apply, +meaning creating a separate MiniCluster per job: + +```bash +# Up / apply / down +$ flux-cloud run --cloud aws + +# Manual up / apply / down (recommended) +$ flux-cloud up --cloud aws +$ flux-cloud apply --cloud aws +$ flux-cloud down --cloud aws +``` + +Or submit, creating shared MiniClusters to submit multiple jobs to: + +```bash +# Up / submit / down +$ flux-cloud batch --cloud aws + +# Manual up / submit / down (recommended) +$ flux-cloud up --cloud aws +$ flux-cloud submit --cloud aws +$ flux-cloud down --cloud aws +``` + +Note that machines and size are required for the matrix. + + ## Custom Variables The following custom variables are supported in the "variables" section (key value pairs) @@ -74,39 +115,3 @@ variables: Note that we currently take a simple approach for boolean values - if it's present (e.g., the examples) above) it will be rendered as true. Don't put False in there, but rather just delete the key. - -## Run Experiments - -**IMPORTANT** for any experiment when you choose an instance type, you absolutely -need to choose a size that has [IsTrunkingCompatible](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go) -true. E.g., `m5.large` has it set to true so it would work. - -Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to -populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the -library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to -provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) -directory for a few that we provide. We will walk through a generic one here to launch -an experiment on a Kubernetes cluster. Note that before doing this step you should -have installed flux-cloud, along with kubectl and gcloud, and set your defaults (e.g., project zone) -in your settings. - -```bash -$ flux-cloud run experiments.yaml -``` - -Note that since the experiments file defaults to that name, you can also just do: - -```bash -$ flux-cloud run -``` - -Or for more control and/or verbosity: - -```bash -$ flux-cloud --debug up --cloud aws -$ flux-cloud --debug apply --cloud aws -$ flux-cloud --debug down --cloud aws -``` - -Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory. -Note that machines and size are required for the matrix, and variables get piped into all experiments (in full). diff --git a/docs/tutorials/google.md b/docs/tutorials/google.md new file mode 100644 index 0000000..cf4616a --- /dev/null +++ b/docs/tutorials/google.md @@ -0,0 +1,91 @@ +# Google Cloud + +> Running on Google Kubernetes Engine, GKE + +The main functionality that flux-cloud provides are easy wrappers (and templates) to running +the Flux Operator on GKE. The main steps of running experiments are: + + - **up** to bring up a cluster + - **apply/submit** to apply or submit one or more experiments defined by an experiments.yaml + - **down** to destroy a cluster + +Each of these commands can be run in isolation, and we provide single commands **run/batch** to +automate the entire thing. For Google Cloud, you can see a small collection of [examples here](https://github.com/converged-computing/flux-cloud/tree/main/examples/google). + +## Pre-requisites + +You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts) +and ensure you are logged in and have kubectl installed: + +```bash +$ gcloud auth login +``` + +Depending on your install, you can either install with gcloud: + +```bash +$ gcloud components install kubectl +``` +or just [on your own](https://kubernetes.io/docs/tasks/tools/). + +## Cloud + +Finally, ensure that google is either your default cloud (the `default_cloud` in your settings.yml) +or you specify it with `--cloud` when you do run. + +## Custom Variables + +The following custom variables are supported in the "variables" section (key value pairs) +for Google in an `experiments.yaml` + +```yaml +variables: + # Customize zone just for this experiment + zone: us-central1-a +``` + + +## Run Experiments + +You can create an empty experiment template as follows: + +```bash +$ flux-cloud experiment init --cloud google +``` + +Each experiment is defined by the matrix and variables in an `experiment.yaml` +One of the goals of the Flux Cloud Experiment runner is not just to run things, but to +provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) +directory for a few that we provide. We will walk through a generic one here to launch +an experiment on a Kubernetes cluster. Note that before doing this step you should +have installed flux-cloud, along with gcloud, and set your defaults (e.g., project zone) +in your settings. + +Given an experiments.yaml in the present working directory, you can do an apply, +meaning creating a separate MiniCluster per job: + +```bash +# Up / apply / down +$ flux-cloud run --cloud google + +# Manual up / apply / down (recommended) +$ flux-cloud --debug up --cloud google +$ flux-cloud --debug apply --cloud google +$ flux-cloud --debug down --cloud google +``` + +For any of the commands here, add `--debug` after `flux-cloud` to see more verbosity. +Or submit, creating shared MiniClusters to submit multiple jobs to: + +```bash +# Up / submit / down +$ flux-cloud batch --cloud google + +# Manual up / submit / down (recommended) +$ flux-cloud --debug up --cloud google +$ flux-cloud --debug submit --cloud google +$ flux-cloud --debug down --cloud google +``` + +Note that machines and size are required for the matrix. See our [debugging guide](../getting-started/debugging.md) +for the Flux Operator for interacting with Flux Operator containers or debugging. diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md new file mode 100644 index 0000000..b545d8f --- /dev/null +++ b/docs/tutorials/index.md @@ -0,0 +1,11 @@ +# Tutorials + +These tutorials will walk through common use cases for Flux Cloud! If you have +any questions or issues, please [let us know](https://github.com/flux-framework/flux-cloud/issues) + +```{toctree} +:maxdepth: 3 +minikube +google +aws +``` diff --git a/docs/tutorials/minikube.md b/docs/tutorials/minikube.md new file mode 100644 index 0000000..4e68e4e --- /dev/null +++ b/docs/tutorials/minikube.md @@ -0,0 +1,313 @@ +# MiniKube + +> Running on a local MiniKube cluster + +Flux Cloud (as of version 0.1.0) can run on MiniKube! There are two primary use cases for using flux-cloud: + + - **apply** is good for many larger experiments that require different container bases and / or take a longer time to run. + - **submit** is good for smaller experiments that might use the same container bases and / or take a shorter time to run. + +For the latter (submit) we will bring up the minimum number of MiniClusters required (unique based on container image size) +and launch all jobs across them, using Flux as a scheduler. As of version 0.2.0 both commands both use the fluxoperator Python +SDK, so we only use bash scripts to bring up and down cloud-specific clusters. + + +## Pre-requisites + +You should first [install minikube](https://minikube.sigs.k8s.io/docs/start/) +and kubectl. + +## Run Experiments + +Let's start with a simple `experiments.yaml` file, where we have defined a number of different +experiments to run on MiniKube. `flux-cloud submit` relies entirely on this experiment file, +and programmatically generates the MiniCluster [custom resource definitions](https://flux-framework.org/flux-operator/getting_started/custom-resource-definition.html#workingdir) +for you, so you don't need to provide any kind of template. + +
+ +How does it work? + +A YAML file (such as the experiments.yaml) can be serialized to JSON, so each section under "jobs" is +also json, or actually (in Python) a dictionary of values. Since the values are passed to the +[Flux Operator Python SDK](https://github.com/flux-framework/flux-operator/tree/main/sdk/python/v1alpha1), +we can map them easily according to the following convention. Let's say we have a job in the experiments listing: + +```yaml +jobs: + # This is the start of the named job + reaxc-hns: + + # These are attributes for the MiniCluster (minus repeats) + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS +``` + +The content under the job name "reaxc-hns" would be mapped to the MiniCluster container as follows: + +```python +from fluxoperator.models import MiniClusterContainer + +container = MiniClusterContainer( + image="ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + working_dir="/home/flux/examples/reaxff/HNS", + command="lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", + run_flux=True, +) +``` + +Note that in the above, since Go is in camel case and the Python SDK turns it into snake case, +`workingDir` is changed to `working_dir`. + +
+ + +Let's start with this set of experiments. Note that we've provided the same container +for all of them, meaning that we will only be creating one MiniCluster with that container. +If you provide jobs with separate containers, they will be brought up as separate clusters +to run (per each unique container, with all jobs matched to it). + +```yaml +# This is intended for MiniKube, so no machine needed. +# We will create a MiniKube cluster of size 2 +matrix: + size: [2] + +# Flux Mini Cluster experiment attributes +minicluster: + name: submit-jobs + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + +# Each of command and image are required to do a submit! +jobs: + reaxc-hns: + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + sleep: + command: 'sleep 5' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + hello-world: + command: 'echo hello world' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS +``` + +Each experiment is defined by the matrix and variables in an `experiment.yaml`, as shown above. +Note that the easiest way to get started is to use an existing example, or run: + +```bash +$ flux-cloud experiment init --cloud minikube +``` + +In the example above, we are targeting minikube. + + +### Apply / Run + +> Ideal if you need to run multiple jobs on different containers + +This apply/run workflow will create a new MiniCluster each time (pods up and down) +and not use Flux as a scheduler proper. A workflow might look like: + +```bash +$ flux-cloud up --cloud minikube +$ flux-cloud apply --cloud minikube +$ flux-cloud down --cloud minikube +``` +Or achieve all three with: + +```bash +$ flux-cloud run --cloud minikube +``` + +Let's run this with our `experiments.yaml` above in the present working directory, +and after having already run `up`: + +```bash +# Also print output to the terminal (so you can watch!) +$ flux-cloud --debug apply --cloud minikube + +# Only save output to output files +$ flux-cloud apply --cloud minikube +``` + +At the end of the run, you'll have an organized output directory with all of your +output logs, along with saved metadata about the minicluster, pods, and nodes. + +```bash + +``` + +### Submit + +> Ideal for one or more commands across the one or more containers and MiniCluster sizes + +The idea behind a submit is that we are going to create the minimal number of MiniClusters you +need (across the set of unique sizes and images) and then submit all jobs to Flux within +the MiniCluster. The submit mode is actually using Flux as a scheduler and not just a +"one job" running machine. A basic submit workflow using the config above might look like this: + +```bash +$ flux-cloud up --cloud minikube +$ flux-cloud submit --cloud minikube +$ flux-cloud down --cloud minikube +``` + +Instead of running one job at a time and waiting for output (e.g., apply) we instead +submit all the jobs, and then poll every 30 seconds to get job statuses. + +
+ +View full output of submit command + +```bash +$ flux-cloud --debug submit --cloud minikube +``` +```console +No experiment ID provided, assuming first experiment k8s-size-4-n1-standard-1. +Job experiments file generated 1 MiniCluster(s). + +šŸŒ€ Bringing up MiniCluster of size 2 with image ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 +All pods are in states "Running" or "Completed" +šŸ’¾ Creating output directory /home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/minikube +MiniCluster created with credentials: + FLUX_USER=fluxuser + FLUX_TOKEN=d467215d-d07d-4c32-b2b9-41643cda3d7d +All pods are in states "Running" or "Completed" +Found broker pod lammps-job-0-ng8pz + +Waiting for http://lammps-job-0-ng8pz.pod.flux-operator.kubernetes:5000 to be ready +šŸŖ…ļø RestFUL API server is ready! +. +Port forward opened to http://lammps-job-0-ng8pz.pod.flux-operator.kubernetes:5000 +Submitting reaxc-hns-1-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting reaxc-hns-2-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting reaxc-hns-3-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting reaxc-hns-4-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting reaxc-hns-5-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting sleep-1-minicluster-size-2: sleep 5 +Submitting sleep-2-minicluster-size-2: sleep 5 +Submitting sleep-3-minicluster-size-2: sleep 5 +Submitting sleep-4-minicluster-size-2: sleep 5 +Submitting sleep-5-minicluster-size-2: sleep 5 +Submitting hello-world-1-minicluster-size-2: echo hello world +Submitting hello-world-2-minicluster-size-2: echo hello world +Submitting hello-world-3-minicluster-size-2: echo hello world +Submitting hello-world-4-minicluster-size-2: echo hello world +Submitting hello-world-5-minicluster-size-2: echo hello world +Submit 15 jobs! Waiting for completion... +15 are active. + lmp is in state RUN + lmp is in state RUN + lmp is in state SCHED + lmp is in state SCHED + lmp is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED +15 are active. + lmp is finished COMPLETED in 28.64 seconds. + lmp is finished COMPLETED in 29.1 seconds. + lmp is in state RUN + lmp is in state RUN + lmp is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED +13 are active. + lmp is in state RUN + lmp is in state RUN + lmp is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED +13 are active. + lmp is finished COMPLETED in 36.56 seconds. + lmp is finished COMPLETED in 35.89 seconds. + lmp is in state RUN + sleep is finished COMPLETED in 5.02 seconds. + sleep is finished COMPLETED in 5.02 seconds. + sleep is finished COMPLETED in 5.02 seconds. + sleep is in state RUN + sleep is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED +8 are active. + lmp is finished COMPLETED in 24.6 seconds. + sleep is finished COMPLETED in 5.02 seconds. + sleep is finished COMPLETED in 5.02 seconds. + echo is finished COMPLETED in 0.01 seconds. + echo is finished COMPLETED in 0.02 seconds. + echo is finished COMPLETED in 0.02 seconds. + echo is finished COMPLETED in 0.01 seconds. + echo is finished COMPLETED in 0.01 seconds. +All jobs are complete! Cleaning up MiniCluster... +All pods are terminated. +``` + +
+ +After submit, you will still have an organized output directory with job output files +and metadata. + +```bash +$ tree -a data/minikube/ +data/minikube/ +ā””ā”€ā”€ k8s-size-4-n1-standard-1 + ā”œā”€ā”€ hello-world-1-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ hello-world-2-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ hello-world-3-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ hello-world-4-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ hello-world-5-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ meta.json + ā”œā”€ā”€ reaxc-hns-1-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ reaxc-hns-2-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ reaxc-hns-3-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ reaxc-hns-4-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā”œā”€ā”€ reaxc-hns-5-minicluster-size-2 + ā”‚ ā””ā”€ā”€ log.out + ā””ā”€ā”€ .scripts + ā””ā”€ā”€ minicluster-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json +``` diff --git a/examples/aws-lammps/README.md b/examples/aws-lammps/README.md deleted file mode 100644 index 99b7d2c..0000000 --- a/examples/aws-lammps/README.md +++ /dev/null @@ -1,71 +0,0 @@ -# Lammps on Amazon Cloud - -In this set of experiments we will run the Flux Operator on Amazon Cloud at size N=2 -(the benchmarks require this) and multiple machine types. - -## Pre-requisites - -You should first [install eksctrl](https://github.com/weaveworks/eksctl) and make sure you have access to an AWS cloud (e.g., -with credentials or similar in your environment). E.g.,: - -```bash -export AWS_ACCESS_KEY_ID=xxxxxxxxxxxxxxxxxxx -export AWS_SECRET_ACCESS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -export AWS_SESSION_TOKEN=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -``` - -The last session token may not be required depending on your setup. -We assume you also have [kubectl](https://kubernetes.io/docs/tasks/tools/). - -### Setup SSH - -You'll need an ssh key for EKS. Here is how to generate it: - -```bash -ssh-keygen -# Ensure you enter the path to ~/.ssh/id_eks -``` - -This is used so you can ssh (connect) to your workers! - -### Cloud - -we will be using [Flux Cloud](https://github.com/converged-computing/flux-cloud) -to run the Operator on Google Cloud Kubernetes engine. - -```bash -$ pip install flux-cloud -``` - -Note that these experiments were run with version 0.1.0. -Ensure that aws is either your default cloud (the `default_cloud` in your settings.yml) -or you specify it with `--cloud` when you do run. - - -## Run Experiments - -Each experiment here is defined by the matrix and variables in [experiments.yaml](experiment.yaml) that is used to -populate a [minicluster-template.yaml](minicluster-template.yaml) and launch a Kubernetes cluster. -You can read the documentation for flux-cloud to understand the variables available. -This tutorial assumes you have flux-cloud installed and configured. See all unique Kubernetes clusters -we will run the jobs on: - -```bash -$ flux-cloud list -``` - -Then you can either run all at once: - -```bash -$ flux-cloud run --force-cluster -``` - -Or (for testing) to bring up just the first cluster and then manually apply: - -```bash -$ flux-cloud --debug up --cloud aws -$ flux-cloud --debug apply --cloud aws -$ flux-cloud --debug down --cloud aws -``` - -By default, results will be written to a [./data](data) directory, but you can customize this with `--outdir`. diff --git a/examples/aws-lammps/experiments.yaml b/examples/aws-lammps/experiments.yaml deleted file mode 100644 index e694254..0000000 --- a/examples/aws-lammps/experiments.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# matrix of experiments to run - machine types and sizes are required - -# These are mini runs intended for testing -matrix: - size: [8] - machine: ["m5.large"] - -# Flux Mini Cluster experiment attributes -minicluster: - name: lammps - namespace: flux-operator - size: [2, 4, 6, 8] - -# Each job can have a command and working directory -jobs: - lmp: - command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite - repeats: 3 diff --git a/examples/aws-lammps/minicluster-template.yaml b/examples/aws-lammps/minicluster-template.yaml deleted file mode 100644 index 7591645..0000000 --- a/examples/aws-lammps/minicluster-template.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster - -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # Number of pods to create for MiniCluster - size: {{ minicluster.size }} - - # This is a list because a pod can support multiple containers - containers: - - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 - workingDir: /home/flux/examples/reaxff/HNS - command: {{ job.command }} diff --git a/examples/google/osu-benchmarks/README.md b/examples/google/osu-benchmarks/README.md new file mode 100644 index 0000000..0bd718c --- /dev/null +++ b/examples/google/osu-benchmarks/README.md @@ -0,0 +1,5 @@ +# OSU Benchmarks + +This example demonstrates how to setup an [experiments.yaml](experiments.yaml) +to run on Google Cloud. See the [Google Cloud tutorials](https://converged-computing.github.io/flux-cloud/tutorials/google.html) +for how to run this tutorial. diff --git a/examples/google/osu-benchmarks/experiments.yaml b/examples/google/osu-benchmarks/experiments.yaml new file mode 100644 index 0000000..14e6806 --- /dev/null +++ b/examples/google/osu-benchmarks/experiments.yaml @@ -0,0 +1,59 @@ +# matrix of experiments to run - machine types and sizes are required +# This can obviously be expanded to more sizes or machines, +matrix: + size: [2] + machine: ["n1-standard-1", "n1-standard-2"] + +# An example of shared container options +x-container-options: &options + flux_option_flags: "-ompi=openmpi@5" + working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided + image: ghcr.io/rse-ops/osu-microbench:test + + # This will get passed during a flux submit + tasks: 2 + +# res = cli.submit(command=job["command"], num_tasks=2, cores_per_task=1, workdir=kwargs['workdir']) + +# Flux Mini Cluster experiment attributes +minicluster: + name: osu-benchmarks + namespace: flux-operator + + # provide credentials if you want to re-use a minicluster + flux_restful: + username: fluxuser + token: "123456" + + # osu benchmarks requires exactly 2 processes + tasks: 2 + +# Each job can have a command and working directory +jobs: + osu_get_latency: + command: './osu_get_latency' + <<: *options + osu_acc_latency: + command: './osu_acc_latency' + <<: *options + osu_fop_latency: + command: './osu_fop_latency' + <<: *options + osu_get_bw: + command: './osu_get_bw' + <<: *options + osu_put_bibw: + command: './osu_put_bibw' + <<: *options + osu_put_latency: + command: './osu_put_latency' + <<: *options + osu_cas_latency: + command: './osu_cas_latency' + <<: *options + osu_get_acc_latency: + command: './osu_get_acc_latency' + <<: *options + osu_put_bw: + command: './osu_put_bw' + <<: *options diff --git a/examples/minikube/basic/README.md b/examples/minikube/basic/README.md new file mode 100644 index 0000000..0a2a6aa --- /dev/null +++ b/examples/minikube/basic/README.md @@ -0,0 +1,3 @@ +# Up, Submit, Down + +See the [minikube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html) for how to run this tutorial. diff --git a/examples/up-submit-down/experiments.yaml b/examples/minikube/basic/experiments.yaml similarity index 64% rename from examples/up-submit-down/experiments.yaml rename to examples/minikube/basic/experiments.yaml index 880e652..ec0ce6a 100644 --- a/examples/up-submit-down/experiments.yaml +++ b/examples/minikube/basic/experiments.yaml @@ -1,7 +1,6 @@ # This is intended for MiniKube, so no machine needed matrix: size: [4] - machine: [n1-standard-1] # Flux Mini Cluster experiment attributes minicluster: @@ -10,22 +9,19 @@ minicluster: # Each of these sizes will be brought up and have commands run across it size: [2] -# Since we are creating a minicluster here to submit commands across -# on the same container, the container is required here. If you specify -# a size here, the image must be the same across sizes jobs: reaxc-hns: command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 repeats: 5 - workdir: /home/flux/examples/reaxff/HNS + working_dir: /home/flux/examples/reaxff/HNS sleep: command: 'sleep 5' image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 repeats: 5 - workdir: /home/flux/examples/reaxff/HNS + working_dir: /home/flux/examples/reaxff/HNS hello-world: command: 'echo hello world' image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 repeats: 5 - workdir: /home/flux/examples/reaxff/HNS + working_dir: /home/flux/examples/reaxff/HNS diff --git a/examples/minikube/logging/README.md b/examples/minikube/logging/README.md new file mode 100644 index 0000000..779e469 --- /dev/null +++ b/examples/minikube/logging/README.md @@ -0,0 +1,3 @@ +# Logging + +This experiments.yaml shows how to customize the MiniCluster logging. diff --git a/examples/minikube/logging/experiments.yaml b/examples/minikube/logging/experiments.yaml new file mode 100644 index 0000000..d457541 --- /dev/null +++ b/examples/minikube/logging/experiments.yaml @@ -0,0 +1,24 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + +# Flux Mini Cluster experiment attributes +minicluster: + name: lammps-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + + # How to set logging attributes + logging: + debug: False # defaults to False + quiet: True # defaults to False + strict: False # defaults to True + timed: False # defaults to False, requires time in containers + +jobs: + reaxc-hns: + command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 2 + working_dir: /home/flux/examples/reaxff/HNS diff --git a/examples/minikube/osu-benchmarks/README.md b/examples/minikube/osu-benchmarks/README.md new file mode 100644 index 0000000..8b8ec6b --- /dev/null +++ b/examples/minikube/osu-benchmarks/README.md @@ -0,0 +1,5 @@ +# OSU Benchmarks + +This example demonstrates how to setup an [experiments.yaml](experiments.yaml) +to run on MiniKube. See the [MiniKube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html) +for how to run this tutorial. diff --git a/examples/minikube/osu-benchmarks/experiments.yaml b/examples/minikube/osu-benchmarks/experiments.yaml new file mode 100644 index 0000000..d278aa5 --- /dev/null +++ b/examples/minikube/osu-benchmarks/experiments.yaml @@ -0,0 +1,65 @@ +# matrix of experiments to run - machine types and sizes are required + +# This can obviously be expanded to more sizes or machines, +matrix: + size: [2] + #machine: ["n1-standard-1", "n1-standard-2"] + +# TODO +# when get this working, save to experiments-full.yaml, move to minkube, have shortened version run for test +# then test this on google cloud +# flux operator / python api still need to be released - maybe only allow pam for auth? + +# An example of shared container options +x-container-options: &options + flux_option_flags: "-ompi=openmpi@5" + working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided + image: ghcr.io/rse-ops/osu-microbench:test + + # This will get passed during a flux submit + tasks: 2 + +# res = cli.submit(command=job["command"], num_tasks=2, cores_per_task=1, workdir=kwargs['workdir']) + +# Flux Mini Cluster experiment attributes +minicluster: + name: osu-benchmarks + namespace: flux-operator + + # provide credentials if you want to re-use a minicluster + flux_restful: + username: fluxuser + token: "123456" + + # osu benchmarks requires exactly 2 processes + tasks: 2 + +# Each job can have a command and working directory +jobs: + osu_get_latency: + command: './osu_get_latency' + <<: *options + osu_acc_latency: + command: './osu_acc_latency' + <<: *options + osu_fop_latency: + command: './osu_fop_latency' + <<: *options + osu_get_bw: + command: './osu_get_bw' + <<: *options + osu_put_bibw: + command: './osu_put_bibw' + <<: *options + osu_put_latency: + command: './osu_put_latency' + <<: *options + osu_cas_latency: + command: './osu_cas_latency' + <<: *options + osu_get_acc_latency: + command: './osu_get_acc_latency' + <<: *options + osu_put_bw: + command: './osu_put_bw' + <<: *options diff --git a/examples/minikube/persistent/README.md b/examples/minikube/persistent/README.md new file mode 100644 index 0000000..5636c77 --- /dev/null +++ b/examples/minikube/persistent/README.md @@ -0,0 +1,49 @@ +# Persistent + +This is a trick to get a MiniCluster up and running (and have it stay running)! + + - For **submit** we run a job that will never complete + - For **apply** we do the same! + +I typically use this case to debug one or the other. E.g., (given MiniKube is running with the operator installed): + +```bash +$ flux-cloud --debug submit --cloud minikube +``` + +Then get the pod + +```bash +$ kubectl get -n flux-operator pods +NAME READY STATUS RESTARTS AGE +sleep-job-0-pm28c 1/1 Running 0 73s +sleep-job-1-h824z 1/1 Running 0 73s +sleep-job-cert-generator 0/1 Completed 0 73s +``` + +And ssh in! + +```bash +$ kubectl exec -it -n flux-operator sleep-job-0-pm28c -- bash +``` + +For either submit or apply, we can connect to the instance with the broker URI + +```bash +$ export FLUX_URI=local:///run/flux/local +$ sudo -u flux flux proxy $FLUX_URI +``` +and then see our infinite flux job! + +```bash +$ flux jobs -a + JOBID USER NAME ST NTASKS NNODES TIME INFO + ʒCvGx8CX flux sleep R 1 1 2.432m sleep-job-1 +``` + +The main difference is that submit is going to periodically ping the restful API to check +on the job. So you are probably better off with apply in that it's almost the same +thing (a flux start -> flux submit instead of starting the flux broker) without +the poll. + +See the [minikube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html) for how to run this tutorial. diff --git a/examples/minikube/persistent/experiments.yaml b/examples/minikube/persistent/experiments.yaml new file mode 100644 index 0000000..aa6b649 --- /dev/null +++ b/examples/minikube/persistent/experiments.yaml @@ -0,0 +1,16 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + +# Flux Mini Cluster experiment attributes +minicluster: + name: sleep-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + +# This will bring up a cluster to stay online (until you kill it) as the job will never end +jobs: + sleep: + command: 'sleep infinity' + image: ghcr.io/flux-framework/flux-restful-api:latest diff --git a/examples/minikube/resources/README.md b/examples/minikube/resources/README.md new file mode 100644 index 0000000..54eab30 --- /dev/null +++ b/examples/minikube/resources/README.md @@ -0,0 +1,3 @@ +# Resources + +This experiments.yaml shows how to customize MiniCluster resources. diff --git a/examples/minikube/resources/experiments.yaml b/examples/minikube/resources/experiments.yaml new file mode 100644 index 0000000..846ef81 --- /dev/null +++ b/examples/minikube/resources/experiments.yaml @@ -0,0 +1,25 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + +# Flux Mini Cluster experiment attributes +minicluster: + name: lammps-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + +jobs: + reaxc-hns: + command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 2 + working_dir: /home/flux/examples/reaxff/HNS + + # Resources for the specific container job + resources: + limits: + cpu: 1 + + requests: + cpu: 1 diff --git a/examples/minikube/volumes/README.md b/examples/minikube/volumes/README.md new file mode 100644 index 0000000..918f351 --- /dev/null +++ b/examples/minikube/volumes/README.md @@ -0,0 +1,3 @@ +# Volumes + +This experiments.yaml shows how to customize MiniCluster volumes. diff --git a/examples/minikube/volumes/experiments.yaml b/examples/minikube/volumes/experiments.yaml new file mode 100644 index 0000000..c2af90e --- /dev/null +++ b/examples/minikube/volumes/experiments.yaml @@ -0,0 +1,31 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + +# Flux Mini Cluster experiment attributes +minicluster: + name: lammps-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + + # How to create MiniCluster volumes - this is a volume named "data" + volumes: + data: + storageClass: hostpath + path: /tmp/data + labels: + type: "local" + + +jobs: + reaxc-hns: + command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 2 + working_dir: /home/flux/examples/reaxff/HNS + + # The volume named "data" above should be bound to "/data" + volumes: + data: + path: /data diff --git a/examples/osu-benchmarks/README.md b/examples/osu-benchmarks/README.md deleted file mode 100644 index 8fbd44c..0000000 --- a/examples/osu-benchmarks/README.md +++ /dev/null @@ -1,60 +0,0 @@ -# OSU Benchmarks on Google Kubernetes Engine - -In this set of experiments we will run the Flux Operator on Google Cloud at size N=2 -(the benchmarks require this) and multiple machine types. - -## Pre-requisites - -You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts) -and ensure you are logged in and have kubectl installed: - -```bash -$ gcloud auth login -``` - -Depending on your install, you can either install with gcloud: - -```bash -$ gcloud components install kubectl -``` -or just [on your own](https://kubernetes.io/docs/tasks/tools/). - -## Run Experiments - -Each experiment here is defined by the matrix and variables in [experiments.yaml](experiment.yaml) that is used to -populate a [minicluster-template.yaml](minicluster-template.yaml) and launch a Kubernetes cluster. -You can read the documentation for flux-cloud to understand the variables available. -This tutorial assumes you have flux-cloud installed and configured. See all unique Kubernetes clusters -we will run the jobs on: - -```bash -$ flux-cloud list -``` - -Then you can either run all at once: - -```bash -$ flux-cloud run --force-cluster -``` - -Or (for testing) to bring up just the first cluster and then manually apply: - -```bash -$ flux-cloud up -$ flux-cloud apply -$ flux-cloud down -``` - -or do the same for a targeted Kubernetes cluster: - -```bash -$ flux-cloud up -e n1-standard-2-2 -$ flux-cloud apply -e n1-standard-2-2 -$ flux-cloud down -e n1-standard-2-2 -``` - - -The latter will either use a single experiment you've defined under `experiment` in your experiments.yaml file, -or select the first in your matrix (as we have here). - -By default, results will be written to a [./data](data) directory, but you can customize this with `--outdir`. diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/meta.json b/examples/osu-benchmarks/data/n1-standard-1-2/meta.json deleted file mode 100644 index 994b8cd..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/meta.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "size": 2, - "machine": "n1-standard-1", - "minicluster": { - "name": "osu-benchmarks", - "namespace": "flux-operator" - }, - "jobs": { - "osu_get_latency": { - "command": "./osu_get_latency" - }, - "osu_acc_latency": { - "command": "./osu_acc_latency" - }, - "osu_fop_latency": { - "command": "./osu_fop_latency" - }, - "osu_get_bw": { - "command": "./osu_get_bw" - }, - "osu_put_bibw": { - "command": "./osu_put_bibw" - }, - "osu_put_latency": { - "command": "./osu_put_latency" - }, - "osu_cas_latency": { - "command": "./osu_cas_latency" - }, - "osu_get_acc_latency": { - "command": "./osu_get_acc_latency" - }, - "osu_put_bw": { - "command": "./osu_put_bw" - } - }, - "id": "n1-standard-1-2", - "times": { - "create-cluster": 356.4845640659332, - "minicluster-run-osu_get_latency": 538.4266033172607, - "minicluster-run-osu_acc_latency": 346.2248685359955, - "minicluster-run-osu_fop_latency": 30.376757621765137, - "minicluster-run-osu_get_bw": 69.91457080841064, - "minicluster-run-osu_put_bibw": 121.5233302116394, - "minicluster-run-osu_put_latency": 347.232608795166, - "minicluster-run-osu_cas_latency": 30.295669078826904, - "minicluster-run-osu_get_acc_latency": 675.3228597640991, - "minicluster-run-osu_put_bw": 65.65373682975769 - } -} diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out deleted file mode 100644 index 33df75e..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Accumulate latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 2026.56 -2 1971.25 -4 1969.97 -8 2033.46 -16 1975.18 -32 2007.49 -64 1958.49 -128 2003.40 -256 2009.72 -512 1974.10 -1024 2027.20 -2048 2040.70 -4096 1958.00 -8192 2026.39 -16384 1962.29 -32768 2014.61 -65536 3992.00 -131072 4587.00 -262144 4074.00 -524288 4244.08 -1048576 4722.99 -2097152 9259.00 -4194304 18870.00 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out deleted file mode 100644 index 13c2d88..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Compare_and_swap latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 2040.58 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out deleted file mode 100644 index fb575bb..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Fetch_and_op latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 2025.38 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out deleted file mode 100644 index 89c06a4..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get_accumulate latency Test v5.8 -# Window creation: MPI_Win_create -# Synchronization: MPI_Win_lock/unlock -# Size Latency (us) -1 4028.65 -2 4036.95 -4 3977.30 -8 3959.60 -16 3999.67 -32 3974.93 -64 3965.61 -128 3921.54 -256 4020.49 -512 3987.41 -1024 3950.50 -2048 4023.82 -4096 4024.50 -8192 4032.61 -16384 4321.01 -32768 4077.98 -65536 6086.01 -131072 6358.00 -262144 6235.36 -524288 7140.15 -1048576 9408.58 -2097152 18535.45 -4194304 36929.51 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out deleted file mode 100644 index 498e3af..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.02 -2 0.04 -4 0.08 -8 0.17 -16 0.32 -32 0.66 -64 1.14 -128 2.58 -256 6.05 -512 9.96 -1024 19.80 -2048 35.15 -4096 64.85 -8192 126.64 -16384 174.69 -32768 205.94 -65536 220.74 -131072 220.53 -262144 173.80 -524288 227.82 -1048576 215.52 -2097152 226.24 -4194304 219.46 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out deleted file mode 100644 index 4bd281e..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 2129.50 -2 2057.45 -4 2013.89 -8 2015.55 -16 1979.00 -32 2024.10 -64 1983.17 -128 2008.34 -256 2023.70 -512 2008.37 -1024 2057.49 -2048 2030.20 -4096 2039.00 -8192 2027.52 -16384 1879.26 -32768 2086.65 -65536 3961.84 -131072 4195.01 -262144 4327.77 -524288 4295.87 -1048576 5230.83 -2097152 9040.55 -4194304 18364.76 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out deleted file mode 100644 index c6d659e..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bi-directional Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_post/start/complete/wait -# Size Bandwidth (MB/s) -1 0.02 -2 0.13 -4 0.26 -8 0.43 -16 1.06 -32 2.27 -64 3.77 -128 9.69 -256 15.68 -512 28.37 -1024 58.54 -2048 105.42 -4096 119.90 -8192 147.82 -16384 151.82 -32768 212.67 -65536 220.28 -131072 221.41 -262144 222.46 -524288 223.21 -1048576 207.12 -2097152 223.48 -4194304 223.16 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out deleted file mode 100644 index 5b8c58a..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.04 -2 0.07 -4 0.12 -8 0.25 -16 0.53 -32 1.10 -64 2.32 -128 3.69 -256 9.53 -512 17.77 -1024 28.00 -2048 56.37 -4096 67.47 -8192 93.29 -16384 147.11 -32768 222.45 -65536 205.60 -131072 227.43 -262144 232.69 -524288 229.48 -1048576 216.91 -2097152 219.29 -4194304 223.36 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out deleted file mode 100644 index 4bd184c..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 1907.18 -2 2021.62 -4 1932.61 -8 1984.30 -16 2022.26 -32 1931.50 -64 2016.32 -128 2010.00 -256 1979.04 -512 1993.74 -1024 1990.06 -2048 1982.00 -4096 1983.60 -8192 2014.80 -16384 2079.00 -32768 1999.49 -65536 4068.88 -131072 3994.00 -262144 4146.00 -524288 4276.83 -1048576 5456.03 -2097152 9407.04 -4194304 19134.00 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/meta.json b/examples/osu-benchmarks/data/n1-standard-2-2/meta.json deleted file mode 100644 index 431de1e..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/meta.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "size": 2, - "machine": "n1-standard-2", - "minicluster": { - "name": "osu-benchmarks", - "namespace": "flux-operator" - }, - "jobs": { - "osu_get_latency": { - "command": "./osu_get_latency" - }, - "osu_acc_latency": { - "command": "./osu_acc_latency" - }, - "osu_fop_latency": { - "command": "./osu_fop_latency" - }, - "osu_get_bw": { - "command": "./osu_get_bw" - }, - "osu_put_bibw": { - "command": "./osu_put_bibw" - }, - "osu_put_latency": { - "command": "./osu_put_latency" - }, - "osu_cas_latency": { - "command": "./osu_cas_latency" - }, - "osu_get_acc_latency": { - "command": "./osu_get_acc_latency" - }, - "osu_put_bw": { - "command": "./osu_put_bw" - } - }, - "id": "n1-standard-2-2", - "times": { - "create-cluster": 1367.3097712993622, - "destroy-cluster": 2073.518306493759, - "minicluster-run-osu_get_latency": 437.91792845726013, - "minicluster-run-osu_acc_latency": 38.31566119194031, - "minicluster-run-osu_fop_latency": 10.17687702178955, - "minicluster-run-osu_get_bw": 150.1252703666687, - "minicluster-run-osu_put_bibw": 38.277549743652344, - "minicluster-run-osu_put_latency": 36.958292961120605, - "minicluster-run-osu_cas_latency": 8.383898735046387, - "minicluster-run-osu_get_acc_latency": 64.05685710906982, - "minicluster-run-osu_put_bw": 19.466553211212158 - } -} diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out deleted file mode 100644 index bfb53a4..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Accumulate latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 141.11 -2 131.90 -4 123.68 -8 117.55 -16 121.55 -32 127.79 -64 114.15 -128 126.75 -256 131.81 -512 118.65 -1024 125.17 -2048 143.56 -4096 142.86 -8192 157.31 -16384 181.50 -32768 199.33 -65536 453.33 -131072 453.50 -262144 560.16 -524288 771.15 -1048576 1167.18 -2097152 1929.84 -4194304 4272.84 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out deleted file mode 100644 index ae44363..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Compare_and_swap latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 163.29 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out deleted file mode 100644 index 6ec7d3c..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Fetch_and_op latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 145.01 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out deleted file mode 100644 index 536ce73..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get_accumulate latency Test v5.8 -# Window creation: MPI_Win_create -# Synchronization: MPI_Win_lock/unlock -# Size Latency (us) -1 287.65 -2 286.34 -4 266.98 -8 297.64 -16 283.17 -32 282.60 -64 263.15 -128 282.87 -256 330.35 -512 295.81 -1024 292.13 -2048 311.83 -4096 323.84 -8192 329.55 -16384 319.65 -32768 341.19 -65536 589.78 -131072 712.14 -262144 887.74 -524288 1315.61 -1048576 2054.79 -2097152 3533.33 -4194304 5818.77 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out deleted file mode 100644 index 142ad29..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.06 -2 0.13 -4 0.27 -8 0.54 -16 1.02 -32 1.95 -64 3.75 -128 8.50 -256 16.98 -512 30.28 -1024 64.48 -2048 115.08 -4096 245.00 -8192 464.01 -16384 585.10 -32768 754.99 -65536 828.35 -131072 890.66 -262144 1042.80 -524288 955.92 -1048576 1142.67 -2097152 1169.05 -4194304 1172.25 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out deleted file mode 100644 index d226e0d..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 156.10 -2 146.43 -4 135.73 -8 141.91 -16 151.82 -32 151.16 -64 154.75 -128 149.68 -256 149.83 -512 147.85 -1024 141.86 -2048 153.64 -4096 152.21 -8192 165.91 -16384 204.98 -32768 196.08 -65536 343.46 -131072 452.29 -262144 519.62 -524288 1094.88 -1048576 1724.75 -2097152 1880.69 -4194304 4034.85 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out deleted file mode 100644 index 8882672..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bi-directional Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_post/start/complete/wait -# Size Bandwidth (MB/s) -1 0.85 -2 1.81 -4 3.81 -8 7.37 -16 14.24 -32 24.82 -64 47.65 -128 87.96 -256 161.55 -512 262.90 -1024 355.17 -2048 464.07 -4096 407.02 -8192 406.54 -16384 789.37 -32768 1210.71 -65536 915.45 -131072 835.05 -262144 600.15 -524288 762.38 -1048576 747.83 -2097152 1065.77 -4194304 873.67 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out deleted file mode 100644 index a731da2..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.42 -2 0.72 -4 1.49 -8 2.95 -16 5.55 -32 11.92 -64 22.03 -128 40.32 -256 79.76 -512 139.34 -1024 181.55 -2048 283.18 -4096 207.81 -8192 242.69 -16384 534.98 -32768 611.93 -65536 705.35 -131072 899.41 -262144 1065.81 -524288 1192.52 -1048576 989.94 -2097152 1189.93 -4194304 1089.70 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out deleted file mode 100644 index 7a0c2e5..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 121.19 -2 128.06 -4 130.09 -8 126.55 -16 121.53 -32 124.01 -64 124.88 -128 134.59 -256 131.45 -512 134.15 -1024 138.92 -2048 144.37 -4096 135.79 -8192 176.68 -16384 171.74 -32768 207.59 -65536 382.16 -131072 447.84 -262144 573.51 -524288 686.69 -1048576 972.05 -2097152 1942.87 -4194304 3791.28 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/meta.json b/examples/osu-benchmarks/data/n1-standard-4-2/meta.json deleted file mode 100644 index 9fd47b7..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/meta.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "size": 2, - "machine": "n1-standard-4", - "minicluster": { - "name": "osu-benchmarks", - "namespace": "flux-operator" - }, - "jobs": { - "osu_get_latency": { - "command": "./osu_get_latency" - }, - "osu_acc_latency": { - "command": "./osu_acc_latency" - }, - "osu_fop_latency": { - "command": "./osu_fop_latency" - }, - "osu_get_bw": { - "command": "./osu_get_bw" - }, - "osu_put_bibw": { - "command": "./osu_put_bibw" - }, - "osu_put_latency": { - "command": "./osu_put_latency" - }, - "osu_cas_latency": { - "command": "./osu_cas_latency" - }, - "osu_get_acc_latency": { - "command": "./osu_get_acc_latency" - }, - "osu_put_bw": { - "command": "./osu_put_bw" - } - }, - "id": "n1-standard-4-2", - "times": { - "minicluster-run-osu_get_latency": 277.4993796348572, - "minicluster-run-osu_acc_latency": 32.00839829444885, - "minicluster-run-osu_fop_latency": 137.7638008594513, - "minicluster-run-osu_get_bw": 149.44539713859558, - "minicluster-run-osu_put_bibw": 33.21780180931091, - "minicluster-run-osu_put_latency": 31.217578411102295, - "minicluster-run-osu_cas_latency": 138.34734511375427, - "minicluster-run-osu_get_acc_latency": 175.93916821479797, - "minicluster-run-osu_put_bw": 17.256979942321777 - } -} diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out deleted file mode 100644 index dc9e63d..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Accumulate latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 114.54 -2 89.25 -4 109.61 -8 120.59 -16 115.39 -32 115.15 -64 115.98 -128 117.26 -256 112.32 -512 113.18 -1024 116.00 -2048 123.82 -4096 122.91 -8192 131.24 -16384 151.30 -32768 166.14 -65536 313.66 -131072 359.74 -262144 444.67 -524288 648.49 -1048576 976.66 -2097152 1724.47 -4194304 3490.93 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out deleted file mode 100644 index df10d69..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Compare_and_swap latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 115.93 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out deleted file mode 100644 index 37a557b..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Fetch_and_op latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 105.31 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out deleted file mode 100644 index b49b29b..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get_accumulate latency Test v5.8 -# Window creation: MPI_Win_create -# Synchronization: MPI_Win_lock/unlock -# Size Latency (us) -1 187.83 -2 185.87 -4 187.54 -8 187.00 -16 189.64 -32 187.64 -64 187.10 -128 189.27 -256 195.68 -512 190.57 -1024 194.59 -2048 205.99 -4096 216.30 -8192 205.38 -16384 220.19 -32768 250.27 -65536 472.44 -131072 552.09 -262144 698.95 -524288 872.33 -1048576 1280.24 -2097152 2214.98 -4194304 3719.21 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out deleted file mode 100644 index 66eaef6..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.09 -2 0.20 -4 0.39 -8 0.79 -16 1.57 -32 3.01 -64 5.50 -128 12.36 -256 25.06 -512 48.87 -1024 96.36 -2048 187.58 -4096 364.00 -8192 657.03 -16384 1121.71 -32768 880.91 -65536 1266.43 -131072 1237.42 -262144 1222.58 -524288 1220.72 -1048576 1217.06 -2097152 1214.67 -4194304 1213.09 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out deleted file mode 100644 index 54baa05..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 105.67 -2 106.76 -4 105.98 -8 104.79 -16 108.18 -32 104.17 -64 107.93 -128 104.04 -256 104.24 -512 100.87 -1024 106.00 -2048 106.41 -4096 107.10 -8192 116.07 -16384 121.87 -32768 153.42 -65536 287.48 -131072 304.28 -262144 394.81 -524288 542.64 -1048576 850.54 -2097152 1754.19 -4194304 4854.39 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out deleted file mode 100644 index 0e481f7..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bi-directional Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_post/start/complete/wait -# Size Bandwidth (MB/s) -1 0.92 -2 1.89 -4 3.58 -8 7.28 -16 13.10 -32 26.02 -64 52.44 -128 96.39 -256 165.92 -512 295.72 -1024 426.83 -2048 511.34 -4096 424.07 -8192 457.66 -16384 881.99 -32768 1144.43 -65536 909.92 -131072 672.95 -262144 632.42 -524288 546.17 -1048576 683.42 -2097152 1031.23 -4194304 1128.50 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out deleted file mode 100644 index 14fccbf..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.51 -2 1.02 -4 2.14 -8 4.00 -16 7.66 -32 16.11 -64 29.96 -128 53.30 -256 104.51 -512 164.96 -1024 213.10 -2048 271.31 -4096 257.33 -8192 271.38 -16384 387.78 -32768 684.16 -65536 671.73 -131072 871.74 -262144 1241.50 -524288 1226.63 -1048576 1220.23 -2097152 1169.76 -4194304 1166.01 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out deleted file mode 100644 index 2ea21e2..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 101.02 -2 109.73 -4 109.01 -8 108.02 -16 111.83 -32 108.72 -64 109.09 -128 115.13 -256 112.40 -512 114.17 -1024 115.96 -2048 119.93 -4096 123.87 -8192 139.41 -16384 152.70 -32768 183.49 -65536 328.75 -131072 387.64 -262144 452.87 -524288 594.75 -1048576 871.06 -2097152 1714.35 -4194304 3572.91 diff --git a/examples/osu-benchmarks/experiments.yaml b/examples/osu-benchmarks/experiments.yaml deleted file mode 100644 index 2bc569c..0000000 --- a/examples/osu-benchmarks/experiments.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# matrix of experiments to run - machine types and sizes are required - -# This can obviously be expanded to more sizes or machines, -matrix: - size: [2] - machine: ["n1-standard-1", "n1-standard-2", "n1-standard-4"] - -# Flux Mini Cluster experiment attributes -minicluster: - name: osu-benchmarks - namespace: flux-operator - -# Each job can have a command and working directory -jobs: - osu_get_latency: - command: './osu_get_latency' - osu_acc_latency: - command: './osu_acc_latency' - osu_fop_latency: - command: './osu_fop_latency' - osu_get_bw: - command: './osu_get_bw' - osu_put_bibw: - command: './osu_put_bibw' - osu_put_latency: - command: './osu_put_latency' - osu_cas_latency: - command: './osu_cas_latency' - osu_get_acc_latency: - command: './osu_get_acc_latency' - osu_put_bw: - command: './osu_put_bw' diff --git a/examples/osu-benchmarks/minicluster-template.yaml b/examples/osu-benchmarks/minicluster-template.yaml deleted file mode 100644 index 8004cc8..0000000 --- a/examples/osu-benchmarks/minicluster-template.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - - # Number of pods to create for MiniCluster - size: {{ size }} - - # Disable verbose output - logging: - quiet: true - - # This is a list because a pod can support multiple containers - containers: - # The container URI to pull (currently needs to be public) - - image: ghcr.io/rse-ops/osu-microbench:app-latest - - # Option Flags for this flux runner wait.sh entrypoint - fluxOptionFlags: "-ompi=openmpi@5" - - # custom preCommand logic (run at start of script) - commands: - pre: | - source /etc/profile.d/z10_spack_environment.sh - asFlux="sudo -u flux -E PYTHONPATH=$PYTHONPATH" - - # All osu-benchmark experiments share the same working directory - workingDir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided - command: {{ job.command }} diff --git a/examples/up-apply-down/README.md b/examples/up-apply-down/README.md deleted file mode 100644 index e4cfc6c..0000000 --- a/examples/up-apply-down/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Up and Down - -This is an example of using flux cloud to bring up a cluster, install the Flux Operator -(and then you would use it as you please) and then bring it down. -You should have kubectl and gcloud installed for this demo. Note that -we use the [experiments.yaml](experiments.yaml) file as a default, -and we only provide basic metadata needed for a single experiment. - -## Up - -```bash -$ flux-cloud up -``` - -This will bring up your cluster, per the size and machine type defined -in your experiments file, and install the operator. - -## Apply - -An "apply" means running the single (or multiple) experiments defined in your -experiments.yaml. While these don't need to be in the same file, for simplicity -we have also defined our experiment metadata and template (provided at [minicluster-template.yaml](minicluster-template.yaml)) -in this directory. For this application we will run a simple llamps application. - -```bash -$ flux-cloud apply -``` - -Note that apply will work for a single experiment OR a matrix, so be careful! - -## Down - -To bring it down: - -```bash -$ flux-cloud down -``` diff --git a/examples/up-apply-down/data/meta.json b/examples/up-apply-down/data/meta.json deleted file mode 100644 index be7b873..0000000 --- a/examples/up-apply-down/data/meta.json +++ /dev/null @@ -1,19 +0,0 @@ -[ - { - "size": 2, - "machine": "n1-standard-1", - "minicluster": { - "name": "lammps-job", - "namespace": "flux-operator" - }, - "jobs": { - "reaxc-hns": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite" - } - }, - "times": { - "minicluster-run-reaxc-hns": 198.465562582016, - "create-cluster": 367.33847880363464 - } - } -] diff --git a/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out b/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out deleted file mode 100644 index 71de171..0000000 --- a/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 2 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.029 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 2 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 5.00 out of 8 (62.50%) - 2432 atoms - replicate CPU = 0.002 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 143.9 | 143.9 | 143.9 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52118 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2824 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.342 -111.57762 -1.7012247 27418.867 - 30 302.21063 -113.28428 7007.6629 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.8245 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0973 -111.58318 -1.7000523 27418.867 - 60 296.67807 -113.26777 7273.8119 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.5522 -111.55514 -1.6992158 27418.867 - 80 293.58677 -113.25831 5993.4438 -111.55946 -1.6988533 27418.867 - 90 300.62635 -113.27925 7202.8369 -111.58069 -1.6985592 27418.867 - 100 305.38276 -113.29357 10085.805 -111.59518 -1.6983874 27418.867 -Loop time of 20.075 on 2 procs for 100 steps with 2432 atoms - -Performance: 0.043 ns/day, 557.640 hours/ns, 4.981 timesteps/s -84.6% CPU use with 2 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 12.399 | 13.154 | 13.91 | 20.8 | 65.53 -Neigh | 0.40351 | 0.40416 | 0.4048 | 0.1 | 2.01 -Comm | 0.33357 | 1.0872 | 1.8408 | 72.3 | 5.42 -Output | 0.004412 | 0.0045916 | 0.0047713 | 0.3 | 0.02 -Modify | 5.4218 | 5.4219 | 5.422 | 0.0 | 27.01 -Other | | 0.002887 | | | 0.01 - -Nlocal: 1216.00 ave 1216 max 1216 min -Histogram: 2 0 0 0 0 0 0 0 0 0 -Nghost: 7591.50 ave 7597 max 7586 min -Histogram: 1 0 0 0 0 0 0 0 0 1 -Neighs: 432912.0 ave 432942 max 432882 min -Histogram: 1 0 0 0 0 0 0 0 0 1 - -Total # of neighbors = 865824 -Ave neighs/atom = 356.01316 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:20 diff --git a/examples/up-apply-down/experiments.yaml b/examples/up-apply-down/experiments.yaml deleted file mode 100644 index ddcfdac..0000000 --- a/examples/up-apply-down/experiments.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Minimum required to bring up a cluster -experiment: - size: 2 - machine: n1-standard-1 - -# Flux Mini Cluster experiment attributes -minicluster: - name: lammps-job - namespace: flux-operator - -# If your jobs share the same variables you can just put them in the template directly! -jobs: - reaxc-hns: - command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' diff --git a/examples/up-apply-down/minicluster-template.yaml b/examples/up-apply-down/minicluster-template.yaml deleted file mode 100644 index 6b34bdd..0000000 --- a/examples/up-apply-down/minicluster-template.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # Number of pods to create for MiniCluster - size: {{ size }} - - # Disable verbose output - logging: - quiet: true - - # This is a list because a pod can support multiple containers - containers: - # The container URI to pull (currently needs to be public) - - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 - - # You can set the working directory if your container WORKDIR is not correct. - workingDir: /home/flux/examples/reaxff/HNS - command: {{ job.command }} - - # This only matters if test is false - fluxLogLevel: 7 diff --git a/examples/up-submit-down/README.md b/examples/up-submit-down/README.md deleted file mode 100644 index 818e95d..0000000 --- a/examples/up-submit-down/README.md +++ /dev/null @@ -1,72 +0,0 @@ -``# Up, Submit, Down - -This is an example of using flux cloud to bring up a cluster, install the Flux Operator -(and then you would use it as you please) and run jobs with submit (on the same -MiniCluster) and then bring it down. -You should have kubectl and gcloud OR minikube installed for this demo. Note that -we use the [experiments.yaml](experiments.yaml) file as a default, -and we only provide basic metadata needed for a single experiment. - -## Up - -```bash -$ flux-cloud up -``` - -This will bring up your cluster, per the size and machine type defined -in your experiments file, and install the operator. - -## Submit - -A "submit" means running the single (or multiple) experiments defined in your -experiments.yaml on the same MiniCluster, without bringing it down between jobs. -This means we are using Flux as the scheduler proper, and we don't need to bring pods -up and down unecessarily (and submit a gazillion YAML files). There is only the number -of YAML CRD needed to correspond to the sizes of MiniClusters you run across. - -```bash -$ flux-cloud submit --cloud minikube -$ flux-cloud submit --cloud google -``` - -## Down - -To bring it down: - -```bash -$ flux-cloud down -``` - -## Batch - -Run all three with one command: - -```bash -$ flux-cloud batch --cloud minikube -$ flux-cloud batch --cloud google -``` - -## UI - -If you want to just bring up the cluster and open the user interface to interact with: - -```bash -$ flux-cloud up --cloud minikube -$ flux-cloud ui --cloud minikube -$ flux-cloud down --cloud minikube -``` - - -## Plot - -I threw together a script to compare running times with info and output times, -where: - -running time < info < output - -```bash -$ pip install pandas matplotlib seaborn -``` -```bash -$ python plot_results.py data/k8s-size-4-n1-standard-1/meta.json -``` diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh deleted file mode 100755 index c6fb8e0..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -NAMESPACE="flux-operator" -JOB="lammps-job" -brokerPrefix="${JOB}-0" - -for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo ${pod} - break - fi -done diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh deleted file mode 100755 index bdace99..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh +++ /dev/null @@ -1,204 +0,0 @@ -#!/bin/bash - -# Source shared helper scripts -# Colors -red='\033[0;31m' -green='\033[0;32m' -yellow='\033[0;33m' -blue='\033[0;34m' -magenta='\033[0;35m' -cyan='\033[0;36m' -clear='\033[0m' - -function print_red() { - echo -e "${red}$@${clear}" -} -function print_yellow() { - echo -e "${yellow}$@${clear}" -} -function print_green() { - echo -e "${green}$@${clear}" -} -function print_blue() { - echo -e "${blue}$@${clear}" -} -function print_magenta() { - echo -e "${magenta}$@${clear}" -} -function print_cyan() { - echo -e "${cyan}$@${clear}" -} - -function is_installed () { - # Determine if a command is available for use! - cmd="${1}" - if command -v $cmd >/dev/null; then - echo "$cmd is installed" - else - echo "$cmd could not be found" - exit 1 - fi -} - -function install_operator() { - # Shared function to install the operator from a specific repository branch and cleanup - script_dir=${1} - repository=${2} - branch=${3} - tmpfile="${script_dir}/flux-operator.yaml" - run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml - kubectl apply -f $tmpfile -} - - -function run_echo() { - # Show the user the command then run it - echo - print_green "$@" - retry $@ -} - -function run_echo_allow_fail() { - echo - print_green "$@" - $@ || true -} - -function retry() { - # Retry an unsuccessful user command, per request - while true - do - $@ - retval=$? - if [[ "${retval}" == "0" ]]; then - return - fi - print_blue "That command was not successful. Do you want to try again? šŸ¤”ļø" - read -p " (yes/no) " answer - # Exit with non-zero response so we know to stop in script. - case ${answer} in - yes ) continue;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac - done -} - - -function prompt() { - # Prompt the user with a yes/no command to continue or exit - print_blue "$@ šŸ¤”ļø" - read -p " (yes/no) " answer - case ${answer} in - yes ) echo ok, we will proceed;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac -} - - -function with_exponential_backoff { - # Run with exponential backoff - assume containers take a while to pull - local max_attempts=100 - local timeout=1 - local attempt=0 - local exitcode=0 - - while [[ $attempt < $max_attempts ]]; do - "$@" - exitcode=$? - - if [[ $exitcode == 0 ]]; then - break - fi - - echo "Failure! Retrying in $timeout.." 1>&2 - sleep $timeout - attempt=$(( attempt + 1 )) - timeout=$(( timeout * 2 )) - done - - if [[ $exitCode != 0 ]]; then - echo "You've failed me for the last time! ($@)" 1>&2 - fi - return $exitcode -} - -# Defaults - these are in the config but left here for information -CLUSTER_NAME="flux-cluster" -ZONE="us-central1-a" -CLUSTER_VERSION="1.23" -MACHINE_TYPE="n1-standard-1" -FORCE_CLUSTER="false" -SIZE=4 -TAGS="flux-cluster" -REPOSITORY="flux-framework/flux-operator" -BRANCH="main" -GOOGLE_PROJECT="dinodev" -SCRIPT_DIR="/home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts" - -# Required arguments -if [ -z ${GOOGLE_PROJECT+x} ]; then - echo "Missing Google Project template variable as GOOGLE_PROJECT"; - exit 1 -fi - -if [ -z ${ZONE+x} ]; then - echo "Missing Google Cloud zone template variable as ZONE"; - exit 1 -fi - -if [ -z ${MACHINE_TYPE+x} ]; then - echo "Missing Google Cloud machine type template variable as MACHINE_TYPE"; - exit 1 -fi - -print_magenta " cluster : ${CLUSTER_NAME}" -print_magenta " version : ${CLUSTER_VERSION}" -print_magenta " project : ${GOOGLE_PROJECT}" -print_magenta " machine : ${MACHINE_TYPE}" -print_magenta " zone : ${ZONE}" -print_magenta " tags : ${TAGS}" -print_magenta " size : ${SIZE}" -print_magenta "repository : ${REPOSITORY}" -print_magenta " branch : ${BRANCH}" - -is_installed kubectl -is_installed gcloud -is_installed wget - -# Check if it already exists -gcloud container clusters list --zone ${ZONE} | grep ${CLUSTER_NAME} -retval=$? -if [[ "${retval}" == "0" ]]; then - print_blue "${CLUSTER_NAME} in ${ZONE} already exists." - echo - exit 0 -fi - -if [[ "${FORCE_CLUSTER}" != "true" ]]; then - prompt "Do you want to create this cluster?" -fi - -# Create the cluster -run_echo gcloud container clusters create ${CLUSTER_NAME} --project $GOOGLE_PROJECT \ - --zone ${ZONE} --cluster-version ${CLUSTER_VERSION} --machine-type ${MACHINE_TYPE} \ - --num-nodes=${SIZE} --enable-network-policy --tags=${TAGS} --enable-intra-node-visibility - -# Get credentials so kubectl will work -run_echo gcloud container clusters get-credentials ${CLUSTER_NAME} --zone ${ZONE} --project $GOOGLE_PROJECT -run_echo kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user $(gcloud config get-value core/account) - -# Show nodes -run_echo kubectl get nodes - -# Deploy the operator -mkdir -p ${SCRIPT_DIR} -install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} - -run_echo kubectl get namespace -run_echo kubectl describe namespace operator-system diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh deleted file mode 100755 index de8988e..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh +++ /dev/null @@ -1,161 +0,0 @@ -#!/bin/bash - -# Source shared helper scripts -# Colors -red='\033[0;31m' -green='\033[0;32m' -yellow='\033[0;33m' -blue='\033[0;34m' -magenta='\033[0;35m' -cyan='\033[0;36m' -clear='\033[0m' - -function print_red() { - echo -e "${red}$@${clear}" -} -function print_yellow() { - echo -e "${yellow}$@${clear}" -} -function print_green() { - echo -e "${green}$@${clear}" -} -function print_blue() { - echo -e "${blue}$@${clear}" -} -function print_magenta() { - echo -e "${magenta}$@${clear}" -} -function print_cyan() { - echo -e "${cyan}$@${clear}" -} - -function is_installed () { - # Determine if a command is available for use! - cmd="${1}" - if command -v $cmd >/dev/null; then - echo "$cmd is installed" - else - echo "$cmd could not be found" - exit 1 - fi -} - -function install_operator() { - # Shared function to install the operator from a specific repository branch and cleanup - script_dir=${1} - repository=${2} - branch=${3} - tmpfile="${script_dir}/flux-operator.yaml" - run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml - kubectl apply -f $tmpfile -} - - -function run_echo() { - # Show the user the command then run it - echo - print_green "$@" - retry $@ -} - -function run_echo_allow_fail() { - echo - print_green "$@" - $@ || true -} - -function retry() { - # Retry an unsuccessful user command, per request - while true - do - $@ - retval=$? - if [[ "${retval}" == "0" ]]; then - return - fi - print_blue "That command was not successful. Do you want to try again? šŸ¤”ļø" - read -p " (yes/no) " answer - # Exit with non-zero response so we know to stop in script. - case ${answer} in - yes ) continue;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac - done -} - - -function prompt() { - # Prompt the user with a yes/no command to continue or exit - print_blue "$@ šŸ¤”ļø" - read -p " (yes/no) " answer - case ${answer} in - yes ) echo ok, we will proceed;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac -} - - -function with_exponential_backoff { - # Run with exponential backoff - assume containers take a while to pull - local max_attempts=100 - local timeout=1 - local attempt=0 - local exitcode=0 - - while [[ $attempt < $max_attempts ]]; do - "$@" - exitcode=$? - - if [[ $exitcode == 0 ]]; then - break - fi - - echo "Failure! Retrying in $timeout.." 1>&2 - sleep $timeout - attempt=$(( attempt + 1 )) - timeout=$(( timeout * 2 )) - done - - if [[ $exitCode != 0 ]]; then - echo "You've failed me for the last time! ($@)" 1>&2 - fi - return $exitcode -} - -# Defaults - these are in the config but left here for information -CLUSTER_NAME="flux-cluster" -FORCE_CLUSTER="false" -ZONE="us-central1-a" - -if [ -z ${ZONE+x} ]; then - echo "Google Cloud zone template missing as ZONE"; - exit 1 -fi - -echo " cluster : ${CLUSTER_NAME}" -echo " zone : ${ZONE}" - -is_installed gcloud -is_installed yes || FORCE_CLUSTER="false" - -# Check if it already exists -gcloud container clusters list --zone ${ZONE} | grep ${CLUSTER_NAME} -retval=$? -if [[ "${retval}" != "0" ]]; then - print_blue "${CLUSTER_NAME} in ${ZONE} does not exist." - echo - exit 0 -fi - -# This command has a confirmation already -if [[ "${FORCE_CLUSTER}" == "true" ]]; then - yes | gcloud container clusters delete --zone ${ZONE} ${CLUSTER_NAME} -else - run_echo gcloud container clusters delete --zone ${ZONE} ${CLUSTER_NAME} -fi diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml deleted file mode 100644 index b4bc03e..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml +++ /dev/null @@ -1,848 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - labels: - control-plane: controller-manager - name: operator-system ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.9.0 - creationTimestamp: null - name: miniclusters.flux-framework.org -spec: - group: flux-framework.org - names: - kind: MiniCluster - listKind: MiniClusterList - plural: miniclusters - singular: minicluster - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: MiniCluster is the Schema for a Flux job launcher on K8s - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - description: MiniCluster defines the desired state of a Flux MiniCluster - "I am a Flux user and I want to launch a MiniCluster for my job!" A - MiniCluster corresponds to a Batch Job -> StatefulSet + ConfigMaps A - "task" within that cluster is flux running something. - properties: - containers: - description: Containers is one or more containers to be created in - a pod. There should only be one container to run flux with runFlux - items: - properties: - command: - description: 'Single user executable to provide to flux start - IMPORTANT: This is left here, but not used in favor of exposing - Flux via a Restful API. We Can remove this when that is finalized.' - type: string - cores: - description: Cores the container should use - format: int32 - type: integer - diagnostics: - description: Run flux diagnostics on start instead of command - type: boolean - environment: - additionalProperties: - type: string - description: Key/value pairs for the environment - type: object - fluxLogLevel: - default: 6 - description: Log level to use for flux logging (only in non - TestMode) - format: int32 - type: integer - fluxOptionFlags: - description: Flux option flags, usually provided with -o optional - - if needed, default option flags for the server These can - also be set in the user interface to override here. This is - only valid for a FluxRunner - type: string - image: - default: fluxrm/flux-sched:focal - description: Container image must contain flux and flux-sched - install - type: string - imagePullSecret: - description: Allow the user to pull authenticated images By - default no secret is selected. Setting this with the name - of an already existing imagePullSecret will specify that secret - in the pod spec. - type: string - name: - description: Container name is only required for non flux runners - type: string - ports: - description: Ports to be exposed to other containers in the - cluster We take a single list of integers and map to the same - items: - format: int32 - type: integer - type: array - postStartExec: - description: Lifecycle can handle post start commands, etc. - type: string - preCommand: - description: Special command to run at beginning of script, - directly after asFlux is defined as sudo -u flux -E (so you - can change that if desired.) This is only valid if FluxRunner - is set (that writes a wait.sh script) - type: string - pullAlways: - default: false - description: Allow the user to dictate pulling By default we - pull if not present. Setting this to true will indicate to - pull always - type: boolean - resources: - description: Resources include limits and requests - properties: - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - type: object - type: object - runFlux: - description: Main container to run flux (only should be one) - type: boolean - volumes: - additionalProperties: - description: A Container volume must reference one defined - for the MiniCluster The path here is in the container - properties: - path: - type: string - readOnly: - default: true - type: boolean - required: - - path - type: object - description: Volumes that can be mounted (must be defined in - volumes) - type: object - workingDir: - description: Working directory to run command from - type: string - required: - - image - type: object - type: array - deadlineSeconds: - default: 31500000 - description: Should the job be limited to a particular number of seconds? - Approximately one year. This cannot be zero or job won't start - format: int64 - type: integer - fluxRestful: - description: Customization to Flux Restful API There should only be - one container to run flux with runFlux - properties: - branch: - default: main - description: Branch to clone Flux Restful API from - type: string - port: - default: 5000 - description: Port to run Flux Restful Server On - format: int32 - type: integer - token: - description: Token to use for RestFul API - type: string - username: - description: These two should not actually be set by a user, but - rather generated by tools and provided Username to use for RestFul - API - type: string - type: object - jobLabels: - additionalProperties: - type: string - description: Labels for the job - type: object - localDeploy: - default: false - description: localDeploy should be true for development, or deploying - in the case that there isn't an actual kubernetes cluster (e.g., - you are not using make deploy. It uses a persistent volume instead - of a claim - type: boolean - logging: - description: Logging modes determine the output you see in the job - log - properties: - quiet: - default: false - description: Quiet mode silences all output so the job only shows - the test running - type: boolean - timed: - default: false - description: Timed mode adds timing to Flux commands - type: boolean - type: object - pod: - description: Pod spec details - properties: - resources: - additionalProperties: - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - description: Resources include limits and requests - type: object - type: object - podLabels: - additionalProperties: - type: string - description: Labels for each pod - type: object - size: - default: 1 - description: Size (number of job pods to run, size of minicluster - in pods) - format: int32 - type: integer - tasks: - default: 1 - description: Total number of CPUs being run across entire cluster - format: int32 - type: integer - volumes: - additionalProperties: - description: Mini Cluster local volumes available to mount (these - are on the host) - properties: - path: - type: string - required: - - path - type: object - description: Volumes on the host (named) accessible to containers - type: object - required: - - containers - type: object - status: - description: MiniClusterStatus defines the observed state of Flux - properties: - conditions: - description: conditions hold the latest Flux Job and MiniCluster states - items: - description: "Condition contains details for one aspect of the current - state of this API Resource. --- This struct is intended for direct - use as an array at the field path .status.conditions. For example, - \n type FooStatus struct{ // Represents the observations of a - foo's current state. // Known .status.conditions.type are: \"Available\", - \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge - // +listType=map // +listMapKey=type Conditions []metav1.Condition - `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" - protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" - properties: - lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. Producers - of specific condition types may define expected values and - meanings for this field, and whether the values are considered - a guaranteed API. The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - --- Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - jobid: - description: The JobUid is set internally to associate to a miniCluster - type: string - required: - - jobid - type: object - type: object - served: true - storage: true - subresources: - status: {} ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: operator-controller-manager - namespace: operator-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: operator-leader-election-role - namespace: operator-system -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - creationTimestamp: null - name: operator-manager-role -rules: -- apiGroups: - - "" - resources: - - events - verbs: - - create - - update - - watch -- apiGroups: - - "" - resources: - - events - - nodes - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - create - - delete - - exec - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs/status - verbs: - - create - - delete - - exec - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - "" - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - batch - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - configmaps - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch -- apiGroups: - - "" - resources: - - jobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - networks - verbs: - - create - - patch -- apiGroups: - - "" - resources: - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - persistentvolumes - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - pods - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - pods/exec - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - pods/log - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - secrets - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - services - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - statefulsets - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - flux-framework.org - resources: - - clusters - - clusters/status - verbs: - - get - - list - - watch -- apiGroups: - - flux-framework.org - resources: - - machineclasses - - machinedeployments - - machinedeployments/status - - machines - - machines/status - - machinesets - - machinesets/status - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - flux-framework.org - resources: - - miniclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - flux-framework.org - resources: - - miniclusters/finalizers - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - flux-framework.org - resources: - - miniclusters/status - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - create - - delete - - get - - list - - patch - - update - - watch ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-metrics-reader -rules: -- nonResourceURLs: - - /metrics - verbs: - - get ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-proxy-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: operator-leader-election-rolebinding - namespace: operator-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: operator-leader-election-role -subjects: -- kind: ServiceAccount - name: operator-controller-manager - namespace: operator-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: operator-manager-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-manager-role -subjects: -- kind: ServiceAccount - name: operator-controller-manager - namespace: operator-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: operator-proxy-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-proxy-role -subjects: -- kind: ServiceAccount - name: operator-controller-manager - namespace: operator-system ---- -apiVersion: v1 -data: - controller_manager_config.yaml: | - apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 - kind: ControllerManagerConfig - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: 127.0.0.1:8080 - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: 14dde902.flux-framework.org -kind: ConfigMap -metadata: - name: operator-manager-config - namespace: operator-system ---- -apiVersion: v1 -kind: Service -metadata: - labels: - control-plane: controller-manager - name: operator-controller-manager-metrics-service - namespace: operator-system -spec: - ports: - - name: https - port: 8443 - protocol: TCP - targetPort: https - selector: - control-plane: controller-manager ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - control-plane: controller-manager - name: operator-controller-manager - namespace: operator-system -spec: - replicas: 1 - selector: - matchLabels: - control-plane: controller-manager - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - control-plane: controller-manager - spec: - containers: - - args: - - --secure-listen-address=0.0.0.0:8443 - - --upstream=http://127.0.0.1:8080/ - - --logtostderr=true - - --v=0 - image: gcr.io/kubebuilder/kube-rbac-proxy:v0.11.0 - name: kube-rbac-proxy - ports: - - containerPort: 8443 - name: https - protocol: TCP - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 5m - memory: 64Mi - securityContext: - allowPrivilegeEscalation: false - - args: - - --health-probe-bind-address=:8081 - - --metrics-bind-address=127.0.0.1:8080 - - --leader-elect - command: - - /manager - image: ghcr.io/flux-framework/flux-operator:latest - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - name: manager - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 10m - memory: 64Mi - securityContext: - allowPrivilegeEscalation: false - securityContext: - runAsNonRoot: true - serviceAccountName: operator-controller-manager - terminationGracePeriodSeconds: 10 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml deleted file mode 100644 index b3b2e17..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster - -metadata: - name: lammps-job - namespace: flux-operator -spec: - # localDeploy needs to be false - localDeploy: false - - # Number of pods to create for MiniCluster - size: 2 - tasks: 1 - - # Disable verbose output - - - # Optional credentials if running the flux restful api - fluxRestful: - token: "6b8a7393-129b-4e2d-83a7-795a5a7c9d9b" - username: "fluxuser" - - # TODO add pod resources, if needed - containers: - - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 - - - - cores: 1 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh deleted file mode 100755 index 0bd72c3..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh +++ /dev/null @@ -1,219 +0,0 @@ -#!/bin/bash - -# This is a template that will be populated with variables by Flux-Cloud -# We only run it to check if a MiniCluster is running. An apply is only -# needed if the MiniCluster is not created yet. - -# Include shared helper scripts -# Colors -red='\033[0;31m' -green='\033[0;32m' -yellow='\033[0;33m' -blue='\033[0;34m' -magenta='\033[0;35m' -cyan='\033[0;36m' -clear='\033[0m' - -function print_red() { - echo -e "${red}$@${clear}" -} -function print_yellow() { - echo -e "${yellow}$@${clear}" -} -function print_green() { - echo -e "${green}$@${clear}" -} -function print_blue() { - echo -e "${blue}$@${clear}" -} -function print_magenta() { - echo -e "${magenta}$@${clear}" -} -function print_cyan() { - echo -e "${cyan}$@${clear}" -} - -function is_installed () { - # Determine if a command is available for use! - cmd="${1}" - if command -v $cmd >/dev/null; then - echo "$cmd is installed" - else - echo "$cmd could not be found" - exit 1 - fi -} - -function install_operator() { - # Shared function to install the operator from a specific repository branch and cleanup - script_dir=${1} - repository=${2} - branch=${3} - tmpfile="${script_dir}/flux-operator.yaml" - run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml - kubectl apply -f $tmpfile -} - - -function run_echo() { - # Show the user the command then run it - echo - print_green "$@" - retry $@ -} - -function run_echo_allow_fail() { - echo - print_green "$@" - $@ || true -} - -function retry() { - # Retry an unsuccessful user command, per request - while true - do - $@ - retval=$? - if [[ "${retval}" == "0" ]]; then - return - fi - print_blue "That command was not successful. Do you want to try again? šŸ¤”ļø" - read -p " (yes/no) " answer - # Exit with non-zero response so we know to stop in script. - case ${answer} in - yes ) continue;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac - done -} - - -function prompt() { - # Prompt the user with a yes/no command to continue or exit - print_blue "$@ šŸ¤”ļø" - read -p " (yes/no) " answer - case ${answer} in - yes ) echo ok, we will proceed;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac -} - - -function with_exponential_backoff { - # Run with exponential backoff - assume containers take a while to pull - local max_attempts=100 - local timeout=1 - local attempt=0 - local exitcode=0 - - while [[ $attempt < $max_attempts ]]; do - "$@" - exitcode=$? - - if [[ $exitcode == 0 ]]; then - break - fi - - echo "Failure! Retrying in $timeout.." 1>&2 - sleep $timeout - attempt=$(( attempt + 1 )) - timeout=$(( timeout * 2 )) - done - - if [[ $exitCode != 0 ]]; then - echo "You've failed me for the last time! ($@)" 1>&2 - fi - return $exitcode -} - -NAMESPACE="flux-operator" -CRD="/home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml" -JOB="lammps-job" - -# Size -1 to account for certificate generator -SIZE=2 - -print_magenta " apply : ${CRD}" -print_magenta " job : ${JOB}" - -is_installed kubectl - -# Create the namespace (ok if already exists) -run_echo_allow_fail kubectl create namespace ${NAMESPACE} - -# Always cleanup a previous one so tokens don't get stale -run_echo_allow_fail kubectl delete -f ${CRD} -echo -podsCleaned="false" -print_blue "Waiting for previous MiniCluster to be cleaned up..." -while [[ "${podsCleaned}" == "false" ]]; do - echo -n "." - sleep 2 - state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1) - lines=$(echo $state | wc -l) - if [[ "${lines}" == "1" ]] && [[ "${state}" == *"No resources found in"* ]]; then - echo - print_green "šŸŒ€ļø Previous pods are cleaned up." - podsCleaned="true" - break - fi -done - -# Ensure we have a MiniCluster of the right namespace running -echo -print_green "šŸŒ€ļø Creating MiniCluster in ${NAMESPACE}" -# Apply the job, get pods -run_echo kubectl apply -f ${CRD} -run_echo kubectl get -n ${NAMESPACE} pods - -# continue until we find the index-0 pod -podsReady="false" - -echo -print_blue "Waiting for MiniCluster of size ${SIZE} to be ready..." -while [[ "${podsReady}" == "false" ]]; do - echo -n "." - sleep 2 - pods=$(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=name | wc -l) - if [[ "${pods}" == "${SIZE}" ]]; then - echo - print_green "šŸŒ€ļø All pods are running." - podsReady="true" - break - fi -done - -echo -brokerPod="" -brokerPrefix="${JOB}-0" -while [[ "${brokerPod}" == "" ]]; do - for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - brokerPod=${pod} - break - fi - done -done - -echo -serverReady="false" -print_blue "Waiting for Flux Restful API Server to be ready..." -while [[ "${serverReady}" == "false" ]]; do - echo -n "." - sleep 2 - logs=$(kubectl logs --namespace ${NAMESPACE} ${brokerPod} | grep "Uvicorn running") - retval=$? - if [[ "${retval}" == "0" ]]; then - echo - serverReady="true" - print_green "šŸŒ€ļø Flux RestFul API Server is Ready." - break - fi -done diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json deleted file mode 100644 index b7b654b..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json +++ /dev/null @@ -1,698 +0,0 @@ -{ - "times": { - "destroy-cluster": 324.709, - "create-cluster": 86.521, - "minicluster-submit-size-2": 183.626, - "reaxc-hns-1-minicluster-size-2": 32.1847505569458, - "reaxc-hns-2-minicluster-size-2": 33.41048860549927, - "reaxc-hns-3-minicluster-size-2": 30.96457529067993, - "reaxc-hns-4-minicluster-size-2": 30.777089595794678, - "reaxc-hns-5-minicluster-size-2": 31.048890829086304, - "sleep-1-minicluster-size-2": 5.028888463973999, - "sleep-2-minicluster-size-2": 5.045725584030151, - "sleep-3-minicluster-size-2": 5.072444677352905, - "sleep-4-minicluster-size-2": 5.034207582473755, - "sleep-5-minicluster-size-2": 5.025948762893677, - "hello-world-1-minicluster-size-2": 0.07241106033325195, - "hello-world-2-minicluster-size-2": 0.052734375, - "hello-world-3-minicluster-size-2": 0.04248523712158203, - "hello-world-4-minicluster-size-2": 0.045003652572631836, - "hello-world-5-minicluster-size-2": 0.05110311508178711, - "minicluster-destroy-size-2": 0.277, - "minicluster-create-persistent-size-2": 42.606, - "minicluster-persistent-destroy-size-2": 0.164 - }, - "size": 4, - "machine": "n1-standard-1", - "minicluster": { - "name": "lammps-job", - "namespace": "flux-operator", - "size": [ - 2 - ] - }, - "jobs": { - "reaxc-hns-1": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "reaxc-hns-2": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "reaxc-hns-3": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "reaxc-hns-4": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "reaxc-hns-5": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-1": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-2": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-3": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-4": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-5": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-1": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-2": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-3": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-4": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-5": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - } - }, - "info": { - "reaxc-hns-1-minicluster-size-2": { - "id": 130073755648, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674444768.0517902, - "t_depend": 1674444768.0517902, - "t_run": 1674444768.100832, - "t_cleanup": 1674444800.2855825, - "t_inactive": 1674444800.290403, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049568.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 32.1847505569458, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 40.13091278076172, - "start_to_output_seconds": 43.215059757232666 - }, - "reaxc-hns-2-minicluster-size-2": { - "id": 816932978688, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674444808.9904723, - "t_depend": 1674444808.9904723, - "t_run": 1674444809.0098114, - "t_cleanup": 1674444842.4203, - "t_inactive": 1674444842.4249685, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "0", - "nodelist": "lammps-job-0", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049609.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 33.41048860549927, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 97.17731666564941, - "start_to_output_seconds": 97.31685972213745 - }, - "reaxc-hns-3-minicluster-size-2": { - "id": 2450245287936, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674444906.3438601, - "t_depend": 1674444906.3438601, - "t_run": 1674444906.3633585, - "t_cleanup": 1674444937.3279338, - "t_inactive": 1674444937.33689, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049706.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 30.96457529067993, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 67.29511857032776, - "start_to_output_seconds": 67.40737009048462 - }, - "reaxc-hns-4-minicluster-size-2": { - "id": 3581969170432, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674444973.8004916, - "t_depend": 1674444973.8004916, - "t_run": 1674444973.8231413, - "t_cleanup": 1674445004.600231, - "t_inactive": 1674445004.6049078, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049773.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 30.777089595794678, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 62.43251633644104, - "start_to_output_seconds": 62.51574635505676 - }, - "reaxc-hns-5-minicluster-size-2": { - "id": 4631065264128, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445036.3308098, - "t_depend": 1674445036.3308098, - "t_run": 1674445036.3509514, - "t_cleanup": 1674445067.3998423, - "t_inactive": 1674445067.4045572, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049836.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 31.048890829086304, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 92.83428883552551, - "start_to_output_seconds": 92.92412114143372 - }, - "sleep-1-minicluster-size-2": { - "id": 461004341248, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677415.8718548, - "t_depend": 1674677415.8718548, - "t_run": 1674677415.8845603, - "t_cleanup": 1674677420.9134488, - "t_inactive": 1674677420.9152129, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282215.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.028888463973999, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 14.840466737747192, - "start_to_output_seconds": 17.383413314819336 - }, - "sleep-2-minicluster-size-2": { - "id": 717628637184, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677431.16695, - "t_depend": 1674677431.16695, - "t_run": 1674677431.1903481, - "t_cleanup": 1674677436.2360737, - "t_inactive": 1674677436.2395134, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "0", - "nodelist": "lammps-job-0", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282231.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.045725584030151, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 12.824117660522461, - "start_to_output_seconds": 15.347451210021973 - }, - "sleep-3-minicluster-size-2": { - "id": 975108571136, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677446.5178363, - "t_depend": 1674677446.5178363, - "t_run": 1674677446.534995, - "t_cleanup": 1674677451.6074398, - "t_inactive": 1674677451.613382, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282246.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.072444677352905, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 12.840857744216919, - "start_to_output_seconds": 15.384143352508545 - }, - "sleep-4-minicluster-size-2": { - "id": 1234333335552, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677461.9656863, - "t_depend": 1674677461.9656863, - "t_run": 1674677461.9789429, - "t_cleanup": 1674677467.0131505, - "t_inactive": 1674677467.0233643, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282261.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.034207582473755, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 12.951504468917847, - "start_to_output_seconds": 15.509077787399292 - }, - "sleep-5-minicluster-size-2": { - "id": 1495168712704, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677477.5129235, - "t_depend": 1674677477.5129235, - "t_run": 1674677477.5259533, - "t_cleanup": 1674677482.551902, - "t_inactive": 1674677482.555279, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282277.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.025948762893677, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 12.880193948745728, - "start_to_output_seconds": 15.410512447357178 - }, - "hello-world-1-minicluster-size-2": { - "id": 8356177641472, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445258.3653252, - "t_depend": 1674445258.3653252, - "t_run": 1674445258.3868065, - "t_cleanup": 1674445258.4592175, - "t_inactive": 1674445258.46398, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050058.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.07241106033325195, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 13.482953310012817, - "start_to_output_seconds": 16.53845739364624 - }, - "hello-world-2-minicluster-size-2": { - "id": 8635753168896, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445275.028449, - "t_depend": 1674445275.028449, - "t_run": 1674445275.0489655, - "t_cleanup": 1674445275.1016998, - "t_inactive": 1674445275.1059186, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050075.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.052734375, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 0.5918288230895996, - "start_to_output_seconds": 0.6222965717315674 - }, - "hello-world-3-minicluster-size-2": { - "id": 8641507753984, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445275.3710968, - "t_depend": 1674445275.3710968, - "t_run": 1674445275.3893383, - "t_cleanup": 1674445275.4318235, - "t_inactive": 1674445275.4359808, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050075.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.04248523712158203, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 0.17513155937194824, - "start_to_output_seconds": 0.21306657791137695 - }, - "hello-world-4-minicluster-size-2": { - "id": 8646121488384, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445275.6465385, - "t_depend": 1674445275.6465385, - "t_run": 1674445275.6643715, - "t_cleanup": 1674445275.7093751, - "t_inactive": 1674445275.7134967, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050075.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.045003652572631836, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 0.19276666641235352, - "start_to_output_seconds": 0.2307295799255371 - }, - "hello-world-5-minicluster-size-2": { - "id": 8649946693632, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445275.8740122, - "t_depend": 1674445275.8740122, - "t_run": 1674445275.8942568, - "t_cleanup": 1674445275.94536, - "t_inactive": 1674445275.95746, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050075.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.05110311508178711, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 0.17215561866760254, - "start_to_output_seconds": 0.19998478889465332 - } - } -} diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out deleted file mode 100644 index 647c484..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.005 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 29.8322 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.029 ns/day, 828.671 hours/ns, 3.352 timesteps/s -94.2% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 22.21 | 22.21 | 22.21 | 0.0 | 74.45 -Neigh | 0.61723 | 0.61723 | 0.61723 | 0.0 | 2.07 -Comm | 0.010007 | 0.010007 | 0.010007 | 0.0 | 0.03 -Output | 0.0004328 | 0.0004328 | 0.0004328 | 0.0 | 0.00 -Modify | 6.9933 | 6.9933 | 6.9933 | 0.0 | 23.44 -Other | | 0.00162 | | | 0.01 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out deleted file mode 100644 index 0b9df79..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.010 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 31.2338 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.028 ns/day, 867.606 hours/ns, 3.202 timesteps/s -91.3% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 23.353 | 23.353 | 23.353 | 0.0 | 74.77 -Neigh | 0.62616 | 0.62616 | 0.62616 | 0.0 | 2.00 -Comm | 0.0096617 | 0.0096617 | 0.0096617 | 0.0 | 0.03 -Output | 0.00044694 | 0.00044694 | 0.00044694 | 0.0 | 0.00 -Modify | 7.2429 | 7.2429 | 7.2429 | 0.0 | 23.19 -Other | | 0.001518 | | | 0.00 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:32 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out deleted file mode 100644 index b6380b6..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.002 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 29.6229 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.029 ns/day, 822.859 hours/ns, 3.376 timesteps/s -94.4% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 22.175 | 22.175 | 22.175 | 0.0 | 74.86 -Neigh | 0.63724 | 0.63724 | 0.63724 | 0.0 | 2.15 -Comm | 0.0097153 | 0.0097153 | 0.0097153 | 0.0 | 0.03 -Output | 0.00041342 | 0.00041342 | 0.00041342 | 0.0 | 0.00 -Modify | 6.799 | 6.799 | 6.799 | 0.0 | 22.95 -Other | | 0.001424 | | | 0.00 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out deleted file mode 100644 index 6c889f5..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.002 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 29.7805 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.029 ns/day, 827.235 hours/ns, 3.358 timesteps/s -94.2% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 22.214 | 22.214 | 22.214 | 0.0 | 74.59 -Neigh | 0.62414 | 0.62414 | 0.62414 | 0.0 | 2.10 -Comm | 0.01756 | 0.01756 | 0.01756 | 0.0 | 0.06 -Output | 0.00041921 | 0.00041921 | 0.00041921 | 0.0 | 0.00 -Modify | 6.9226 | 6.9226 | 6.9226 | 0.0 | 23.25 -Other | | 0.00152 | | | 0.01 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out deleted file mode 100644 index 9c9d4df..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.002 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 30.0677 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.029 ns/day, 835.214 hours/ns, 3.326 timesteps/s -93.3% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 22.337 | 22.337 | 22.337 | 0.0 | 74.29 -Neigh | 0.73472 | 0.73472 | 0.73472 | 0.0 | 2.44 -Comm | 0.009731 | 0.009731 | 0.009731 | 0.0 | 0.03 -Output | 0.00041722 | 0.00041722 | 0.00041722 | 0.0 | 0.00 -Modify | 6.9844 | 6.9844 | 6.9844 | 0.0 | 23.23 -Other | | 0.001495 | | | 0.00 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:30 diff --git a/examples/up-submit-down/plot_results.py b/examples/up-submit-down/plot_results.py deleted file mode 100644 index 6395f83..0000000 --- a/examples/up-submit-down/plot_results.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import json -import os -import sys - -import matplotlib.pyplot as plt -import pandas -import seaborn as sns - - -def read_json(filename): - """ - Read a file into a text blob. - """ - with open(filename, "r") as fd: - content = json.loads(fd.read()) - return content - - -def plot_outputs(raw, plotname, ext="pdf"): - """ - Parse results.json into dataframe and plots to save. - """ - # Let's save the following, with runid as index - columns = ["minicluster_size", "job_type", "time_seconds", "time_type"] - - # Let's first organize distributions of times - data = [] - index = [] - for jobname, item in raw["info"].items(): - index += [jobname, jobname, jobname] - jobtype = jobname.split("-minicluster-size")[0].rsplit("-", 1)[0] - - # This is how flux-cloud organized the output - minicluster_size = int(jobname.rsplit("size-", 1)[-1]) - - # Manual melt :) - data.append([minicluster_size, jobtype, item["runtime"], "runtime"]) - data.append( - [ - minicluster_size, - jobtype, - item["start_to_output_seconds"], - "output_seconds", - ] - ) - data.append( - [minicluster_size, jobtype, item["start_to_info_seconds"], "info_seconds"] - ) - - # Assemble the data frame, index is the runids - df = pandas.DataFrame(data, columns=columns) - df.index = index - - # Save raw data - df.to_csv("results-df.csv") - - # We need colors! - colors = sns.color_palette("hls", 8) - hexcolors = colors.as_hex() - - palette = {} - for size in df.time_type.unique(): - palette[size] = hexcolors.pop(0) - - # Sort by size - palette = dict(sorted(palette.items())) - - # Let's make a plot that shows distributions of the times by the cluster size, across all - make_plot( - df, - title="Flux MiniCluster Time Variation", - tag="minicluster_times", - ydimension="time_seconds", - palette=palette, - ext=ext, - plotname=plotname, - ) - - -def make_plot(df, title, tag, ydimension, palette, ext="pdf", plotname="lammps"): - """ - Helper function to make common plots. - """ - ext = ext.strip(".") - plt.figure(figsize=(12, 12)) - sns.set_style("dark") - ax = sns.boxplot( - x="job_type", - y=ydimension, - hue="time_type", - data=df, - whis=[5, 95], - palette=palette, - ) - plt.title(title) - plt.legend([], [], frameon=False) - ax.set_xlabel("Job Type", fontsize=16) - ax.set_ylabel("Time (seconds)", fontsize=16) - ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=14) - ax.set_yticklabels(ax.get_yticks(), fontsize=14) - handles, _ = ax.get_legend_handles_labels() - ax.legend(handles, list(palette)) - plt.savefig(f"{tag}_{plotname}.{ext}") - plt.clf() - - -def get_parser(): - """ - Process results file into plots. - """ - parser = argparse.ArgumentParser(description="Plot LAMMPS outputs") - parser.add_argument("results_json", help="results json file", nargs="?") - parser.add_argument( - "-p", - "--plotname", - default="lammps", - help="base name for plot output files", - ) - parser.add_argument( - "-e", - "--extension", - dest="extension", - default="pdf", - help="image extension to use (defaults to pdf)", - ) - return parser - - -def main(): - """ - Read in results json, and make plots. - """ - parser = get_parser() - args = parser.parse_args() - if not os.path.exists(args.results_json): - sys.exit(f"{args.results_json} does not exist.") - data = read_json(args.results_json) - plot_outputs(data, args.plotname, ext=args.extension) - - -if __name__ == "__main__": - main() diff --git a/fluxcloud/client/__init__.py b/fluxcloud/client/__init__.py index 673a585..1007f64 100644 --- a/fluxcloud/client/__init__.py +++ b/fluxcloud/client/__init__.py @@ -130,11 +130,6 @@ def get_parser(): description="Bring the cluster up, run experiments via applying CRDs, and bring it down.", formatter_class=argparse.RawTextHelpFormatter, ) - ui = subparsers.add_parser( - "ui", - description="Once the cluster is up, create/open the user interface.", - formatter_class=argparse.RawTextHelpFormatter, - ) batch = subparsers.add_parser( "batch", description="Bring the cluster up, run experiments via a Flux Restful API submit, and bring it down.", @@ -167,13 +162,38 @@ def get_parser(): help="Bring down all experiment clusters", dest="down_all", ) + for command in submit, apply: + command.add_argument( + "--non-interactive", + "--ni", + default=False, + action="store_true", + help="Don't ask before bringing miniclusters down or re-creating.", + dest="non_interactive", + ) + + experiment = subparsers.add_parser( + "experiment", + description="Experiment controller.", + formatter_class=argparse.RawTextHelpFormatter, + ) + experiment.add_argument( + "experiment_command", + help="Command for experiment (defaults to init)", + ) + experiment.add_argument( + "-c", + "--cloud", + help="cloud to use", + choices=clouds.cloud_names, + ) listing = subparsers.add_parser( "list", description="List experiment ids available.", formatter_class=argparse.RawTextHelpFormatter, ) - for command in run, up, down, apply, listing, batch, submit, ui: + for command in run, up, down, apply, listing, batch, submit: command.add_argument( "experiments", default="experiments.yaml", @@ -188,7 +208,7 @@ def get_parser(): choices=clouds.cloud_names, ) - for command in apply, up, down, run, batch, submit, ui: + for command in apply, up, down, run, batch, submit: command.add_argument( "--force-cluster", dest="force_cluster", @@ -228,11 +248,6 @@ def get_parser(): default=False, action="store_true", ) - command.add_argument( - "--template", - help="minicluster yaml template to populate for experiments (defaults to minicluster-template.yaml", - default="minicluster-template.yaml", - ) command.add_argument( "--force", help="force re-run if experiment already exists.", @@ -287,22 +302,22 @@ def help(return_code=0): # Does the user want a shell? if args.command == "apply": from .apply import main - elif args.command == "submit": - from .apply import submit as main - elif args.command == "list": - from .listing import main - elif args.command == "run": - from .run import main elif args.command == "batch": from .run import batch as main elif args.command == "config": from .config import main - elif args.command == "ui": - from .ui import main - elif args.command == "up": - from .up import main elif args.command == "down": from .down import main + elif args.command == "experiment": + from .experiment import main + elif args.command == "list": + from .listing import main + elif args.command == "run": + from .run import main + elif args.command == "submit": + from .apply import submit as main + elif args.command == "up": + from .up import main # Pass on to the correct parser return_code = 0 diff --git a/fluxcloud/client/apply.py b/fluxcloud/client/apply.py index 13d1369..db6d0ee 100644 --- a/fluxcloud/client/apply.py +++ b/fluxcloud/client/apply.py @@ -11,7 +11,7 @@ def main(args, parser, extra, subparser): apply parser submits via separate CRDs. """ cli, setup, experiment = prepare_client(args, extra) - cli.apply(setup, experiment=experiment) + cli.apply(setup, experiment=experiment, interactive=not args.non_interactive) setup.cleanup(setup.matrices) @@ -20,5 +20,5 @@ def submit(args, parser, extra, subparser): submit parser submits via the Flux Restful API to one cluster """ cli, setup, experiment = prepare_client(args, extra) - cli.submit(setup, experiment=experiment) + cli.submit(setup, experiment=experiment, interactive=not args.non_interactive) setup.cleanup(setup.matrices) diff --git a/fluxcloud/client/experiment.py b/fluxcloud/client/experiment.py new file mode 100644 index 0000000..0229135 --- /dev/null +++ b/fluxcloud/client/experiment.py @@ -0,0 +1,27 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import fluxcloud.main.template as templates +from fluxcloud.logger import logger +from fluxcloud.main import get_experiment_client + + +def main(args, parser, extra, subparser): + """ + apply parser submits via separate CRDs. + """ + cli = get_experiment_client(args.cloud) + if args.experiment_command == "init": + if cli.name == "aws": + print(templates.aws_experiment_template) + elif cli.name in ["google", "gcp"]: + print(templates.google_experiment_template) + elif cli.name == "minikube": + print(templates.minikube_experiment_template) + else: + logger.error(f"Client {cli.name} is not a recognized cloud") + + else: + logger.exit(f'{args.experiment_command} is not recognized. Try "init"') diff --git a/fluxcloud/client/helpers.py b/fluxcloud/client/helpers.py index d9973d4..1aba57c 100644 --- a/fluxcloud/client/helpers.py +++ b/fluxcloud/client/helpers.py @@ -17,11 +17,10 @@ def prepare_client(args, extra): """ utils.ensure_no_extra(extra) - cli = get_experiment_client(args.cloud) + cli = get_experiment_client(args.cloud, debug=args.debug) setup = ExperimentSetup( args.experiments, force_cluster=args.force_cluster, - template=args.template, cleanup=args.cleanup, # Ensure the output directory is namespaced by the cloud name outdir=os.path.join(args.output_dir, cli.name), diff --git a/fluxcloud/client/ui.py b/fluxcloud/client/ui.py deleted file mode 100644 index 40ae1d1..0000000 --- a/fluxcloud/client/ui.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2022 Lawrence Livermore National Security, LLC and other -# This is part of Flux Framework. See the COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -from fluxcloud.logger import logger - -from .helpers import prepare_client - - -def main(args, parser, extra, subparser): - """ - open the ui by starting flux - """ - cli, setup, experiment = prepare_client(args, extra) - size = args.size - if not size and len(experiment.minicluster.get("size")) != 1: - logger.exit( - "Your MiniCluster has more than one size - please define the targer size with --size." - ) - elif not size: - size = experiment.minicluster["size"][0] - logger.info(f"Selected size {size} MiniCluster to open user interface.") - cli.open_ui(setup, experiment=experiment, size=size, persistent=True) diff --git a/fluxcloud/defaults.py b/fluxcloud/defaults.py index fb64369..d321073 100644 --- a/fluxcloud/defaults.py +++ b/fluxcloud/defaults.py @@ -13,9 +13,6 @@ # The default settings file in the install root default_settings_file = os.path.join(reps["$install_dir"], "settings.yml") -# Default template if one is not provided -default_minicluster_template = os.path.join(install_dir, "minicluster-template.yaml") - # User home userhome = os.path.expanduser("~/.fluxcloud") diff --git a/fluxcloud/main/__init__.py b/fluxcloud/main/__init__.py index 836c786..ce6c478 100644 --- a/fluxcloud/main/__init__.py +++ b/fluxcloud/main/__init__.py @@ -1,10 +1,10 @@ -# Copyright 2022 Lawrence Livermore National Security, LLC and other +# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other # This is part of Flux Framework. See the COPYRIGHT file for details. # # SPDX-License-Identifier: Apache-2.0 -def get_experiment_client(cloud=None, force_cluster=False): +def get_experiment_client(cloud=None, **kwargs): """ Create the cloud experiment client. """ @@ -19,4 +19,4 @@ def get_experiment_client(cloud=None, force_cluster=False): cloud = clouds.get_cloud(cloud) else: cloud = clients.ExperimentClient - return cloud(force_cluster=force_cluster) + return cloud(**kwargs) diff --git a/fluxcloud/main/api.py b/fluxcloud/main/api.py index 24c693e..860217d 100644 --- a/fluxcloud/main/api.py +++ b/fluxcloud/main/api.py @@ -3,167 +3,319 @@ # # SPDX-License-Identifier: Apache-2.0 -import atexit -import logging import os +import re import shutil -import subprocess -import threading import time import uuid from flux_restful_client.main import get_client +from fluxoperator.client import FluxOperator import fluxcloud.utils as utils from fluxcloud.logger import logger here = os.path.dirname(os.path.abspath(__file__)) -exit_event = threading.Event() - class APIClient: - def __init__(self, token=None, user=None): + def __init__(self, token=None, user=None, secret_key=None): """ API client wrapper. """ - self.user = token or os.environ.get("FLUX_USER") or "fluxuser" + self.user = token or os.environ.get("FLUX_USER") or user or "fluxuser" self.token = token or os.environ.get("FLUX_TOKEN") or str(uuid.uuid4()) - self.cli = get_client(user=self.user, token=self.token) + self.secret_key = ( + secret_key or os.environ.get("FLUX_SECRET_KEY") or str(uuid.uuid4()) + ) self.proc = None self.broker_pod = None - def check(self, experiment): + def show_credentials(self): """ - Set the basic auth for username and password and check it works + Show the token and user, if requested. """ - minicluster = experiment.minicluster - get_broker_pod = experiment.get_shared_script( - "broker-id", {"minicluster": minicluster} - ) - - logger.info("Waiting for id of running broker pod...") - - # We've already waited for them to be running - broker_pod = None - while not broker_pod: - result = utils.run_capture(["/bin/bash", get_broker_pod], stream=True) - - # Save the broker pod, or exit on failure. - if result["message"]: - broker_pod = result["message"].strip() - - self.broker_pod = broker_pod - self.port_forward(minicluster["namespace"], self.broker_pod) + logger.info("MiniCluster created with credentials:") + logger.info(f" FLUX_USER={self.user}") + logger.info(f" FLUX_TOKEN={self.token}") - def port_forward(self, namespace, broker_pod): + def _set_minicluster_credentials(self, minicluster): """ - Ask user to open port to forward + If the user provided credentials, use """ - command = ["kubectl", "port-forward", "-n", namespace, broker_pod, "5000:5000"] + if "flux_restful" not in minicluster: + minicluster["flux_restful"] = {} - # This is detached - we can kill but not interact - logger.info(" ".join(command)) - self.proc = proc = subprocess.Popen( - command, - stdout=subprocess.DEVNULL if logger.level >= logging.DEBUG else None, - ) + if "username" not in minicluster["flux_restful"]: + minicluster["flux_restful"]["username"] = self.user + + if "token" not in minicluster["flux_restful"]: + minicluster["flux_restful"]["token"] = self.token - def cleanup(): - proc.kill() + if "secret_key" not in minicluster["flux_restful"]: + minicluster["flux_restful"]["secret_key"] = self.secret_key - # Ensure we cleanup if anything goes wrong - atexit.register(cleanup) + # Update credentials + self.user = minicluster["flux_restful"]["username"] + self.token = minicluster["flux_restful"]["token"] + self.secret_key = minicluster["flux_restful"]["secret_key"] + return minicluster - def submit(self, setup, experiment, size): + def _create_minicluster( + self, operator, minicluster, experiment, job, interactive=True + ): """ - Use the client to submit the jobs programatically. + Shared function to take an operator handle and create the minicluster. + + This can be used for apply or submit! We separate minicluster (gets + piped into the MiniClusterSpec) from job (gets piped into a + MiniClusterContainer spec). """ - # Submit jobs! - - # Sleep time will be time of last job, assuming they are similar - sleep_time = 5 - for jobname, job in experiment.jobs.items(): - # Do we want to run this job for this size and machine? - if not experiment.check_job_run(job, size): - logger.debug( - f"Skipping job {jobname} as does not match inclusion criteria." - ) - continue + namespace = minicluster["namespace"] + image = job["image"] + name = minicluster["name"] + size = minicluster["size"] + + self._set_minicluster_credentials(minicluster) + + try: + # The operator will time creation through pods being ready + result = operator.create_minicluster(**minicluster, container=job) + except Exception as e: + # Give the user the option to delete and recreate or just exit + logger.error(f"There was an issue creating the MiniCluster: {e}") + if interactive and not utils.confirm_action( + "Would you like to submit jobs to the current cluster? You will need to have provided the same username as password." + ): + if utils.confirm_action( + "Would you like to delete this mini cluster and re-create?" + ): + logger.info("Cleaning up MiniCluster...") + operator.delete_minicluster(name=name, namespace=namespace) + return self._create_minicluster( + operator, minicluster, experiment, job, interactive=interactive + ) + else: + logger.exit( + f"Try: 'kubectl delete -n {namespace} minicluster {name}'" + ) + elif not interactive: + logger.exit(f"Try: 'kubectl delete -n {namespace} minicluster {name}'") + return + + # Wait for pods to be ready to include in minicluster up time + self.show_credentials() + + # Save MiniCluster metadata + image_slug = re.sub("(:|/)", "-", image) + uid = f"{size}-{name}-{image_slug}" + experiment.save_json(result, f"minicluster-size-{uid}.json") + + # This is a good point to also save nodes metadata + nodes = operator.get_nodes() + operator.wait_pods(quiet=True) + pods = operator.get_pods() + + experiment.save_file(nodes.to_str(), f"nodes-{uid}.json") + experiment.save_file(pods.to_str(), f"pods-size-{uid}.json") + return result + + def apply( + self, + experiment, + minicluster, + job=None, + outfile=None, + stdout=True, + interactive=True, + ): + """ + Use the client to apply (1:1 job,minicluster) the jobs programatically. + """ + namespace = minicluster["namespace"] + name = minicluster["name"] - if "command" not in job: - logger.debug(f"Skipping job {jobname} as does not have a command.") - continue + # Interact with the Flux Operator Python SDK + operator = FluxOperator(namespace) - # The experiment is defined by the machine type and size - experiment_dir = experiment.root_dir + self._create_minicluster( + operator, minicluster, experiment, job, interactive=interactive + ) - # Add the size - jobname = f"{jobname}-minicluster-size-{size}" - job_output = os.path.join(experiment_dir, jobname) - logfile = os.path.join(job_output, "log.out") + # Get the broker pod (this would also wait for all pods to be ready) + broker = operator.get_broker_pod() - # Do we have output? - if os.path.exists(logfile) and not setup.force: - relpath = os.path.relpath(logfile, experiment_dir) - logger.warning( - f"{relpath} already exists and force is False, skipping." - ) - continue - - elif os.path.exists(logfile) and setup.force: - logger.warning(f"Cleaning up previous run in {job_output}.") - shutil.rmtree(job_output) - - # Create job directory anew - utils.mkdir_p(job_output) - - kwargs = dict(job) - del kwargs["command"] - - # Assume the task gets all nodes, unless specified in job - # Also assume the flux restful server is using one node - if "nodes" not in kwargs: - kwargs["nodes"] = size - 1 - if "tasks" not in kwargs: - kwargs["tasks"] = size - 1 - - # Ensure we convert - map between job params and the flux restful api - for convert in ( - ["num_tasks", "tasks"], - ["cores_per_task", "cores"], - ["gpus_per_task", "gpus"], - ["num_nodes", "nodes"], - ): - if convert[1] in kwargs: - kwargs[convert[0]] = kwargs[convert[1]] + # Time from when broker pod (and all pods are ready) + start = time.time() - # Let's also keep track of actual time to get logs, info, etc. - start = time.time() + # Get the pod to stream output from directly + if outfile is not None: + operator.stream_output(outfile, pod=broker, stdout=stdout) - # Run and block output until job is done - res = self.cli.submit(command=job["command"], **kwargs) + # When output done streaming, job is done + end = time.time() + logger.info(f"Job {name} is complete! Cleaning up MiniCluster...") - logger.info(f"Submitting {jobname}: {job['command']}") - info = self.cli.jobs(res["id"]) + # This also waits for termination (and pods to be gone) and times it + operator.delete_minicluster(name=name, namespace=namespace) - while info["returncode"] == "": - info = self.cli.jobs(res["id"]) - time.sleep(sleep_time) + # TODO likely need to separate minicluster up/down times. + results = {"times": operator.times} + results["times"][name] = end - start + return results - end1 = time.time() - output = self.cli.output(res["id"]).get("Output") - if output: - utils.write_file("".join(output), logfile) - end2 = time.time() + def submit( + self, setup, experiment, minicluster, job, poll_seconds=20, interactive=True + ): + """ + Use the client to submit the jobs programatically. + """ + namespace = minicluster["namespace"] + image = job["image"] + name = minicluster["name"] + size = minicluster["size"] + + # Interact with the Flux Operator Python SDK + operator = FluxOperator(namespace) - # Get the full job info, and add some wrapper times - info = self.cli.jobs(res["id"]) - info["start_to_info_seconds"] = end1 - start - info["start_to_output_seconds"] = end2 - start + self._create_minicluster( + operator, minicluster, experiment, job, interactive=interactive + ) - yield jobname, info - sleep_time = info["runtime"] + # Get the broker pod (this would also wait for all pods to be ready) + broker = operator.get_broker_pod() + + # Return results (and times) to calling client + results = {} + + # Submit jobs via port forward - this waits until the server is ready + with operator.port_forward(broker) as forward_url: + print(f"Port forward opened to {forward_url}") + + # See https://flux-framework.org/flux-restful-api/getting_started/api.html + cli = get_client( + host=forward_url, + user=self.user, + token=self.token, + secret_key=self.secret_key, + ) + cli.set_basic_auth(self.user, self.token) + + # Keep a lookup of jobid and output files. + # We will try waiting for all jobs to finish and then save output + jobs = [] + for jobname, job in experiment.jobs.items(): + # Do we want to run this job for this size, image? + if not experiment.check_job_run(job, size=size, image=image): + logger.debug( + f"Skipping job {jobname} as does not match inclusion criteria." + ) + continue + + if "command" not in job: + logger.debug(f"Skipping job {jobname} as does not have a command.") + continue + + # Here we submit all jobs to the scheduler. Let the scheduler handle it! + submit_job = self.submit_job( + cli, experiment, setup, minicluster, job, jobname + ) + if not submit_job: + continue + jobs.append(submit_job) + + logger.info(f"Submit {len(jobs)} jobs! Waiting for completion...") + + # Poll once every 30 seconds + # This could be improved with some kind of notification / pubsub thing + completed = [] + while jobs: + logger.info(f"{len(jobs)} are active.") + time.sleep(poll_seconds) + unfinished = [] + for job in jobs: + if "id" not in job: + logger.warning( + f"Job {job} is missing an id or name, likely an issue or not ready, skipping." + ) + continue + + info = cli.jobs(job["id"]) + + # If we don't have a name yet, it's still pending + if "name" not in info: + unfinished.append(job) + continue + + jobname = info["name"].rjust(15) + if info["state"] == "INACTIVE": + finish_time = round(info["runtime"], 2) + logger.debug( + f"{jobname} is finished {info['result']} in {finish_time} seconds." + ) + job["info"] = info + job["output"] = cli.output(job["id"]).get("Output") + completed.append(job) + else: + logger.debug(f"{jobname} is in state {info['state']}") + unfinished.append(job) + jobs = unfinished + + logger.info("All jobs are complete!") + + # This also waits for termination (and pods to be gone) and times it + if not interactive or utils.confirm_action( + "Would you like to delete this mini cluster?" + ): + logger.info("Cleaning up MiniCluster...") + operator.delete_minicluster(name=name, namespace=namespace) + + # Get times recorded by FluxOperator Python SDK + results["jobs"] = completed + results["times"] = operator.times + return results + + def submit_job(self, cli, experiment, setup, minicluster, job, jobname): + """ + Submit the job (if appropriate for the minicluster) - # Kill the connection to the service - self.proc.kill() + Return an appended Flux Restful API job result with the expected + output file. + """ + # The experiment is defined by the machine type and size + experiment_dir = experiment.root_dir + + jobname = f"{jobname}-minicluster-size-{minicluster['size']}" + job_output = os.path.join(experiment_dir, jobname) + logfile = os.path.join(job_output, "log.out") + + # Do we have output? + if os.path.exists(logfile) and not setup.force: + relpath = os.path.relpath(logfile, experiment_dir) + logger.warning(f"{relpath} already exists and force is False, skipping.") + return + + if os.path.exists(logfile) and setup.force: + logger.warning(f"Cleaning up previous run in {job_output}.") + shutil.rmtree(job_output) + + kwargs = dict(job) + del kwargs["command"] + + # Ensure we convert - map between job params and the flux restful api + for convert in ( + ["num_tasks", "tasks"], + ["cores_per_task", "cores"], + ["gpus_per_task", "gpus"], + ["num_nodes", "nodes"], + ["workdir", "working_dir"], + ): + if convert[1] in kwargs: + kwargs[convert[0]] = kwargs[convert[1]] + del kwargs[convert[1]] + + # Submit the job, add the expected output file, and return + logger.info(f"Submitting {jobname}: {job['command']}") + res = cli.submit(command=job["command"], **kwargs) + res["job_output"] = logfile + return res diff --git a/fluxcloud/main/client.py b/fluxcloud/main/client.py index e5a8db5..49cdbb8 100644 --- a/fluxcloud/main/client.py +++ b/fluxcloud/main/client.py @@ -3,13 +3,13 @@ # # SPDX-License-Identifier: Apache-2.0 +import copy import os import shutil -import time +import fluxcloud.main.api as api import fluxcloud.utils as utils from fluxcloud.logger import logger -from fluxcloud.main.api import APIClient from fluxcloud.main.decorator import save_meta, timed here = os.path.dirname(os.path.abspath(__file__)) @@ -26,9 +26,10 @@ def __init__(self, *args, **kwargs): self.settings = settings.Settings self.info = {} self.times = {} + self.debug = kwargs.get("debug", False) # Job prefix is used for organizing time entries - self.job_prefix = "minicluster-run" + self.job_prefix = "job_" def __repr__(self): return str(self) @@ -67,7 +68,7 @@ def run(self, setup): # Each experiment has its own cluster size and machine type for experiment in setup.iter_experiments(): self.up(setup, experiment=experiment) - self.apply(setup, experiment=experiment) + self.apply(setup, experiment=experiment, interactive=False) self.down(setup, experiment=experiment) @save_meta @@ -82,7 +83,7 @@ def batch(self, setup): # Each experiment has its own cluster size and machine type for experiment in setup.iter_experiments(): self.up(setup, experiment=experiment) - self.submit(setup, experiment=experiment) + self.submit(setup, experiment=experiment, interactive=False) self.down(setup, experiment=experiment) @save_meta @@ -93,81 +94,7 @@ def down(self, *args, **kwargs): raise NotImplementedError @save_meta - def open_ui(self, setup, experiment, size, api=None, persistent=False): - """ - Launch a CRD that opens the UI only. - """ - # The MiniCluster can vary on size - minicluster = experiment.minicluster - - # Create a FluxRestful API to submit to - created = False - if api is None: - api = APIClient() - created = True - - logger.info(f"\nšŸŒ€ Bringing up MiniCluster of size {size}") - - # Get persistent variables for this job size, image is required - job = experiment.get_persistent_variables(size, required=["image"]) - job.update({"token": api.token, "user": api.user}) - - # We can't have a command - if "command" in job: - del job["command"] - - # Pre-pull containers, etc. - if hasattr(self, "pre_apply"): - self.pre_apply(experiment, "global-job", job=job) - - # Create the minicluster via a CRD without a command - crd = experiment.generate_crd(job, size) - - # Create one MiniCluster CRD (without a command) to run the Flux Restful API - kwargs = { - "minicluster": minicluster, - "crd": crd, - "token": api.token, - "user": api.user, - "size": size, - } - submit_script = experiment.get_shared_script( - "minicluster-create-persistent", kwargs, suffix=f"-size-{size}" - ) - # Start the MiniCluster! This should probably be done better... - self.run_timed( - f"minicluster-create-persistent-size-{size}", ["/bin/bash", submit_script] - ) - - # Ensure our credentials still work, and open port forward - api.check(experiment) - logger.info(f"\nšŸŒ€ MiniCluster of size {size} is up.\n") - - # If created for the first time, show credentials - if created: - logger.info( - "Save these if you want to log into the Flux RESTFul interface, there are specific to the MiniCluster" - ) - logger.info(f"export FLUX_USER={api.user}") - logger.info(f"export FLUX_TOKEN={api.token}") - - # If we exit, the port forward will close. - if persistent: - try: - logger.info("Press Control+c to Disconnect.") - while True: - time.sleep(10) - except KeyboardInterrupt: - logger.info("šŸ§½ļø Cleaning up!") - self.run_timed( - f"minicluster-persistent-destroy-size-{size}", - ["kubectl", "delete", "-f", crd], - ) - - return api, kwargs - - @save_meta - def submit(self, setup, experiment): + def submit(self, setup, experiment, interactive=True): """ Submit a Job via the Restful API """ @@ -177,8 +104,6 @@ def submit(self, setup, experiment): ) return - api = None - # Iterate through all the cluster sizes for size in experiment.minicluster["size"]: # We can't run if the minicluster > the experiment size @@ -188,24 +113,49 @@ def submit(self, setup, experiment): ) continue - # Open the api for the size - api, uiattrs = self.open_ui(setup, experiment, size, api) - logger.info(f"\nšŸŒ€ Bringing up MiniCluster of size {size}") + # Launch a unique Minicluster per container image. E.g., + # if the user provides 2 images for size 4, we create two MiniClusters + # This will provide all shared volumes across the jobs + for minicluster, job in experiment.get_submit_miniclusters(size): + logger.info( + f"\nšŸŒ€ Bringing up MiniCluster of size {size} with image {job['image']}" + ) + + # Create the API client (creates the user and token for the cluster) + cli = api.APIClient() - # Save times (and logs in submit) as we go - for jobid, info in api.submit(setup, experiment, size): - logger.info(f"{jobid} took {info['runtime']} seconds.") - self.times[jobid] = info["runtime"] - self.info[jobid] = info + # Pre-pull containers, etc. + if hasattr(self, "pre_apply"): + self.pre_apply(experiment, minicluster["name"], job=job) - logger.info(f"\nšŸŒ€ MiniCluster of size {size} is finished") - self.run_timed( - f"minicluster-persistent-destroy-size-{size}", - ["kubectl", "delete", "-f", uiattrs["crd"]], - ) + # Get back results with times (for minicluster assets) and jobs + results = cli.submit( + setup, experiment, minicluster, job=job, interactive=interactive + ) + + # Save times and output files for jobs + for job in results.get("jobs", []): + self.save_job(job) + + def save_job(self, job): + """ + Save the job and add times to our times listing. + """ + jobid = f"{self.job_prefix}{job['id']}" + self.times[jobid] = job["info"]["runtime"] + + # Do we have an output file and output? + if job["output"]: + # Save to our output directory! + logfile = job["job_output"] + utils.mkdir_p(os.path.dirname(logfile)) + utils.write_file(job["output"], logfile) + + del job["output"] + self.info[jobid] = job @save_meta - def apply(self, setup, experiment): + def apply(self, setup, experiment, interactive=True): """ Apply a CRD to run the experiment and wait for output. @@ -246,22 +196,24 @@ def apply(self, setup, experiment): # Create job directory anew utils.mkdir_p(job_output) - # Generate the populated crd from the template - crd = experiment.generate_crd(job, size) - - # Prepare specific .crd for template - # Note the output directory is already specific to the job index - kwargs = { - "minicluster": experiment.minicluster, - "logfile": logfile, - "crd": crd, - } - apply_script = experiment.get_shared_script( - "minicluster-run", kwargs, suffix=f"-{jobname}" + # Prepare the client for one minicluster + cli = api.APIClient() + + # Prepare a specific MiniCluster for this size + minicluster = copy.deepcopy(experiment.minicluster) + minicluster["size"] = size + + # Get back results with times (for minicluster assets) and jobs + # If debug level, print job output to terminal too :) + results = cli.apply( + experiment=experiment, + minicluster=minicluster, + outfile=logfile, + stdout=self.debug, + job=job, + interactive=interactive, ) - - # Apply the job, and save to output directory - self.run_timed(f"{self.job_prefix}-{jobname}", ["/bin/bash", apply_script]) + self.times[jobname] = results["times"] # Save times between experiment runs experiment.save_metadata(self.times, self.info) diff --git a/fluxcloud/main/clouds/aws/scripts/cluster-create b/fluxcloud/main/clouds/aws/scripts/cluster-create index 3a3cd8e..14855d5 100755 --- a/fluxcloud/main/clouds/aws/scripts/cluster-create +++ b/fluxcloud/main/clouds/aws/scripts/cluster-create @@ -5,6 +5,7 @@ # Defaults - these are in the config but left here for information CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux-cluster{% endif %}" +NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}" REGION="{% if region %}{{ region }}{% else %}us-east-1{% endif %}" CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}" MACHINE_TYPE="{% if experiment.machine %}{{ experiment.machine }}{% else %}m5.large{% endif %}" @@ -33,6 +34,7 @@ if [ -z ${MACHINE_TYPE+x} ]; then exit 1 fi +print_magenta " namespace: ${NAMESPACE}" print_magenta " cluster : ${CLUSTER_NAME}" print_magenta " version : ${CLUSTER_VERSION}" print_magenta " machine : ${MACHINE_TYPE}" @@ -64,6 +66,8 @@ run_echo eksctl create cluster -f ${CONFIG_FILE} # Deploy the operator install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} + +run_echo kubectl create namespace ${NAMESPACE} || true run_echo kubectl get namespace run_echo kubectl describe namespace operator-system diff --git a/fluxcloud/main/clouds/google/scripts/cluster-create b/fluxcloud/main/clouds/google/scripts/cluster-create index c33b78f..51c2562 100755 --- a/fluxcloud/main/clouds/google/scripts/cluster-create +++ b/fluxcloud/main/clouds/google/scripts/cluster-create @@ -5,6 +5,7 @@ # Defaults - these are in the config but left here for information CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux{% endif %}" +NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}" ZONE="{% if zone %}{{ zone }}{% else %}us-central1-a{% endif %}" CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}" MACHINE_TYPE="{% if experiment.machine %}{{ experiment.machine }}{% else %}n1-standard-1{% endif %}" @@ -32,6 +33,7 @@ if [ -z ${MACHINE_TYPE+x} ]; then exit 1 fi +print_magenta " namespace: ${NAMESPACE}" print_magenta " cluster : ${CLUSTER_NAME}" print_magenta " version : ${CLUSTER_VERSION}" print_magenta " project : ${GOOGLE_PROJECT}" @@ -74,7 +76,7 @@ run_echo kubectl get nodes # Deploy the operator mkdir -p ${SCRIPT_DIR} install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} - +run_echo kubectl create namespace ${NAMESPACE} || true run_echo kubectl get namespace run_echo kubectl describe namespace operator-system diff --git a/fluxcloud/main/clouds/local/scripts/cluster-create-minikube b/fluxcloud/main/clouds/local/scripts/cluster-create-minikube index 35fb21c..116047d 100755 --- a/fluxcloud/main/clouds/local/scripts/cluster-create-minikube +++ b/fluxcloud/main/clouds/local/scripts/cluster-create-minikube @@ -5,6 +5,7 @@ # Defaults - these are in the config but left here for information CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux-cluster{% endif %}" +NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}" CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}" FORCE_CLUSTER="{% if setup.force_cluster %}true{% else %}false{% endif %}" SIZE={% if experiment.size %}{{ experiment.size }}{% else %}4{% endif %} @@ -12,6 +13,7 @@ REPOSITORY="{% if experiment.operator_repository %}{{ experiment.operator_reposi BRANCH="{% if experiment.operator_branch %}{{ experiment.operator_branch }}{% else %}main{% endif %}" SCRIPT_DIR="{{ experiment.script_dir }}" +print_magenta " namespace: ${NAMESPACE}" print_magenta " cluster : ${CLUSTER_NAME}" print_magenta " version : ${CLUSTER_VERSION}" print_magenta " size : ${SIZE}" @@ -51,7 +53,7 @@ install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} # Show nodes run_echo kubectl get nodes - +run_echo kubectl create namespace ${NAMESPACE} || true run_echo kubectl get namespace run_echo kubectl describe namespace operator-system save_versions ${SCRIPT_DIR} ${SIZE} diff --git a/fluxcloud/main/clouds/shared/scripts/broker-id b/fluxcloud/main/clouds/shared/scripts/broker-id deleted file mode 100755 index a45ba8c..0000000 --- a/fluxcloud/main/clouds/shared/scripts/broker-id +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}" -JOB="{{ minicluster.name }}" -brokerPrefix="${JOB}-0" - -for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo ${pod} - break - fi -done diff --git a/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent b/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent deleted file mode 100755 index 3b2db0a..0000000 --- a/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# This is a template that will be populated with variables by Flux-Cloud -# We only run it to check if a MiniCluster is running. An apply is only -# needed if the MiniCluster is not created yet. - -# Include shared helper scripts -{% include "helpers.sh" %} - -NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}" -CRD="{{ crd }}" -JOB="{{ minicluster.name }}" - -# Size -1 to account for certificate generator -SIZE={{ size }} - -print_magenta " apply : ${CRD}" -print_magenta " job : ${JOB}" - -is_installed kubectl - -# Create the namespace (ok if already exists) -run_echo_allow_fail kubectl create namespace ${NAMESPACE} - -# Always cleanup a previous one so tokens don't get stale -run_echo_allow_fail kubectl delete -f ${CRD} -{% include "wait_for_cleanup.sh" %} - -# Ensure we have a MiniCluster of the right namespace running -echo -print_green "šŸŒ€ļø Creating MiniCluster in ${NAMESPACE}" -{% include "wait_for_all.sh" %} -{% include "wait_for_flux_restful.sh" %} diff --git a/fluxcloud/main/clouds/shared/scripts/minicluster-run b/fluxcloud/main/clouds/shared/scripts/minicluster-run deleted file mode 100755 index b7f14ce..0000000 --- a/fluxcloud/main/clouds/shared/scripts/minicluster-run +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# This is a template that will be populated with variables by Flux-Cloud -# It used to be a script proper with getopt, but in practice this was -# erroneous on different operating systems. - -# Include shared helper scripts -{% include "helpers.sh" %} - -NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}" -CRD="{{ crd }}" -JOB="{{ minicluster.name }}" -LOGFILE="{{ logfile }}" - -print_magenta " apply : ${CRD}" -print_magenta " job : ${JOB}" -print_magenta "logfile : ${LOGFILE}" - -is_installed kubectl - -# Ensure we wait for the space to be cleaned up -{% include "wait_for_cleanup.sh" %} - -# Create the namespace (ok if already exists) -run_echo_allow_fail kubectl create namespace ${NAMESPACE} - -{% include "wait_for_broker.sh" %} - -# Get the name of the pods -pods=($(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}')) -brokerpod=${pods[0]} - -# This will hang like this until the job finishes running -echo -print_green "kubectl -n ${NAMESPACE} logs ${brokerpod} -f > ${LOGFILE}" -kubectl -n ${NAMESPACE} logs ${brokerpod} -f > ${LOGFILE} - -for exitcode in $(kubectl get -n ${NAMESPACE} pod --selector=job-name=${JOB} --output=jsonpath={.items...containerStatuses..state.terminated.exitCode}); do - if [[ ${exitcode} -ne 0 ]]; then - echo "Container in ${JOB} had nonzero exit code" - fi -done - -run_echo kubectl delete -f ${CRD} diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh deleted file mode 100644 index ddf5cc7..0000000 --- a/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Apply the job, get pods -run_echo kubectl apply -f ${CRD} -run_echo kubectl get -n ${NAMESPACE} pods - -# continue until we find the index-0 pod -podsReady="false" - -echo -print_blue "Waiting for MiniCluster of size ${SIZE} to be ready..." -while [[ "${podsReady}" == "false" ]]; do - echo -n "." - sleep 2 - pods=$(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=name | wc -l) - if [[ ${pods} -eq ${SIZE} ]]; then - echo - print_green "šŸŒ€ļø All pods are running." - podsReady="true" - break - fi -done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh deleted file mode 100644 index 9335313..0000000 --- a/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh +++ /dev/null @@ -1,40 +0,0 @@ -# Apply the job, get pods -run_echo kubectl apply -f ${CRD} -run_echo kubectl get -n ${NAMESPACE} pods - -# continue until we find the index-0 pod -brokerPrefix="${JOB}-0" -brokerReady="false" - -echo -print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be created..." -while [[ "${brokerReady}" == "false" ]]; do - echo -n "." - sleep 2 - for pod in $(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - print_green "šŸŒ€ļø Broker pod is created." - brokerReady="true" - break - fi - done -done - -# Now broker pod needs to be running -echo -print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be running..." -brokerReady="false" -while [[ "${brokerReady}" == "false" ]]; do - echo -n "." - - # TODO - we likely want to check for running OR completed, it's rare but sometimes they can complete too fast. - for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - print_green "šŸŒ€ļø Broker pod is running." - brokerReady="true" - break - fi - done -done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh deleted file mode 100644 index 466482f..0000000 --- a/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh +++ /dev/null @@ -1,15 +0,0 @@ -echo -podsCleaned="false" -print_blue "Waiting for previous MiniCluster to be cleaned up..." -while [[ "${podsCleaned}" == "false" ]]; do - echo -n "." - sleep 2 - state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1) - lines=$(echo $state | wc -l) - if [[ ${lines} -eq 1 ]] && [[ "${state}" == *"No resources found in"* ]]; then - echo - print_green "šŸŒ€ļø Previous pods are cleaned up." - podsCleaned="true" - break - fi -done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh deleted file mode 100644 index 6c27ba7..0000000 --- a/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh +++ /dev/null @@ -1,29 +0,0 @@ - -echo -brokerPod="" -brokerPrefix="${JOB}-0" -while [[ "${brokerPod}" == "" ]]; do - for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - brokerPod=${pod} - break - fi - done -done - -echo -serverReady="false" -print_blue "Waiting for Flux Restful API Server to be ready..." -while [[ "${serverReady}" == "false" ]]; do - echo -n "." - sleep 2 - logs=$(kubectl logs --namespace ${NAMESPACE} ${brokerPod} | grep "Uvicorn running") - retval=$? - if [[ ${retval} -eq 0 ]]; then - echo - serverReady="true" - print_green "šŸŒ€ļø Flux RestFul API Server is Ready." - break - fi -done diff --git a/fluxcloud/main/experiment.py b/fluxcloud/main/experiment.py index c50d8b7..44a65c8 100644 --- a/fluxcloud/main/experiment.py +++ b/fluxcloud/main/experiment.py @@ -8,7 +8,6 @@ import os import shutil -import jinja2 import jsonschema import fluxcloud.defaults as defaults @@ -22,7 +21,6 @@ class ExperimentSetup: def __init__( self, experiments, - template=None, outdir=None, validate=True, cleanup=True, @@ -34,21 +32,12 @@ def __init__( An experiment setup is a light wrapper around a group of experiments. """ self.experiment_file = os.path.abspath(experiments) - self.template = ( - os.path.abspath(template) - if template is not None and os.path.exists(template) - else None - ) self.outdir = outdir self.test = test self.settings = settings.Settings self.quiet = quiet self.run_cleanup = cleanup - # Show the user the template file - if template: - logger.debug(f"Using template {self.template}") - # Rewrite existing outputs self.force = kwargs.get("force") or False # Don't ask for confirmation to create/destroy @@ -99,7 +88,7 @@ def prepare_matrices(self): validate_experiments(self.spec) # Sploot out into matrices - matrices = expand_experiments(self.spec, self.outdir, self.template) + matrices = expand_experiments(self.spec, self.outdir) if not matrices: raise ValueError( "No matrices generated. Did you include any empty variables in your matrix?" @@ -134,11 +123,10 @@ class Experiment: An experiment wrapper to make it easy to get variables in templates. """ - def __init__(self, experiment, outdir=None, template=None): + def __init__(self, experiment, outdir=None): self.experiment = experiment self.settings = settings.Settings self._outdir = outdir - self.template = template or defaults.default_minicluster_template @property def outdir(self): @@ -191,31 +179,60 @@ def iter_jobs(self): yield size, jobname, job - def get_persistent_variables(self, size, required=None): + def get_submit_miniclusters(self, size): """ - Get persistent variables that should be used across the MiniCluster + Return Miniclusters organized by unique sizes and containers + + For each, we return a faux job that includes (potentially) the job volumes. """ - jobvars = {} - for _, job in self.jobs.items(): - # Skip jobs targeted for a different size + # A faux job is provided that includes all volumes + images = {} + for name, job in self.jobs.items(): if "size" in job and job["size"] != size: continue - - for key, value in job.items(): - if key not in jobvars or (key in jobvars and jobvars[key] == value): - jobvars[key] = value - continue - logger.warning( - f'Inconsistent job variable between MiniCluster jobs: {value} vs. {jobvars["value"]}' - ) - - # If we get here and we don't have an image - for req in required or []: - if req not in jobvars: - raise ValueError( - f'Submit requires a "{req}" field under at least one job spec to create the MiniCluster.' - ) - return jobvars + if "image" not in job: + logger.warning(f"Job {name} is missing an image and cannot be run.") + + # Add the image if we don't know about it already + # This is where we can define shared minicluster container attributes (the job) + if job["image"] not in images: + images[job["image"]] = copy.deepcopy(job) + + # Update the job and warn the user for differences + else: + for k, v in job.items(): + # Skip the command + if k == "command": + continue + + # This shared job for the image doesn't have the attribute defined yet + if k not in images[job["image"]]: + images[job["image"]][k] = v + continue + current = images[job["image"]][k] + + # If it's a dictionary, just update + if isinstance(current, dict) and isinstance(v, dict): + images[job["image"]][k].update(v) + + # Otherwise give a warning we won't be updating + elif current != v: + logger.warning( + f"Found different definition of {k}, {v}. Using first discovered {current}" + ) + + logger.debug(f"Job experiments file generated {len(images)} MiniCluster(s).") + + # Prepare a MiniCluster and job for each image + for image in images: + minicluster = copy.deepcopy(self.minicluster) + minicluster["size"] = size + job = images[image] + + # A shared MiniCluster starts with no command to start flux restful + if "command" in job: + del job["command"] + yield minicluster, job @property def script_dir(self): @@ -238,15 +255,6 @@ def get_script(self, name, cloud, render_kwargs=None, ext="sh", suffix=""): utils.mkdir_p(outdir) return script.render(outfile=outfile, **render_kwargs) - def get_shared_script(self, name, render_kwargs=None, suffix="", ext="sh"): - """ - Get a named shared script - """ - render_kwargs = render_kwargs or {} - return self.get_script( - name, cloud="shared", render_kwargs=render_kwargs, suffix=suffix, ext=ext - ) - def cleanup(self): """ Cleanup the scripts directory for the experiment! @@ -255,36 +263,6 @@ def cleanup(self): logger.debug(f"Cleaning up {self.script_dir}") shutil.rmtree(self.script_dir) - def generate_crd(self, job, minicluster_size): - """ - Generate a custom resource definition for the experiment - """ - template = jinja2.Template(utils.read_file(self.template)) - experiment = copy.deepcopy(self.experiment) - - # If the experiment doesn't define a minicluster, add our default - if "minicluster" not in experiment: - experiment["minicluster"] = self.settings.minicluster - - # Update minicluster size to the one we want - experiment["minicluster"]["size"] = minicluster_size - - if "jobs" in experiment: - del experiment["jobs"] - experiment["job"] = job - result = template.render(**experiment).strip(" ") - logger.debug(result) - - # Write to output directory - outfile = os.path.join( - self.script_dir, f"minicluster-size-{minicluster_size}.yaml" - ) - outdir = os.path.dirname(outfile) - if not os.path.exists(outdir): - logger.info(f"Creating output directory for scripts {outdir}") - utils.mkdir_p(outdir) - return utils.write_file(result, outfile) - @property def jobs(self): return self.experiment.get("jobs", {}) @@ -325,10 +303,12 @@ def is_run(self): return False return True - def check_job_run(self, job, size): + def check_job_run(self, job, size, image=None): """ Determine if a job is marked for a MiniCluster size. """ + if "image" in job and image is not None and job["image"] != image: + return False if "sizes" in job and size not in job["sizes"]: return False if "size" in job and job["size"] != size: @@ -339,6 +319,27 @@ def check_job_run(self, job, size): return False return True + def save_file(self, obj, filename, is_json=False): + """ + Save a json dump of something to a filename in the experiment directory. + """ + experiment_dir = self.root_dir + save_file = os.path.join(experiment_dir, ".scripts", filename) + save_dir = os.path.dirname(save_file) + if not os.path.exists(save_dir): + utils.mkdir_p(save_dir) + if is_json: + utils.write_json(obj, save_file) + else: + utils.write_file(obj, save_file) + return save_file + + def save_json(self, obj, filename): + """ + Save a json dump of something to a filename in the experiment directory. + """ + return self.save_file(obj, filename, is_json=True) + def save_metadata(self, times, info=None): """ Save experiment metadata, loading an existing meta.json, if present. @@ -421,8 +422,17 @@ def minicluster(self): minicluster = self.experiment.get("minicluster") or self.settings.minicluster if "namespace" not in minicluster or not minicluster["namespace"]: minicluster["namespace"] = defaults.default_namespace + if "size" not in minicluster: + minicluster["size"] = [self.experiment.get("size")] return minicluster + @property + def minicluster_namespace(self): + """ + Get mini cluster namespace + """ + return self.minicluster["namespace"] + @property def machine(self): return self.experiment.get("machine") or self.settings.google["machine"] @@ -455,7 +465,7 @@ def kubernetes_version(self): ) -def expand_experiments(experiments, outdir, template=None): +def expand_experiments(experiments, outdir): """ Given a valid experiments.yaml, expand out into experiments """ @@ -484,7 +494,7 @@ def expand_experiments(experiments, outdir, template=None): # Put in final matrix form final = [] for entry in matrix: - final.append(Experiment(entry, outdir, template)) + final.append(Experiment(entry, outdir)) return final diff --git a/fluxcloud/main/schemas.py b/fluxcloud/main/schemas.py index 8556347..5902448 100644 --- a/fluxcloud/main/schemas.py +++ b/fluxcloud/main/schemas.py @@ -24,14 +24,14 @@ "properties": { "command": {"type": "string"}, "repeats": {"type": "number"}, - "workdir": {"type": "string"}, + "working_dir": {"type": "string"}, "image": {"type": "string"}, "machine": {"type": "string"}, "machines": {"type": "array", "items": {"type": "string"}}, "size": {"type": "number"}, "sizes": {"type": "array", "items": {"type": "number"}}, }, - "required": ["command"], + "required": ["command", "image"], } jobs_properties = { @@ -187,6 +187,9 @@ "required": ["size"], }, }, + "patternProperties": { + "x-*": {"type": "object"}, + }, "additionalProperties": False, } diff --git a/fluxcloud/main/template.py b/fluxcloud/main/template.py new file mode 100644 index 0000000..e176500 --- /dev/null +++ b/fluxcloud/main/template.py @@ -0,0 +1,91 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +experiment_base = """ +# Flux MiniCluster experiment attributes +minicluster: + name: my-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + # They must be smaller than the Kubernetes cluster size or not possible to run! + size: [2, 4] + +# Under jobs should be named jobs (output orgainzed by name) where +# each is required to have a command and image. Repeats is the number +# of times to run each job +jobs: + reaxc-hns: + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + sleep: + command: 'sleep 5' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + hello-world: + command: 'echo hello world' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS +""" + +google_experiment_template = f""" +matrix: + size: [4] + + # This is a Google Cloud machine + machine: [n1-standard-1] + +variables: + # Customize zone just for this experiment + # otherwise defaults to your settings.yml + zone: us-central1-a + +{experiment_base} +""" + +minikube_experiment_template = f""" +# This is intended for MiniKube, so no machine needed +matrix: + + # This is the size of the MiniKube cluster (aka Kubernetes cluster) to bring up + size: [4] + +{experiment_base} +""" + +aws_experiment_template = f""" +matrix: + + # This is the size of the MiniKube cluster (aka Kubernetes cluster) to bring up + size: [4] + + # This is an EC2 machine + machine: [m5.large] + +variables: + # Enable private networking + private_networking: false + + # Enable efa (requires efa also set under the container limits) + efa_enabled: false + + # Add a custom placement group name to your workers managed node group + placement_group: eks-efa-testing + + # Customize region just for this experiment + region: us-east-2 + + # Customize availability zones for this experiment + availability_zones: [us-east-1a, us-east-1b] + + # Important for instance types only in one zone (hpc instances) + # Select your node group availability zone: + node_group_availability_zone: us-east-2b + +{experiment_base} +""" diff --git a/fluxcloud/minicluster-template.yaml b/fluxcloud/minicluster-template.yaml deleted file mode 100644 index ede959d..0000000 --- a/fluxcloud/minicluster-template.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster - -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # localDeploy needs to be false - localDeploy: {% if minicluster.local_deploy %}true{% else %}false{% endif %} - - # Number of pods to create for MiniCluster - size: {{ minicluster.size }} - tasks: {% if job.tasks %}{{ job.tasks }}{% else %}1{% endif %} - - # Disable verbose output - {% if job.quiet or job.timed %}logging: - {% if job.quiet %}quiet: true{% endif %} - {% if job.timed %}timed: true{% endif %}{% endif %} - - # Optional credentials if running the flux restful api - {% if job.token or job.user %}fluxRestful: - {% if job.token %}token: "{{ job.token }}"{% endif %} - {% if job.user %}username: "{{ job.user }}"{% endif %}{% endif %} - - # TODO add pod resources, if needed - containers: - - image: {{ job.image }} - {% if job.workdir %}workingDir: {{ job.workdir }}{% endif %} - {% if job.command %}command: {{ job.command }}{% endif %} - {% if job.flux_option_flags %}fluxOptionFlags: "-ompi=openmpi@5"{% endif %} - cores: {% if job.cores %}{{ job.cores }}{% else %}1{% endif %} - {% if job.limits or job.resources %}resources:{% endif %} - {% if job.limits %}limits: - {% for limit in job.limits %} - {{ limit[0] }}: {{ limit[1] }} - {% endfor %}{% endif %} - {% if job.requests %}requests: - {% for limit in job.requests %} - {{ limit[0] }}: {{ limit[1] }} - {% endfor %}{% endif %} - {% if job.pre_command %}preCommand: | - {{ job.pre_command }}{% endif %} diff --git a/fluxcloud/tests/__init__.py b/fluxcloud/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fluxcloud/tests/helpers.py b/fluxcloud/tests/helpers.py new file mode 100644 index 0000000..b9f8330 --- /dev/null +++ b/fluxcloud/tests/helpers.py @@ -0,0 +1,47 @@ +#!/usr/bin/python + +# Copyright (C) 2022 Vanessa Sochat. + +# This Source Code Form is subject to the terms of the +# Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed +# with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os +import shlex +import shutil + +from fluxcloud.client import get_parser +from fluxcloud.main.client import ExperimentClient +from fluxcloud.main import get_experiment_client + +here = os.path.dirname(os.path.abspath(__file__)) +root = os.path.dirname(here) + + +def parse_args(argstr): + """ + Given an argument string for a test, parse it. + """ + parser = get_parser() + parser.prog = "fluxcloud" + args = parser.parse_args(shlex.split(argstr)) + args.debug = True + return args + + +def get_settings(tmpdir): + """ + Create a temporary settings file + """ + settings_file = os.path.join(root, "settings.yml") + new_settings = os.path.join(tmpdir, "settings.yml") + shutil.copyfile(settings_file, new_settings) + return new_settings + + +def init_client(tmpdir, cloud=None): + """ + Get a common client for some container technology and module system + """ + new_settings = get_settings(tmpdir) + return get_experiment_client(cloud, debug=True, settings_file=new_settings) \ No newline at end of file diff --git a/fluxcloud/tests/test_examples.py b/fluxcloud/tests/test_examples.py new file mode 100644 index 0000000..b5d2e17 --- /dev/null +++ b/fluxcloud/tests/test_examples.py @@ -0,0 +1,181 @@ +#!/usr/bin/python + +# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +from glob import glob +import os + +import fluxcloud.utils as utils +from fluxcloud.main.experiment import ExperimentSetup + +from .helpers import here, init_client + +here = os.path.abspath(os.path.dirname(__file__)) +root = os.path.dirname(os.path.dirname(here)) + +def check_lammps(minicluster_file): + """ + Checks for examples that run lammps. + """ + expected_outdir = os.path.dirname(os.path.dirname(minicluster_file)) + for out in utils.recursive_find(expected_outdir, "log.out"): + content = utils.read_file(out) + assert "Total wall time" in content + assert "LAMMPS" in content + + +def _test_example(dirname, tmp_path, check, test_apply=True): + """ + Shared function to test an example in a dirname, with a check function + """ + client = init_client(str(tmp_path), cloud="minikube") + experiment_file = os.path.join( + root, "examples", "minikube", dirname, "experiments.yaml" + ) + + # Create a new experiment directory to work from + experiment_dir = os.path.join(tmp_path, "experiment") + outdir = os.path.join(experiment_dir, "data") + utils.mkdir_p(experiment_dir) + setup = ExperimentSetup(experiment_file, outdir=outdir, force_cluster=True, quiet=False) + + # Select the first (only) experiment! + experiment = setup.matrices[0] + client.up(setup, experiment=experiment) + + # Expected output directory + expected_outdir = os.path.join(outdir, f"k8s-size-{experiment.size}-local") + expected_scripts = os.path.join(expected_outdir, ".scripts") + + def shared_checks(info=True): + assert os.path.exists(expected_outdir) + assert "meta.json" in os.listdir(expected_outdir) + meta = utils.read_json(os.path.join(expected_outdir, "meta.json")) + assert meta["times"] + assert meta["minicluster"] + assert meta["jobs"] + + # Info is only present for submit + if info: + assert meta["info"] + + # Run the experiment in the working directory + with utils.working_dir(experiment_dir): + # This won't work in the CI it seems + client.submit(setup, experiment, interactive=False) + shared_checks() + + files = glob(os.path.join(expected_scripts, "minicluster-size*.json")) + minicluster_file = files[0] + print(f'Found minicluster metadata file {minicluster_file}') + + check(minicluster_file, experiment) + + # Now do the same for apply + # shutil.rmtree(expected_outdir) + if test_apply: + client.apply(setup, experiment, interactive=False) + shared_checks(info=False) + check(minicluster_file, experiment) + + client.down(setup, experiment=experiment) + + +def test_minicluster_logging(tmp_path): + """ + Ensure that the logging example returns expected logging params set + in the minicluster output. + """ + + def check(minicluster_file, experiment): + assert os.path.exists(minicluster_file) + + # Assert that the logging spec matches + minicluster = utils.read_json(minicluster_file) + for level, value in experiment.minicluster["logging"].items(): + assert level in minicluster["spec"]["logging"] + assert minicluster["spec"]["logging"][level] == value + + check_lammps(minicluster_file) + + # Run the example for submit and apply, with check + _test_example("logging", tmp_path, check) + + +def test_minicluster_volumes(tmp_path): + """ + Ensure that the volumes example produces the expected Minicluster spec + """ + + def check(minicluster_file, experiment): + assert os.path.exists(minicluster_file) + + # Assert that the logging spec matches + minicluster = utils.read_json(minicluster_file) + assert "volumes" in minicluster["spec"] + + check_lammps(minicluster_file) + + # And container level volumes + assert "volumes" in minicluster["spec"]["containers"][0] + container_volumes = minicluster["spec"]["containers"][0]["volumes"] + + # This checks the cluster level volumes + for name, volume in experiment.minicluster["volumes"].items(): + assert name in minicluster["spec"]["volumes"] + generated_volume = minicluster["spec"]["volumes"][name] + + for attr, value in volume.items(): + if attr in generated_volume: + assert value == generated_volume[attr] + + assert name in container_volumes + + for vname, containervol in experiment.jobs["reaxc-hns-1"][ + "volumes" + ].items(): + assert vname in container_volumes + for attr, val in containervol.items(): + assert attr in container_volumes[vname] + assert container_volumes[vname][attr] == val + + # Run the example for submit and apply, with check + _test_example("volumes", tmp_path, check) + + +def test_osu_benchmarks(tmp_path): + """ + Ensure we can explicitly specify resources + """ + def check(minicluster_file, experiment): + assert os.path.exists(minicluster_file) + + + # Run the example for submit and apply, with check + _test_example("osu-benchmarks", tmp_path, check, test_apply=False) + + +def test_minicluster_resources(tmp_path): + """ + Ensure that the resources example works as expected. + """ + + def check(minicluster_file, experiment): + assert os.path.exists(minicluster_file) + + # Assert that the logging spec matches + minicluster = utils.read_json(minicluster_file) + check_lammps(minicluster_file) + + assert "resources" in minicluster["spec"]["containers"][0] + resources = minicluster["spec"]["containers"][0]["resources"] + + for rtype, rvalue in experiment.jobs["reaxc-hns-1"]["resources"].items(): + assert rtype in resources + assert resources[rtype] == rvalue + + # Run the example for submit and apply, with check + _test_example("resources", tmp_path, check) diff --git a/fluxcloud/tests/test_settings.py b/fluxcloud/tests/test_settings.py new file mode 100644 index 0000000..9d0b162 --- /dev/null +++ b/fluxcloud/tests/test_settings.py @@ -0,0 +1,50 @@ +#!/usr/bin/python + +# Copyright 2022 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import os + +import pytest + +from fluxcloud.main.settings import UserSettings + +here = os.path.dirname(os.path.abspath(__file__)) +root = os.path.dirname(here) + +from .helpers import get_settings # noqa + + +def test_invalid_properties(tmp_path): + """ + Test invalid setting property + """ + settings = UserSettings(get_settings(tmp_path)) + assert settings.config_editor == "vim" + settings.set("config_editor", "code") + with pytest.raises(SystemExit): + settings.set("invalid_key", "invalid_value") + assert settings.config_editor == "code" + + +def test_set_get(tmp_path): + """ + Test variable set/get + """ + settings = UserSettings(get_settings(tmp_path)) + + zone = "us-central1-a" + assert settings.google["zone"] == zone + + # Cannot add invalid parameter + with pytest.raises(SystemExit): + settings.set("cache_only", True) + + found_zone = settings.get("google:zone") + assert isinstance(zone, str) + assert zone == found_zone + + # Just check the first in the list + assert settings.google["zone"] == zone diff --git a/fluxcloud/tests/test_utils.py b/fluxcloud/tests/test_utils.py new file mode 100644 index 0000000..b10c97d --- /dev/null +++ b/fluxcloud/tests/test_utils.py @@ -0,0 +1,133 @@ +#!/usr/bin/python + +# Copyright (C) 2021-2022 Vanessa Sochat. + +# This Source Code Form is subject to the terms of the +# Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed +# with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import json +import os +import shutil + +import pytest + +import fluxcloud.utils as utils + + +def test_write_read_files(tmp_path): + """ + test_write_read_files will test the functions write_file and read_file + """ + print("Testing utils.write_file...") + + tmpfile = str(tmp_path / "written_file.txt") + assert not os.path.exists(tmpfile) + utils.write_file("hello!", tmpfile) + assert os.path.exists(tmpfile) + + print("Testing utils.read_file...") + content = utils.read_file(tmpfile) + assert content == "hello!" + + +def test_write_bad_json(tmp_path): + bad_json = {"Wakkawakkawakka'}": [{True}, "2", 3]} + tmpfile = str(tmp_path / "json_file.txt") + assert not os.path.exists(tmpfile) + with pytest.raises(TypeError): + utils.write_json(bad_json, tmpfile) + + +def test_write_json(tmp_path): + good_json = {"Wakkawakkawakka": [True, "2", 3]} + tmpfile = str(tmp_path / "good_json_file.txt") + + assert not os.path.exists(tmpfile) + utils.write_json(good_json, tmpfile) + with open(tmpfile, "r") as f: + content = json.loads(f.read()) + assert isinstance(content, dict) + assert "Wakkawakkawakka" in content + content = utils.read_json(tmpfile) + assert "Wakkawakkawakka" in content + + +def test_check_install(): + """ + check install is used to check if a particular software is installed. + If no command is provided, singularity is assumed to be the test case + """ + print("Testing utils.check_install") + + is_installed = utils.check_install("echo") + assert is_installed + is_not_installed = utils.check_install("fakesoftwarename") + assert not is_not_installed + + +def test_get_installdir(): + """ + Get install directory should return the base of where fluxcloud + is installed + """ + print("Testing utils.get_installdir") + + whereami = utils.get_installdir() + print(whereami) + assert whereami.endswith("fluxcloud") + + +def test_get_file_hash(): + print("Testing utils.get_file_hash") + here = os.path.dirname(os.path.abspath(__file__)) + testdata = os.path.join(here, "testdata", "hashtest.txt") + assert ( + utils.get_file_hash(testdata) + == "6bb92117bded3da774363713657a629a9f38eac2e57cd47e1dcda21d3445c67d" + ) + assert utils.get_file_hash(testdata, "md5") == "e5d376ca96081dd561ff303c3a631fd5" + + +def test_copyfile(tmp_path): + print("Testing utils.copyfile") + original = str(tmp_path / "location1.txt") + dest = str(tmp_path / "location2.txt") + print(original) + print(dest) + utils.write_file("CONTENT IN FILE", original) + utils.copyfile(original, dest) + assert os.path.exists(original) + assert os.path.exists(dest) + + +def test_get_tmpdir_tmpfile(): + print("Testing utils.get_tmpdir, get_tmpfile") + tmpdir = utils.get_tmpdir() + assert os.path.exists(tmpdir) + assert os.path.basename(tmpdir).startswith("fluxcloud") + shutil.rmtree(tmpdir) + tmpdir = utils.get_tmpdir(prefix="name") + assert os.path.basename(tmpdir).startswith("name") + shutil.rmtree(tmpdir) + tmpfile = utils.get_tmpfile() + assert "fluxcloud" in tmpfile + os.remove(tmpfile) + tmpfile = utils.get_tmpfile(prefix="pancakes") + assert "pancakes" in tmpfile + os.remove(tmpfile) + + +def test_mkdir_p(tmp_path): + print("Testing utils.mkdir_p") + dirname = str(tmp_path / "input") + result = os.path.join(dirname, "level1", "level2", "level3") + utils.mkdir_p(result) + utils.mkdirp([result]) + assert os.path.exists(result) + + +def test_print_json(): + print("Testing utils.print_json") + result = utils.print_json({1: 1}) + assert result == '{\n "1": 1\n}' diff --git a/fluxcloud/tests/testdata/hashtest.txt b/fluxcloud/tests/testdata/hashtest.txt new file mode 100644 index 0000000..e85812c --- /dev/null +++ b/fluxcloud/tests/testdata/hashtest.txt @@ -0,0 +1,2 @@ +This is a file that exists purely to test the functions to generate +hashes. Please don't modify, thank you! diff --git a/fluxcloud/utils/__init__.py b/fluxcloud/utils/__init__.py index b079912..10c9291 100644 --- a/fluxcloud/utils/__init__.py +++ b/fluxcloud/utils/__init__.py @@ -18,7 +18,7 @@ write_json, write_yaml, ) -from .misc import chunks, get_hash, mb_to_bytes, print_bytes, slugify +from .misc import chunks, get_hash, mb_to_bytes, print_bytes, slugify, working_dir from .terminal import ( check_install, confirm_action, diff --git a/fluxcloud/utils/misc.py b/fluxcloud/utils/misc.py index acfcc9c..0bee595 100644 --- a/fluxcloud/utils/misc.py +++ b/fluxcloud/utils/misc.py @@ -4,6 +4,21 @@ # SPDX-License-Identifier: Apache-2.0 import copy +import os +from contextlib import contextmanager + + +@contextmanager +def working_dir(path): + """ + Sets the cwd within the context + """ + here = os.getcwd() + try: + os.chdir(path) + yield + finally: + os.chdir(here) def chunks(listing, chunk_size): diff --git a/fluxcloud/version.py b/fluxcloud/version.py index c3655ca..409163d 100644 --- a/fluxcloud/version.py +++ b/fluxcloud/version.py @@ -1,7 +1,7 @@ # Copyright 2022-2023 Lawrence Livermore National Security, LLC # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.1.19" +__version__ = "0.2.0" AUTHOR = "Vanessa Sochat" EMAIL = "vsoch@users.noreply.github.com" NAME = "flux-cloud" @@ -14,6 +14,8 @@ # Global requirements INSTALL_REQUIRES = ( + ("kubernetes", {"min_version": None}), + ("fluxoperator", {"min_version": "0.0.12"}), ("ruamel.yaml", {"min_version": None}), ("jsonschema", {"min_version": None}), ("requests", {"min_version": None}), diff --git a/tests/test.sh b/tests/test.sh index ca485cb..7c1ac3c 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -37,39 +37,10 @@ echo "flux-cloud run --cloud minikube --output ${output} --force-cluster" flux-cloud run --cloud minikube --output ${output} --force-cluster retval=$? -if [[ "${retval}" != "0" ]]; then +if [[ ${retval} -ne 0 ]]; then echo "Issue running Flux Cloud, return value ${retval}" exit ${retval} fi -# Check output -for filename in $(find ./data -type f -print); do - echo "Checking $filename"; - filebase=$(basename ${filename}) - - # Don't check these files, likely to change - if [[ "${filebase}" == "flux-operator.yaml" ]]; then - continue - fi - if [[ "${filebase}" == "nodes-size"* ]]; then - continue - fi - suffix=$(echo ${filename:7}) - outfile="$output/$suffix" - if [[ ! -e "${outfile}" ]]; then - echo "Expected output $outfile does not exist." - exit 1 - fi - # Check the length - actual=$(cat $filename | wc -l) - found=$(cat $outfile | wc -l) - - if [[ "${actual}" != "${found}" ]]; then - echo "Incorrect output length found for ${filename}: expected ${actual} vs found ${found}" - cat ${outfile} - exit 1 - fi -done - echo ${output} rm -rf ${output}