diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3739a40..04aa71f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -24,13 +24,74 @@ jobs:
pip install -r .github/dev-requirements.txt
pre-commit run --all-files
- test-runs:
+ test-python:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Clone the code
+ uses: actions/checkout@v3
+
+ - name: Install flux-cloud
+ run: |
+ conda create --quiet --name fc jinja2
+ export PATH="/usr/share/miniconda/bin:$PATH"
+ source activate fc
+ pip install .[all]
+
+ - name: Test Python
+ run: |
+ export PATH="/usr/share/miniconda/bin:$PATH"
+ source activate fc
+ export SHELL=/bin/bash
+ # This will bring MiniKube up/down
+ pytest -xs ./fluxcloud/tests/test_settings.py
+ pytest -xs ./fluxcloud/tests/test_utils.py
+
+ test-examples:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
- test: ["lammps"]
+ test: ["test_minicluster_logging", "test_minicluster_volumes",
+ "test_minicluster_resources"]
+ steps:
+ - name: Clone the code
+ uses: actions/checkout@v3
+
+ - name: Setup Go
+ uses: actions/setup-go@v3
+ with:
+ go-version: ^1.18
+
+ - name: Install flux-cloud
+ run: |
+ conda create --quiet --name fc jinja2
+ export PATH="/usr/share/miniconda/bin:$PATH"
+ source activate fc
+ pip install .[all]
+ pip install kubernetes
+ - name: Start minikube
+ uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9
+
+ - name: Test Example
+ env:
+ test: ${{ matrix.test }}
+ run: |
+ export PATH="/usr/share/miniconda/bin:$PATH"
+ source activate fc
+ export SHELL=/bin/bash
+ eval $(minikube -p minikube docker-env)
+ # We need to delete the minikube cluster to bring it up again
+ minikube delete
+ # This will bring MiniKube up/down
+ pytest -xs ./fluxcloud/tests/test_examples.py::${test}
+
+ test-runs:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ command: [["apply", "lammps"], ["submit", "./examples/minikube/basic"]]
steps:
- name: Clone the code
uses: actions/checkout@v3
@@ -50,9 +111,10 @@ jobs:
- name: Start minikube
uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9
- - name: Test ${{ matrix.test }}
+ - name: Test ${{ matrix.command }}
env:
- name: ${{ matrix.test }}
+ name: ${{ matrix.command[1] }}
+ if: (matrix.command[0] == 'apply')
run: |
export PATH="/usr/share/miniconda/bin:$PATH"
source activate fc
@@ -61,3 +123,18 @@ jobs:
# We need to delete the minikube cluster to bring it up again
minikube delete
/bin/bash ./tests/test.sh ${name}
+
+ - name: Test ${{ matrix.command }}
+ env:
+ workdir: ${{ matrix.command[1] }}
+ if: (matrix.command[0] == 'submit')
+ run: |
+ export PATH="/usr/share/miniconda/bin:$PATH"
+ source activate fc
+ export SHELL=/bin/bash
+ eval $(minikube -p minikube docker-env)
+ minikube delete
+ cd ${workdir}
+ flux-cloud up --cloud minikube --force-cluster
+ flux-cloud --debug submit --non-interactive
+ flux-cloud down --cloud minikube
diff --git a/.gitignore b/.gitignore
index 50fc771..50592df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@ env
.env
dist
__pycache__
+examples/**/data
+examples/**/_data
diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa1b003..b231b2e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,10 @@ and **Merged pull requests**. Critical items to know are:
The versions coincide with releases on pip. Only major versions will be released as tags on Github.
## [0.0.x](https://github.com/converged-computing/flux-cloud/tree/main) (0.0.x)
+ - refactor flux submit and apply to use fluxoperator Python SDK (0.2.0)
+ - This reduces scripts in output folder, but is a good tradeoff for fewer errors
+ - remove "ui" command, flux-cloud is intended mostly for automation
+ - command and image will always be required.
- fix bash script bugs (0.1.19)
- support for node group level aws avail. zones, save times on each experiment apply (0.1.18)
- data should be namespaced by cloud type (so multiple experiments can be run alongside) (0.1.17)
diff --git a/README.md b/README.md
index 4e9ae0c..8cd28ea 100644
--- a/README.md
+++ b/README.md
@@ -23,11 +23,6 @@ It will be expanded as needed.
Code is under development and likely to change!
In the meantime, for early documentation, see our ⭐️ [Documentation](https://converged-computing.github.io/flux-cloud/) ⭐️
-## TODO
-
- - test for list of experiments
- - cloud-select could estimate the cost?
- - run and add more cluster examples
## 😁️ Contributors 😁️
diff --git a/docs/getting_started/commands.md b/docs/getting_started/commands.md
index 863e9ae..0c7f846 100644
--- a/docs/getting_started/commands.md
+++ b/docs/getting_started/commands.md
@@ -1,11 +1,104 @@
# Commands
-The following commands are provided by Flux Cloud. For running jobs, you can either do:
+Welcome to the commands section! You can learn the details of each command below, or
+check out an [example](examples.md) or [cloud tutorial](../tutorials/index.md).
+The general steps you want to take are:
-- **apply**/**run**: A single/multi job submission intended for different containers to re-create pods each time.
-- **batch**/**submit**: A single/multi job submission intended for a common container base where we use the same set of pods.
+1. Generate or find an `experiments.yaml` configuration.
+2. Decide if you want to use `submit` or `apply`
+3. Create the cluster, run experiments, and clean up.
-Both are described in the following sections.
+If you don't want to use an existing example, see [experiment init](#init) for how to create an `experiments.yaml` from scratch.
+
+> What's the difference between submit and apply?
+
+For `apply`, we are running one job per Minicluster (the Flux Operator custom resource definition). This means
+we bring up an entire set of pods for each container (each entry under "jobs" in your experiment.yaml),
+run the single job directly with `flux start -> flux submit` to provide the command to the broker, and then
+when it finished the container will exit and the job clean up. This approach likely is suited to fewer jobs
+that are longer running, and if you want to see output appear as it's available (we stream the log from the broker pod).
+For `apply` we also skip creating the [Flux RESTFul API](https;//github.com/flux-framework/flux-restful-api) server,
+so it's one less dependency to worry about, and you also don't need to think about exposing an API or users.
+
+For `submit`, we take advantage of Flux as a scheduler, bringing up the fewest number of MiniClusters we can
+derive based on the unique containers and sizes in your `experiments.yaml`. This means that, for each unique
+set, we bring up one MiniCluster, and then submit all your jobs at once, allowing Flux to act as a scheduler.
+We poll the server every 30 seconds to get an update on running jobs, and when they are all complete, jobs
+output and results are saved. This approach is more ideal for many smaller jobs, as the MiniClusters are
+only brought up once (and you don't need to wait for pods to go up and down for each job). The cons of this
+approach are getting logs at the end, unless you decide to interact with the Flux RESTFul API on your own
+earlier.
+
+Next, read about how to use these commands in detail.
+
+## experiment
+
+### init
+
+When you want to create a new experiment, do:
+
+```bash
+$ mkdir -p my-experiment
+$ cd my-experiment
+
+# Create a new experiment for minikube
+$ flux-cloud experiment init --cloud minikube
+$ flux-cloud experiment init --cloud aws
+$ flux-cloud experiment init --cloud google
+```
+
+This will create an `experiments.yaml` template with custom variables for your
+cloud of choice, and robustly commented.
+
+
+
+View Example Output of flux-cloud experiment init
+
+```bash
+$ flux-cloud experiment init --cloud google > experiments.yaml
+```
+```yaml
+matrix:
+ size: [4]
+
+ # This is a Google Cloud machine
+ machine: [n1-standard-1]
+
+variables:
+ # Customize zone just for this experiment
+ # otherwise defaults to your settings.yml
+ zone: us-central1-a
+
+# Flux MiniCluster experiment attributes
+minicluster:
+ name: my-job
+ namespace: flux-operator
+ # Each of these sizes will be brought up and have commands run across it
+ # They must be smaller than the Kubernetes cluster size or not possible to run!
+ size: [2, 4]
+
+# Under jobs should be named jobs (output orgainzed by name) where
+# each is required to have a command and image. Repeats is the number
+# of times to run each job
+jobs:
+ reaxc-hns:
+ command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+ sleep:
+ command: 'sleep 5'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+ hello-world:
+ command: 'echo hello world'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+```
+
+
## list
@@ -46,9 +139,86 @@ And this will run across sizes. To ask for a specific size:
$ flux-cloud apply -e k8s-size-8-m5.large --size 2
```
-## run
+### up
+
+Here is how to bring up a cluster (with the operator installed). For this command,
+we will either select the first in the matrix (default):
+
+```bash
+$ flux-cloud up
+```
+```console
+No experiment ID provided, assuming first experiment n1-standard-1-2.
+```
+
+or if you want to specify an experiment identifier based on the machine and size, you can do that:
+
+```bash
+$ flux-cloud up -e n1-standard-1-2
+```
+```console
+Selected experiment n1-standard-1-2.
+```
+
+And to force up without a prompt:
+
+```bash
+$ flux-cloud up -e n1-standard-1-2 --force-cluster
+```
+
+## Ways to run jobs
+
+The following commands are provided by Flux Cloud. For running jobs, you can either do:
+
+- **apply**/**run**: A single/multi job submission intended for different containers to re-create pods each time.
+- **batch**/**submit**: A batch mode, where we submit / schedule many jobs on the fewest MiniClusters
+
+Both are described in the following sections.
+
+### apply / run
+
+> Ideal for running multiple jobs with different containers.
+
+An apply assumes that you want to create a separate MiniCluster each time, meaning
+bringing up an entire set of pods, running a single command, and then bringing everything
+down. This is ideal for longing running experiments, but note that it does not take advantage
+of using Flux as a scheduler. Flux is basically running one job and going away.
+
+#### apply
+
+After "up" you can choose to run experiments (as you feel) with "apply."
+
+```bash
+$ flux-cloud apply
+```
+
+The same convention applies - not providing the identifier runs the
+first entry, otherwise we use the identifier you provide.
+
+```bash
+$ flux-cloud apply -e n1-standard-1-2
+```
+
+To force overwrite of existing results (by default they are skipped)
+
+```bash
+$ flux-cloud apply -e n1-standard-1-2 --force
+```
+
+Apply is going to be creating on CRD per job, so that's a lot of
+pod creation and deletion. This is in comparison to "submit" that
+brings up a MiniCluster once, and then executes commands to it, allowing
+Flux to serve as the scheduler. Note that by default, we always wait for a previous run to be cleaned up
+before continuing. If you don't want apply to be interactive (e.g., it will
+ask you before cleaning up) you can do:
+
+```bash
+$ flux-cloud apply --non-interactive
+```
+
+By default, apply via a "run" is non-interactive.
-> Up, apply, down in one command, ideal for completely headless runs and jobs with different containers.
+#### run
The main command is a "run" that is going to, for each cluster:
@@ -112,67 +282,18 @@ $ flux-cloud apply -e n1-standard-1-2
$ flux-cloud down -e n1-standard-1-2
```
-These commands are discussed in more next.
-
-### up
-
-Here is how to bring up a cluster (with the operator installed). For this command,
-we will either select the first in the matrix (default):
-
-```bash
-$ flux-cloud up
-```
-```console
-No experiment ID provided, assuming first experiment n1-standard-1-2.
-```
-
-or if you want to specify an experiment identifier based on the machine and size, you can do that:
-
-```bash
-$ flux-cloud up -e n1-standard-1-2
-```
-```console
-Selected experiment n1-standard-1-2.
-```
-
-And to force up without a prompt:
-
-```bash
-$ flux-cloud up -e n1-standard-1-2 --force-cluster
-```
-
-## apply
+### submit / batch
-> Ideal for running multiple jobs with different containers.
+> Ideal for one or more commands and/or containers across persistent MiniClusters.
-After "up" you can choose to run experiments (as you feel) with "apply."
+These commands submit multiple jobs to the same MiniCluster and actually use Flux
+as a scheduler! This means we get the unique set of images and MiniCluster sizes for
+your experiments, and then bring up each one, submitting the matching jobs to it.
+We submit all jobs at once, and then poll Flux until they are completed to get output.
-```bash
-$ flux-cloud apply
-```
+#### submit
-The same convention applies - not providing the identifier runs the
-first entry, otherwise we use the identifier you provide.
-
-```bash
-$ flux-cloud apply -e n1-standard-1-2
-```
-
-To force overwrite of existing results (by default they are skipped)
-
-```bash
-$ flux-cloud apply -e n1-standard-1-2 --force
-```
-
-Apply is going to be creating on CRD per job, so that's a lot of
-pod creation and deletion. This is in comparison to "submit" that
-brings up a MiniCluster once, and then executes commands to it, allowing
-Flux to serve as the scheduler. Note that by default, we always wait for a previous run to be cleaned up
-before continuing.
-
-## submit
-
-> Ideal for one or more commands across the same container(s) and MiniCluster size.
+The entire flow might look like:
```bash
$ flux-cloud up --cloud minikube
@@ -185,27 +306,31 @@ to submit jobs. For submit (and the equivalent to bring it up and down with batc
your commands aren't provided in the CRD,
but rather to the Flux Restful API. Submit / batch will also generate one CRD
per MiniCluster size, but use the same MiniCluster across jobs. This is different
-from apply, which generates one CRD per job to run.
+from apply, which generates one CRD per job to run. If you don't want submit to be interactive
+(e.g., it will ask you before cleaning up) you can do:
-## batch
+```bash
+$ flux-cloud submit --non-interactive
+```
-> Up, submit, down in one command, ideal for jobs with the same container(s)
+By default, submit run with batch is non-interactive.
+
+#### batch
+
+This is the equivalent of "submit" but includes the up and down for the larger
+Kubernetes cluster.
+
+```bash
+$ flux-cloud batch --cloud aws
+```
-The "batch" command is comparable to "run" except we are running commands
-across the same set of containers. We don't need to bring pods up/down each time,
-and we are using Flux in our cluster to handle scheduling.
This command is going to:
1. Create the cluster
2. Run each of the experiments, saving output and timing, on the same pods
3. Bring down the cluster
-The output is organized in the same way, and as before, you can choose to run a single
-command with "submit"
-
-```bash
-$ flux-cloud batch --cloud aws
-```
+The output is organized in the same way,
Note that since we are communicating with the FluxRestful API, you are required to
provide a `FLUX_USER` and `FLUX_TOKEN` for the API. If you are running this programmatically,
@@ -219,32 +344,6 @@ $ flux-cloud submit
$ flux-cloud down
```
-## ui
-
-If you are interested in interactive submission on your own, either in the user interface
-or via one of our client SDKs, you can bring up the MiniCluster and it's interface with
-the Flux Restful API with `ui`:
-
-```bash
-$ flux-cloud ui --cloud minikube
-```
-
-If you have many sizes of MiniClusters, you'll need to specify the one that you want:
-
-```bash
-$ flux-cloud ui --cloud minikube --size 4
-```
-
-By default, it will use your single MiniCluster size.
-
-
-
-Which then looks like this in the browser, available for submission via the interface itself
-or the restful API until the user presses control+c to close the port forward and delete
-the MiniCluster.
-
-![img/ui.png](img/ui.png)
-
## down
And then bring down your first (or named) cluster:
@@ -266,7 +365,6 @@ You can also use `--force-cluster` here:
$ flux-cloud down --force-cluster
```
-
## debug
For any command, you can add `--debug` as a main client argument to see additional information. E.g.,
@@ -297,11 +395,10 @@ managedNodeGroups:
## scripts
-By default, flux cloud keeps all scripts that the job renders in the experiment output directory under `.scripts`. If you
-want to cleanup instead, you can add the `--cleanup` flag. We do this so you can inspect a script to debug, or if you
-just want to keep them for reproducibility. As an example, here is outfrom from a run with multiple repeats of the
-same command, across two MiniCluster cluster sizes (2 and 4). As of version `0.1.17` the data is also organized
-by the runner (e.g., minikube vs google) so you can run the experiments across multiple clouds without conflict.
+Flux cloud (prior to version 0.2.0) ran each job with a script, and it would save each script. Since version 0.2.0,
+we refactored to do everything with Python APIs/SDKs, so we no longer save submit scripts. However, we still save
+scripts for bringing up an down each cluster, along with node and pod metadata (as json). We save this in in the
+hidden `.scripts` directory.
```console
$ tree -a ./data/
@@ -314,17 +411,11 @@ $ tree -a ./data/
│ └── log.out
├── meta.json
└── .scripts
- ├── cluster-create-minikube.sh
- ├── flux-operator.yaml
- ├── kubectl-version.yaml
- ├── minicluster-run-lmp-size-2-minicluster-size-2.sh
- ├── minicluster-run-lmp-size-4-minicluster-size-4.sh
- ├── minicluster-size-2.yaml
- ├── minicluster-size-4.yaml
- ├── minikube-version.json
- ├── nodes-size-4.json
- └── nodes-size-4.txt
-```
-
-And that's it! I think there might be a more elegant way to determine what cluster is running,
-however if the user decides to launch more than one, it might be harder. More thinking / docs / examples coming soon.
+ ├── minicluster-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json
+ ├── nodes-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json
+ └── pods-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json
+```
+
+And that's it! We recommend you look at [examples](examples.md) or [tutorials](../tutorials/index.md) for
+getting started. If you are brave, just run `flux-cloud experiment init --cloud ` to create
+your own experiment from scratch.
diff --git a/docs/getting_started/debugging.md b/docs/getting_started/debugging.md
new file mode 100644
index 0000000..53bea0a
--- /dev/null
+++ b/docs/getting_started/debugging.md
@@ -0,0 +1,163 @@
+# Debugging
+
+> Oh no, my MiniCluster jobs aren't running!
+
+Kubernetes is a complex beast, so here are some debugging tips that might help you figure out what
+is going on. We are generally going to be looking at objects owned by the Flux Operator - pods,
+config maps, and (sometimes volumes or services). Note that the object deployed by the Flux Operator
+custom resource definition is called a `minicluster`:
+
+```bash
+$ kubectl get -n flux-operator minicluster
+```
+```console
+NAME AGE
+osu-benchmarks 57s
+```
+
+## 0. kubectl pro tips
+
+These tips come from the amazing [Claudia](https://github.com/cmisale)!
+
+It's fairly arduous to copy paste or type complete pod names, especially for indexed jobs where there is a random
+set of characters. You can enable kubectl to autocomplete by adding this to your bash profile (`~/.bashrc`):
+
+```bash
+source <(kubectl completion bash)
+```
+
+Another shortcut that is nice to have is to make an alias for `kubectl` to just be `k`:
+
+```bash
+alias k=kubectl
+```
+
+Another tip is how to get an interactive session to a pod:
+
+```bash
+$ kubectl exec -n flux-operator -it -- bash
+```
+
+Yes, it's very docker-like! I've found I'm much faster having these tricks than before.
+
+
+## 1. Start with logs
+
+You can usually first look to pod logs to see what pods are there, and their various states:
+
+```bash
+$ kubectl get -n flux-operator pods
+```
+
+Remember that if you use `flux-cloud` apply without debug, you won't see output after it finds the broker pod,
+but you'll see it being printed to logs in your `data` folder. If you want to see output, either add `--debug`
+after `flux-cloud` or look at the log and add `-f` to keep it hanging:
+
+```bash
+# See instant of a log
+$ kubectl logs -n flux-operator osu-benchmarks-0-vxnfq
+
+# Stream to the terminal until the container is done
+$ kubectl logs -n flux-operator osu-benchmarks-0-vxnfq -f
+```
+
+Here is looking at output for the certificate generator pod:
+
+```bash
+$ kubectl logs -n flux-operator osu-benchmarks-cert-generator
+```
+
+For `flux-cloud apply` if you want to see output consistently, it's suggested to add `--debug`,
+as the miniclusters are going to be created / deleted and you'd need to grab the pod logs
+multiple times!
+
+### What should I expect to see?
+
+The certificate generator pod runs first. It's output should *only* be
+the certificate:
+
+```console
+# **** Generated on 2023-03-04 04:24:46 by CZMQ ****
+# ZeroMQ CURVE **Secret** Certificate
+# DO NOT PROVIDE THIS FILE TO OTHER USERS nor change its permissions.
+
+metadata
+ name = "osu-benchmarks-cert-generator"
+ time = "2023-03-04T04:24:46"
+ userid = "0"
+ hostname = "osu-benchmarks-cert-generator"
+curve
+ public-key = "l12&OlN-DwF*6rhx##Y#ZQ^9w1zON039Vxh2&+8r"
+ secret-key = "o^(dM0R96q-d=2Jk-tEjgh=syRjW?q6%Kq{Q8Y4H"
+```
+
+If you see any error message about "invalid curve cert" this means that something was incorrectly
+generated. As an example, you should use `preCommand` for any logic that is shared between
+the certificate generator and worker/broker pods (e.g., sourcing an environment for Flux) and commands->pre
+for anything else that is just for the worker/broker pods (printing to debug, etc.)
+
+For the broker pod, you should expect to see debugging output (if logging->debug is true) and then the
+Flux Broker starting. The quorum should be reported to be full. E.g.,
+
+```console
+🌀 flux start -o --config /etc/flux/config -Scron.directory=/etc/flux/system/cron.d -Stbon.fanout=256 -Srundir=/run/flux -Sstatedir=/var/lib/flux -Slocal-uri=local:///run/flux/local -Slog-stderr-level=6 -Slog-stderr-mode=local
+broker.info[1]: start: none->join 13.3684ms
+broker.info[1]: parent-ready: join->init 1.14525s
+broker.info[1]: configuration updated
+broker.info[1]: rc1.0: running /etc/flux/rc1.d/01-sched-fluxion
+broker.info[1]: rc1.0: running /etc/flux/rc1.d/02-cron
+broker.info[1]: rc1.0: /etc/flux/rc1 Exited (rc=0) 0.2s
+broker.info[1]: rc1-success: init->quorum 0.234173s
+broker.info[1]: quorum-full: quorum->run 0.204937s
+```
+
+If you see any error messages from the broker, this should be looked into.
+Warnings can sometimes be OK. Ask if you aren't sure.
+
+## 2. Use describe
+
+You can describe any object in Kubernetes space to debug. Describe is especially important when you are debugging
+storage and want to figure out why something isn't mounting. Typically you might start by looking at pods in all
+namespaces:
+
+```bash
+$ kubectl get pods --all-namespaces -o wide
+```
+
+The wide format is useful because it will show you the node each pod is assigned to, which can be useful
+for debugging resource limits and requests. You then might want to describe a particular pod,
+maybe to look at annotations or volume mounts:
+
+```bash
+$ kubectl describe pod -n flux-operator osu-benchmarks-1-tj6bt
+```
+
+You can get json output with a get for the pod (or object):
+
+```bash
+$ kubectl get pod -n flux-operator osu-benchmarks-1-tj6bt -o json
+```
+
+And pipe that into `jq` to look for specific attributes! So let's say you see that a volume
+failed for your pod. You likely want to next check your persistent volumes "pv" and claims "pvc":
+
+```bash
+$ kubectl describe -n flux-operator pv
+$ kubectl describe -n flux-operator pvc
+```
+
+For volumes, if you are using a container storage interface (CSI) you likely are using a daemon set that
+deploys pods. Try looking at the logs for the pods, and/or the daemonset for issues:
+
+```bash
+$ kubectl describe daemonset --all-namespaces
+```
+
+Finally, services (svc) can be useful if you suspect a permission or credential is wonky.
+
+## 3. Advanced
+
+Often when I'm debugging something complex I try to create the object I'm interested in so it is in a
+continuously running state. As an example, to test a pod for a daemonset, I will get the raw YAML
+for the daemonset and change the entrypoint to `sleep infinity`. I can then shell in and manually run
+commands to see their output.
diff --git a/docs/getting_started/examples.md b/docs/getting_started/examples.md
index b2e0cdd..7ddd9be 100644
--- a/docs/getting_started/examples.md
+++ b/docs/getting_started/examples.md
@@ -3,24 +3,24 @@
The easiest thing to do is arguably to start with an example,
and then customize it. Here we will add examples as we create them.
-- [up-apply-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-apply-down): shows using `flux-cloud apply` for individual CRD submission.
-- [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/osu-benchmarks)
-- [up-submit-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-submit-down): shows using `flux-cloud submit` for batch submission.
-- [aws-lammps](https://github.com/converged-computing/flux-cloud/tree/main/examples/aws-lammps): a simple lammps run on AWS.
-
-The above example runs a single command in a single Kubernetes cluster and MiniCluster,
-and it's lammps!
-
-## Demo
-
-Here is a quick demo from the [up-apply-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-apply-down) in the repository.
-
-
-
-which was actually run as:
+- [minikube](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube)
+ - [basic](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/basic)
+ - [volumes](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/volumes)
+ - [resources](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/resources)
+ - [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/osu-benchmarks)
+ - [persistent](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/persistent)
+- [google](https://github.com/converged-computing/flux-cloud/tree/main/examples/google)
+ - [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/google/osu-benchmarks)
+
+All of the examples above (for MiniKube) are tested, and can be adopted for another cloud typically by adding
+the "machines" directive under "matrix" and then any custom variables. As a reminder, you can generate
+a blank template for any cloud (including variables) via:
```bash
-$ flux-cloud run
+$ flux-cloud experiment init --cloud minikube
+$ flux-cloud experiment init --cloud aws
+$ flux-cloud experiment init --cloud google
```
-for the purposes of the demo, and runs a lammps job on two tiny nodes!
+
+New examples for AWS will be coming soon - I didn't have credits to test when I wrote these.
diff --git a/docs/getting_started/experiments.md b/docs/getting_started/experiments.md
index 2736cfb..fa31abf 100644
--- a/docs/getting_started/experiments.md
+++ b/docs/getting_started/experiments.md
@@ -3,12 +3,21 @@
Welcome to the Flux Cloud experiments user guide! If you come here, we are assuming you want
to run jobs with the Flux Operator on GKE, and that you have [installed](install.md) flux-cloud.
Note this project is early in development so this could change or bugs could be introduced.
-Let's get started with talking about experiments. Your experiments will typically be defined by two files:
+Let's get started with talking about experiments. As of version 0.2.0, your experiments will be defined by one file:
- - experiments.yaml: a yaml file that describes sizes, machines, and jobs to run
- - minicluster-template.yaml: a completely or partially filled template custom resource definition.
+ - experiments.yaml: a yaml file that describes sizes, machines, miniclusters, and jobs to run
We will walk through example experiment files here, along with a full set of fields you can use.
+Note that to get an example experiments.yaml template for any cloud, you can always do:
+
+```bash
+$ flux-cloud experiment init --cloud minikube
+$ flux-cloud experiment init --cloud aws
+$ flux-cloud experiment init --cloud google
+```
+
+The documentation here outlines the sections in details, however the above is the best
+means to get an entire, holistic file.
## Experiment Definition
@@ -29,6 +38,7 @@ matrix:
size: [2, 4]
machine: ["n1-standard-1", "n1-standard-2"]
```
+
Note that the sizes at this level indicate *the size of the Kubernetes cluster*. We
will expand on this idea later. This would run each size across each machine, for a total of 4 Kubernetes clusters created.
The number of custom resource (CRD) definitions applied to each one would vary based on the number of jobs.
@@ -167,78 +177,85 @@ jobs:
osu_get_latency:
command: './osu_get_latency'
image: ghcr.io/awesome/science:latest
- workdir: /path/to/science
+ working_dir: /path/to/science
repeats: 3
```
For repeats, we add another level to the output directory, and represent the result data as
-subdirectories of the machine and size from 1..N. Note also that likely in the future we
-can provide a default template and require all these variables
-defined. For now we require you to provide the template.
-
-
-## Custom Resource Definition
-
-> minicluster-template.yaml
+subdirectories of the machine and size from 1..N.
-The custom resource definition template "CRD" is currently suggested so you can customize exactly to your liking,
-but it's not required. It is used by flux-cloud to populate your job metadata and then submit one or more jobs to your Kubernetes cluster.
+#### Flux Options
-### Use Your Own
-
-Here is an example that uses a shared working directory (so it's hard coded) and a variable
-for the command:
+How do job parameters map to Flux, in the case of using `flux-cloud submit`? Good question! Here is the mapping:
```yaml
-apiVersion: flux-framework.org/v1alpha1
-kind: MiniCluster
-metadata:
- name: {{ minicluster.name }}
- namespace: {{ minicluster.namespace }}
-spec:
- # Number of pods to create for MiniCluster
- size: {{ minicluster.size }}
-
- # Disable verbose output
- logging:
- quiet: true
-
- # This is a list because a pod can support multiple containers
- containers:
- # The container URI to pull (currently needs to be public)
- - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
-
- # You can set the working directory if your container WORKDIR is not correct.
- workingDir: /home/flux/examples/reaxff/HNS
- command: {{ job.command }}
+jobs:
+ example-job:
+ command: './osu_get_latency'
+ flux_option_flags: "-ompi=openmpi@5"
+ working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided # workdir
+ image: ghcr.io/rse-ops/osu-microbench:test
+
+ # osu benchmarks requires exactly 2 processes
+ tasks: 2 # num_tasks
+ cores: 1 # cores_per_task
+ gpus: 0 # gpus_per_task
+ nodes: 1 # num_nodes
```
-### Use The Default
-
-To use the default, you want to make sure that you provide all variables that are required.
-The following are required (and have defaults or are otherwise generated by flux cloud
-so you could leave them out of your experiments.yaml):
-
-- minicluster.name
-- minicluster.namespace
-- minicluster.local_deploy (defaults to false)
-- minicluster.verbose (default to false to run in test mode)
-
-It's recommended to set your listing of sizes for miniclusters:
+#### Yaml Tricks
-- minicluster.size
+For your jobs, you likely will want to re-use parameters. There is a trick with YAML
+to define a named section, and then re-use it. Here is an example running the OSU
+benchmarks.
-The following are specific to the job and required:
+```yaml
+# matrix of experiments to run - machine types and sizes are required
+# This can obviously be expanded to more sizes or machines,
+matrix:
+ size: [2]
+ machine: ["n1-standard-1", "n1-standard-2"]
-- job.image
-- job.command
+# An example of shared container options!
+x-container-options: &options
+ fluxOptionFlags: "-ompi=openmpi@5"
+ working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided
+ image: ghcr.io/rse-ops/osu-microbench:app-latest
+ # This MUST be run for the certificate generator and workers/broker
+ pre_command: source /etc/profile.d/z10_spack_environment.sh
-The following are specific to the job but not required:
+# Flux Mini Cluster experiment attributes
+minicluster:
+ name: osu-benchmarks
+ namespace: flux-operator
-- job.workdir
-- job.tasks (recommended for better control of flux, as this would default to 1)
-- job.flux_option_flags (e.g., "-ompi=openmpi@5")
-- job.cores (defaults to 1 if not set, likely not ideal for your experiment)
-- job.limits (key value pairs)
-- job.requests (key value pairs)
-- job.pre_command: the job pre-command (usually multiple lines) but not required.
+# Each job can have a command and working directory
+jobs:
+ osu_get_latency:
+ command: './osu_get_latency'
+ <<: *options
+ osu_acc_latency:
+ command: './osu_acc_latency'
+ <<: *options
+ osu_fop_latency:
+ command: './osu_fop_latency'
+ <<: *options
+ osu_get_bw:
+ command: './osu_get_bw'
+ <<: *options
+ osu_put_bibw:
+ command: './osu_put_bibw'
+ <<: *options
+ osu_put_latency:
+ command: './osu_put_latency'
+ <<: *options
+ osu_cas_latency:
+ command: './osu_cas_latency'
+ <<: *options
+ osu_get_acc_latency:
+ command: './osu_get_acc_latency'
+ <<: *options
+ osu_put_bw:
+ command: './osu_put_bw'
+ <<: *options
+```
diff --git a/docs/getting_started/google.md b/docs/getting_started/google.md
deleted file mode 100644
index 9f7c96c..0000000
--- a/docs/getting_started/google.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# Google Cloud
-
-> Running on Google Kubernetes Engine, GKE
-
-The main functionality that flux-cloud provides are easy wrappers (and templates) to running
-the Flux Operator on GKE. The main steps of running experiments are:
-
- - **up** to bring up a cluster
- - **apply** to apply one or more experiments defined by an experiments.yaml
- - **down** to destroy a cluster
-
-Each of these commands can be run in isolation, and we provide a single command **run** to
-automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your
-machine to do the work (e.g., kubectl and gcloud) and importantly, for every step we show
-you the command, and if it fails, give you a chance to bail out. We do this so if you
-want to remove the abstraction at any point and run the commands on your own, you can.
-
-## Pre-requisites
-
-You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts)
-and ensure you are logged in and have kubectl installed:
-
-```bash
-$ gcloud auth login
-```
-
-Depending on your install, you can either install with gcloud:
-
-```bash
-$ gcloud components install kubectl
-```
-or just [on your own](https://kubernetes.io/docs/tasks/tools/).
-
-## Cloud
-
-Finally, ensure that google is either your default cloud (the `default_cloud` in your settings.yml)
-or you specify it with `--cloud` when you do run.
-
-
-## Custom Variables
-
-The following custom variables are supported in the "variables" section (key value pairs)
-for Google in an `experiments.yaml`
-
-```yaml
-variables:
- # Customize zone just for this experiment
- zone: us-central1-a
-```
-
-
-## Run Experiments
-
-Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to
-populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the
-library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to
-provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples)
-directory for a few that we provide. We will walk through a generic one here to launch
-an experiment on a Kubernetes cluster. Note that before doing this step you should
-have installed flux-cloud, along with kubectl and gcloud, and set your defaults (e.g., project zone)
-in your settings.
-
-```bash
-$ flux-cloud run experiments.yaml
-```
-
-Note that since the experiments file defaults to that name, you can also just do:
-
-```bash
-$ flux-cloud run
-```
-
-Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory.
-Note that machines and size are required for the matrix, and variables get piped into all experiments (in full).
diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md
index 30879f8..9fc263c 100644
--- a/docs/getting_started/index.md
+++ b/docs/getting_started/index.md
@@ -10,8 +10,6 @@ install
commands
examples
experiments
+debugging
settings
-google
-aws
-minikube
```
diff --git a/docs/getting_started/minikube.md b/docs/getting_started/minikube.md
deleted file mode 100644
index f851b51..0000000
--- a/docs/getting_started/minikube.md
+++ /dev/null
@@ -1,134 +0,0 @@
-# MiniKube
-
-> Running on a local MiniKube cluster
-
-Flux Cloud (as of version 0.1.0) can run on MiniKube! The main steps of running experiments with
-different container bases are:
-
- - **up** to bring up a cluster
- - **apply** to apply one or more CRDs from experiments defined by an experiments.yaml
- - **down** to destroy a cluster
-
-or one or more commands with the same container base(s):
-
- - **up** to bring up a cluster
- - **submit** to submit one or more experiments to the same set of pods defined by an experiments.yaml
- - **down** to destroy a cluster
-
-Each of these commands can be run in isolation, and we provide a single command **run** to
-automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your
-machine to do the work (e.g., minikube and kubectl) and importantly, for every step we show
-you the command, and if it fails, give you a chance to bail out. We do this so if you
-want to remove the abstraction at any point and run the commands on your own, you can.
-
-## Pre-requisites
-
-You should first [install minikube](https://minikube.sigs.k8s.io/docs/start/)
-and kubectl.
-
-## Run Experiments
-
-Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to
-populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the
-library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to
-provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples)
-directory for a few that we provide. We will walk through a generic one here to launch
-an experiment on a MiniKube Kubernetes cluster. Note that before doing this step you should
-have installed flux-cloud, along with kubectl and minikube. Note that if it's not the default,
-you'll need to specify using MiniKube
-
-### Apply / Run
-
-> Ideal if you need to run multiple jobs on different containers
-
-```bash
-$ flux-cloud run --cloud minikube experiments.yaml
-```
-
-Or set to the default:
-
-```bash
-$ flux-cloud config set default_cloud:minikube
-```
-
-Given MiniKube is the default, since the experiments file defaults to that name, you can also just do:
-
-```bash
-$ flux-cloud run
-```
-
-Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory.
-Note that only size is required for the matrix for MiniKube (there is currently no concept of a machine,
-although there could be), and variables get piped into all experiments (in full). Under variables,
-both "commands" and "ids" are required, and must be equal in length (each command is assigned to one id
-for output). To just run the first entry in the matrix (test mode) do:
-
-```bash
-$ flux-cloud run experiments.yaml --test
-```
-
-Note that you can also use the other commands in place of a single run, notably "up" "apply" and "down."
-By default, results will be written to a temporary output directory, but you can customize this with `--outdir`.
-Finally, since MiniKube often has trouble pulling images, we recommend you include the container image as a variable
-in the experiment.yaml so it can be pulled before the experiment is run. E.g., this experiment:
-
-```yaml
-matrix:
- size: [4]
-
-# Flux Mini Cluster experiment attributes
-minicluster:
- name: lammps
- namespace: flux-operator
- size: [2, 4]
-
-# Each job can have a command and working directory
-jobs:
- lmp:
- command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
- repeats: 2
- image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
-```
-
-And this config file:
-
-```yaml
-apiVersion: flux-framework.org/v1alpha1
-kind: MiniCluster
-metadata:
- name: {{ minicluster.name }}
- namespace: {{ minicluster.namespace }}
-spec:
- # Number of pods to create for MiniCluster
- size: {{ minicluster.size }}
-
- # Disable verbose output
- logging:
- quiet: true
-
- # This is a list because a pod can support multiple containers
- containers:
- # The container URI to pull (currently needs to be public)
- - image: {{ job.image }}
-
- # You can set the working directory if your container WORKDIR is not correct.
- workingDir: /home/flux/examples/reaxff/HNS
- command: {{ job.command }}
-```
-
-### Submit
-
-> Ideal for one or more commands across the same container(s) and MiniCluster size.
-
-```bash
-$ flux-cloud up --cloud minikube
-$ flux-cloud submit --cloud minikube
-$ flux-cloud down --cloud minikube
-```
-
-The submit will always check if the MiniCluster is already created, and if not, create it
-to submit jobs. For submit (and the equivalent to bring it up and down with batch)
-your commands aren't provided in the CRD,
-but rather to the Flux Restful API. Submit / batch will also generate one CRD
-per MiniCluster size, but use the same MiniCluster across jobs. This is different
-from apply, which generates one CRD per job to run.
diff --git a/docs/index.rst b/docs/index.rst
index efdb1a1..d9804fb 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -52,15 +52,15 @@ to unite the worlds and technologies typical of cloud computing and
high performance computing.
To get started, check out the links below!
-Would you like to request a feature or contribute?
-[Open an issue](https://github.com/flux-framework/flux-cloud/issues).
+Would you like to request a feature or contribute? `Open an issue `_.
.. toctree::
:caption: Getting Started
- :maxdepth: 1
+ :maxdepth: 2
getting_started/index.md
+ tutorials/index.md
.. toctree::
:caption: About
diff --git a/docs/getting_started/aws.md b/docs/tutorials/aws.md
similarity index 54%
rename from docs/getting_started/aws.md
rename to docs/tutorials/aws.md
index 2ef4a6c..be1a0b3 100644
--- a/docs/getting_started/aws.md
+++ b/docs/tutorials/aws.md
@@ -1,19 +1,14 @@
# AWS
> Running on Amazon Elastic Kubernetes Service EKS
+Flux Cloud (as of version 0.1.0) can run on MiniKube! There are two primary use cases for using flux-cloud:
-The flux-cloud software provides are easy wrappers (and templates) to running
-the Flux Operator on Amazon. The main steps of running experiments are:
+ - **apply** is good for many larger experiments that require different container bases and / or take a longer time to run.
+ - **submit** is good for smaller experiments that might use the same container bases and / or take a shorter time to run.
- - **up** to bring up a cluster
- - **apply** to apply one or more experiments defined by an experiments.yaml
- - **down** to destroy a cluster
-
-Each of these commands can be run in isolation, and we provide a single command **run** to
-automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your
-machine to do the work (e.g., kubectl and gcloud) and importantly, for every step we show
-you the command, and if it fails, give you a chance to bail out. We do this so if you
-want to remove the abstraction at any point and run the commands on your own, you can.
+For the latter (submit) we will bring up the minimum number of MiniClusters required (unique based on container image size)
+and launch all jobs across them, using Flux as a scheduler. As of version 0.2.0 both commands both use the fluxoperator Python
+SDK, so we only use bash scripts to bring up and down cloud-specific clusters.
## Pre-requisites
@@ -45,6 +40,52 @@ This is used so you can ssh (connect) to your workers!
Finally, ensure that aws is either your default cloud (the `default_cloud` in your settings.yml)
or you specify it with `--cloud` when you do run.
+## Run Experiments
+
+**IMPORTANT** for any experiment when you choose an instance type, you absolutely
+need to choose a size that has [IsTrunkingCompatible](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go)
+true. E.g., `m5.large` has it set to true so it would work. Each experiment is defined by the matrix and variables in an `experiment.yaml`. It's recommended you
+start with a template populated for aws:
+
+```bash
+$ flux-cloud experiment init --cloud aws
+```
+
+And see the [custom variables](#custom-variables) defined below to learn more about them,
+or the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples)
+directory for a few examples that we provide. We will walk through a generic one here to launch
+an experiment on a Kubernetes cluster. Note that before doing this step you should
+have installed flux-cloud, along with ekctl, and set your defaults (e.g., project zone)
+in your settings.
+
+Given an experiments.yaml in the present working directory, you can do an apply,
+meaning creating a separate MiniCluster per job:
+
+```bash
+# Up / apply / down
+$ flux-cloud run --cloud aws
+
+# Manual up / apply / down (recommended)
+$ flux-cloud up --cloud aws
+$ flux-cloud apply --cloud aws
+$ flux-cloud down --cloud aws
+```
+
+Or submit, creating shared MiniClusters to submit multiple jobs to:
+
+```bash
+# Up / submit / down
+$ flux-cloud batch --cloud aws
+
+# Manual up / submit / down (recommended)
+$ flux-cloud up --cloud aws
+$ flux-cloud submit --cloud aws
+$ flux-cloud down --cloud aws
+```
+
+Note that machines and size are required for the matrix.
+
+
## Custom Variables
The following custom variables are supported in the "variables" section (key value pairs)
@@ -74,39 +115,3 @@ variables:
Note that we currently take a simple approach for boolean values - if it's present (e.g., the examples)
above) it will be rendered as true. Don't put False in there, but rather just delete the key.
-
-## Run Experiments
-
-**IMPORTANT** for any experiment when you choose an instance type, you absolutely
-need to choose a size that has [IsTrunkingCompatible](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go)
-true. E.g., `m5.large` has it set to true so it would work.
-
-Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to
-populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the
-library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to
-provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples)
-directory for a few that we provide. We will walk through a generic one here to launch
-an experiment on a Kubernetes cluster. Note that before doing this step you should
-have installed flux-cloud, along with kubectl and gcloud, and set your defaults (e.g., project zone)
-in your settings.
-
-```bash
-$ flux-cloud run experiments.yaml
-```
-
-Note that since the experiments file defaults to that name, you can also just do:
-
-```bash
-$ flux-cloud run
-```
-
-Or for more control and/or verbosity:
-
-```bash
-$ flux-cloud --debug up --cloud aws
-$ flux-cloud --debug apply --cloud aws
-$ flux-cloud --debug down --cloud aws
-```
-
-Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory.
-Note that machines and size are required for the matrix, and variables get piped into all experiments (in full).
diff --git a/docs/tutorials/google.md b/docs/tutorials/google.md
new file mode 100644
index 0000000..cf4616a
--- /dev/null
+++ b/docs/tutorials/google.md
@@ -0,0 +1,91 @@
+# Google Cloud
+
+> Running on Google Kubernetes Engine, GKE
+
+The main functionality that flux-cloud provides are easy wrappers (and templates) to running
+the Flux Operator on GKE. The main steps of running experiments are:
+
+ - **up** to bring up a cluster
+ - **apply/submit** to apply or submit one or more experiments defined by an experiments.yaml
+ - **down** to destroy a cluster
+
+Each of these commands can be run in isolation, and we provide single commands **run/batch** to
+automate the entire thing. For Google Cloud, you can see a small collection of [examples here](https://github.com/converged-computing/flux-cloud/tree/main/examples/google).
+
+## Pre-requisites
+
+You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts)
+and ensure you are logged in and have kubectl installed:
+
+```bash
+$ gcloud auth login
+```
+
+Depending on your install, you can either install with gcloud:
+
+```bash
+$ gcloud components install kubectl
+```
+or just [on your own](https://kubernetes.io/docs/tasks/tools/).
+
+## Cloud
+
+Finally, ensure that google is either your default cloud (the `default_cloud` in your settings.yml)
+or you specify it with `--cloud` when you do run.
+
+## Custom Variables
+
+The following custom variables are supported in the "variables" section (key value pairs)
+for Google in an `experiments.yaml`
+
+```yaml
+variables:
+ # Customize zone just for this experiment
+ zone: us-central1-a
+```
+
+
+## Run Experiments
+
+You can create an empty experiment template as follows:
+
+```bash
+$ flux-cloud experiment init --cloud google
+```
+
+Each experiment is defined by the matrix and variables in an `experiment.yaml`
+One of the goals of the Flux Cloud Experiment runner is not just to run things, but to
+provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples)
+directory for a few that we provide. We will walk through a generic one here to launch
+an experiment on a Kubernetes cluster. Note that before doing this step you should
+have installed flux-cloud, along with gcloud, and set your defaults (e.g., project zone)
+in your settings.
+
+Given an experiments.yaml in the present working directory, you can do an apply,
+meaning creating a separate MiniCluster per job:
+
+```bash
+# Up / apply / down
+$ flux-cloud run --cloud google
+
+# Manual up / apply / down (recommended)
+$ flux-cloud --debug up --cloud google
+$ flux-cloud --debug apply --cloud google
+$ flux-cloud --debug down --cloud google
+```
+
+For any of the commands here, add `--debug` after `flux-cloud` to see more verbosity.
+Or submit, creating shared MiniClusters to submit multiple jobs to:
+
+```bash
+# Up / submit / down
+$ flux-cloud batch --cloud google
+
+# Manual up / submit / down (recommended)
+$ flux-cloud --debug up --cloud google
+$ flux-cloud --debug submit --cloud google
+$ flux-cloud --debug down --cloud google
+```
+
+Note that machines and size are required for the matrix. See our [debugging guide](../getting-started/debugging.md)
+for the Flux Operator for interacting with Flux Operator containers or debugging.
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
new file mode 100644
index 0000000..b545d8f
--- /dev/null
+++ b/docs/tutorials/index.md
@@ -0,0 +1,11 @@
+# Tutorials
+
+These tutorials will walk through common use cases for Flux Cloud! If you have
+any questions or issues, please [let us know](https://github.com/flux-framework/flux-cloud/issues)
+
+```{toctree}
+:maxdepth: 3
+minikube
+google
+aws
+```
diff --git a/docs/tutorials/minikube.md b/docs/tutorials/minikube.md
new file mode 100644
index 0000000..4e68e4e
--- /dev/null
+++ b/docs/tutorials/minikube.md
@@ -0,0 +1,313 @@
+# MiniKube
+
+> Running on a local MiniKube cluster
+
+Flux Cloud (as of version 0.1.0) can run on MiniKube! There are two primary use cases for using flux-cloud:
+
+ - **apply** is good for many larger experiments that require different container bases and / or take a longer time to run.
+ - **submit** is good for smaller experiments that might use the same container bases and / or take a shorter time to run.
+
+For the latter (submit) we will bring up the minimum number of MiniClusters required (unique based on container image size)
+and launch all jobs across them, using Flux as a scheduler. As of version 0.2.0 both commands both use the fluxoperator Python
+SDK, so we only use bash scripts to bring up and down cloud-specific clusters.
+
+
+## Pre-requisites
+
+You should first [install minikube](https://minikube.sigs.k8s.io/docs/start/)
+and kubectl.
+
+## Run Experiments
+
+Let's start with a simple `experiments.yaml` file, where we have defined a number of different
+experiments to run on MiniKube. `flux-cloud submit` relies entirely on this experiment file,
+and programmatically generates the MiniCluster [custom resource definitions](https://flux-framework.org/flux-operator/getting_started/custom-resource-definition.html#workingdir)
+for you, so you don't need to provide any kind of template.
+
+
+
+How does it work?
+
+A YAML file (such as the experiments.yaml) can be serialized to JSON, so each section under "jobs" is
+also json, or actually (in Python) a dictionary of values. Since the values are passed to the
+[Flux Operator Python SDK](https://github.com/flux-framework/flux-operator/tree/main/sdk/python/v1alpha1),
+we can map them easily according to the following convention. Let's say we have a job in the experiments listing:
+
+```yaml
+jobs:
+ # This is the start of the named job
+ reaxc-hns:
+
+ # These are attributes for the MiniCluster (minus repeats)
+ command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+```
+
+The content under the job name "reaxc-hns" would be mapped to the MiniCluster container as follows:
+
+```python
+from fluxoperator.models import MiniClusterContainer
+
+container = MiniClusterContainer(
+ image="ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
+ working_dir="/home/flux/examples/reaxff/HNS",
+ command="lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite",
+ run_flux=True,
+)
+```
+
+Note that in the above, since Go is in camel case and the Python SDK turns it into snake case,
+`workingDir` is changed to `working_dir`.
+
+
+
+
+Let's start with this set of experiments. Note that we've provided the same container
+for all of them, meaning that we will only be creating one MiniCluster with that container.
+If you provide jobs with separate containers, they will be brought up as separate clusters
+to run (per each unique container, with all jobs matched to it).
+
+```yaml
+# This is intended for MiniKube, so no machine needed.
+# We will create a MiniKube cluster of size 2
+matrix:
+ size: [2]
+
+# Flux Mini Cluster experiment attributes
+minicluster:
+ name: submit-jobs
+ namespace: flux-operator
+ # Each of these sizes will be brought up and have commands run across it
+ size: [2]
+
+# Each of command and image are required to do a submit!
+jobs:
+ reaxc-hns:
+ command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+ sleep:
+ command: 'sleep 5'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+ hello-world:
+ command: 'echo hello world'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+```
+
+Each experiment is defined by the matrix and variables in an `experiment.yaml`, as shown above.
+Note that the easiest way to get started is to use an existing example, or run:
+
+```bash
+$ flux-cloud experiment init --cloud minikube
+```
+
+In the example above, we are targeting minikube.
+
+
+### Apply / Run
+
+> Ideal if you need to run multiple jobs on different containers
+
+This apply/run workflow will create a new MiniCluster each time (pods up and down)
+and not use Flux as a scheduler proper. A workflow might look like:
+
+```bash
+$ flux-cloud up --cloud minikube
+$ flux-cloud apply --cloud minikube
+$ flux-cloud down --cloud minikube
+```
+Or achieve all three with:
+
+```bash
+$ flux-cloud run --cloud minikube
+```
+
+Let's run this with our `experiments.yaml` above in the present working directory,
+and after having already run `up`:
+
+```bash
+# Also print output to the terminal (so you can watch!)
+$ flux-cloud --debug apply --cloud minikube
+
+# Only save output to output files
+$ flux-cloud apply --cloud minikube
+```
+
+At the end of the run, you'll have an organized output directory with all of your
+output logs, along with saved metadata about the minicluster, pods, and nodes.
+
+```bash
+
+```
+
+### Submit
+
+> Ideal for one or more commands across the one or more containers and MiniCluster sizes
+
+The idea behind a submit is that we are going to create the minimal number of MiniClusters you
+need (across the set of unique sizes and images) and then submit all jobs to Flux within
+the MiniCluster. The submit mode is actually using Flux as a scheduler and not just a
+"one job" running machine. A basic submit workflow using the config above might look like this:
+
+```bash
+$ flux-cloud up --cloud minikube
+$ flux-cloud submit --cloud minikube
+$ flux-cloud down --cloud minikube
+```
+
+Instead of running one job at a time and waiting for output (e.g., apply) we instead
+submit all the jobs, and then poll every 30 seconds to get job statuses.
+
+
+
+View full output of submit command
+
+```bash
+$ flux-cloud --debug submit --cloud minikube
+```
+```console
+No experiment ID provided, assuming first experiment k8s-size-4-n1-standard-1.
+Job experiments file generated 1 MiniCluster(s).
+
+🌀 Bringing up MiniCluster of size 2 with image ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+All pods are in states "Running" or "Completed"
+💾 Creating output directory /home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/minikube
+MiniCluster created with credentials:
+ FLUX_USER=fluxuser
+ FLUX_TOKEN=d467215d-d07d-4c32-b2b9-41643cda3d7d
+All pods are in states "Running" or "Completed"
+Found broker pod lammps-job-0-ng8pz
+
+Waiting for http://lammps-job-0-ng8pz.pod.flux-operator.kubernetes:5000 to be ready
+🪅️ RestFUL API server is ready!
+.
+Port forward opened to http://lammps-job-0-ng8pz.pod.flux-operator.kubernetes:5000
+Submitting reaxc-hns-1-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+Submitting reaxc-hns-2-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+Submitting reaxc-hns-3-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+Submitting reaxc-hns-4-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+Submitting reaxc-hns-5-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+Submitting sleep-1-minicluster-size-2: sleep 5
+Submitting sleep-2-minicluster-size-2: sleep 5
+Submitting sleep-3-minicluster-size-2: sleep 5
+Submitting sleep-4-minicluster-size-2: sleep 5
+Submitting sleep-5-minicluster-size-2: sleep 5
+Submitting hello-world-1-minicluster-size-2: echo hello world
+Submitting hello-world-2-minicluster-size-2: echo hello world
+Submitting hello-world-3-minicluster-size-2: echo hello world
+Submitting hello-world-4-minicluster-size-2: echo hello world
+Submitting hello-world-5-minicluster-size-2: echo hello world
+Submit 15 jobs! Waiting for completion...
+15 are active.
+ lmp is in state RUN
+ lmp is in state RUN
+ lmp is in state SCHED
+ lmp is in state SCHED
+ lmp is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+15 are active.
+ lmp is finished COMPLETED in 28.64 seconds.
+ lmp is finished COMPLETED in 29.1 seconds.
+ lmp is in state RUN
+ lmp is in state RUN
+ lmp is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+13 are active.
+ lmp is in state RUN
+ lmp is in state RUN
+ lmp is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ sleep is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+13 are active.
+ lmp is finished COMPLETED in 36.56 seconds.
+ lmp is finished COMPLETED in 35.89 seconds.
+ lmp is in state RUN
+ sleep is finished COMPLETED in 5.02 seconds.
+ sleep is finished COMPLETED in 5.02 seconds.
+ sleep is finished COMPLETED in 5.02 seconds.
+ sleep is in state RUN
+ sleep is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+ echo is in state SCHED
+8 are active.
+ lmp is finished COMPLETED in 24.6 seconds.
+ sleep is finished COMPLETED in 5.02 seconds.
+ sleep is finished COMPLETED in 5.02 seconds.
+ echo is finished COMPLETED in 0.01 seconds.
+ echo is finished COMPLETED in 0.02 seconds.
+ echo is finished COMPLETED in 0.02 seconds.
+ echo is finished COMPLETED in 0.01 seconds.
+ echo is finished COMPLETED in 0.01 seconds.
+All jobs are complete! Cleaning up MiniCluster...
+All pods are terminated.
+```
+
+
+
+After submit, you will still have an organized output directory with job output files
+and metadata.
+
+```bash
+$ tree -a data/minikube/
+data/minikube/
+└── k8s-size-4-n1-standard-1
+ ├── hello-world-1-minicluster-size-2
+ │ └── log.out
+ ├── hello-world-2-minicluster-size-2
+ │ └── log.out
+ ├── hello-world-3-minicluster-size-2
+ │ └── log.out
+ ├── hello-world-4-minicluster-size-2
+ │ └── log.out
+ ├── hello-world-5-minicluster-size-2
+ │ └── log.out
+ ├── meta.json
+ ├── reaxc-hns-1-minicluster-size-2
+ │ └── log.out
+ ├── reaxc-hns-2-minicluster-size-2
+ │ └── log.out
+ ├── reaxc-hns-3-minicluster-size-2
+ │ └── log.out
+ ├── reaxc-hns-4-minicluster-size-2
+ │ └── log.out
+ ├── reaxc-hns-5-minicluster-size-2
+ │ └── log.out
+ └── .scripts
+ └── minicluster-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json
+```
diff --git a/examples/aws-lammps/README.md b/examples/aws-lammps/README.md
deleted file mode 100644
index 99b7d2c..0000000
--- a/examples/aws-lammps/README.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# Lammps on Amazon Cloud
-
-In this set of experiments we will run the Flux Operator on Amazon Cloud at size N=2
-(the benchmarks require this) and multiple machine types.
-
-## Pre-requisites
-
-You should first [install eksctrl](https://github.com/weaveworks/eksctl) and make sure you have access to an AWS cloud (e.g.,
-with credentials or similar in your environment). E.g.,:
-
-```bash
-export AWS_ACCESS_KEY_ID=xxxxxxxxxxxxxxxxxxx
-export AWS_SECRET_ACCESS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-export AWS_SESSION_TOKEN=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-```
-
-The last session token may not be required depending on your setup.
-We assume you also have [kubectl](https://kubernetes.io/docs/tasks/tools/).
-
-### Setup SSH
-
-You'll need an ssh key for EKS. Here is how to generate it:
-
-```bash
-ssh-keygen
-# Ensure you enter the path to ~/.ssh/id_eks
-```
-
-This is used so you can ssh (connect) to your workers!
-
-### Cloud
-
-we will be using [Flux Cloud](https://github.com/converged-computing/flux-cloud)
-to run the Operator on Google Cloud Kubernetes engine.
-
-```bash
-$ pip install flux-cloud
-```
-
-Note that these experiments were run with version 0.1.0.
-Ensure that aws is either your default cloud (the `default_cloud` in your settings.yml)
-or you specify it with `--cloud` when you do run.
-
-
-## Run Experiments
-
-Each experiment here is defined by the matrix and variables in [experiments.yaml](experiment.yaml) that is used to
-populate a [minicluster-template.yaml](minicluster-template.yaml) and launch a Kubernetes cluster.
-You can read the documentation for flux-cloud to understand the variables available.
-This tutorial assumes you have flux-cloud installed and configured. See all unique Kubernetes clusters
-we will run the jobs on:
-
-```bash
-$ flux-cloud list
-```
-
-Then you can either run all at once:
-
-```bash
-$ flux-cloud run --force-cluster
-```
-
-Or (for testing) to bring up just the first cluster and then manually apply:
-
-```bash
-$ flux-cloud --debug up --cloud aws
-$ flux-cloud --debug apply --cloud aws
-$ flux-cloud --debug down --cloud aws
-```
-
-By default, results will be written to a [./data](data) directory, but you can customize this with `--outdir`.
diff --git a/examples/aws-lammps/experiments.yaml b/examples/aws-lammps/experiments.yaml
deleted file mode 100644
index e694254..0000000
--- a/examples/aws-lammps/experiments.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# matrix of experiments to run - machine types and sizes are required
-
-# These are mini runs intended for testing
-matrix:
- size: [8]
- machine: ["m5.large"]
-
-# Flux Mini Cluster experiment attributes
-minicluster:
- name: lammps
- namespace: flux-operator
- size: [2, 4, 6, 8]
-
-# Each job can have a command and working directory
-jobs:
- lmp:
- command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
- repeats: 3
diff --git a/examples/aws-lammps/minicluster-template.yaml b/examples/aws-lammps/minicluster-template.yaml
deleted file mode 100644
index 7591645..0000000
--- a/examples/aws-lammps/minicluster-template.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-apiVersion: flux-framework.org/v1alpha1
-kind: MiniCluster
-
-metadata:
- name: {{ minicluster.name }}
- namespace: {{ minicluster.namespace }}
-spec:
- # Number of pods to create for MiniCluster
- size: {{ minicluster.size }}
-
- # This is a list because a pod can support multiple containers
- containers:
- - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
- workingDir: /home/flux/examples/reaxff/HNS
- command: {{ job.command }}
diff --git a/examples/google/osu-benchmarks/README.md b/examples/google/osu-benchmarks/README.md
new file mode 100644
index 0000000..0bd718c
--- /dev/null
+++ b/examples/google/osu-benchmarks/README.md
@@ -0,0 +1,5 @@
+# OSU Benchmarks
+
+This example demonstrates how to setup an [experiments.yaml](experiments.yaml)
+to run on Google Cloud. See the [Google Cloud tutorials](https://converged-computing.github.io/flux-cloud/tutorials/google.html)
+for how to run this tutorial.
diff --git a/examples/google/osu-benchmarks/experiments.yaml b/examples/google/osu-benchmarks/experiments.yaml
new file mode 100644
index 0000000..14e6806
--- /dev/null
+++ b/examples/google/osu-benchmarks/experiments.yaml
@@ -0,0 +1,59 @@
+# matrix of experiments to run - machine types and sizes are required
+# This can obviously be expanded to more sizes or machines,
+matrix:
+ size: [2]
+ machine: ["n1-standard-1", "n1-standard-2"]
+
+# An example of shared container options
+x-container-options: &options
+ flux_option_flags: "-ompi=openmpi@5"
+ working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided
+ image: ghcr.io/rse-ops/osu-microbench:test
+
+ # This will get passed during a flux submit
+ tasks: 2
+
+# res = cli.submit(command=job["command"], num_tasks=2, cores_per_task=1, workdir=kwargs['workdir'])
+
+# Flux Mini Cluster experiment attributes
+minicluster:
+ name: osu-benchmarks
+ namespace: flux-operator
+
+ # provide credentials if you want to re-use a minicluster
+ flux_restful:
+ username: fluxuser
+ token: "123456"
+
+ # osu benchmarks requires exactly 2 processes
+ tasks: 2
+
+# Each job can have a command and working directory
+jobs:
+ osu_get_latency:
+ command: './osu_get_latency'
+ <<: *options
+ osu_acc_latency:
+ command: './osu_acc_latency'
+ <<: *options
+ osu_fop_latency:
+ command: './osu_fop_latency'
+ <<: *options
+ osu_get_bw:
+ command: './osu_get_bw'
+ <<: *options
+ osu_put_bibw:
+ command: './osu_put_bibw'
+ <<: *options
+ osu_put_latency:
+ command: './osu_put_latency'
+ <<: *options
+ osu_cas_latency:
+ command: './osu_cas_latency'
+ <<: *options
+ osu_get_acc_latency:
+ command: './osu_get_acc_latency'
+ <<: *options
+ osu_put_bw:
+ command: './osu_put_bw'
+ <<: *options
diff --git a/examples/minikube/basic/README.md b/examples/minikube/basic/README.md
new file mode 100644
index 0000000..0a2a6aa
--- /dev/null
+++ b/examples/minikube/basic/README.md
@@ -0,0 +1,3 @@
+# Up, Submit, Down
+
+See the [minikube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html) for how to run this tutorial.
diff --git a/examples/up-submit-down/experiments.yaml b/examples/minikube/basic/experiments.yaml
similarity index 64%
rename from examples/up-submit-down/experiments.yaml
rename to examples/minikube/basic/experiments.yaml
index 880e652..ec0ce6a 100644
--- a/examples/up-submit-down/experiments.yaml
+++ b/examples/minikube/basic/experiments.yaml
@@ -1,7 +1,6 @@
# This is intended for MiniKube, so no machine needed
matrix:
size: [4]
- machine: [n1-standard-1]
# Flux Mini Cluster experiment attributes
minicluster:
@@ -10,22 +9,19 @@ minicluster:
# Each of these sizes will be brought up and have commands run across it
size: [2]
-# Since we are creating a minicluster here to submit commands across
-# on the same container, the container is required here. If you specify
-# a size here, the image must be the same across sizes
jobs:
reaxc-hns:
command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite'
image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
repeats: 5
- workdir: /home/flux/examples/reaxff/HNS
+ working_dir: /home/flux/examples/reaxff/HNS
sleep:
command: 'sleep 5'
image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
repeats: 5
- workdir: /home/flux/examples/reaxff/HNS
+ working_dir: /home/flux/examples/reaxff/HNS
hello-world:
command: 'echo hello world'
image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
repeats: 5
- workdir: /home/flux/examples/reaxff/HNS
+ working_dir: /home/flux/examples/reaxff/HNS
diff --git a/examples/minikube/logging/README.md b/examples/minikube/logging/README.md
new file mode 100644
index 0000000..779e469
--- /dev/null
+++ b/examples/minikube/logging/README.md
@@ -0,0 +1,3 @@
+# Logging
+
+This experiments.yaml shows how to customize the MiniCluster logging.
diff --git a/examples/minikube/logging/experiments.yaml b/examples/minikube/logging/experiments.yaml
new file mode 100644
index 0000000..d457541
--- /dev/null
+++ b/examples/minikube/logging/experiments.yaml
@@ -0,0 +1,24 @@
+# This is intended for MiniKube, so no machine needed
+matrix:
+ size: [4]
+
+# Flux Mini Cluster experiment attributes
+minicluster:
+ name: lammps-job
+ namespace: flux-operator
+ # Each of these sizes will be brought up and have commands run across it
+ size: [2]
+
+ # How to set logging attributes
+ logging:
+ debug: False # defaults to False
+ quiet: True # defaults to False
+ strict: False # defaults to True
+ timed: False # defaults to False, requires time in containers
+
+jobs:
+ reaxc-hns:
+ command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 2
+ working_dir: /home/flux/examples/reaxff/HNS
diff --git a/examples/minikube/osu-benchmarks/README.md b/examples/minikube/osu-benchmarks/README.md
new file mode 100644
index 0000000..8b8ec6b
--- /dev/null
+++ b/examples/minikube/osu-benchmarks/README.md
@@ -0,0 +1,5 @@
+# OSU Benchmarks
+
+This example demonstrates how to setup an [experiments.yaml](experiments.yaml)
+to run on MiniKube. See the [MiniKube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html)
+for how to run this tutorial.
diff --git a/examples/minikube/osu-benchmarks/experiments.yaml b/examples/minikube/osu-benchmarks/experiments.yaml
new file mode 100644
index 0000000..d278aa5
--- /dev/null
+++ b/examples/minikube/osu-benchmarks/experiments.yaml
@@ -0,0 +1,65 @@
+# matrix of experiments to run - machine types and sizes are required
+
+# This can obviously be expanded to more sizes or machines,
+matrix:
+ size: [2]
+ #machine: ["n1-standard-1", "n1-standard-2"]
+
+# TODO
+# when get this working, save to experiments-full.yaml, move to minkube, have shortened version run for test
+# then test this on google cloud
+# flux operator / python api still need to be released - maybe only allow pam for auth?
+
+# An example of shared container options
+x-container-options: &options
+ flux_option_flags: "-ompi=openmpi@5"
+ working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided
+ image: ghcr.io/rse-ops/osu-microbench:test
+
+ # This will get passed during a flux submit
+ tasks: 2
+
+# res = cli.submit(command=job["command"], num_tasks=2, cores_per_task=1, workdir=kwargs['workdir'])
+
+# Flux Mini Cluster experiment attributes
+minicluster:
+ name: osu-benchmarks
+ namespace: flux-operator
+
+ # provide credentials if you want to re-use a minicluster
+ flux_restful:
+ username: fluxuser
+ token: "123456"
+
+ # osu benchmarks requires exactly 2 processes
+ tasks: 2
+
+# Each job can have a command and working directory
+jobs:
+ osu_get_latency:
+ command: './osu_get_latency'
+ <<: *options
+ osu_acc_latency:
+ command: './osu_acc_latency'
+ <<: *options
+ osu_fop_latency:
+ command: './osu_fop_latency'
+ <<: *options
+ osu_get_bw:
+ command: './osu_get_bw'
+ <<: *options
+ osu_put_bibw:
+ command: './osu_put_bibw'
+ <<: *options
+ osu_put_latency:
+ command: './osu_put_latency'
+ <<: *options
+ osu_cas_latency:
+ command: './osu_cas_latency'
+ <<: *options
+ osu_get_acc_latency:
+ command: './osu_get_acc_latency'
+ <<: *options
+ osu_put_bw:
+ command: './osu_put_bw'
+ <<: *options
diff --git a/examples/minikube/persistent/README.md b/examples/minikube/persistent/README.md
new file mode 100644
index 0000000..5636c77
--- /dev/null
+++ b/examples/minikube/persistent/README.md
@@ -0,0 +1,49 @@
+# Persistent
+
+This is a trick to get a MiniCluster up and running (and have it stay running)!
+
+ - For **submit** we run a job that will never complete
+ - For **apply** we do the same!
+
+I typically use this case to debug one or the other. E.g., (given MiniKube is running with the operator installed):
+
+```bash
+$ flux-cloud --debug submit --cloud minikube
+```
+
+Then get the pod
+
+```bash
+$ kubectl get -n flux-operator pods
+NAME READY STATUS RESTARTS AGE
+sleep-job-0-pm28c 1/1 Running 0 73s
+sleep-job-1-h824z 1/1 Running 0 73s
+sleep-job-cert-generator 0/1 Completed 0 73s
+```
+
+And ssh in!
+
+```bash
+$ kubectl exec -it -n flux-operator sleep-job-0-pm28c -- bash
+```
+
+For either submit or apply, we can connect to the instance with the broker URI
+
+```bash
+$ export FLUX_URI=local:///run/flux/local
+$ sudo -u flux flux proxy $FLUX_URI
+```
+and then see our infinite flux job!
+
+```bash
+$ flux jobs -a
+ JOBID USER NAME ST NTASKS NNODES TIME INFO
+ ƒCvGx8CX flux sleep R 1 1 2.432m sleep-job-1
+```
+
+The main difference is that submit is going to periodically ping the restful API to check
+on the job. So you are probably better off with apply in that it's almost the same
+thing (a flux start -> flux submit instead of starting the flux broker) without
+the poll.
+
+See the [minikube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html) for how to run this tutorial.
diff --git a/examples/minikube/persistent/experiments.yaml b/examples/minikube/persistent/experiments.yaml
new file mode 100644
index 0000000..aa6b649
--- /dev/null
+++ b/examples/minikube/persistent/experiments.yaml
@@ -0,0 +1,16 @@
+# This is intended for MiniKube, so no machine needed
+matrix:
+ size: [4]
+
+# Flux Mini Cluster experiment attributes
+minicluster:
+ name: sleep-job
+ namespace: flux-operator
+ # Each of these sizes will be brought up and have commands run across it
+ size: [2]
+
+# This will bring up a cluster to stay online (until you kill it) as the job will never end
+jobs:
+ sleep:
+ command: 'sleep infinity'
+ image: ghcr.io/flux-framework/flux-restful-api:latest
diff --git a/examples/minikube/resources/README.md b/examples/minikube/resources/README.md
new file mode 100644
index 0000000..54eab30
--- /dev/null
+++ b/examples/minikube/resources/README.md
@@ -0,0 +1,3 @@
+# Resources
+
+This experiments.yaml shows how to customize MiniCluster resources.
diff --git a/examples/minikube/resources/experiments.yaml b/examples/minikube/resources/experiments.yaml
new file mode 100644
index 0000000..846ef81
--- /dev/null
+++ b/examples/minikube/resources/experiments.yaml
@@ -0,0 +1,25 @@
+# This is intended for MiniKube, so no machine needed
+matrix:
+ size: [4]
+
+# Flux Mini Cluster experiment attributes
+minicluster:
+ name: lammps-job
+ namespace: flux-operator
+ # Each of these sizes will be brought up and have commands run across it
+ size: [2]
+
+jobs:
+ reaxc-hns:
+ command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 2
+ working_dir: /home/flux/examples/reaxff/HNS
+
+ # Resources for the specific container job
+ resources:
+ limits:
+ cpu: 1
+
+ requests:
+ cpu: 1
diff --git a/examples/minikube/volumes/README.md b/examples/minikube/volumes/README.md
new file mode 100644
index 0000000..918f351
--- /dev/null
+++ b/examples/minikube/volumes/README.md
@@ -0,0 +1,3 @@
+# Volumes
+
+This experiments.yaml shows how to customize MiniCluster volumes.
diff --git a/examples/minikube/volumes/experiments.yaml b/examples/minikube/volumes/experiments.yaml
new file mode 100644
index 0000000..c2af90e
--- /dev/null
+++ b/examples/minikube/volumes/experiments.yaml
@@ -0,0 +1,31 @@
+# This is intended for MiniKube, so no machine needed
+matrix:
+ size: [4]
+
+# Flux Mini Cluster experiment attributes
+minicluster:
+ name: lammps-job
+ namespace: flux-operator
+ # Each of these sizes will be brought up and have commands run across it
+ size: [2]
+
+ # How to create MiniCluster volumes - this is a volume named "data"
+ volumes:
+ data:
+ storageClass: hostpath
+ path: /tmp/data
+ labels:
+ type: "local"
+
+
+jobs:
+ reaxc-hns:
+ command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 2
+ working_dir: /home/flux/examples/reaxff/HNS
+
+ # The volume named "data" above should be bound to "/data"
+ volumes:
+ data:
+ path: /data
diff --git a/examples/osu-benchmarks/README.md b/examples/osu-benchmarks/README.md
deleted file mode 100644
index 8fbd44c..0000000
--- a/examples/osu-benchmarks/README.md
+++ /dev/null
@@ -1,60 +0,0 @@
-# OSU Benchmarks on Google Kubernetes Engine
-
-In this set of experiments we will run the Flux Operator on Google Cloud at size N=2
-(the benchmarks require this) and multiple machine types.
-
-## Pre-requisites
-
-You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts)
-and ensure you are logged in and have kubectl installed:
-
-```bash
-$ gcloud auth login
-```
-
-Depending on your install, you can either install with gcloud:
-
-```bash
-$ gcloud components install kubectl
-```
-or just [on your own](https://kubernetes.io/docs/tasks/tools/).
-
-## Run Experiments
-
-Each experiment here is defined by the matrix and variables in [experiments.yaml](experiment.yaml) that is used to
-populate a [minicluster-template.yaml](minicluster-template.yaml) and launch a Kubernetes cluster.
-You can read the documentation for flux-cloud to understand the variables available.
-This tutorial assumes you have flux-cloud installed and configured. See all unique Kubernetes clusters
-we will run the jobs on:
-
-```bash
-$ flux-cloud list
-```
-
-Then you can either run all at once:
-
-```bash
-$ flux-cloud run --force-cluster
-```
-
-Or (for testing) to bring up just the first cluster and then manually apply:
-
-```bash
-$ flux-cloud up
-$ flux-cloud apply
-$ flux-cloud down
-```
-
-or do the same for a targeted Kubernetes cluster:
-
-```bash
-$ flux-cloud up -e n1-standard-2-2
-$ flux-cloud apply -e n1-standard-2-2
-$ flux-cloud down -e n1-standard-2-2
-```
-
-
-The latter will either use a single experiment you've defined under `experiment` in your experiments.yaml file,
-or select the first in your matrix (as we have here).
-
-By default, results will be written to a [./data](data) directory, but you can customize this with `--outdir`.
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/meta.json b/examples/osu-benchmarks/data/n1-standard-1-2/meta.json
deleted file mode 100644
index 994b8cd..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/meta.json
+++ /dev/null
@@ -1,50 +0,0 @@
-{
- "size": 2,
- "machine": "n1-standard-1",
- "minicluster": {
- "name": "osu-benchmarks",
- "namespace": "flux-operator"
- },
- "jobs": {
- "osu_get_latency": {
- "command": "./osu_get_latency"
- },
- "osu_acc_latency": {
- "command": "./osu_acc_latency"
- },
- "osu_fop_latency": {
- "command": "./osu_fop_latency"
- },
- "osu_get_bw": {
- "command": "./osu_get_bw"
- },
- "osu_put_bibw": {
- "command": "./osu_put_bibw"
- },
- "osu_put_latency": {
- "command": "./osu_put_latency"
- },
- "osu_cas_latency": {
- "command": "./osu_cas_latency"
- },
- "osu_get_acc_latency": {
- "command": "./osu_get_acc_latency"
- },
- "osu_put_bw": {
- "command": "./osu_put_bw"
- }
- },
- "id": "n1-standard-1-2",
- "times": {
- "create-cluster": 356.4845640659332,
- "minicluster-run-osu_get_latency": 538.4266033172607,
- "minicluster-run-osu_acc_latency": 346.2248685359955,
- "minicluster-run-osu_fop_latency": 30.376757621765137,
- "minicluster-run-osu_get_bw": 69.91457080841064,
- "minicluster-run-osu_put_bibw": 121.5233302116394,
- "minicluster-run-osu_put_latency": 347.232608795166,
- "minicluster-run-osu_cas_latency": 30.295669078826904,
- "minicluster-run-osu_get_acc_latency": 675.3228597640991,
- "minicluster-run-osu_put_bw": 65.65373682975769
- }
-}
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out
deleted file mode 100644
index 33df75e..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Accumulate latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 2026.56
-2 1971.25
-4 1969.97
-8 2033.46
-16 1975.18
-32 2007.49
-64 1958.49
-128 2003.40
-256 2009.72
-512 1974.10
-1024 2027.20
-2048 2040.70
-4096 1958.00
-8192 2026.39
-16384 1962.29
-32768 2014.61
-65536 3992.00
-131072 4587.00
-262144 4074.00
-524288 4244.08
-1048576 4722.99
-2097152 9259.00
-4194304 18870.00
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out
deleted file mode 100644
index 13c2d88..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out
+++ /dev/null
@@ -1,5 +0,0 @@
-# OSU MPI_Compare_and_swap latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-8 2040.58
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out
deleted file mode 100644
index fb575bb..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out
+++ /dev/null
@@ -1,5 +0,0 @@
-# OSU MPI_Fetch_and_op latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-8 2025.38
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out
deleted file mode 100644
index 89c06a4..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get_accumulate latency Test v5.8
-# Window creation: MPI_Win_create
-# Synchronization: MPI_Win_lock/unlock
-# Size Latency (us)
-1 4028.65
-2 4036.95
-4 3977.30
-8 3959.60
-16 3999.67
-32 3974.93
-64 3965.61
-128 3921.54
-256 4020.49
-512 3987.41
-1024 3950.50
-2048 4023.82
-4096 4024.50
-8192 4032.61
-16384 4321.01
-32768 4077.98
-65536 6086.01
-131072 6358.00
-262144 6235.36
-524288 7140.15
-1048576 9408.58
-2097152 18535.45
-4194304 36929.51
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out
deleted file mode 100644
index 498e3af..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Bandwidth (MB/s)
-1 0.02
-2 0.04
-4 0.08
-8 0.17
-16 0.32
-32 0.66
-64 1.14
-128 2.58
-256 6.05
-512 9.96
-1024 19.80
-2048 35.15
-4096 64.85
-8192 126.64
-16384 174.69
-32768 205.94
-65536 220.74
-131072 220.53
-262144 173.80
-524288 227.82
-1048576 215.52
-2097152 226.24
-4194304 219.46
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out
deleted file mode 100644
index 4bd281e..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 2129.50
-2 2057.45
-4 2013.89
-8 2015.55
-16 1979.00
-32 2024.10
-64 1983.17
-128 2008.34
-256 2023.70
-512 2008.37
-1024 2057.49
-2048 2030.20
-4096 2039.00
-8192 2027.52
-16384 1879.26
-32768 2086.65
-65536 3961.84
-131072 4195.01
-262144 4327.77
-524288 4295.87
-1048576 5230.83
-2097152 9040.55
-4194304 18364.76
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out
deleted file mode 100644
index c6d659e..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Bi-directional Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_post/start/complete/wait
-# Size Bandwidth (MB/s)
-1 0.02
-2 0.13
-4 0.26
-8 0.43
-16 1.06
-32 2.27
-64 3.77
-128 9.69
-256 15.68
-512 28.37
-1024 58.54
-2048 105.42
-4096 119.90
-8192 147.82
-16384 151.82
-32768 212.67
-65536 220.28
-131072 221.41
-262144 222.46
-524288 223.21
-1048576 207.12
-2097152 223.48
-4194304 223.16
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out
deleted file mode 100644
index 5b8c58a..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Bandwidth (MB/s)
-1 0.04
-2 0.07
-4 0.12
-8 0.25
-16 0.53
-32 1.10
-64 2.32
-128 3.69
-256 9.53
-512 17.77
-1024 28.00
-2048 56.37
-4096 67.47
-8192 93.29
-16384 147.11
-32768 222.45
-65536 205.60
-131072 227.43
-262144 232.69
-524288 229.48
-1048576 216.91
-2097152 219.29
-4194304 223.36
diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out
deleted file mode 100644
index 4bd184c..0000000
--- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 1907.18
-2 2021.62
-4 1932.61
-8 1984.30
-16 2022.26
-32 1931.50
-64 2016.32
-128 2010.00
-256 1979.04
-512 1993.74
-1024 1990.06
-2048 1982.00
-4096 1983.60
-8192 2014.80
-16384 2079.00
-32768 1999.49
-65536 4068.88
-131072 3994.00
-262144 4146.00
-524288 4276.83
-1048576 5456.03
-2097152 9407.04
-4194304 19134.00
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/meta.json b/examples/osu-benchmarks/data/n1-standard-2-2/meta.json
deleted file mode 100644
index 431de1e..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/meta.json
+++ /dev/null
@@ -1,51 +0,0 @@
-{
- "size": 2,
- "machine": "n1-standard-2",
- "minicluster": {
- "name": "osu-benchmarks",
- "namespace": "flux-operator"
- },
- "jobs": {
- "osu_get_latency": {
- "command": "./osu_get_latency"
- },
- "osu_acc_latency": {
- "command": "./osu_acc_latency"
- },
- "osu_fop_latency": {
- "command": "./osu_fop_latency"
- },
- "osu_get_bw": {
- "command": "./osu_get_bw"
- },
- "osu_put_bibw": {
- "command": "./osu_put_bibw"
- },
- "osu_put_latency": {
- "command": "./osu_put_latency"
- },
- "osu_cas_latency": {
- "command": "./osu_cas_latency"
- },
- "osu_get_acc_latency": {
- "command": "./osu_get_acc_latency"
- },
- "osu_put_bw": {
- "command": "./osu_put_bw"
- }
- },
- "id": "n1-standard-2-2",
- "times": {
- "create-cluster": 1367.3097712993622,
- "destroy-cluster": 2073.518306493759,
- "minicluster-run-osu_get_latency": 437.91792845726013,
- "minicluster-run-osu_acc_latency": 38.31566119194031,
- "minicluster-run-osu_fop_latency": 10.17687702178955,
- "minicluster-run-osu_get_bw": 150.1252703666687,
- "minicluster-run-osu_put_bibw": 38.277549743652344,
- "minicluster-run-osu_put_latency": 36.958292961120605,
- "minicluster-run-osu_cas_latency": 8.383898735046387,
- "minicluster-run-osu_get_acc_latency": 64.05685710906982,
- "minicluster-run-osu_put_bw": 19.466553211212158
- }
-}
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out
deleted file mode 100644
index bfb53a4..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Accumulate latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 141.11
-2 131.90
-4 123.68
-8 117.55
-16 121.55
-32 127.79
-64 114.15
-128 126.75
-256 131.81
-512 118.65
-1024 125.17
-2048 143.56
-4096 142.86
-8192 157.31
-16384 181.50
-32768 199.33
-65536 453.33
-131072 453.50
-262144 560.16
-524288 771.15
-1048576 1167.18
-2097152 1929.84
-4194304 4272.84
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out
deleted file mode 100644
index ae44363..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out
+++ /dev/null
@@ -1,5 +0,0 @@
-# OSU MPI_Compare_and_swap latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-8 163.29
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out
deleted file mode 100644
index 6ec7d3c..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out
+++ /dev/null
@@ -1,5 +0,0 @@
-# OSU MPI_Fetch_and_op latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-8 145.01
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out
deleted file mode 100644
index 536ce73..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get_accumulate latency Test v5.8
-# Window creation: MPI_Win_create
-# Synchronization: MPI_Win_lock/unlock
-# Size Latency (us)
-1 287.65
-2 286.34
-4 266.98
-8 297.64
-16 283.17
-32 282.60
-64 263.15
-128 282.87
-256 330.35
-512 295.81
-1024 292.13
-2048 311.83
-4096 323.84
-8192 329.55
-16384 319.65
-32768 341.19
-65536 589.78
-131072 712.14
-262144 887.74
-524288 1315.61
-1048576 2054.79
-2097152 3533.33
-4194304 5818.77
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out
deleted file mode 100644
index 142ad29..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Bandwidth (MB/s)
-1 0.06
-2 0.13
-4 0.27
-8 0.54
-16 1.02
-32 1.95
-64 3.75
-128 8.50
-256 16.98
-512 30.28
-1024 64.48
-2048 115.08
-4096 245.00
-8192 464.01
-16384 585.10
-32768 754.99
-65536 828.35
-131072 890.66
-262144 1042.80
-524288 955.92
-1048576 1142.67
-2097152 1169.05
-4194304 1172.25
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out
deleted file mode 100644
index d226e0d..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 156.10
-2 146.43
-4 135.73
-8 141.91
-16 151.82
-32 151.16
-64 154.75
-128 149.68
-256 149.83
-512 147.85
-1024 141.86
-2048 153.64
-4096 152.21
-8192 165.91
-16384 204.98
-32768 196.08
-65536 343.46
-131072 452.29
-262144 519.62
-524288 1094.88
-1048576 1724.75
-2097152 1880.69
-4194304 4034.85
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out
deleted file mode 100644
index 8882672..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Bi-directional Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_post/start/complete/wait
-# Size Bandwidth (MB/s)
-1 0.85
-2 1.81
-4 3.81
-8 7.37
-16 14.24
-32 24.82
-64 47.65
-128 87.96
-256 161.55
-512 262.90
-1024 355.17
-2048 464.07
-4096 407.02
-8192 406.54
-16384 789.37
-32768 1210.71
-65536 915.45
-131072 835.05
-262144 600.15
-524288 762.38
-1048576 747.83
-2097152 1065.77
-4194304 873.67
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out
deleted file mode 100644
index a731da2..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Bandwidth (MB/s)
-1 0.42
-2 0.72
-4 1.49
-8 2.95
-16 5.55
-32 11.92
-64 22.03
-128 40.32
-256 79.76
-512 139.34
-1024 181.55
-2048 283.18
-4096 207.81
-8192 242.69
-16384 534.98
-32768 611.93
-65536 705.35
-131072 899.41
-262144 1065.81
-524288 1192.52
-1048576 989.94
-2097152 1189.93
-4194304 1089.70
diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out
deleted file mode 100644
index 7a0c2e5..0000000
--- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 121.19
-2 128.06
-4 130.09
-8 126.55
-16 121.53
-32 124.01
-64 124.88
-128 134.59
-256 131.45
-512 134.15
-1024 138.92
-2048 144.37
-4096 135.79
-8192 176.68
-16384 171.74
-32768 207.59
-65536 382.16
-131072 447.84
-262144 573.51
-524288 686.69
-1048576 972.05
-2097152 1942.87
-4194304 3791.28
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/meta.json b/examples/osu-benchmarks/data/n1-standard-4-2/meta.json
deleted file mode 100644
index 9fd47b7..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/meta.json
+++ /dev/null
@@ -1,49 +0,0 @@
-{
- "size": 2,
- "machine": "n1-standard-4",
- "minicluster": {
- "name": "osu-benchmarks",
- "namespace": "flux-operator"
- },
- "jobs": {
- "osu_get_latency": {
- "command": "./osu_get_latency"
- },
- "osu_acc_latency": {
- "command": "./osu_acc_latency"
- },
- "osu_fop_latency": {
- "command": "./osu_fop_latency"
- },
- "osu_get_bw": {
- "command": "./osu_get_bw"
- },
- "osu_put_bibw": {
- "command": "./osu_put_bibw"
- },
- "osu_put_latency": {
- "command": "./osu_put_latency"
- },
- "osu_cas_latency": {
- "command": "./osu_cas_latency"
- },
- "osu_get_acc_latency": {
- "command": "./osu_get_acc_latency"
- },
- "osu_put_bw": {
- "command": "./osu_put_bw"
- }
- },
- "id": "n1-standard-4-2",
- "times": {
- "minicluster-run-osu_get_latency": 277.4993796348572,
- "minicluster-run-osu_acc_latency": 32.00839829444885,
- "minicluster-run-osu_fop_latency": 137.7638008594513,
- "minicluster-run-osu_get_bw": 149.44539713859558,
- "minicluster-run-osu_put_bibw": 33.21780180931091,
- "minicluster-run-osu_put_latency": 31.217578411102295,
- "minicluster-run-osu_cas_latency": 138.34734511375427,
- "minicluster-run-osu_get_acc_latency": 175.93916821479797,
- "minicluster-run-osu_put_bw": 17.256979942321777
- }
-}
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out
deleted file mode 100644
index dc9e63d..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Accumulate latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 114.54
-2 89.25
-4 109.61
-8 120.59
-16 115.39
-32 115.15
-64 115.98
-128 117.26
-256 112.32
-512 113.18
-1024 116.00
-2048 123.82
-4096 122.91
-8192 131.24
-16384 151.30
-32768 166.14
-65536 313.66
-131072 359.74
-262144 444.67
-524288 648.49
-1048576 976.66
-2097152 1724.47
-4194304 3490.93
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out
deleted file mode 100644
index df10d69..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out
+++ /dev/null
@@ -1,5 +0,0 @@
-# OSU MPI_Compare_and_swap latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-8 115.93
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out
deleted file mode 100644
index 37a557b..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out
+++ /dev/null
@@ -1,5 +0,0 @@
-# OSU MPI_Fetch_and_op latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-8 105.31
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out
deleted file mode 100644
index b49b29b..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get_accumulate latency Test v5.8
-# Window creation: MPI_Win_create
-# Synchronization: MPI_Win_lock/unlock
-# Size Latency (us)
-1 187.83
-2 185.87
-4 187.54
-8 187.00
-16 189.64
-32 187.64
-64 187.10
-128 189.27
-256 195.68
-512 190.57
-1024 194.59
-2048 205.99
-4096 216.30
-8192 205.38
-16384 220.19
-32768 250.27
-65536 472.44
-131072 552.09
-262144 698.95
-524288 872.33
-1048576 1280.24
-2097152 2214.98
-4194304 3719.21
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out
deleted file mode 100644
index 66eaef6..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Bandwidth (MB/s)
-1 0.09
-2 0.20
-4 0.39
-8 0.79
-16 1.57
-32 3.01
-64 5.50
-128 12.36
-256 25.06
-512 48.87
-1024 96.36
-2048 187.58
-4096 364.00
-8192 657.03
-16384 1121.71
-32768 880.91
-65536 1266.43
-131072 1237.42
-262144 1222.58
-524288 1220.72
-1048576 1217.06
-2097152 1214.67
-4194304 1213.09
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out
deleted file mode 100644
index 54baa05..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Get latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 105.67
-2 106.76
-4 105.98
-8 104.79
-16 108.18
-32 104.17
-64 107.93
-128 104.04
-256 104.24
-512 100.87
-1024 106.00
-2048 106.41
-4096 107.10
-8192 116.07
-16384 121.87
-32768 153.42
-65536 287.48
-131072 304.28
-262144 394.81
-524288 542.64
-1048576 850.54
-2097152 1754.19
-4194304 4854.39
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out
deleted file mode 100644
index 0e481f7..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Bi-directional Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_post/start/complete/wait
-# Size Bandwidth (MB/s)
-1 0.92
-2 1.89
-4 3.58
-8 7.28
-16 13.10
-32 26.02
-64 52.44
-128 96.39
-256 165.92
-512 295.72
-1024 426.83
-2048 511.34
-4096 424.07
-8192 457.66
-16384 881.99
-32768 1144.43
-65536 909.92
-131072 672.95
-262144 632.42
-524288 546.17
-1048576 683.42
-2097152 1031.23
-4194304 1128.50
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out
deleted file mode 100644
index 14fccbf..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Bandwidth Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Bandwidth (MB/s)
-1 0.51
-2 1.02
-4 2.14
-8 4.00
-16 7.66
-32 16.11
-64 29.96
-128 53.30
-256 104.51
-512 164.96
-1024 213.10
-2048 271.31
-4096 257.33
-8192 271.38
-16384 387.78
-32768 684.16
-65536 671.73
-131072 871.74
-262144 1241.50
-524288 1226.63
-1048576 1220.23
-2097152 1169.76
-4194304 1166.01
diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out
deleted file mode 100644
index 2ea21e2..0000000
--- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out
+++ /dev/null
@@ -1,27 +0,0 @@
-# OSU MPI_Put Latency Test v5.8
-# Window creation: MPI_Win_allocate
-# Synchronization: MPI_Win_flush
-# Size Latency (us)
-1 101.02
-2 109.73
-4 109.01
-8 108.02
-16 111.83
-32 108.72
-64 109.09
-128 115.13
-256 112.40
-512 114.17
-1024 115.96
-2048 119.93
-4096 123.87
-8192 139.41
-16384 152.70
-32768 183.49
-65536 328.75
-131072 387.64
-262144 452.87
-524288 594.75
-1048576 871.06
-2097152 1714.35
-4194304 3572.91
diff --git a/examples/osu-benchmarks/experiments.yaml b/examples/osu-benchmarks/experiments.yaml
deleted file mode 100644
index 2bc569c..0000000
--- a/examples/osu-benchmarks/experiments.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# matrix of experiments to run - machine types and sizes are required
-
-# This can obviously be expanded to more sizes or machines,
-matrix:
- size: [2]
- machine: ["n1-standard-1", "n1-standard-2", "n1-standard-4"]
-
-# Flux Mini Cluster experiment attributes
-minicluster:
- name: osu-benchmarks
- namespace: flux-operator
-
-# Each job can have a command and working directory
-jobs:
- osu_get_latency:
- command: './osu_get_latency'
- osu_acc_latency:
- command: './osu_acc_latency'
- osu_fop_latency:
- command: './osu_fop_latency'
- osu_get_bw:
- command: './osu_get_bw'
- osu_put_bibw:
- command: './osu_put_bibw'
- osu_put_latency:
- command: './osu_put_latency'
- osu_cas_latency:
- command: './osu_cas_latency'
- osu_get_acc_latency:
- command: './osu_get_acc_latency'
- osu_put_bw:
- command: './osu_put_bw'
diff --git a/examples/osu-benchmarks/minicluster-template.yaml b/examples/osu-benchmarks/minicluster-template.yaml
deleted file mode 100644
index 8004cc8..0000000
--- a/examples/osu-benchmarks/minicluster-template.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-apiVersion: flux-framework.org/v1alpha1
-kind: MiniCluster
-metadata:
- name: {{ minicluster.name }}
- namespace: {{ minicluster.namespace }}
-spec:
-
- # Number of pods to create for MiniCluster
- size: {{ size }}
-
- # Disable verbose output
- logging:
- quiet: true
-
- # This is a list because a pod can support multiple containers
- containers:
- # The container URI to pull (currently needs to be public)
- - image: ghcr.io/rse-ops/osu-microbench:app-latest
-
- # Option Flags for this flux runner wait.sh entrypoint
- fluxOptionFlags: "-ompi=openmpi@5"
-
- # custom preCommand logic (run at start of script)
- commands:
- pre: |
- source /etc/profile.d/z10_spack_environment.sh
- asFlux="sudo -u flux -E PYTHONPATH=$PYTHONPATH"
-
- # All osu-benchmark experiments share the same working directory
- workingDir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided
- command: {{ job.command }}
diff --git a/examples/up-apply-down/README.md b/examples/up-apply-down/README.md
deleted file mode 100644
index e4cfc6c..0000000
--- a/examples/up-apply-down/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Up and Down
-
-This is an example of using flux cloud to bring up a cluster, install the Flux Operator
-(and then you would use it as you please) and then bring it down.
-You should have kubectl and gcloud installed for this demo. Note that
-we use the [experiments.yaml](experiments.yaml) file as a default,
-and we only provide basic metadata needed for a single experiment.
-
-## Up
-
-```bash
-$ flux-cloud up
-```
-
-This will bring up your cluster, per the size and machine type defined
-in your experiments file, and install the operator.
-
-## Apply
-
-An "apply" means running the single (or multiple) experiments defined in your
-experiments.yaml. While these don't need to be in the same file, for simplicity
-we have also defined our experiment metadata and template (provided at [minicluster-template.yaml](minicluster-template.yaml))
-in this directory. For this application we will run a simple llamps application.
-
-```bash
-$ flux-cloud apply
-```
-
-Note that apply will work for a single experiment OR a matrix, so be careful!
-
-## Down
-
-To bring it down:
-
-```bash
-$ flux-cloud down
-```
diff --git a/examples/up-apply-down/data/meta.json b/examples/up-apply-down/data/meta.json
deleted file mode 100644
index be7b873..0000000
--- a/examples/up-apply-down/data/meta.json
+++ /dev/null
@@ -1,19 +0,0 @@
-[
- {
- "size": 2,
- "machine": "n1-standard-1",
- "minicluster": {
- "name": "lammps-job",
- "namespace": "flux-operator"
- },
- "jobs": {
- "reaxc-hns": {
- "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite"
- }
- },
- "times": {
- "minicluster-run-reaxc-hns": 198.465562582016,
- "create-cluster": 367.33847880363464
- }
- }
-]
diff --git a/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out b/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out
deleted file mode 100644
index 71de171..0000000
--- a/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out
+++ /dev/null
@@ -1,80 +0,0 @@
-LAMMPS (29 Sep 2021 - Update 2)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
- using 1 OpenMP thread(s) per MPI task
-Reading data file ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000)
- 2 by 1 by 1 MPI processor grid
- reading atoms ...
- 304 atoms
- reading velocities ...
- 304 velocities
- read_data CPU = 0.029 seconds
-Replicating atoms ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000)
- 2 by 1 by 1 MPI processor grid
- bounding box image = (0 -1 -1) to (0 1 1)
- bounding box extra memory = 0.03 MB
- average # of replicas added to proc = 5.00 out of 8 (62.50%)
- 2432 atoms
- replicate CPU = 0.002 seconds
-Neighbor list info ...
- update every 20 steps, delay 0 steps, check no
- max neighbors/atom: 2000, page size: 100000
- master list distance cutoff = 11
- ghost atom cutoff = 11
- binsize = 5.5, bins = 10 5 6
- 2 neighbor lists, perpetual/occasional/extra = 2 0 0
- (1) pair reax/c, perpetual
- attributes: half, newton off, ghost
- pair build: half/bin/newtoff/ghost
- stencil: full/ghost/bin/3d
- bin: standard
- (2) fix qeq/reax, perpetual, copy from (1)
- attributes: half, newton off, ghost
- pair build: copy
- stencil: none
- bin: none
-Setting up Verlet run ...
- Unit style : real
- Current step : 0
- Time step : 0.1
-Per MPI rank memory allocation (min/avg/max) = 143.9 | 143.9 | 143.9 Mbytes
-Step Temp PotEng Press E_vdwl E_coul Volume
- 0 300 -113.27833 437.52118 -111.57687 -1.7014647 27418.867
- 10 299.38517 -113.27631 1439.2824 -111.57492 -1.7013813 27418.867
- 20 300.27107 -113.27884 3764.342 -111.57762 -1.7012247 27418.867
- 30 302.21063 -113.28428 7007.6629 -111.58335 -1.7009363 27418.867
- 40 303.52265 -113.28799 9844.8245 -111.58747 -1.7005186 27418.867
- 50 301.87059 -113.28324 9663.0973 -111.58318 -1.7000523 27418.867
- 60 296.67807 -113.26777 7273.8119 -111.56815 -1.6996137 27418.867
- 70 292.19999 -113.25435 5533.5522 -111.55514 -1.6992158 27418.867
- 80 293.58677 -113.25831 5993.4438 -111.55946 -1.6988533 27418.867
- 90 300.62635 -113.27925 7202.8369 -111.58069 -1.6985592 27418.867
- 100 305.38276 -113.29357 10085.805 -111.59518 -1.6983874 27418.867
-Loop time of 20.075 on 2 procs for 100 steps with 2432 atoms
-
-Performance: 0.043 ns/day, 557.640 hours/ns, 4.981 timesteps/s
-84.6% CPU use with 2 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section | min time | avg time | max time |%varavg| %total
----------------------------------------------------------------
-Pair | 12.399 | 13.154 | 13.91 | 20.8 | 65.53
-Neigh | 0.40351 | 0.40416 | 0.4048 | 0.1 | 2.01
-Comm | 0.33357 | 1.0872 | 1.8408 | 72.3 | 5.42
-Output | 0.004412 | 0.0045916 | 0.0047713 | 0.3 | 0.02
-Modify | 5.4218 | 5.4219 | 5.422 | 0.0 | 27.01
-Other | | 0.002887 | | | 0.01
-
-Nlocal: 1216.00 ave 1216 max 1216 min
-Histogram: 2 0 0 0 0 0 0 0 0 0
-Nghost: 7591.50 ave 7597 max 7586 min
-Histogram: 1 0 0 0 0 0 0 0 0 1
-Neighs: 432912.0 ave 432942 max 432882 min
-Histogram: 1 0 0 0 0 0 0 0 0 1
-
-Total # of neighbors = 865824
-Ave neighs/atom = 356.01316
-Neighbor list builds = 5
-Dangerous builds not checked
-Total wall time: 0:00:20
diff --git a/examples/up-apply-down/experiments.yaml b/examples/up-apply-down/experiments.yaml
deleted file mode 100644
index ddcfdac..0000000
--- a/examples/up-apply-down/experiments.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Minimum required to bring up a cluster
-experiment:
- size: 2
- machine: n1-standard-1
-
-# Flux Mini Cluster experiment attributes
-minicluster:
- name: lammps-job
- namespace: flux-operator
-
-# If your jobs share the same variables you can just put them in the template directly!
-jobs:
- reaxc-hns:
- command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite'
diff --git a/examples/up-apply-down/minicluster-template.yaml b/examples/up-apply-down/minicluster-template.yaml
deleted file mode 100644
index 6b34bdd..0000000
--- a/examples/up-apply-down/minicluster-template.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-apiVersion: flux-framework.org/v1alpha1
-kind: MiniCluster
-metadata:
- name: {{ minicluster.name }}
- namespace: {{ minicluster.namespace }}
-spec:
- # Number of pods to create for MiniCluster
- size: {{ size }}
-
- # Disable verbose output
- logging:
- quiet: true
-
- # This is a list because a pod can support multiple containers
- containers:
- # The container URI to pull (currently needs to be public)
- - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
-
- # You can set the working directory if your container WORKDIR is not correct.
- workingDir: /home/flux/examples/reaxff/HNS
- command: {{ job.command }}
-
- # This only matters if test is false
- fluxLogLevel: 7
diff --git a/examples/up-submit-down/README.md b/examples/up-submit-down/README.md
deleted file mode 100644
index 818e95d..0000000
--- a/examples/up-submit-down/README.md
+++ /dev/null
@@ -1,72 +0,0 @@
-``# Up, Submit, Down
-
-This is an example of using flux cloud to bring up a cluster, install the Flux Operator
-(and then you would use it as you please) and run jobs with submit (on the same
-MiniCluster) and then bring it down.
-You should have kubectl and gcloud OR minikube installed for this demo. Note that
-we use the [experiments.yaml](experiments.yaml) file as a default,
-and we only provide basic metadata needed for a single experiment.
-
-## Up
-
-```bash
-$ flux-cloud up
-```
-
-This will bring up your cluster, per the size and machine type defined
-in your experiments file, and install the operator.
-
-## Submit
-
-A "submit" means running the single (or multiple) experiments defined in your
-experiments.yaml on the same MiniCluster, without bringing it down between jobs.
-This means we are using Flux as the scheduler proper, and we don't need to bring pods
-up and down unecessarily (and submit a gazillion YAML files). There is only the number
-of YAML CRD needed to correspond to the sizes of MiniClusters you run across.
-
-```bash
-$ flux-cloud submit --cloud minikube
-$ flux-cloud submit --cloud google
-```
-
-## Down
-
-To bring it down:
-
-```bash
-$ flux-cloud down
-```
-
-## Batch
-
-Run all three with one command:
-
-```bash
-$ flux-cloud batch --cloud minikube
-$ flux-cloud batch --cloud google
-```
-
-## UI
-
-If you want to just bring up the cluster and open the user interface to interact with:
-
-```bash
-$ flux-cloud up --cloud minikube
-$ flux-cloud ui --cloud minikube
-$ flux-cloud down --cloud minikube
-```
-
-
-## Plot
-
-I threw together a script to compare running times with info and output times,
-where:
-
-running time < info < output
-
-```bash
-$ pip install pandas matplotlib seaborn
-```
-```bash
-$ python plot_results.py data/k8s-size-4-n1-standard-1/meta.json
-```
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh
deleted file mode 100755
index c6fb8e0..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-NAMESPACE="flux-operator"
-JOB="lammps-job"
-brokerPrefix="${JOB}-0"
-
-for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do
- if [[ "${pod}" == ${brokerPrefix}* ]]; then
- echo ${pod}
- break
- fi
-done
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh
deleted file mode 100755
index bdace99..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh
+++ /dev/null
@@ -1,204 +0,0 @@
-#!/bin/bash
-
-# Source shared helper scripts
-# Colors
-red='\033[0;31m'
-green='\033[0;32m'
-yellow='\033[0;33m'
-blue='\033[0;34m'
-magenta='\033[0;35m'
-cyan='\033[0;36m'
-clear='\033[0m'
-
-function print_red() {
- echo -e "${red}$@${clear}"
-}
-function print_yellow() {
- echo -e "${yellow}$@${clear}"
-}
-function print_green() {
- echo -e "${green}$@${clear}"
-}
-function print_blue() {
- echo -e "${blue}$@${clear}"
-}
-function print_magenta() {
- echo -e "${magenta}$@${clear}"
-}
-function print_cyan() {
- echo -e "${cyan}$@${clear}"
-}
-
-function is_installed () {
- # Determine if a command is available for use!
- cmd="${1}"
- if command -v $cmd >/dev/null; then
- echo "$cmd is installed"
- else
- echo "$cmd could not be found"
- exit 1
- fi
-}
-
-function install_operator() {
- # Shared function to install the operator from a specific repository branch and cleanup
- script_dir=${1}
- repository=${2}
- branch=${3}
- tmpfile="${script_dir}/flux-operator.yaml"
- run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml
- kubectl apply -f $tmpfile
-}
-
-
-function run_echo() {
- # Show the user the command then run it
- echo
- print_green "$@"
- retry $@
-}
-
-function run_echo_allow_fail() {
- echo
- print_green "$@"
- $@ || true
-}
-
-function retry() {
- # Retry an unsuccessful user command, per request
- while true
- do
- $@
- retval=$?
- if [[ "${retval}" == "0" ]]; then
- return
- fi
- print_blue "That command was not successful. Do you want to try again? 🤔️"
- read -p " (yes/no) " answer
- # Exit with non-zero response so we know to stop in script.
- case ${answer} in
- yes ) continue;;
- no ) echo exiting...;
- exit 1;;
- * ) echo invalid response;
- exit 1;;
- esac
- done
-}
-
-
-function prompt() {
- # Prompt the user with a yes/no command to continue or exit
- print_blue "$@ 🤔️"
- read -p " (yes/no) " answer
- case ${answer} in
- yes ) echo ok, we will proceed;;
- no ) echo exiting...;
- exit 1;;
- * ) echo invalid response;
- exit 1;;
- esac
-}
-
-
-function with_exponential_backoff {
- # Run with exponential backoff - assume containers take a while to pull
- local max_attempts=100
- local timeout=1
- local attempt=0
- local exitcode=0
-
- while [[ $attempt < $max_attempts ]]; do
- "$@"
- exitcode=$?
-
- if [[ $exitcode == 0 ]]; then
- break
- fi
-
- echo "Failure! Retrying in $timeout.." 1>&2
- sleep $timeout
- attempt=$(( attempt + 1 ))
- timeout=$(( timeout * 2 ))
- done
-
- if [[ $exitCode != 0 ]]; then
- echo "You've failed me for the last time! ($@)" 1>&2
- fi
- return $exitcode
-}
-
-# Defaults - these are in the config but left here for information
-CLUSTER_NAME="flux-cluster"
-ZONE="us-central1-a"
-CLUSTER_VERSION="1.23"
-MACHINE_TYPE="n1-standard-1"
-FORCE_CLUSTER="false"
-SIZE=4
-TAGS="flux-cluster"
-REPOSITORY="flux-framework/flux-operator"
-BRANCH="main"
-GOOGLE_PROJECT="dinodev"
-SCRIPT_DIR="/home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts"
-
-# Required arguments
-if [ -z ${GOOGLE_PROJECT+x} ]; then
- echo "Missing Google Project template variable as GOOGLE_PROJECT";
- exit 1
-fi
-
-if [ -z ${ZONE+x} ]; then
- echo "Missing Google Cloud zone template variable as ZONE";
- exit 1
-fi
-
-if [ -z ${MACHINE_TYPE+x} ]; then
- echo "Missing Google Cloud machine type template variable as MACHINE_TYPE";
- exit 1
-fi
-
-print_magenta " cluster : ${CLUSTER_NAME}"
-print_magenta " version : ${CLUSTER_VERSION}"
-print_magenta " project : ${GOOGLE_PROJECT}"
-print_magenta " machine : ${MACHINE_TYPE}"
-print_magenta " zone : ${ZONE}"
-print_magenta " tags : ${TAGS}"
-print_magenta " size : ${SIZE}"
-print_magenta "repository : ${REPOSITORY}"
-print_magenta " branch : ${BRANCH}"
-
-is_installed kubectl
-is_installed gcloud
-is_installed wget
-
-# Check if it already exists
-gcloud container clusters list --zone ${ZONE} | grep ${CLUSTER_NAME}
-retval=$?
-if [[ "${retval}" == "0" ]]; then
- print_blue "${CLUSTER_NAME} in ${ZONE} already exists."
- echo
- exit 0
-fi
-
-if [[ "${FORCE_CLUSTER}" != "true" ]]; then
- prompt "Do you want to create this cluster?"
-fi
-
-# Create the cluster
-run_echo gcloud container clusters create ${CLUSTER_NAME} --project $GOOGLE_PROJECT \
- --zone ${ZONE} --cluster-version ${CLUSTER_VERSION} --machine-type ${MACHINE_TYPE} \
- --num-nodes=${SIZE} --enable-network-policy --tags=${TAGS} --enable-intra-node-visibility
-
-# Get credentials so kubectl will work
-run_echo gcloud container clusters get-credentials ${CLUSTER_NAME} --zone ${ZONE} --project $GOOGLE_PROJECT
-run_echo kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user $(gcloud config get-value core/account)
-
-# Show nodes
-run_echo kubectl get nodes
-
-# Deploy the operator
-mkdir -p ${SCRIPT_DIR}
-install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH}
-
-run_echo kubectl get namespace
-run_echo kubectl describe namespace operator-system
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh
deleted file mode 100755
index de8988e..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/bin/bash
-
-# Source shared helper scripts
-# Colors
-red='\033[0;31m'
-green='\033[0;32m'
-yellow='\033[0;33m'
-blue='\033[0;34m'
-magenta='\033[0;35m'
-cyan='\033[0;36m'
-clear='\033[0m'
-
-function print_red() {
- echo -e "${red}$@${clear}"
-}
-function print_yellow() {
- echo -e "${yellow}$@${clear}"
-}
-function print_green() {
- echo -e "${green}$@${clear}"
-}
-function print_blue() {
- echo -e "${blue}$@${clear}"
-}
-function print_magenta() {
- echo -e "${magenta}$@${clear}"
-}
-function print_cyan() {
- echo -e "${cyan}$@${clear}"
-}
-
-function is_installed () {
- # Determine if a command is available for use!
- cmd="${1}"
- if command -v $cmd >/dev/null; then
- echo "$cmd is installed"
- else
- echo "$cmd could not be found"
- exit 1
- fi
-}
-
-function install_operator() {
- # Shared function to install the operator from a specific repository branch and cleanup
- script_dir=${1}
- repository=${2}
- branch=${3}
- tmpfile="${script_dir}/flux-operator.yaml"
- run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml
- kubectl apply -f $tmpfile
-}
-
-
-function run_echo() {
- # Show the user the command then run it
- echo
- print_green "$@"
- retry $@
-}
-
-function run_echo_allow_fail() {
- echo
- print_green "$@"
- $@ || true
-}
-
-function retry() {
- # Retry an unsuccessful user command, per request
- while true
- do
- $@
- retval=$?
- if [[ "${retval}" == "0" ]]; then
- return
- fi
- print_blue "That command was not successful. Do you want to try again? 🤔️"
- read -p " (yes/no) " answer
- # Exit with non-zero response so we know to stop in script.
- case ${answer} in
- yes ) continue;;
- no ) echo exiting...;
- exit 1;;
- * ) echo invalid response;
- exit 1;;
- esac
- done
-}
-
-
-function prompt() {
- # Prompt the user with a yes/no command to continue or exit
- print_blue "$@ 🤔️"
- read -p " (yes/no) " answer
- case ${answer} in
- yes ) echo ok, we will proceed;;
- no ) echo exiting...;
- exit 1;;
- * ) echo invalid response;
- exit 1;;
- esac
-}
-
-
-function with_exponential_backoff {
- # Run with exponential backoff - assume containers take a while to pull
- local max_attempts=100
- local timeout=1
- local attempt=0
- local exitcode=0
-
- while [[ $attempt < $max_attempts ]]; do
- "$@"
- exitcode=$?
-
- if [[ $exitcode == 0 ]]; then
- break
- fi
-
- echo "Failure! Retrying in $timeout.." 1>&2
- sleep $timeout
- attempt=$(( attempt + 1 ))
- timeout=$(( timeout * 2 ))
- done
-
- if [[ $exitCode != 0 ]]; then
- echo "You've failed me for the last time! ($@)" 1>&2
- fi
- return $exitcode
-}
-
-# Defaults - these are in the config but left here for information
-CLUSTER_NAME="flux-cluster"
-FORCE_CLUSTER="false"
-ZONE="us-central1-a"
-
-if [ -z ${ZONE+x} ]; then
- echo "Google Cloud zone template missing as ZONE";
- exit 1
-fi
-
-echo " cluster : ${CLUSTER_NAME}"
-echo " zone : ${ZONE}"
-
-is_installed gcloud
-is_installed yes || FORCE_CLUSTER="false"
-
-# Check if it already exists
-gcloud container clusters list --zone ${ZONE} | grep ${CLUSTER_NAME}
-retval=$?
-if [[ "${retval}" != "0" ]]; then
- print_blue "${CLUSTER_NAME} in ${ZONE} does not exist."
- echo
- exit 0
-fi
-
-# This command has a confirmation already
-if [[ "${FORCE_CLUSTER}" == "true" ]]; then
- yes | gcloud container clusters delete --zone ${ZONE} ${CLUSTER_NAME}
-else
- run_echo gcloud container clusters delete --zone ${ZONE} ${CLUSTER_NAME}
-fi
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml
deleted file mode 100644
index b4bc03e..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml
+++ /dev/null
@@ -1,848 +0,0 @@
-apiVersion: v1
-kind: Namespace
-metadata:
- labels:
- control-plane: controller-manager
- name: operator-system
----
-apiVersion: apiextensions.k8s.io/v1
-kind: CustomResourceDefinition
-metadata:
- annotations:
- controller-gen.kubebuilder.io/version: v0.9.0
- creationTimestamp: null
- name: miniclusters.flux-framework.org
-spec:
- group: flux-framework.org
- names:
- kind: MiniCluster
- listKind: MiniClusterList
- plural: miniclusters
- singular: minicluster
- scope: Namespaced
- versions:
- - name: v1alpha1
- schema:
- openAPIV3Schema:
- description: MiniCluster is the Schema for a Flux job launcher on K8s
- properties:
- apiVersion:
- description: 'APIVersion defines the versioned schema of this representation
- of an object. Servers should convert recognized schemas to the latest
- internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
- type: string
- kind:
- description: 'Kind is a string value representing the REST resource this
- object represents. Servers may infer this from the endpoint the client
- submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
- type: string
- metadata:
- type: object
- spec:
- description: MiniCluster defines the desired state of a Flux MiniCluster
- "I am a Flux user and I want to launch a MiniCluster for my job!" A
- MiniCluster corresponds to a Batch Job -> StatefulSet + ConfigMaps A
- "task" within that cluster is flux running something.
- properties:
- containers:
- description: Containers is one or more containers to be created in
- a pod. There should only be one container to run flux with runFlux
- items:
- properties:
- command:
- description: 'Single user executable to provide to flux start
- IMPORTANT: This is left here, but not used in favor of exposing
- Flux via a Restful API. We Can remove this when that is finalized.'
- type: string
- cores:
- description: Cores the container should use
- format: int32
- type: integer
- diagnostics:
- description: Run flux diagnostics on start instead of command
- type: boolean
- environment:
- additionalProperties:
- type: string
- description: Key/value pairs for the environment
- type: object
- fluxLogLevel:
- default: 6
- description: Log level to use for flux logging (only in non
- TestMode)
- format: int32
- type: integer
- fluxOptionFlags:
- description: Flux option flags, usually provided with -o optional
- - if needed, default option flags for the server These can
- also be set in the user interface to override here. This is
- only valid for a FluxRunner
- type: string
- image:
- default: fluxrm/flux-sched:focal
- description: Container image must contain flux and flux-sched
- install
- type: string
- imagePullSecret:
- description: Allow the user to pull authenticated images By
- default no secret is selected. Setting this with the name
- of an already existing imagePullSecret will specify that secret
- in the pod spec.
- type: string
- name:
- description: Container name is only required for non flux runners
- type: string
- ports:
- description: Ports to be exposed to other containers in the
- cluster We take a single list of integers and map to the same
- items:
- format: int32
- type: integer
- type: array
- postStartExec:
- description: Lifecycle can handle post start commands, etc.
- type: string
- preCommand:
- description: Special command to run at beginning of script,
- directly after asFlux is defined as sudo -u flux -E (so you
- can change that if desired.) This is only valid if FluxRunner
- is set (that writes a wait.sh script)
- type: string
- pullAlways:
- default: false
- description: Allow the user to dictate pulling By default we
- pull if not present. Setting this to true will indicate to
- pull always
- type: boolean
- resources:
- description: Resources include limits and requests
- properties:
- limits:
- additionalProperties:
- anyOf:
- - type: integer
- - type: string
- x-kubernetes-int-or-string: true
- type: object
- requests:
- additionalProperties:
- anyOf:
- - type: integer
- - type: string
- x-kubernetes-int-or-string: true
- type: object
- type: object
- runFlux:
- description: Main container to run flux (only should be one)
- type: boolean
- volumes:
- additionalProperties:
- description: A Container volume must reference one defined
- for the MiniCluster The path here is in the container
- properties:
- path:
- type: string
- readOnly:
- default: true
- type: boolean
- required:
- - path
- type: object
- description: Volumes that can be mounted (must be defined in
- volumes)
- type: object
- workingDir:
- description: Working directory to run command from
- type: string
- required:
- - image
- type: object
- type: array
- deadlineSeconds:
- default: 31500000
- description: Should the job be limited to a particular number of seconds?
- Approximately one year. This cannot be zero or job won't start
- format: int64
- type: integer
- fluxRestful:
- description: Customization to Flux Restful API There should only be
- one container to run flux with runFlux
- properties:
- branch:
- default: main
- description: Branch to clone Flux Restful API from
- type: string
- port:
- default: 5000
- description: Port to run Flux Restful Server On
- format: int32
- type: integer
- token:
- description: Token to use for RestFul API
- type: string
- username:
- description: These two should not actually be set by a user, but
- rather generated by tools and provided Username to use for RestFul
- API
- type: string
- type: object
- jobLabels:
- additionalProperties:
- type: string
- description: Labels for the job
- type: object
- localDeploy:
- default: false
- description: localDeploy should be true for development, or deploying
- in the case that there isn't an actual kubernetes cluster (e.g.,
- you are not using make deploy. It uses a persistent volume instead
- of a claim
- type: boolean
- logging:
- description: Logging modes determine the output you see in the job
- log
- properties:
- quiet:
- default: false
- description: Quiet mode silences all output so the job only shows
- the test running
- type: boolean
- timed:
- default: false
- description: Timed mode adds timing to Flux commands
- type: boolean
- type: object
- pod:
- description: Pod spec details
- properties:
- resources:
- additionalProperties:
- anyOf:
- - type: integer
- - type: string
- x-kubernetes-int-or-string: true
- description: Resources include limits and requests
- type: object
- type: object
- podLabels:
- additionalProperties:
- type: string
- description: Labels for each pod
- type: object
- size:
- default: 1
- description: Size (number of job pods to run, size of minicluster
- in pods)
- format: int32
- type: integer
- tasks:
- default: 1
- description: Total number of CPUs being run across entire cluster
- format: int32
- type: integer
- volumes:
- additionalProperties:
- description: Mini Cluster local volumes available to mount (these
- are on the host)
- properties:
- path:
- type: string
- required:
- - path
- type: object
- description: Volumes on the host (named) accessible to containers
- type: object
- required:
- - containers
- type: object
- status:
- description: MiniClusterStatus defines the observed state of Flux
- properties:
- conditions:
- description: conditions hold the latest Flux Job and MiniCluster states
- items:
- description: "Condition contains details for one aspect of the current
- state of this API Resource. --- This struct is intended for direct
- use as an array at the field path .status.conditions. For example,
- \n type FooStatus struct{ // Represents the observations of a
- foo's current state. // Known .status.conditions.type are: \"Available\",
- \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge
- // +listType=map // +listMapKey=type Conditions []metav1.Condition
- `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\"
- protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }"
- properties:
- lastTransitionTime:
- description: lastTransitionTime is the last time the condition
- transitioned from one status to another. This should be when
- the underlying condition changed. If that is not known, then
- using the time when the API field changed is acceptable.
- format: date-time
- type: string
- message:
- description: message is a human readable message indicating
- details about the transition. This may be an empty string.
- maxLength: 32768
- type: string
- observedGeneration:
- description: observedGeneration represents the .metadata.generation
- that the condition was set based upon. For instance, if .metadata.generation
- is currently 12, but the .status.conditions[x].observedGeneration
- is 9, the condition is out of date with respect to the current
- state of the instance.
- format: int64
- minimum: 0
- type: integer
- reason:
- description: reason contains a programmatic identifier indicating
- the reason for the condition's last transition. Producers
- of specific condition types may define expected values and
- meanings for this field, and whether the values are considered
- a guaranteed API. The value should be a CamelCase string.
- This field may not be empty.
- maxLength: 1024
- minLength: 1
- pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
- type: string
- status:
- description: status of the condition, one of True, False, Unknown.
- enum:
- - "True"
- - "False"
- - Unknown
- type: string
- type:
- description: type of condition in CamelCase or in foo.example.com/CamelCase.
- --- Many .condition.type values are consistent across resources
- like Available, but because arbitrary conditions can be useful
- (see .node.status.conditions), the ability to deconflict is
- important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt)
- maxLength: 316
- pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
- type: string
- required:
- - lastTransitionTime
- - message
- - reason
- - status
- - type
- type: object
- type: array
- jobid:
- description: The JobUid is set internally to associate to a miniCluster
- type: string
- required:
- - jobid
- type: object
- type: object
- served: true
- storage: true
- subresources:
- status: {}
----
-apiVersion: v1
-kind: ServiceAccount
-metadata:
- name: operator-controller-manager
- namespace: operator-system
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
- name: operator-leader-election-role
- namespace: operator-system
-rules:
-- apiGroups:
- - ""
- resources:
- - configmaps
- verbs:
- - get
- - list
- - watch
- - create
- - update
- - patch
- - delete
-- apiGroups:
- - coordination.k8s.io
- resources:
- - leases
- verbs:
- - get
- - list
- - watch
- - create
- - update
- - patch
- - delete
-- apiGroups:
- - ""
- resources:
- - events
- verbs:
- - create
- - patch
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
- creationTimestamp: null
- name: operator-manager-role
-rules:
-- apiGroups:
- - ""
- resources:
- - events
- verbs:
- - create
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - events
- - nodes
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - batch
- resources:
- - jobs
- verbs:
- - create
- - delete
- - exec
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - batch
- resources:
- - jobs/status
- verbs:
- - create
- - delete
- - exec
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - ""
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - batch
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - configmaps
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - events
- verbs:
- - create
- - patch
-- apiGroups:
- - ""
- resources:
- - jobs
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - networks
- verbs:
- - create
- - patch
-- apiGroups:
- - ""
- resources:
- - persistentvolumeclaims
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - persistentvolumes
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - pods
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - pods/exec
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - pods/log
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - secrets
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - services
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - ""
- resources:
- - statefulsets
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - flux-framework.org
- resources:
- - clusters
- - clusters/status
- verbs:
- - get
- - list
- - watch
-- apiGroups:
- - flux-framework.org
- resources:
- - machineclasses
- - machinedeployments
- - machinedeployments/status
- - machines
- - machines/status
- - machinesets
- - machinesets/status
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - flux-framework.org
- resources:
- - miniclusters
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - flux-framework.org
- resources:
- - miniclusters/finalizers
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - flux-framework.org
- resources:
- - miniclusters/status
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
-- apiGroups:
- - networking.k8s.io
- resources:
- - ingresses
- verbs:
- - create
- - delete
- - get
- - list
- - patch
- - update
- - watch
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
- name: operator-metrics-reader
-rules:
-- nonResourceURLs:
- - /metrics
- verbs:
- - get
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
- name: operator-proxy-role
-rules:
-- apiGroups:
- - authentication.k8s.io
- resources:
- - tokenreviews
- verbs:
- - create
-- apiGroups:
- - authorization.k8s.io
- resources:
- - subjectaccessreviews
- verbs:
- - create
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
- name: operator-leader-election-rolebinding
- namespace: operator-system
-roleRef:
- apiGroup: rbac.authorization.k8s.io
- kind: Role
- name: operator-leader-election-role
-subjects:
-- kind: ServiceAccount
- name: operator-controller-manager
- namespace: operator-system
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
- name: operator-manager-rolebinding
-roleRef:
- apiGroup: rbac.authorization.k8s.io
- kind: ClusterRole
- name: operator-manager-role
-subjects:
-- kind: ServiceAccount
- name: operator-controller-manager
- namespace: operator-system
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
- name: operator-proxy-rolebinding
-roleRef:
- apiGroup: rbac.authorization.k8s.io
- kind: ClusterRole
- name: operator-proxy-role
-subjects:
-- kind: ServiceAccount
- name: operator-controller-manager
- namespace: operator-system
----
-apiVersion: v1
-data:
- controller_manager_config.yaml: |
- apiVersion: controller-runtime.sigs.k8s.io/v1alpha1
- kind: ControllerManagerConfig
- health:
- healthProbeBindAddress: :8081
- metrics:
- bindAddress: 127.0.0.1:8080
- webhook:
- port: 9443
- leaderElection:
- leaderElect: true
- resourceName: 14dde902.flux-framework.org
-kind: ConfigMap
-metadata:
- name: operator-manager-config
- namespace: operator-system
----
-apiVersion: v1
-kind: Service
-metadata:
- labels:
- control-plane: controller-manager
- name: operator-controller-manager-metrics-service
- namespace: operator-system
-spec:
- ports:
- - name: https
- port: 8443
- protocol: TCP
- targetPort: https
- selector:
- control-plane: controller-manager
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
- labels:
- control-plane: controller-manager
- name: operator-controller-manager
- namespace: operator-system
-spec:
- replicas: 1
- selector:
- matchLabels:
- control-plane: controller-manager
- template:
- metadata:
- annotations:
- kubectl.kubernetes.io/default-container: manager
- labels:
- control-plane: controller-manager
- spec:
- containers:
- - args:
- - --secure-listen-address=0.0.0.0:8443
- - --upstream=http://127.0.0.1:8080/
- - --logtostderr=true
- - --v=0
- image: gcr.io/kubebuilder/kube-rbac-proxy:v0.11.0
- name: kube-rbac-proxy
- ports:
- - containerPort: 8443
- name: https
- protocol: TCP
- resources:
- limits:
- cpu: 500m
- memory: 128Mi
- requests:
- cpu: 5m
- memory: 64Mi
- securityContext:
- allowPrivilegeEscalation: false
- - args:
- - --health-probe-bind-address=:8081
- - --metrics-bind-address=127.0.0.1:8080
- - --leader-elect
- command:
- - /manager
- image: ghcr.io/flux-framework/flux-operator:latest
- livenessProbe:
- httpGet:
- path: /healthz
- port: 8081
- initialDelaySeconds: 15
- periodSeconds: 20
- name: manager
- readinessProbe:
- httpGet:
- path: /readyz
- port: 8081
- initialDelaySeconds: 5
- periodSeconds: 10
- resources:
- limits:
- cpu: 500m
- memory: 128Mi
- requests:
- cpu: 10m
- memory: 64Mi
- securityContext:
- allowPrivilegeEscalation: false
- securityContext:
- runAsNonRoot: true
- serviceAccountName: operator-controller-manager
- terminationGracePeriodSeconds: 10
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml
deleted file mode 100644
index b3b2e17..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-apiVersion: flux-framework.org/v1alpha1
-kind: MiniCluster
-
-metadata:
- name: lammps-job
- namespace: flux-operator
-spec:
- # localDeploy needs to be false
- localDeploy: false
-
- # Number of pods to create for MiniCluster
- size: 2
- tasks: 1
-
- # Disable verbose output
-
-
- # Optional credentials if running the flux restful api
- fluxRestful:
- token: "6b8a7393-129b-4e2d-83a7-795a5a7c9d9b"
- username: "fluxuser"
-
- # TODO add pod resources, if needed
- containers:
- - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
-
-
-
- cores: 1
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh
deleted file mode 100755
index 0bd72c3..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/bin/bash
-
-# This is a template that will be populated with variables by Flux-Cloud
-# We only run it to check if a MiniCluster is running. An apply is only
-# needed if the MiniCluster is not created yet.
-
-# Include shared helper scripts
-# Colors
-red='\033[0;31m'
-green='\033[0;32m'
-yellow='\033[0;33m'
-blue='\033[0;34m'
-magenta='\033[0;35m'
-cyan='\033[0;36m'
-clear='\033[0m'
-
-function print_red() {
- echo -e "${red}$@${clear}"
-}
-function print_yellow() {
- echo -e "${yellow}$@${clear}"
-}
-function print_green() {
- echo -e "${green}$@${clear}"
-}
-function print_blue() {
- echo -e "${blue}$@${clear}"
-}
-function print_magenta() {
- echo -e "${magenta}$@${clear}"
-}
-function print_cyan() {
- echo -e "${cyan}$@${clear}"
-}
-
-function is_installed () {
- # Determine if a command is available for use!
- cmd="${1}"
- if command -v $cmd >/dev/null; then
- echo "$cmd is installed"
- else
- echo "$cmd could not be found"
- exit 1
- fi
-}
-
-function install_operator() {
- # Shared function to install the operator from a specific repository branch and cleanup
- script_dir=${1}
- repository=${2}
- branch=${3}
- tmpfile="${script_dir}/flux-operator.yaml"
- run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml
- kubectl apply -f $tmpfile
-}
-
-
-function run_echo() {
- # Show the user the command then run it
- echo
- print_green "$@"
- retry $@
-}
-
-function run_echo_allow_fail() {
- echo
- print_green "$@"
- $@ || true
-}
-
-function retry() {
- # Retry an unsuccessful user command, per request
- while true
- do
- $@
- retval=$?
- if [[ "${retval}" == "0" ]]; then
- return
- fi
- print_blue "That command was not successful. Do you want to try again? 🤔️"
- read -p " (yes/no) " answer
- # Exit with non-zero response so we know to stop in script.
- case ${answer} in
- yes ) continue;;
- no ) echo exiting...;
- exit 1;;
- * ) echo invalid response;
- exit 1;;
- esac
- done
-}
-
-
-function prompt() {
- # Prompt the user with a yes/no command to continue or exit
- print_blue "$@ 🤔️"
- read -p " (yes/no) " answer
- case ${answer} in
- yes ) echo ok, we will proceed;;
- no ) echo exiting...;
- exit 1;;
- * ) echo invalid response;
- exit 1;;
- esac
-}
-
-
-function with_exponential_backoff {
- # Run with exponential backoff - assume containers take a while to pull
- local max_attempts=100
- local timeout=1
- local attempt=0
- local exitcode=0
-
- while [[ $attempt < $max_attempts ]]; do
- "$@"
- exitcode=$?
-
- if [[ $exitcode == 0 ]]; then
- break
- fi
-
- echo "Failure! Retrying in $timeout.." 1>&2
- sleep $timeout
- attempt=$(( attempt + 1 ))
- timeout=$(( timeout * 2 ))
- done
-
- if [[ $exitCode != 0 ]]; then
- echo "You've failed me for the last time! ($@)" 1>&2
- fi
- return $exitcode
-}
-
-NAMESPACE="flux-operator"
-CRD="/home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml"
-JOB="lammps-job"
-
-# Size -1 to account for certificate generator
-SIZE=2
-
-print_magenta " apply : ${CRD}"
-print_magenta " job : ${JOB}"
-
-is_installed kubectl
-
-# Create the namespace (ok if already exists)
-run_echo_allow_fail kubectl create namespace ${NAMESPACE}
-
-# Always cleanup a previous one so tokens don't get stale
-run_echo_allow_fail kubectl delete -f ${CRD}
-echo
-podsCleaned="false"
-print_blue "Waiting for previous MiniCluster to be cleaned up..."
-while [[ "${podsCleaned}" == "false" ]]; do
- echo -n "."
- sleep 2
- state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1)
- lines=$(echo $state | wc -l)
- if [[ "${lines}" == "1" ]] && [[ "${state}" == *"No resources found in"* ]]; then
- echo
- print_green "🌀️ Previous pods are cleaned up."
- podsCleaned="true"
- break
- fi
-done
-
-# Ensure we have a MiniCluster of the right namespace running
-echo
-print_green "🌀️ Creating MiniCluster in ${NAMESPACE}"
-# Apply the job, get pods
-run_echo kubectl apply -f ${CRD}
-run_echo kubectl get -n ${NAMESPACE} pods
-
-# continue until we find the index-0 pod
-podsReady="false"
-
-echo
-print_blue "Waiting for MiniCluster of size ${SIZE} to be ready..."
-while [[ "${podsReady}" == "false" ]]; do
- echo -n "."
- sleep 2
- pods=$(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=name | wc -l)
- if [[ "${pods}" == "${SIZE}" ]]; then
- echo
- print_green "🌀️ All pods are running."
- podsReady="true"
- break
- fi
-done
-
-echo
-brokerPod=""
-brokerPrefix="${JOB}-0"
-while [[ "${brokerPod}" == "" ]]; do
- for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do
- if [[ "${pod}" == ${brokerPrefix}* ]]; then
- echo
- brokerPod=${pod}
- break
- fi
- done
-done
-
-echo
-serverReady="false"
-print_blue "Waiting for Flux Restful API Server to be ready..."
-while [[ "${serverReady}" == "false" ]]; do
- echo -n "."
- sleep 2
- logs=$(kubectl logs --namespace ${NAMESPACE} ${brokerPod} | grep "Uvicorn running")
- retval=$?
- if [[ "${retval}" == "0" ]]; then
- echo
- serverReady="true"
- print_green "🌀️ Flux RestFul API Server is Ready."
- break
- fi
-done
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out
deleted file mode 100644
index 3b18e51..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out
+++ /dev/null
@@ -1 +0,0 @@
-hello world
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out
deleted file mode 100644
index 3b18e51..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out
+++ /dev/null
@@ -1 +0,0 @@
-hello world
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out
deleted file mode 100644
index 3b18e51..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out
+++ /dev/null
@@ -1 +0,0 @@
-hello world
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out
deleted file mode 100644
index 3b18e51..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out
+++ /dev/null
@@ -1 +0,0 @@
-hello world
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out
deleted file mode 100644
index 3b18e51..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out
+++ /dev/null
@@ -1 +0,0 @@
-hello world
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json
deleted file mode 100644
index b7b654b..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json
+++ /dev/null
@@ -1,698 +0,0 @@
-{
- "times": {
- "destroy-cluster": 324.709,
- "create-cluster": 86.521,
- "minicluster-submit-size-2": 183.626,
- "reaxc-hns-1-minicluster-size-2": 32.1847505569458,
- "reaxc-hns-2-minicluster-size-2": 33.41048860549927,
- "reaxc-hns-3-minicluster-size-2": 30.96457529067993,
- "reaxc-hns-4-minicluster-size-2": 30.777089595794678,
- "reaxc-hns-5-minicluster-size-2": 31.048890829086304,
- "sleep-1-minicluster-size-2": 5.028888463973999,
- "sleep-2-minicluster-size-2": 5.045725584030151,
- "sleep-3-minicluster-size-2": 5.072444677352905,
- "sleep-4-minicluster-size-2": 5.034207582473755,
- "sleep-5-minicluster-size-2": 5.025948762893677,
- "hello-world-1-minicluster-size-2": 0.07241106033325195,
- "hello-world-2-minicluster-size-2": 0.052734375,
- "hello-world-3-minicluster-size-2": 0.04248523712158203,
- "hello-world-4-minicluster-size-2": 0.045003652572631836,
- "hello-world-5-minicluster-size-2": 0.05110311508178711,
- "minicluster-destroy-size-2": 0.277,
- "minicluster-create-persistent-size-2": 42.606,
- "minicluster-persistent-destroy-size-2": 0.164
- },
- "size": 4,
- "machine": "n1-standard-1",
- "minicluster": {
- "name": "lammps-job",
- "namespace": "flux-operator",
- "size": [
- 2
- ]
- },
- "jobs": {
- "reaxc-hns-1": {
- "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "reaxc-hns-2": {
- "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "reaxc-hns-3": {
- "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "reaxc-hns-4": {
- "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "reaxc-hns-5": {
- "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "sleep-1": {
- "command": "sleep 5",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "sleep-2": {
- "command": "sleep 5",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "sleep-3": {
- "command": "sleep 5",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "sleep-4": {
- "command": "sleep 5",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "sleep-5": {
- "command": "sleep 5",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "hello-world-1": {
- "command": "echo hello world",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "hello-world-2": {
- "command": "echo hello world",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "hello-world-3": {
- "command": "echo hello world",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "hello-world-4": {
- "command": "echo hello world",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- },
- "hello-world-5": {
- "command": "echo hello world",
- "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0",
- "repeats": 5,
- "workdir": "/home/flux/examples/reaxff/HNS"
- }
- },
- "info": {
- "reaxc-hns-1-minicluster-size-2": {
- "id": 130073755648,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674444768.0517902,
- "t_depend": 1674444768.0517902,
- "t_run": 1674444768.100832,
- "t_cleanup": 1674444800.2855825,
- "t_inactive": 1674444800.290403,
- "state": "INACTIVE",
- "name": "lmp",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675049568.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 32.1847505569458,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 40.13091278076172,
- "start_to_output_seconds": 43.215059757232666
- },
- "reaxc-hns-2-minicluster-size-2": {
- "id": 816932978688,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674444808.9904723,
- "t_depend": 1674444808.9904723,
- "t_run": 1674444809.0098114,
- "t_cleanup": 1674444842.4203,
- "t_inactive": 1674444842.4249685,
- "state": "INACTIVE",
- "name": "lmp",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "0",
- "nodelist": "lammps-job-0",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675049609.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 33.41048860549927,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 97.17731666564941,
- "start_to_output_seconds": 97.31685972213745
- },
- "reaxc-hns-3-minicluster-size-2": {
- "id": 2450245287936,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674444906.3438601,
- "t_depend": 1674444906.3438601,
- "t_run": 1674444906.3633585,
- "t_cleanup": 1674444937.3279338,
- "t_inactive": 1674444937.33689,
- "state": "INACTIVE",
- "name": "lmp",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675049706.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 30.96457529067993,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 67.29511857032776,
- "start_to_output_seconds": 67.40737009048462
- },
- "reaxc-hns-4-minicluster-size-2": {
- "id": 3581969170432,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674444973.8004916,
- "t_depend": 1674444973.8004916,
- "t_run": 1674444973.8231413,
- "t_cleanup": 1674445004.600231,
- "t_inactive": 1674445004.6049078,
- "state": "INACTIVE",
- "name": "lmp",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675049773.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 30.777089595794678,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 62.43251633644104,
- "start_to_output_seconds": 62.51574635505676
- },
- "reaxc-hns-5-minicluster-size-2": {
- "id": 4631065264128,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674445036.3308098,
- "t_depend": 1674445036.3308098,
- "t_run": 1674445036.3509514,
- "t_cleanup": 1674445067.3998423,
- "t_inactive": 1674445067.4045572,
- "state": "INACTIVE",
- "name": "lmp",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675049836.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 31.048890829086304,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 92.83428883552551,
- "start_to_output_seconds": 92.92412114143372
- },
- "sleep-1-minicluster-size-2": {
- "id": 461004341248,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674677415.8718548,
- "t_depend": 1674677415.8718548,
- "t_run": 1674677415.8845603,
- "t_cleanup": 1674677420.9134488,
- "t_inactive": 1674677420.9152129,
- "state": "INACTIVE",
- "name": "sleep",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675282215.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 5.028888463973999,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 14.840466737747192,
- "start_to_output_seconds": 17.383413314819336
- },
- "sleep-2-minicluster-size-2": {
- "id": 717628637184,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674677431.16695,
- "t_depend": 1674677431.16695,
- "t_run": 1674677431.1903481,
- "t_cleanup": 1674677436.2360737,
- "t_inactive": 1674677436.2395134,
- "state": "INACTIVE",
- "name": "sleep",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "0",
- "nodelist": "lammps-job-0",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675282231.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 5.045725584030151,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 12.824117660522461,
- "start_to_output_seconds": 15.347451210021973
- },
- "sleep-3-minicluster-size-2": {
- "id": 975108571136,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674677446.5178363,
- "t_depend": 1674677446.5178363,
- "t_run": 1674677446.534995,
- "t_cleanup": 1674677451.6074398,
- "t_inactive": 1674677451.613382,
- "state": "INACTIVE",
- "name": "sleep",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675282246.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 5.072444677352905,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 12.840857744216919,
- "start_to_output_seconds": 15.384143352508545
- },
- "sleep-4-minicluster-size-2": {
- "id": 1234333335552,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674677461.9656863,
- "t_depend": 1674677461.9656863,
- "t_run": 1674677461.9789429,
- "t_cleanup": 1674677467.0131505,
- "t_inactive": 1674677467.0233643,
- "state": "INACTIVE",
- "name": "sleep",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675282261.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 5.034207582473755,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 12.951504468917847,
- "start_to_output_seconds": 15.509077787399292
- },
- "sleep-5-minicluster-size-2": {
- "id": 1495168712704,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674677477.5129235,
- "t_depend": 1674677477.5129235,
- "t_run": 1674677477.5259533,
- "t_cleanup": 1674677482.551902,
- "t_inactive": 1674677482.555279,
- "state": "INACTIVE",
- "name": "sleep",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675282277.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 5.025948762893677,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 12.880193948745728,
- "start_to_output_seconds": 15.410512447357178
- },
- "hello-world-1-minicluster-size-2": {
- "id": 8356177641472,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674445258.3653252,
- "t_depend": 1674445258.3653252,
- "t_run": 1674445258.3868065,
- "t_cleanup": 1674445258.4592175,
- "t_inactive": 1674445258.46398,
- "state": "INACTIVE",
- "name": "echo",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675050058.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 0.07241106033325195,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 13.482953310012817,
- "start_to_output_seconds": 16.53845739364624
- },
- "hello-world-2-minicluster-size-2": {
- "id": 8635753168896,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674445275.028449,
- "t_depend": 1674445275.028449,
- "t_run": 1674445275.0489655,
- "t_cleanup": 1674445275.1016998,
- "t_inactive": 1674445275.1059186,
- "state": "INACTIVE",
- "name": "echo",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675050075.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 0.052734375,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 0.5918288230895996,
- "start_to_output_seconds": 0.6222965717315674
- },
- "hello-world-3-minicluster-size-2": {
- "id": 8641507753984,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674445275.3710968,
- "t_depend": 1674445275.3710968,
- "t_run": 1674445275.3893383,
- "t_cleanup": 1674445275.4318235,
- "t_inactive": 1674445275.4359808,
- "state": "INACTIVE",
- "name": "echo",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675050075.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 0.04248523712158203,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 0.17513155937194824,
- "start_to_output_seconds": 0.21306657791137695
- },
- "hello-world-4-minicluster-size-2": {
- "id": 8646121488384,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674445275.6465385,
- "t_depend": 1674445275.6465385,
- "t_run": 1674445275.6643715,
- "t_cleanup": 1674445275.7093751,
- "t_inactive": 1674445275.7134967,
- "state": "INACTIVE",
- "name": "echo",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675050075.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 0.045003652572631836,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 0.19276666641235352,
- "start_to_output_seconds": 0.2307295799255371
- },
- "hello-world-5-minicluster-size-2": {
- "id": 8649946693632,
- "userid": 1234,
- "urgency": 16,
- "priority": 16,
- "t_submit": 1674445275.8740122,
- "t_depend": 1674445275.8740122,
- "t_run": 1674445275.8942568,
- "t_cleanup": 1674445275.94536,
- "t_inactive": 1674445275.95746,
- "state": "INACTIVE",
- "name": "echo",
- "ntasks": 1,
- "nnodes": 1,
- "ranks": "1",
- "nodelist": "lammps-job-1",
- "success": true,
- "exception_occurred": false,
- "result": "COMPLETED",
- "expiration": 1675050075.0,
- "annotations": {
- "sched": {
- "queue": "default"
- }
- },
- "waitstatus": 0,
- "returncode": 0,
- "runtime": 0.05110311508178711,
- "exception": {
- "occurred": false,
- "severity": "",
- "type": "",
- "note": ""
- },
- "duration": "",
- "start_to_info_seconds": 0.17215561866760254,
- "start_to_output_seconds": 0.19998478889465332
- }
- }
-}
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out
deleted file mode 100644
index 647c484..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out
+++ /dev/null
@@ -1,80 +0,0 @@
-LAMMPS (29 Sep 2021 - Update 2)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
- using 1 OpenMP thread(s) per MPI task
-Reading data file ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000)
- 1 by 1 by 1 MPI processor grid
- reading atoms ...
- 304 atoms
- reading velocities ...
- 304 velocities
- read_data CPU = 0.005 seconds
-Replicating atoms ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000)
- 1 by 1 by 1 MPI processor grid
- bounding box image = (0 -1 -1) to (0 1 1)
- bounding box extra memory = 0.03 MB
- average # of replicas added to proc = 8.00 out of 8 (100.00%)
- 2432 atoms
- replicate CPU = 0.001 seconds
-Neighbor list info ...
- update every 20 steps, delay 0 steps, check no
- max neighbors/atom: 2000, page size: 100000
- master list distance cutoff = 11
- ghost atom cutoff = 11
- binsize = 5.5, bins = 10 5 6
- 2 neighbor lists, perpetual/occasional/extra = 2 0 0
- (1) pair reax/c, perpetual
- attributes: half, newton off, ghost
- pair build: half/bin/newtoff/ghost
- stencil: full/ghost/bin/3d
- bin: standard
- (2) fix qeq/reax, perpetual, copy from (1)
- attributes: half, newton off, ghost
- pair build: copy
- stencil: none
- bin: none
-Setting up Verlet run ...
- Unit style : real
- Current step : 0
- Time step : 0.1
-Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes
-Step Temp PotEng Press E_vdwl E_coul Volume
- 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867
- 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867
- 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867
- 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867
- 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867
- 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867
- 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867
- 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867
- 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867
- 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867
- 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867
-Loop time of 29.8322 on 1 procs for 100 steps with 2432 atoms
-
-Performance: 0.029 ns/day, 828.671 hours/ns, 3.352 timesteps/s
-94.2% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section | min time | avg time | max time |%varavg| %total
----------------------------------------------------------------
-Pair | 22.21 | 22.21 | 22.21 | 0.0 | 74.45
-Neigh | 0.61723 | 0.61723 | 0.61723 | 0.0 | 2.07
-Comm | 0.010007 | 0.010007 | 0.010007 | 0.0 | 0.03
-Output | 0.0004328 | 0.0004328 | 0.0004328 | 0.0 | 0.00
-Modify | 6.9933 | 6.9933 | 6.9933 | 0.0 | 23.44
-Other | | 0.00162 | | | 0.01
-
-Nlocal: 2432.00 ave 2432 max 2432 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost: 10685.0 ave 10685 max 10685 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs: 823958.0 ave 823958 max 823958 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 823958
-Ave neighs/atom = 338.79852
-Neighbor list builds = 5
-Dangerous builds not checked
-Total wall time: 0:00:30
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out
deleted file mode 100644
index 0b9df79..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out
+++ /dev/null
@@ -1,80 +0,0 @@
-LAMMPS (29 Sep 2021 - Update 2)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
- using 1 OpenMP thread(s) per MPI task
-Reading data file ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000)
- 1 by 1 by 1 MPI processor grid
- reading atoms ...
- 304 atoms
- reading velocities ...
- 304 velocities
- read_data CPU = 0.010 seconds
-Replicating atoms ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000)
- 1 by 1 by 1 MPI processor grid
- bounding box image = (0 -1 -1) to (0 1 1)
- bounding box extra memory = 0.03 MB
- average # of replicas added to proc = 8.00 out of 8 (100.00%)
- 2432 atoms
- replicate CPU = 0.001 seconds
-Neighbor list info ...
- update every 20 steps, delay 0 steps, check no
- max neighbors/atom: 2000, page size: 100000
- master list distance cutoff = 11
- ghost atom cutoff = 11
- binsize = 5.5, bins = 10 5 6
- 2 neighbor lists, perpetual/occasional/extra = 2 0 0
- (1) pair reax/c, perpetual
- attributes: half, newton off, ghost
- pair build: half/bin/newtoff/ghost
- stencil: full/ghost/bin/3d
- bin: standard
- (2) fix qeq/reax, perpetual, copy from (1)
- attributes: half, newton off, ghost
- pair build: copy
- stencil: none
- bin: none
-Setting up Verlet run ...
- Unit style : real
- Current step : 0
- Time step : 0.1
-Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes
-Step Temp PotEng Press E_vdwl E_coul Volume
- 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867
- 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867
- 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867
- 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867
- 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867
- 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867
- 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867
- 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867
- 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867
- 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867
- 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867
-Loop time of 31.2338 on 1 procs for 100 steps with 2432 atoms
-
-Performance: 0.028 ns/day, 867.606 hours/ns, 3.202 timesteps/s
-91.3% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section | min time | avg time | max time |%varavg| %total
----------------------------------------------------------------
-Pair | 23.353 | 23.353 | 23.353 | 0.0 | 74.77
-Neigh | 0.62616 | 0.62616 | 0.62616 | 0.0 | 2.00
-Comm | 0.0096617 | 0.0096617 | 0.0096617 | 0.0 | 0.03
-Output | 0.00044694 | 0.00044694 | 0.00044694 | 0.0 | 0.00
-Modify | 7.2429 | 7.2429 | 7.2429 | 0.0 | 23.19
-Other | | 0.001518 | | | 0.00
-
-Nlocal: 2432.00 ave 2432 max 2432 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost: 10685.0 ave 10685 max 10685 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs: 823958.0 ave 823958 max 823958 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 823958
-Ave neighs/atom = 338.79852
-Neighbor list builds = 5
-Dangerous builds not checked
-Total wall time: 0:00:32
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out
deleted file mode 100644
index b6380b6..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out
+++ /dev/null
@@ -1,80 +0,0 @@
-LAMMPS (29 Sep 2021 - Update 2)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
- using 1 OpenMP thread(s) per MPI task
-Reading data file ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000)
- 1 by 1 by 1 MPI processor grid
- reading atoms ...
- 304 atoms
- reading velocities ...
- 304 velocities
- read_data CPU = 0.002 seconds
-Replicating atoms ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000)
- 1 by 1 by 1 MPI processor grid
- bounding box image = (0 -1 -1) to (0 1 1)
- bounding box extra memory = 0.03 MB
- average # of replicas added to proc = 8.00 out of 8 (100.00%)
- 2432 atoms
- replicate CPU = 0.001 seconds
-Neighbor list info ...
- update every 20 steps, delay 0 steps, check no
- max neighbors/atom: 2000, page size: 100000
- master list distance cutoff = 11
- ghost atom cutoff = 11
- binsize = 5.5, bins = 10 5 6
- 2 neighbor lists, perpetual/occasional/extra = 2 0 0
- (1) pair reax/c, perpetual
- attributes: half, newton off, ghost
- pair build: half/bin/newtoff/ghost
- stencil: full/ghost/bin/3d
- bin: standard
- (2) fix qeq/reax, perpetual, copy from (1)
- attributes: half, newton off, ghost
- pair build: copy
- stencil: none
- bin: none
-Setting up Verlet run ...
- Unit style : real
- Current step : 0
- Time step : 0.1
-Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes
-Step Temp PotEng Press E_vdwl E_coul Volume
- 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867
- 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867
- 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867
- 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867
- 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867
- 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867
- 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867
- 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867
- 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867
- 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867
- 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867
-Loop time of 29.6229 on 1 procs for 100 steps with 2432 atoms
-
-Performance: 0.029 ns/day, 822.859 hours/ns, 3.376 timesteps/s
-94.4% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section | min time | avg time | max time |%varavg| %total
----------------------------------------------------------------
-Pair | 22.175 | 22.175 | 22.175 | 0.0 | 74.86
-Neigh | 0.63724 | 0.63724 | 0.63724 | 0.0 | 2.15
-Comm | 0.0097153 | 0.0097153 | 0.0097153 | 0.0 | 0.03
-Output | 0.00041342 | 0.00041342 | 0.00041342 | 0.0 | 0.00
-Modify | 6.799 | 6.799 | 6.799 | 0.0 | 22.95
-Other | | 0.001424 | | | 0.00
-
-Nlocal: 2432.00 ave 2432 max 2432 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost: 10685.0 ave 10685 max 10685 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs: 823958.0 ave 823958 max 823958 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 823958
-Ave neighs/atom = 338.79852
-Neighbor list builds = 5
-Dangerous builds not checked
-Total wall time: 0:00:30
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out
deleted file mode 100644
index 6c889f5..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out
+++ /dev/null
@@ -1,80 +0,0 @@
-LAMMPS (29 Sep 2021 - Update 2)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
- using 1 OpenMP thread(s) per MPI task
-Reading data file ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000)
- 1 by 1 by 1 MPI processor grid
- reading atoms ...
- 304 atoms
- reading velocities ...
- 304 velocities
- read_data CPU = 0.002 seconds
-Replicating atoms ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000)
- 1 by 1 by 1 MPI processor grid
- bounding box image = (0 -1 -1) to (0 1 1)
- bounding box extra memory = 0.03 MB
- average # of replicas added to proc = 8.00 out of 8 (100.00%)
- 2432 atoms
- replicate CPU = 0.001 seconds
-Neighbor list info ...
- update every 20 steps, delay 0 steps, check no
- max neighbors/atom: 2000, page size: 100000
- master list distance cutoff = 11
- ghost atom cutoff = 11
- binsize = 5.5, bins = 10 5 6
- 2 neighbor lists, perpetual/occasional/extra = 2 0 0
- (1) pair reax/c, perpetual
- attributes: half, newton off, ghost
- pair build: half/bin/newtoff/ghost
- stencil: full/ghost/bin/3d
- bin: standard
- (2) fix qeq/reax, perpetual, copy from (1)
- attributes: half, newton off, ghost
- pair build: copy
- stencil: none
- bin: none
-Setting up Verlet run ...
- Unit style : real
- Current step : 0
- Time step : 0.1
-Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes
-Step Temp PotEng Press E_vdwl E_coul Volume
- 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867
- 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867
- 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867
- 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867
- 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867
- 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867
- 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867
- 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867
- 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867
- 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867
- 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867
-Loop time of 29.7805 on 1 procs for 100 steps with 2432 atoms
-
-Performance: 0.029 ns/day, 827.235 hours/ns, 3.358 timesteps/s
-94.2% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section | min time | avg time | max time |%varavg| %total
----------------------------------------------------------------
-Pair | 22.214 | 22.214 | 22.214 | 0.0 | 74.59
-Neigh | 0.62414 | 0.62414 | 0.62414 | 0.0 | 2.10
-Comm | 0.01756 | 0.01756 | 0.01756 | 0.0 | 0.06
-Output | 0.00041921 | 0.00041921 | 0.00041921 | 0.0 | 0.00
-Modify | 6.9226 | 6.9226 | 6.9226 | 0.0 | 23.25
-Other | | 0.00152 | | | 0.01
-
-Nlocal: 2432.00 ave 2432 max 2432 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost: 10685.0 ave 10685 max 10685 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs: 823958.0 ave 823958 max 823958 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 823958
-Ave neighs/atom = 338.79852
-Neighbor list builds = 5
-Dangerous builds not checked
-Total wall time: 0:00:30
diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out
deleted file mode 100644
index 9c9d4df..0000000
--- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out
+++ /dev/null
@@ -1,80 +0,0 @@
-LAMMPS (29 Sep 2021 - Update 2)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
- using 1 OpenMP thread(s) per MPI task
-Reading data file ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000)
- 1 by 1 by 1 MPI processor grid
- reading atoms ...
- 304 atoms
- reading velocities ...
- 304 velocities
- read_data CPU = 0.002 seconds
-Replicating atoms ...
- triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000)
- 1 by 1 by 1 MPI processor grid
- bounding box image = (0 -1 -1) to (0 1 1)
- bounding box extra memory = 0.03 MB
- average # of replicas added to proc = 8.00 out of 8 (100.00%)
- 2432 atoms
- replicate CPU = 0.001 seconds
-Neighbor list info ...
- update every 20 steps, delay 0 steps, check no
- max neighbors/atom: 2000, page size: 100000
- master list distance cutoff = 11
- ghost atom cutoff = 11
- binsize = 5.5, bins = 10 5 6
- 2 neighbor lists, perpetual/occasional/extra = 2 0 0
- (1) pair reax/c, perpetual
- attributes: half, newton off, ghost
- pair build: half/bin/newtoff/ghost
- stencil: full/ghost/bin/3d
- bin: standard
- (2) fix qeq/reax, perpetual, copy from (1)
- attributes: half, newton off, ghost
- pair build: copy
- stencil: none
- bin: none
-Setting up Verlet run ...
- Unit style : real
- Current step : 0
- Time step : 0.1
-Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes
-Step Temp PotEng Press E_vdwl E_coul Volume
- 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867
- 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867
- 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867
- 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867
- 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867
- 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867
- 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867
- 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867
- 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867
- 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867
- 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867
-Loop time of 30.0677 on 1 procs for 100 steps with 2432 atoms
-
-Performance: 0.029 ns/day, 835.214 hours/ns, 3.326 timesteps/s
-93.3% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section | min time | avg time | max time |%varavg| %total
----------------------------------------------------------------
-Pair | 22.337 | 22.337 | 22.337 | 0.0 | 74.29
-Neigh | 0.73472 | 0.73472 | 0.73472 | 0.0 | 2.44
-Comm | 0.009731 | 0.009731 | 0.009731 | 0.0 | 0.03
-Output | 0.00041722 | 0.00041722 | 0.00041722 | 0.0 | 0.00
-Modify | 6.9844 | 6.9844 | 6.9844 | 0.0 | 23.23
-Other | | 0.001495 | | | 0.00
-
-Nlocal: 2432.00 ave 2432 max 2432 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost: 10685.0 ave 10685 max 10685 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs: 823958.0 ave 823958 max 823958 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 823958
-Ave neighs/atom = 338.79852
-Neighbor list builds = 5
-Dangerous builds not checked
-Total wall time: 0:00:30
diff --git a/examples/up-submit-down/plot_results.py b/examples/up-submit-down/plot_results.py
deleted file mode 100644
index 6395f83..0000000
--- a/examples/up-submit-down/plot_results.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import os
-import sys
-
-import matplotlib.pyplot as plt
-import pandas
-import seaborn as sns
-
-
-def read_json(filename):
- """
- Read a file into a text blob.
- """
- with open(filename, "r") as fd:
- content = json.loads(fd.read())
- return content
-
-
-def plot_outputs(raw, plotname, ext="pdf"):
- """
- Parse results.json into dataframe and plots to save.
- """
- # Let's save the following, with runid as index
- columns = ["minicluster_size", "job_type", "time_seconds", "time_type"]
-
- # Let's first organize distributions of times
- data = []
- index = []
- for jobname, item in raw["info"].items():
- index += [jobname, jobname, jobname]
- jobtype = jobname.split("-minicluster-size")[0].rsplit("-", 1)[0]
-
- # This is how flux-cloud organized the output
- minicluster_size = int(jobname.rsplit("size-", 1)[-1])
-
- # Manual melt :)
- data.append([minicluster_size, jobtype, item["runtime"], "runtime"])
- data.append(
- [
- minicluster_size,
- jobtype,
- item["start_to_output_seconds"],
- "output_seconds",
- ]
- )
- data.append(
- [minicluster_size, jobtype, item["start_to_info_seconds"], "info_seconds"]
- )
-
- # Assemble the data frame, index is the runids
- df = pandas.DataFrame(data, columns=columns)
- df.index = index
-
- # Save raw data
- df.to_csv("results-df.csv")
-
- # We need colors!
- colors = sns.color_palette("hls", 8)
- hexcolors = colors.as_hex()
-
- palette = {}
- for size in df.time_type.unique():
- palette[size] = hexcolors.pop(0)
-
- # Sort by size
- palette = dict(sorted(palette.items()))
-
- # Let's make a plot that shows distributions of the times by the cluster size, across all
- make_plot(
- df,
- title="Flux MiniCluster Time Variation",
- tag="minicluster_times",
- ydimension="time_seconds",
- palette=palette,
- ext=ext,
- plotname=plotname,
- )
-
-
-def make_plot(df, title, tag, ydimension, palette, ext="pdf", plotname="lammps"):
- """
- Helper function to make common plots.
- """
- ext = ext.strip(".")
- plt.figure(figsize=(12, 12))
- sns.set_style("dark")
- ax = sns.boxplot(
- x="job_type",
- y=ydimension,
- hue="time_type",
- data=df,
- whis=[5, 95],
- palette=palette,
- )
- plt.title(title)
- plt.legend([], [], frameon=False)
- ax.set_xlabel("Job Type", fontsize=16)
- ax.set_ylabel("Time (seconds)", fontsize=16)
- ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=14)
- ax.set_yticklabels(ax.get_yticks(), fontsize=14)
- handles, _ = ax.get_legend_handles_labels()
- ax.legend(handles, list(palette))
- plt.savefig(f"{tag}_{plotname}.{ext}")
- plt.clf()
-
-
-def get_parser():
- """
- Process results file into plots.
- """
- parser = argparse.ArgumentParser(description="Plot LAMMPS outputs")
- parser.add_argument("results_json", help="results json file", nargs="?")
- parser.add_argument(
- "-p",
- "--plotname",
- default="lammps",
- help="base name for plot output files",
- )
- parser.add_argument(
- "-e",
- "--extension",
- dest="extension",
- default="pdf",
- help="image extension to use (defaults to pdf)",
- )
- return parser
-
-
-def main():
- """
- Read in results json, and make plots.
- """
- parser = get_parser()
- args = parser.parse_args()
- if not os.path.exists(args.results_json):
- sys.exit(f"{args.results_json} does not exist.")
- data = read_json(args.results_json)
- plot_outputs(data, args.plotname, ext=args.extension)
-
-
-if __name__ == "__main__":
- main()
diff --git a/fluxcloud/client/__init__.py b/fluxcloud/client/__init__.py
index 673a585..1007f64 100644
--- a/fluxcloud/client/__init__.py
+++ b/fluxcloud/client/__init__.py
@@ -130,11 +130,6 @@ def get_parser():
description="Bring the cluster up, run experiments via applying CRDs, and bring it down.",
formatter_class=argparse.RawTextHelpFormatter,
)
- ui = subparsers.add_parser(
- "ui",
- description="Once the cluster is up, create/open the user interface.",
- formatter_class=argparse.RawTextHelpFormatter,
- )
batch = subparsers.add_parser(
"batch",
description="Bring the cluster up, run experiments via a Flux Restful API submit, and bring it down.",
@@ -167,13 +162,38 @@ def get_parser():
help="Bring down all experiment clusters",
dest="down_all",
)
+ for command in submit, apply:
+ command.add_argument(
+ "--non-interactive",
+ "--ni",
+ default=False,
+ action="store_true",
+ help="Don't ask before bringing miniclusters down or re-creating.",
+ dest="non_interactive",
+ )
+
+ experiment = subparsers.add_parser(
+ "experiment",
+ description="Experiment controller.",
+ formatter_class=argparse.RawTextHelpFormatter,
+ )
+ experiment.add_argument(
+ "experiment_command",
+ help="Command for experiment (defaults to init)",
+ )
+ experiment.add_argument(
+ "-c",
+ "--cloud",
+ help="cloud to use",
+ choices=clouds.cloud_names,
+ )
listing = subparsers.add_parser(
"list",
description="List experiment ids available.",
formatter_class=argparse.RawTextHelpFormatter,
)
- for command in run, up, down, apply, listing, batch, submit, ui:
+ for command in run, up, down, apply, listing, batch, submit:
command.add_argument(
"experiments",
default="experiments.yaml",
@@ -188,7 +208,7 @@ def get_parser():
choices=clouds.cloud_names,
)
- for command in apply, up, down, run, batch, submit, ui:
+ for command in apply, up, down, run, batch, submit:
command.add_argument(
"--force-cluster",
dest="force_cluster",
@@ -228,11 +248,6 @@ def get_parser():
default=False,
action="store_true",
)
- command.add_argument(
- "--template",
- help="minicluster yaml template to populate for experiments (defaults to minicluster-template.yaml",
- default="minicluster-template.yaml",
- )
command.add_argument(
"--force",
help="force re-run if experiment already exists.",
@@ -287,22 +302,22 @@ def help(return_code=0):
# Does the user want a shell?
if args.command == "apply":
from .apply import main
- elif args.command == "submit":
- from .apply import submit as main
- elif args.command == "list":
- from .listing import main
- elif args.command == "run":
- from .run import main
elif args.command == "batch":
from .run import batch as main
elif args.command == "config":
from .config import main
- elif args.command == "ui":
- from .ui import main
- elif args.command == "up":
- from .up import main
elif args.command == "down":
from .down import main
+ elif args.command == "experiment":
+ from .experiment import main
+ elif args.command == "list":
+ from .listing import main
+ elif args.command == "run":
+ from .run import main
+ elif args.command == "submit":
+ from .apply import submit as main
+ elif args.command == "up":
+ from .up import main
# Pass on to the correct parser
return_code = 0
diff --git a/fluxcloud/client/apply.py b/fluxcloud/client/apply.py
index 13d1369..db6d0ee 100644
--- a/fluxcloud/client/apply.py
+++ b/fluxcloud/client/apply.py
@@ -11,7 +11,7 @@ def main(args, parser, extra, subparser):
apply parser submits via separate CRDs.
"""
cli, setup, experiment = prepare_client(args, extra)
- cli.apply(setup, experiment=experiment)
+ cli.apply(setup, experiment=experiment, interactive=not args.non_interactive)
setup.cleanup(setup.matrices)
@@ -20,5 +20,5 @@ def submit(args, parser, extra, subparser):
submit parser submits via the Flux Restful API to one cluster
"""
cli, setup, experiment = prepare_client(args, extra)
- cli.submit(setup, experiment=experiment)
+ cli.submit(setup, experiment=experiment, interactive=not args.non_interactive)
setup.cleanup(setup.matrices)
diff --git a/fluxcloud/client/experiment.py b/fluxcloud/client/experiment.py
new file mode 100644
index 0000000..0229135
--- /dev/null
+++ b/fluxcloud/client/experiment.py
@@ -0,0 +1,27 @@
+# Copyright 2023 Lawrence Livermore National Security, LLC and other
+# This is part of Flux Framework. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import fluxcloud.main.template as templates
+from fluxcloud.logger import logger
+from fluxcloud.main import get_experiment_client
+
+
+def main(args, parser, extra, subparser):
+ """
+ apply parser submits via separate CRDs.
+ """
+ cli = get_experiment_client(args.cloud)
+ if args.experiment_command == "init":
+ if cli.name == "aws":
+ print(templates.aws_experiment_template)
+ elif cli.name in ["google", "gcp"]:
+ print(templates.google_experiment_template)
+ elif cli.name == "minikube":
+ print(templates.minikube_experiment_template)
+ else:
+ logger.error(f"Client {cli.name} is not a recognized cloud")
+
+ else:
+ logger.exit(f'{args.experiment_command} is not recognized. Try "init"')
diff --git a/fluxcloud/client/helpers.py b/fluxcloud/client/helpers.py
index d9973d4..1aba57c 100644
--- a/fluxcloud/client/helpers.py
+++ b/fluxcloud/client/helpers.py
@@ -17,11 +17,10 @@ def prepare_client(args, extra):
"""
utils.ensure_no_extra(extra)
- cli = get_experiment_client(args.cloud)
+ cli = get_experiment_client(args.cloud, debug=args.debug)
setup = ExperimentSetup(
args.experiments,
force_cluster=args.force_cluster,
- template=args.template,
cleanup=args.cleanup,
# Ensure the output directory is namespaced by the cloud name
outdir=os.path.join(args.output_dir, cli.name),
diff --git a/fluxcloud/client/ui.py b/fluxcloud/client/ui.py
deleted file mode 100644
index 40ae1d1..0000000
--- a/fluxcloud/client/ui.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2022 Lawrence Livermore National Security, LLC and other
-# This is part of Flux Framework. See the COPYRIGHT file for details.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from fluxcloud.logger import logger
-
-from .helpers import prepare_client
-
-
-def main(args, parser, extra, subparser):
- """
- open the ui by starting flux
- """
- cli, setup, experiment = prepare_client(args, extra)
- size = args.size
- if not size and len(experiment.minicluster.get("size")) != 1:
- logger.exit(
- "Your MiniCluster has more than one size - please define the targer size with --size."
- )
- elif not size:
- size = experiment.minicluster["size"][0]
- logger.info(f"Selected size {size} MiniCluster to open user interface.")
- cli.open_ui(setup, experiment=experiment, size=size, persistent=True)
diff --git a/fluxcloud/defaults.py b/fluxcloud/defaults.py
index fb64369..d321073 100644
--- a/fluxcloud/defaults.py
+++ b/fluxcloud/defaults.py
@@ -13,9 +13,6 @@
# The default settings file in the install root
default_settings_file = os.path.join(reps["$install_dir"], "settings.yml")
-# Default template if one is not provided
-default_minicluster_template = os.path.join(install_dir, "minicluster-template.yaml")
-
# User home
userhome = os.path.expanduser("~/.fluxcloud")
diff --git a/fluxcloud/main/__init__.py b/fluxcloud/main/__init__.py
index 836c786..ce6c478 100644
--- a/fluxcloud/main/__init__.py
+++ b/fluxcloud/main/__init__.py
@@ -1,10 +1,10 @@
-# Copyright 2022 Lawrence Livermore National Security, LLC and other
+# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other
# This is part of Flux Framework. See the COPYRIGHT file for details.
#
# SPDX-License-Identifier: Apache-2.0
-def get_experiment_client(cloud=None, force_cluster=False):
+def get_experiment_client(cloud=None, **kwargs):
"""
Create the cloud experiment client.
"""
@@ -19,4 +19,4 @@ def get_experiment_client(cloud=None, force_cluster=False):
cloud = clouds.get_cloud(cloud)
else:
cloud = clients.ExperimentClient
- return cloud(force_cluster=force_cluster)
+ return cloud(**kwargs)
diff --git a/fluxcloud/main/api.py b/fluxcloud/main/api.py
index 24c693e..860217d 100644
--- a/fluxcloud/main/api.py
+++ b/fluxcloud/main/api.py
@@ -3,167 +3,319 @@
#
# SPDX-License-Identifier: Apache-2.0
-import atexit
-import logging
import os
+import re
import shutil
-import subprocess
-import threading
import time
import uuid
from flux_restful_client.main import get_client
+from fluxoperator.client import FluxOperator
import fluxcloud.utils as utils
from fluxcloud.logger import logger
here = os.path.dirname(os.path.abspath(__file__))
-exit_event = threading.Event()
-
class APIClient:
- def __init__(self, token=None, user=None):
+ def __init__(self, token=None, user=None, secret_key=None):
"""
API client wrapper.
"""
- self.user = token or os.environ.get("FLUX_USER") or "fluxuser"
+ self.user = token or os.environ.get("FLUX_USER") or user or "fluxuser"
self.token = token or os.environ.get("FLUX_TOKEN") or str(uuid.uuid4())
- self.cli = get_client(user=self.user, token=self.token)
+ self.secret_key = (
+ secret_key or os.environ.get("FLUX_SECRET_KEY") or str(uuid.uuid4())
+ )
self.proc = None
self.broker_pod = None
- def check(self, experiment):
+ def show_credentials(self):
"""
- Set the basic auth for username and password and check it works
+ Show the token and user, if requested.
"""
- minicluster = experiment.minicluster
- get_broker_pod = experiment.get_shared_script(
- "broker-id", {"minicluster": minicluster}
- )
-
- logger.info("Waiting for id of running broker pod...")
-
- # We've already waited for them to be running
- broker_pod = None
- while not broker_pod:
- result = utils.run_capture(["/bin/bash", get_broker_pod], stream=True)
-
- # Save the broker pod, or exit on failure.
- if result["message"]:
- broker_pod = result["message"].strip()
-
- self.broker_pod = broker_pod
- self.port_forward(minicluster["namespace"], self.broker_pod)
+ logger.info("MiniCluster created with credentials:")
+ logger.info(f" FLUX_USER={self.user}")
+ logger.info(f" FLUX_TOKEN={self.token}")
- def port_forward(self, namespace, broker_pod):
+ def _set_minicluster_credentials(self, minicluster):
"""
- Ask user to open port to forward
+ If the user provided credentials, use
"""
- command = ["kubectl", "port-forward", "-n", namespace, broker_pod, "5000:5000"]
+ if "flux_restful" not in minicluster:
+ minicluster["flux_restful"] = {}
- # This is detached - we can kill but not interact
- logger.info(" ".join(command))
- self.proc = proc = subprocess.Popen(
- command,
- stdout=subprocess.DEVNULL if logger.level >= logging.DEBUG else None,
- )
+ if "username" not in minicluster["flux_restful"]:
+ minicluster["flux_restful"]["username"] = self.user
+
+ if "token" not in minicluster["flux_restful"]:
+ minicluster["flux_restful"]["token"] = self.token
- def cleanup():
- proc.kill()
+ if "secret_key" not in minicluster["flux_restful"]:
+ minicluster["flux_restful"]["secret_key"] = self.secret_key
- # Ensure we cleanup if anything goes wrong
- atexit.register(cleanup)
+ # Update credentials
+ self.user = minicluster["flux_restful"]["username"]
+ self.token = minicluster["flux_restful"]["token"]
+ self.secret_key = minicluster["flux_restful"]["secret_key"]
+ return minicluster
- def submit(self, setup, experiment, size):
+ def _create_minicluster(
+ self, operator, minicluster, experiment, job, interactive=True
+ ):
"""
- Use the client to submit the jobs programatically.
+ Shared function to take an operator handle and create the minicluster.
+
+ This can be used for apply or submit! We separate minicluster (gets
+ piped into the MiniClusterSpec) from job (gets piped into a
+ MiniClusterContainer spec).
"""
- # Submit jobs!
-
- # Sleep time will be time of last job, assuming they are similar
- sleep_time = 5
- for jobname, job in experiment.jobs.items():
- # Do we want to run this job for this size and machine?
- if not experiment.check_job_run(job, size):
- logger.debug(
- f"Skipping job {jobname} as does not match inclusion criteria."
- )
- continue
+ namespace = minicluster["namespace"]
+ image = job["image"]
+ name = minicluster["name"]
+ size = minicluster["size"]
+
+ self._set_minicluster_credentials(minicluster)
+
+ try:
+ # The operator will time creation through pods being ready
+ result = operator.create_minicluster(**minicluster, container=job)
+ except Exception as e:
+ # Give the user the option to delete and recreate or just exit
+ logger.error(f"There was an issue creating the MiniCluster: {e}")
+ if interactive and not utils.confirm_action(
+ "Would you like to submit jobs to the current cluster? You will need to have provided the same username as password."
+ ):
+ if utils.confirm_action(
+ "Would you like to delete this mini cluster and re-create?"
+ ):
+ logger.info("Cleaning up MiniCluster...")
+ operator.delete_minicluster(name=name, namespace=namespace)
+ return self._create_minicluster(
+ operator, minicluster, experiment, job, interactive=interactive
+ )
+ else:
+ logger.exit(
+ f"Try: 'kubectl delete -n {namespace} minicluster {name}'"
+ )
+ elif not interactive:
+ logger.exit(f"Try: 'kubectl delete -n {namespace} minicluster {name}'")
+ return
+
+ # Wait for pods to be ready to include in minicluster up time
+ self.show_credentials()
+
+ # Save MiniCluster metadata
+ image_slug = re.sub("(:|/)", "-", image)
+ uid = f"{size}-{name}-{image_slug}"
+ experiment.save_json(result, f"minicluster-size-{uid}.json")
+
+ # This is a good point to also save nodes metadata
+ nodes = operator.get_nodes()
+ operator.wait_pods(quiet=True)
+ pods = operator.get_pods()
+
+ experiment.save_file(nodes.to_str(), f"nodes-{uid}.json")
+ experiment.save_file(pods.to_str(), f"pods-size-{uid}.json")
+ return result
+
+ def apply(
+ self,
+ experiment,
+ minicluster,
+ job=None,
+ outfile=None,
+ stdout=True,
+ interactive=True,
+ ):
+ """
+ Use the client to apply (1:1 job,minicluster) the jobs programatically.
+ """
+ namespace = minicluster["namespace"]
+ name = minicluster["name"]
- if "command" not in job:
- logger.debug(f"Skipping job {jobname} as does not have a command.")
- continue
+ # Interact with the Flux Operator Python SDK
+ operator = FluxOperator(namespace)
- # The experiment is defined by the machine type and size
- experiment_dir = experiment.root_dir
+ self._create_minicluster(
+ operator, minicluster, experiment, job, interactive=interactive
+ )
- # Add the size
- jobname = f"{jobname}-minicluster-size-{size}"
- job_output = os.path.join(experiment_dir, jobname)
- logfile = os.path.join(job_output, "log.out")
+ # Get the broker pod (this would also wait for all pods to be ready)
+ broker = operator.get_broker_pod()
- # Do we have output?
- if os.path.exists(logfile) and not setup.force:
- relpath = os.path.relpath(logfile, experiment_dir)
- logger.warning(
- f"{relpath} already exists and force is False, skipping."
- )
- continue
-
- elif os.path.exists(logfile) and setup.force:
- logger.warning(f"Cleaning up previous run in {job_output}.")
- shutil.rmtree(job_output)
-
- # Create job directory anew
- utils.mkdir_p(job_output)
-
- kwargs = dict(job)
- del kwargs["command"]
-
- # Assume the task gets all nodes, unless specified in job
- # Also assume the flux restful server is using one node
- if "nodes" not in kwargs:
- kwargs["nodes"] = size - 1
- if "tasks" not in kwargs:
- kwargs["tasks"] = size - 1
-
- # Ensure we convert - map between job params and the flux restful api
- for convert in (
- ["num_tasks", "tasks"],
- ["cores_per_task", "cores"],
- ["gpus_per_task", "gpus"],
- ["num_nodes", "nodes"],
- ):
- if convert[1] in kwargs:
- kwargs[convert[0]] = kwargs[convert[1]]
+ # Time from when broker pod (and all pods are ready)
+ start = time.time()
- # Let's also keep track of actual time to get logs, info, etc.
- start = time.time()
+ # Get the pod to stream output from directly
+ if outfile is not None:
+ operator.stream_output(outfile, pod=broker, stdout=stdout)
- # Run and block output until job is done
- res = self.cli.submit(command=job["command"], **kwargs)
+ # When output done streaming, job is done
+ end = time.time()
+ logger.info(f"Job {name} is complete! Cleaning up MiniCluster...")
- logger.info(f"Submitting {jobname}: {job['command']}")
- info = self.cli.jobs(res["id"])
+ # This also waits for termination (and pods to be gone) and times it
+ operator.delete_minicluster(name=name, namespace=namespace)
- while info["returncode"] == "":
- info = self.cli.jobs(res["id"])
- time.sleep(sleep_time)
+ # TODO likely need to separate minicluster up/down times.
+ results = {"times": operator.times}
+ results["times"][name] = end - start
+ return results
- end1 = time.time()
- output = self.cli.output(res["id"]).get("Output")
- if output:
- utils.write_file("".join(output), logfile)
- end2 = time.time()
+ def submit(
+ self, setup, experiment, minicluster, job, poll_seconds=20, interactive=True
+ ):
+ """
+ Use the client to submit the jobs programatically.
+ """
+ namespace = minicluster["namespace"]
+ image = job["image"]
+ name = minicluster["name"]
+ size = minicluster["size"]
+
+ # Interact with the Flux Operator Python SDK
+ operator = FluxOperator(namespace)
- # Get the full job info, and add some wrapper times
- info = self.cli.jobs(res["id"])
- info["start_to_info_seconds"] = end1 - start
- info["start_to_output_seconds"] = end2 - start
+ self._create_minicluster(
+ operator, minicluster, experiment, job, interactive=interactive
+ )
- yield jobname, info
- sleep_time = info["runtime"]
+ # Get the broker pod (this would also wait for all pods to be ready)
+ broker = operator.get_broker_pod()
+
+ # Return results (and times) to calling client
+ results = {}
+
+ # Submit jobs via port forward - this waits until the server is ready
+ with operator.port_forward(broker) as forward_url:
+ print(f"Port forward opened to {forward_url}")
+
+ # See https://flux-framework.org/flux-restful-api/getting_started/api.html
+ cli = get_client(
+ host=forward_url,
+ user=self.user,
+ token=self.token,
+ secret_key=self.secret_key,
+ )
+ cli.set_basic_auth(self.user, self.token)
+
+ # Keep a lookup of jobid and output files.
+ # We will try waiting for all jobs to finish and then save output
+ jobs = []
+ for jobname, job in experiment.jobs.items():
+ # Do we want to run this job for this size, image?
+ if not experiment.check_job_run(job, size=size, image=image):
+ logger.debug(
+ f"Skipping job {jobname} as does not match inclusion criteria."
+ )
+ continue
+
+ if "command" not in job:
+ logger.debug(f"Skipping job {jobname} as does not have a command.")
+ continue
+
+ # Here we submit all jobs to the scheduler. Let the scheduler handle it!
+ submit_job = self.submit_job(
+ cli, experiment, setup, minicluster, job, jobname
+ )
+ if not submit_job:
+ continue
+ jobs.append(submit_job)
+
+ logger.info(f"Submit {len(jobs)} jobs! Waiting for completion...")
+
+ # Poll once every 30 seconds
+ # This could be improved with some kind of notification / pubsub thing
+ completed = []
+ while jobs:
+ logger.info(f"{len(jobs)} are active.")
+ time.sleep(poll_seconds)
+ unfinished = []
+ for job in jobs:
+ if "id" not in job:
+ logger.warning(
+ f"Job {job} is missing an id or name, likely an issue or not ready, skipping."
+ )
+ continue
+
+ info = cli.jobs(job["id"])
+
+ # If we don't have a name yet, it's still pending
+ if "name" not in info:
+ unfinished.append(job)
+ continue
+
+ jobname = info["name"].rjust(15)
+ if info["state"] == "INACTIVE":
+ finish_time = round(info["runtime"], 2)
+ logger.debug(
+ f"{jobname} is finished {info['result']} in {finish_time} seconds."
+ )
+ job["info"] = info
+ job["output"] = cli.output(job["id"]).get("Output")
+ completed.append(job)
+ else:
+ logger.debug(f"{jobname} is in state {info['state']}")
+ unfinished.append(job)
+ jobs = unfinished
+
+ logger.info("All jobs are complete!")
+
+ # This also waits for termination (and pods to be gone) and times it
+ if not interactive or utils.confirm_action(
+ "Would you like to delete this mini cluster?"
+ ):
+ logger.info("Cleaning up MiniCluster...")
+ operator.delete_minicluster(name=name, namespace=namespace)
+
+ # Get times recorded by FluxOperator Python SDK
+ results["jobs"] = completed
+ results["times"] = operator.times
+ return results
+
+ def submit_job(self, cli, experiment, setup, minicluster, job, jobname):
+ """
+ Submit the job (if appropriate for the minicluster)
- # Kill the connection to the service
- self.proc.kill()
+ Return an appended Flux Restful API job result with the expected
+ output file.
+ """
+ # The experiment is defined by the machine type and size
+ experiment_dir = experiment.root_dir
+
+ jobname = f"{jobname}-minicluster-size-{minicluster['size']}"
+ job_output = os.path.join(experiment_dir, jobname)
+ logfile = os.path.join(job_output, "log.out")
+
+ # Do we have output?
+ if os.path.exists(logfile) and not setup.force:
+ relpath = os.path.relpath(logfile, experiment_dir)
+ logger.warning(f"{relpath} already exists and force is False, skipping.")
+ return
+
+ if os.path.exists(logfile) and setup.force:
+ logger.warning(f"Cleaning up previous run in {job_output}.")
+ shutil.rmtree(job_output)
+
+ kwargs = dict(job)
+ del kwargs["command"]
+
+ # Ensure we convert - map between job params and the flux restful api
+ for convert in (
+ ["num_tasks", "tasks"],
+ ["cores_per_task", "cores"],
+ ["gpus_per_task", "gpus"],
+ ["num_nodes", "nodes"],
+ ["workdir", "working_dir"],
+ ):
+ if convert[1] in kwargs:
+ kwargs[convert[0]] = kwargs[convert[1]]
+ del kwargs[convert[1]]
+
+ # Submit the job, add the expected output file, and return
+ logger.info(f"Submitting {jobname}: {job['command']}")
+ res = cli.submit(command=job["command"], **kwargs)
+ res["job_output"] = logfile
+ return res
diff --git a/fluxcloud/main/client.py b/fluxcloud/main/client.py
index e5a8db5..49cdbb8 100644
--- a/fluxcloud/main/client.py
+++ b/fluxcloud/main/client.py
@@ -3,13 +3,13 @@
#
# SPDX-License-Identifier: Apache-2.0
+import copy
import os
import shutil
-import time
+import fluxcloud.main.api as api
import fluxcloud.utils as utils
from fluxcloud.logger import logger
-from fluxcloud.main.api import APIClient
from fluxcloud.main.decorator import save_meta, timed
here = os.path.dirname(os.path.abspath(__file__))
@@ -26,9 +26,10 @@ def __init__(self, *args, **kwargs):
self.settings = settings.Settings
self.info = {}
self.times = {}
+ self.debug = kwargs.get("debug", False)
# Job prefix is used for organizing time entries
- self.job_prefix = "minicluster-run"
+ self.job_prefix = "job_"
def __repr__(self):
return str(self)
@@ -67,7 +68,7 @@ def run(self, setup):
# Each experiment has its own cluster size and machine type
for experiment in setup.iter_experiments():
self.up(setup, experiment=experiment)
- self.apply(setup, experiment=experiment)
+ self.apply(setup, experiment=experiment, interactive=False)
self.down(setup, experiment=experiment)
@save_meta
@@ -82,7 +83,7 @@ def batch(self, setup):
# Each experiment has its own cluster size and machine type
for experiment in setup.iter_experiments():
self.up(setup, experiment=experiment)
- self.submit(setup, experiment=experiment)
+ self.submit(setup, experiment=experiment, interactive=False)
self.down(setup, experiment=experiment)
@save_meta
@@ -93,81 +94,7 @@ def down(self, *args, **kwargs):
raise NotImplementedError
@save_meta
- def open_ui(self, setup, experiment, size, api=None, persistent=False):
- """
- Launch a CRD that opens the UI only.
- """
- # The MiniCluster can vary on size
- minicluster = experiment.minicluster
-
- # Create a FluxRestful API to submit to
- created = False
- if api is None:
- api = APIClient()
- created = True
-
- logger.info(f"\n🌀 Bringing up MiniCluster of size {size}")
-
- # Get persistent variables for this job size, image is required
- job = experiment.get_persistent_variables(size, required=["image"])
- job.update({"token": api.token, "user": api.user})
-
- # We can't have a command
- if "command" in job:
- del job["command"]
-
- # Pre-pull containers, etc.
- if hasattr(self, "pre_apply"):
- self.pre_apply(experiment, "global-job", job=job)
-
- # Create the minicluster via a CRD without a command
- crd = experiment.generate_crd(job, size)
-
- # Create one MiniCluster CRD (without a command) to run the Flux Restful API
- kwargs = {
- "minicluster": minicluster,
- "crd": crd,
- "token": api.token,
- "user": api.user,
- "size": size,
- }
- submit_script = experiment.get_shared_script(
- "minicluster-create-persistent", kwargs, suffix=f"-size-{size}"
- )
- # Start the MiniCluster! This should probably be done better...
- self.run_timed(
- f"minicluster-create-persistent-size-{size}", ["/bin/bash", submit_script]
- )
-
- # Ensure our credentials still work, and open port forward
- api.check(experiment)
- logger.info(f"\n🌀 MiniCluster of size {size} is up.\n")
-
- # If created for the first time, show credentials
- if created:
- logger.info(
- "Save these if you want to log into the Flux RESTFul interface, there are specific to the MiniCluster"
- )
- logger.info(f"export FLUX_USER={api.user}")
- logger.info(f"export FLUX_TOKEN={api.token}")
-
- # If we exit, the port forward will close.
- if persistent:
- try:
- logger.info("Press Control+c to Disconnect.")
- while True:
- time.sleep(10)
- except KeyboardInterrupt:
- logger.info("🧽️ Cleaning up!")
- self.run_timed(
- f"minicluster-persistent-destroy-size-{size}",
- ["kubectl", "delete", "-f", crd],
- )
-
- return api, kwargs
-
- @save_meta
- def submit(self, setup, experiment):
+ def submit(self, setup, experiment, interactive=True):
"""
Submit a Job via the Restful API
"""
@@ -177,8 +104,6 @@ def submit(self, setup, experiment):
)
return
- api = None
-
# Iterate through all the cluster sizes
for size in experiment.minicluster["size"]:
# We can't run if the minicluster > the experiment size
@@ -188,24 +113,49 @@ def submit(self, setup, experiment):
)
continue
- # Open the api for the size
- api, uiattrs = self.open_ui(setup, experiment, size, api)
- logger.info(f"\n🌀 Bringing up MiniCluster of size {size}")
+ # Launch a unique Minicluster per container image. E.g.,
+ # if the user provides 2 images for size 4, we create two MiniClusters
+ # This will provide all shared volumes across the jobs
+ for minicluster, job in experiment.get_submit_miniclusters(size):
+ logger.info(
+ f"\n🌀 Bringing up MiniCluster of size {size} with image {job['image']}"
+ )
+
+ # Create the API client (creates the user and token for the cluster)
+ cli = api.APIClient()
- # Save times (and logs in submit) as we go
- for jobid, info in api.submit(setup, experiment, size):
- logger.info(f"{jobid} took {info['runtime']} seconds.")
- self.times[jobid] = info["runtime"]
- self.info[jobid] = info
+ # Pre-pull containers, etc.
+ if hasattr(self, "pre_apply"):
+ self.pre_apply(experiment, minicluster["name"], job=job)
- logger.info(f"\n🌀 MiniCluster of size {size} is finished")
- self.run_timed(
- f"minicluster-persistent-destroy-size-{size}",
- ["kubectl", "delete", "-f", uiattrs["crd"]],
- )
+ # Get back results with times (for minicluster assets) and jobs
+ results = cli.submit(
+ setup, experiment, minicluster, job=job, interactive=interactive
+ )
+
+ # Save times and output files for jobs
+ for job in results.get("jobs", []):
+ self.save_job(job)
+
+ def save_job(self, job):
+ """
+ Save the job and add times to our times listing.
+ """
+ jobid = f"{self.job_prefix}{job['id']}"
+ self.times[jobid] = job["info"]["runtime"]
+
+ # Do we have an output file and output?
+ if job["output"]:
+ # Save to our output directory!
+ logfile = job["job_output"]
+ utils.mkdir_p(os.path.dirname(logfile))
+ utils.write_file(job["output"], logfile)
+
+ del job["output"]
+ self.info[jobid] = job
@save_meta
- def apply(self, setup, experiment):
+ def apply(self, setup, experiment, interactive=True):
"""
Apply a CRD to run the experiment and wait for output.
@@ -246,22 +196,24 @@ def apply(self, setup, experiment):
# Create job directory anew
utils.mkdir_p(job_output)
- # Generate the populated crd from the template
- crd = experiment.generate_crd(job, size)
-
- # Prepare specific .crd for template
- # Note the output directory is already specific to the job index
- kwargs = {
- "minicluster": experiment.minicluster,
- "logfile": logfile,
- "crd": crd,
- }
- apply_script = experiment.get_shared_script(
- "minicluster-run", kwargs, suffix=f"-{jobname}"
+ # Prepare the client for one minicluster
+ cli = api.APIClient()
+
+ # Prepare a specific MiniCluster for this size
+ minicluster = copy.deepcopy(experiment.minicluster)
+ minicluster["size"] = size
+
+ # Get back results with times (for minicluster assets) and jobs
+ # If debug level, print job output to terminal too :)
+ results = cli.apply(
+ experiment=experiment,
+ minicluster=minicluster,
+ outfile=logfile,
+ stdout=self.debug,
+ job=job,
+ interactive=interactive,
)
-
- # Apply the job, and save to output directory
- self.run_timed(f"{self.job_prefix}-{jobname}", ["/bin/bash", apply_script])
+ self.times[jobname] = results["times"]
# Save times between experiment runs
experiment.save_metadata(self.times, self.info)
diff --git a/fluxcloud/main/clouds/aws/scripts/cluster-create b/fluxcloud/main/clouds/aws/scripts/cluster-create
index 3a3cd8e..14855d5 100755
--- a/fluxcloud/main/clouds/aws/scripts/cluster-create
+++ b/fluxcloud/main/clouds/aws/scripts/cluster-create
@@ -5,6 +5,7 @@
# Defaults - these are in the config but left here for information
CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux-cluster{% endif %}"
+NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}"
REGION="{% if region %}{{ region }}{% else %}us-east-1{% endif %}"
CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}"
MACHINE_TYPE="{% if experiment.machine %}{{ experiment.machine }}{% else %}m5.large{% endif %}"
@@ -33,6 +34,7 @@ if [ -z ${MACHINE_TYPE+x} ]; then
exit 1
fi
+print_magenta " namespace: ${NAMESPACE}"
print_magenta " cluster : ${CLUSTER_NAME}"
print_magenta " version : ${CLUSTER_VERSION}"
print_magenta " machine : ${MACHINE_TYPE}"
@@ -64,6 +66,8 @@ run_echo eksctl create cluster -f ${CONFIG_FILE}
# Deploy the operator
install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH}
+
+run_echo kubectl create namespace ${NAMESPACE} || true
run_echo kubectl get namespace
run_echo kubectl describe namespace operator-system
diff --git a/fluxcloud/main/clouds/google/scripts/cluster-create b/fluxcloud/main/clouds/google/scripts/cluster-create
index c33b78f..51c2562 100755
--- a/fluxcloud/main/clouds/google/scripts/cluster-create
+++ b/fluxcloud/main/clouds/google/scripts/cluster-create
@@ -5,6 +5,7 @@
# Defaults - these are in the config but left here for information
CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux{% endif %}"
+NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}"
ZONE="{% if zone %}{{ zone }}{% else %}us-central1-a{% endif %}"
CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}"
MACHINE_TYPE="{% if experiment.machine %}{{ experiment.machine }}{% else %}n1-standard-1{% endif %}"
@@ -32,6 +33,7 @@ if [ -z ${MACHINE_TYPE+x} ]; then
exit 1
fi
+print_magenta " namespace: ${NAMESPACE}"
print_magenta " cluster : ${CLUSTER_NAME}"
print_magenta " version : ${CLUSTER_VERSION}"
print_magenta " project : ${GOOGLE_PROJECT}"
@@ -74,7 +76,7 @@ run_echo kubectl get nodes
# Deploy the operator
mkdir -p ${SCRIPT_DIR}
install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH}
-
+run_echo kubectl create namespace ${NAMESPACE} || true
run_echo kubectl get namespace
run_echo kubectl describe namespace operator-system
diff --git a/fluxcloud/main/clouds/local/scripts/cluster-create-minikube b/fluxcloud/main/clouds/local/scripts/cluster-create-minikube
index 35fb21c..116047d 100755
--- a/fluxcloud/main/clouds/local/scripts/cluster-create-minikube
+++ b/fluxcloud/main/clouds/local/scripts/cluster-create-minikube
@@ -5,6 +5,7 @@
# Defaults - these are in the config but left here for information
CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux-cluster{% endif %}"
+NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}"
CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}"
FORCE_CLUSTER="{% if setup.force_cluster %}true{% else %}false{% endif %}"
SIZE={% if experiment.size %}{{ experiment.size }}{% else %}4{% endif %}
@@ -12,6 +13,7 @@ REPOSITORY="{% if experiment.operator_repository %}{{ experiment.operator_reposi
BRANCH="{% if experiment.operator_branch %}{{ experiment.operator_branch }}{% else %}main{% endif %}"
SCRIPT_DIR="{{ experiment.script_dir }}"
+print_magenta " namespace: ${NAMESPACE}"
print_magenta " cluster : ${CLUSTER_NAME}"
print_magenta " version : ${CLUSTER_VERSION}"
print_magenta " size : ${SIZE}"
@@ -51,7 +53,7 @@ install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH}
# Show nodes
run_echo kubectl get nodes
-
+run_echo kubectl create namespace ${NAMESPACE} || true
run_echo kubectl get namespace
run_echo kubectl describe namespace operator-system
save_versions ${SCRIPT_DIR} ${SIZE}
diff --git a/fluxcloud/main/clouds/shared/scripts/broker-id b/fluxcloud/main/clouds/shared/scripts/broker-id
deleted file mode 100755
index a45ba8c..0000000
--- a/fluxcloud/main/clouds/shared/scripts/broker-id
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}"
-JOB="{{ minicluster.name }}"
-brokerPrefix="${JOB}-0"
-
-for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do
- if [[ "${pod}" == ${brokerPrefix}* ]]; then
- echo ${pod}
- break
- fi
-done
diff --git a/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent b/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent
deleted file mode 100755
index 3b2db0a..0000000
--- a/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-# This is a template that will be populated with variables by Flux-Cloud
-# We only run it to check if a MiniCluster is running. An apply is only
-# needed if the MiniCluster is not created yet.
-
-# Include shared helper scripts
-{% include "helpers.sh" %}
-
-NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}"
-CRD="{{ crd }}"
-JOB="{{ minicluster.name }}"
-
-# Size -1 to account for certificate generator
-SIZE={{ size }}
-
-print_magenta " apply : ${CRD}"
-print_magenta " job : ${JOB}"
-
-is_installed kubectl
-
-# Create the namespace (ok if already exists)
-run_echo_allow_fail kubectl create namespace ${NAMESPACE}
-
-# Always cleanup a previous one so tokens don't get stale
-run_echo_allow_fail kubectl delete -f ${CRD}
-{% include "wait_for_cleanup.sh" %}
-
-# Ensure we have a MiniCluster of the right namespace running
-echo
-print_green "🌀️ Creating MiniCluster in ${NAMESPACE}"
-{% include "wait_for_all.sh" %}
-{% include "wait_for_flux_restful.sh" %}
diff --git a/fluxcloud/main/clouds/shared/scripts/minicluster-run b/fluxcloud/main/clouds/shared/scripts/minicluster-run
deleted file mode 100755
index b7f14ce..0000000
--- a/fluxcloud/main/clouds/shared/scripts/minicluster-run
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-# This is a template that will be populated with variables by Flux-Cloud
-# It used to be a script proper with getopt, but in practice this was
-# erroneous on different operating systems.
-
-# Include shared helper scripts
-{% include "helpers.sh" %}
-
-NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}"
-CRD="{{ crd }}"
-JOB="{{ minicluster.name }}"
-LOGFILE="{{ logfile }}"
-
-print_magenta " apply : ${CRD}"
-print_magenta " job : ${JOB}"
-print_magenta "logfile : ${LOGFILE}"
-
-is_installed kubectl
-
-# Ensure we wait for the space to be cleaned up
-{% include "wait_for_cleanup.sh" %}
-
-# Create the namespace (ok if already exists)
-run_echo_allow_fail kubectl create namespace ${NAMESPACE}
-
-{% include "wait_for_broker.sh" %}
-
-# Get the name of the pods
-pods=($(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}'))
-brokerpod=${pods[0]}
-
-# This will hang like this until the job finishes running
-echo
-print_green "kubectl -n ${NAMESPACE} logs ${brokerpod} -f > ${LOGFILE}"
-kubectl -n ${NAMESPACE} logs ${brokerpod} -f > ${LOGFILE}
-
-for exitcode in $(kubectl get -n ${NAMESPACE} pod --selector=job-name=${JOB} --output=jsonpath={.items...containerStatuses..state.terminated.exitCode}); do
- if [[ ${exitcode} -ne 0 ]]; then
- echo "Container in ${JOB} had nonzero exit code"
- fi
-done
-
-run_echo kubectl delete -f ${CRD}
diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh
deleted file mode 100644
index ddf5cc7..0000000
--- a/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Apply the job, get pods
-run_echo kubectl apply -f ${CRD}
-run_echo kubectl get -n ${NAMESPACE} pods
-
-# continue until we find the index-0 pod
-podsReady="false"
-
-echo
-print_blue "Waiting for MiniCluster of size ${SIZE} to be ready..."
-while [[ "${podsReady}" == "false" ]]; do
- echo -n "."
- sleep 2
- pods=$(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=name | wc -l)
- if [[ ${pods} -eq ${SIZE} ]]; then
- echo
- print_green "🌀️ All pods are running."
- podsReady="true"
- break
- fi
-done
diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh
deleted file mode 100644
index 9335313..0000000
--- a/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-# Apply the job, get pods
-run_echo kubectl apply -f ${CRD}
-run_echo kubectl get -n ${NAMESPACE} pods
-
-# continue until we find the index-0 pod
-brokerPrefix="${JOB}-0"
-brokerReady="false"
-
-echo
-print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be created..."
-while [[ "${brokerReady}" == "false" ]]; do
- echo -n "."
- sleep 2
- for pod in $(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}'); do
- if [[ "${pod}" == ${brokerPrefix}* ]]; then
- echo
- print_green "🌀️ Broker pod is created."
- brokerReady="true"
- break
- fi
- done
-done
-
-# Now broker pod needs to be running
-echo
-print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be running..."
-brokerReady="false"
-while [[ "${brokerReady}" == "false" ]]; do
- echo -n "."
-
- # TODO - we likely want to check for running OR completed, it's rare but sometimes they can complete too fast.
- for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do
- if [[ "${pod}" == ${brokerPrefix}* ]]; then
- echo
- print_green "🌀️ Broker pod is running."
- brokerReady="true"
- break
- fi
- done
-done
diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh
deleted file mode 100644
index 466482f..0000000
--- a/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-echo
-podsCleaned="false"
-print_blue "Waiting for previous MiniCluster to be cleaned up..."
-while [[ "${podsCleaned}" == "false" ]]; do
- echo -n "."
- sleep 2
- state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1)
- lines=$(echo $state | wc -l)
- if [[ ${lines} -eq 1 ]] && [[ "${state}" == *"No resources found in"* ]]; then
- echo
- print_green "🌀️ Previous pods are cleaned up."
- podsCleaned="true"
- break
- fi
-done
diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh
deleted file mode 100644
index 6c27ba7..0000000
--- a/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-
-echo
-brokerPod=""
-brokerPrefix="${JOB}-0"
-while [[ "${brokerPod}" == "" ]]; do
- for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do
- if [[ "${pod}" == ${brokerPrefix}* ]]; then
- echo
- brokerPod=${pod}
- break
- fi
- done
-done
-
-echo
-serverReady="false"
-print_blue "Waiting for Flux Restful API Server to be ready..."
-while [[ "${serverReady}" == "false" ]]; do
- echo -n "."
- sleep 2
- logs=$(kubectl logs --namespace ${NAMESPACE} ${brokerPod} | grep "Uvicorn running")
- retval=$?
- if [[ ${retval} -eq 0 ]]; then
- echo
- serverReady="true"
- print_green "🌀️ Flux RestFul API Server is Ready."
- break
- fi
-done
diff --git a/fluxcloud/main/experiment.py b/fluxcloud/main/experiment.py
index c50d8b7..44a65c8 100644
--- a/fluxcloud/main/experiment.py
+++ b/fluxcloud/main/experiment.py
@@ -8,7 +8,6 @@
import os
import shutil
-import jinja2
import jsonschema
import fluxcloud.defaults as defaults
@@ -22,7 +21,6 @@ class ExperimentSetup:
def __init__(
self,
experiments,
- template=None,
outdir=None,
validate=True,
cleanup=True,
@@ -34,21 +32,12 @@ def __init__(
An experiment setup is a light wrapper around a group of experiments.
"""
self.experiment_file = os.path.abspath(experiments)
- self.template = (
- os.path.abspath(template)
- if template is not None and os.path.exists(template)
- else None
- )
self.outdir = outdir
self.test = test
self.settings = settings.Settings
self.quiet = quiet
self.run_cleanup = cleanup
- # Show the user the template file
- if template:
- logger.debug(f"Using template {self.template}")
-
# Rewrite existing outputs
self.force = kwargs.get("force") or False
# Don't ask for confirmation to create/destroy
@@ -99,7 +88,7 @@ def prepare_matrices(self):
validate_experiments(self.spec)
# Sploot out into matrices
- matrices = expand_experiments(self.spec, self.outdir, self.template)
+ matrices = expand_experiments(self.spec, self.outdir)
if not matrices:
raise ValueError(
"No matrices generated. Did you include any empty variables in your matrix?"
@@ -134,11 +123,10 @@ class Experiment:
An experiment wrapper to make it easy to get variables in templates.
"""
- def __init__(self, experiment, outdir=None, template=None):
+ def __init__(self, experiment, outdir=None):
self.experiment = experiment
self.settings = settings.Settings
self._outdir = outdir
- self.template = template or defaults.default_minicluster_template
@property
def outdir(self):
@@ -191,31 +179,60 @@ def iter_jobs(self):
yield size, jobname, job
- def get_persistent_variables(self, size, required=None):
+ def get_submit_miniclusters(self, size):
"""
- Get persistent variables that should be used across the MiniCluster
+ Return Miniclusters organized by unique sizes and containers
+
+ For each, we return a faux job that includes (potentially) the job volumes.
"""
- jobvars = {}
- for _, job in self.jobs.items():
- # Skip jobs targeted for a different size
+ # A faux job is provided that includes all volumes
+ images = {}
+ for name, job in self.jobs.items():
if "size" in job and job["size"] != size:
continue
-
- for key, value in job.items():
- if key not in jobvars or (key in jobvars and jobvars[key] == value):
- jobvars[key] = value
- continue
- logger.warning(
- f'Inconsistent job variable between MiniCluster jobs: {value} vs. {jobvars["value"]}'
- )
-
- # If we get here and we don't have an image
- for req in required or []:
- if req not in jobvars:
- raise ValueError(
- f'Submit requires a "{req}" field under at least one job spec to create the MiniCluster.'
- )
- return jobvars
+ if "image" not in job:
+ logger.warning(f"Job {name} is missing an image and cannot be run.")
+
+ # Add the image if we don't know about it already
+ # This is where we can define shared minicluster container attributes (the job)
+ if job["image"] not in images:
+ images[job["image"]] = copy.deepcopy(job)
+
+ # Update the job and warn the user for differences
+ else:
+ for k, v in job.items():
+ # Skip the command
+ if k == "command":
+ continue
+
+ # This shared job for the image doesn't have the attribute defined yet
+ if k not in images[job["image"]]:
+ images[job["image"]][k] = v
+ continue
+ current = images[job["image"]][k]
+
+ # If it's a dictionary, just update
+ if isinstance(current, dict) and isinstance(v, dict):
+ images[job["image"]][k].update(v)
+
+ # Otherwise give a warning we won't be updating
+ elif current != v:
+ logger.warning(
+ f"Found different definition of {k}, {v}. Using first discovered {current}"
+ )
+
+ logger.debug(f"Job experiments file generated {len(images)} MiniCluster(s).")
+
+ # Prepare a MiniCluster and job for each image
+ for image in images:
+ minicluster = copy.deepcopy(self.minicluster)
+ minicluster["size"] = size
+ job = images[image]
+
+ # A shared MiniCluster starts with no command to start flux restful
+ if "command" in job:
+ del job["command"]
+ yield minicluster, job
@property
def script_dir(self):
@@ -238,15 +255,6 @@ def get_script(self, name, cloud, render_kwargs=None, ext="sh", suffix=""):
utils.mkdir_p(outdir)
return script.render(outfile=outfile, **render_kwargs)
- def get_shared_script(self, name, render_kwargs=None, suffix="", ext="sh"):
- """
- Get a named shared script
- """
- render_kwargs = render_kwargs or {}
- return self.get_script(
- name, cloud="shared", render_kwargs=render_kwargs, suffix=suffix, ext=ext
- )
-
def cleanup(self):
"""
Cleanup the scripts directory for the experiment!
@@ -255,36 +263,6 @@ def cleanup(self):
logger.debug(f"Cleaning up {self.script_dir}")
shutil.rmtree(self.script_dir)
- def generate_crd(self, job, minicluster_size):
- """
- Generate a custom resource definition for the experiment
- """
- template = jinja2.Template(utils.read_file(self.template))
- experiment = copy.deepcopy(self.experiment)
-
- # If the experiment doesn't define a minicluster, add our default
- if "minicluster" not in experiment:
- experiment["minicluster"] = self.settings.minicluster
-
- # Update minicluster size to the one we want
- experiment["minicluster"]["size"] = minicluster_size
-
- if "jobs" in experiment:
- del experiment["jobs"]
- experiment["job"] = job
- result = template.render(**experiment).strip(" ")
- logger.debug(result)
-
- # Write to output directory
- outfile = os.path.join(
- self.script_dir, f"minicluster-size-{minicluster_size}.yaml"
- )
- outdir = os.path.dirname(outfile)
- if not os.path.exists(outdir):
- logger.info(f"Creating output directory for scripts {outdir}")
- utils.mkdir_p(outdir)
- return utils.write_file(result, outfile)
-
@property
def jobs(self):
return self.experiment.get("jobs", {})
@@ -325,10 +303,12 @@ def is_run(self):
return False
return True
- def check_job_run(self, job, size):
+ def check_job_run(self, job, size, image=None):
"""
Determine if a job is marked for a MiniCluster size.
"""
+ if "image" in job and image is not None and job["image"] != image:
+ return False
if "sizes" in job and size not in job["sizes"]:
return False
if "size" in job and job["size"] != size:
@@ -339,6 +319,27 @@ def check_job_run(self, job, size):
return False
return True
+ def save_file(self, obj, filename, is_json=False):
+ """
+ Save a json dump of something to a filename in the experiment directory.
+ """
+ experiment_dir = self.root_dir
+ save_file = os.path.join(experiment_dir, ".scripts", filename)
+ save_dir = os.path.dirname(save_file)
+ if not os.path.exists(save_dir):
+ utils.mkdir_p(save_dir)
+ if is_json:
+ utils.write_json(obj, save_file)
+ else:
+ utils.write_file(obj, save_file)
+ return save_file
+
+ def save_json(self, obj, filename):
+ """
+ Save a json dump of something to a filename in the experiment directory.
+ """
+ return self.save_file(obj, filename, is_json=True)
+
def save_metadata(self, times, info=None):
"""
Save experiment metadata, loading an existing meta.json, if present.
@@ -421,8 +422,17 @@ def minicluster(self):
minicluster = self.experiment.get("minicluster") or self.settings.minicluster
if "namespace" not in minicluster or not minicluster["namespace"]:
minicluster["namespace"] = defaults.default_namespace
+ if "size" not in minicluster:
+ minicluster["size"] = [self.experiment.get("size")]
return minicluster
+ @property
+ def minicluster_namespace(self):
+ """
+ Get mini cluster namespace
+ """
+ return self.minicluster["namespace"]
+
@property
def machine(self):
return self.experiment.get("machine") or self.settings.google["machine"]
@@ -455,7 +465,7 @@ def kubernetes_version(self):
)
-def expand_experiments(experiments, outdir, template=None):
+def expand_experiments(experiments, outdir):
"""
Given a valid experiments.yaml, expand out into experiments
"""
@@ -484,7 +494,7 @@ def expand_experiments(experiments, outdir, template=None):
# Put in final matrix form
final = []
for entry in matrix:
- final.append(Experiment(entry, outdir, template))
+ final.append(Experiment(entry, outdir))
return final
diff --git a/fluxcloud/main/schemas.py b/fluxcloud/main/schemas.py
index 8556347..5902448 100644
--- a/fluxcloud/main/schemas.py
+++ b/fluxcloud/main/schemas.py
@@ -24,14 +24,14 @@
"properties": {
"command": {"type": "string"},
"repeats": {"type": "number"},
- "workdir": {"type": "string"},
+ "working_dir": {"type": "string"},
"image": {"type": "string"},
"machine": {"type": "string"},
"machines": {"type": "array", "items": {"type": "string"}},
"size": {"type": "number"},
"sizes": {"type": "array", "items": {"type": "number"}},
},
- "required": ["command"],
+ "required": ["command", "image"],
}
jobs_properties = {
@@ -187,6 +187,9 @@
"required": ["size"],
},
},
+ "patternProperties": {
+ "x-*": {"type": "object"},
+ },
"additionalProperties": False,
}
diff --git a/fluxcloud/main/template.py b/fluxcloud/main/template.py
new file mode 100644
index 0000000..e176500
--- /dev/null
+++ b/fluxcloud/main/template.py
@@ -0,0 +1,91 @@
+# Copyright 2023 Lawrence Livermore National Security, LLC and other
+# This is part of Flux Framework. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+experiment_base = """
+# Flux MiniCluster experiment attributes
+minicluster:
+ name: my-job
+ namespace: flux-operator
+ # Each of these sizes will be brought up and have commands run across it
+ # They must be smaller than the Kubernetes cluster size or not possible to run!
+ size: [2, 4]
+
+# Under jobs should be named jobs (output orgainzed by name) where
+# each is required to have a command and image. Repeats is the number
+# of times to run each job
+jobs:
+ reaxc-hns:
+ command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+ sleep:
+ command: 'sleep 5'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+ hello-world:
+ command: 'echo hello world'
+ image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0
+ repeats: 5
+ working_dir: /home/flux/examples/reaxff/HNS
+"""
+
+google_experiment_template = f"""
+matrix:
+ size: [4]
+
+ # This is a Google Cloud machine
+ machine: [n1-standard-1]
+
+variables:
+ # Customize zone just for this experiment
+ # otherwise defaults to your settings.yml
+ zone: us-central1-a
+
+{experiment_base}
+"""
+
+minikube_experiment_template = f"""
+# This is intended for MiniKube, so no machine needed
+matrix:
+
+ # This is the size of the MiniKube cluster (aka Kubernetes cluster) to bring up
+ size: [4]
+
+{experiment_base}
+"""
+
+aws_experiment_template = f"""
+matrix:
+
+ # This is the size of the MiniKube cluster (aka Kubernetes cluster) to bring up
+ size: [4]
+
+ # This is an EC2 machine
+ machine: [m5.large]
+
+variables:
+ # Enable private networking
+ private_networking: false
+
+ # Enable efa (requires efa also set under the container limits)
+ efa_enabled: false
+
+ # Add a custom placement group name to your workers managed node group
+ placement_group: eks-efa-testing
+
+ # Customize region just for this experiment
+ region: us-east-2
+
+ # Customize availability zones for this experiment
+ availability_zones: [us-east-1a, us-east-1b]
+
+ # Important for instance types only in one zone (hpc instances)
+ # Select your node group availability zone:
+ node_group_availability_zone: us-east-2b
+
+{experiment_base}
+"""
diff --git a/fluxcloud/minicluster-template.yaml b/fluxcloud/minicluster-template.yaml
deleted file mode 100644
index ede959d..0000000
--- a/fluxcloud/minicluster-template.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-apiVersion: flux-framework.org/v1alpha1
-kind: MiniCluster
-
-metadata:
- name: {{ minicluster.name }}
- namespace: {{ minicluster.namespace }}
-spec:
- # localDeploy needs to be false
- localDeploy: {% if minicluster.local_deploy %}true{% else %}false{% endif %}
-
- # Number of pods to create for MiniCluster
- size: {{ minicluster.size }}
- tasks: {% if job.tasks %}{{ job.tasks }}{% else %}1{% endif %}
-
- # Disable verbose output
- {% if job.quiet or job.timed %}logging:
- {% if job.quiet %}quiet: true{% endif %}
- {% if job.timed %}timed: true{% endif %}{% endif %}
-
- # Optional credentials if running the flux restful api
- {% if job.token or job.user %}fluxRestful:
- {% if job.token %}token: "{{ job.token }}"{% endif %}
- {% if job.user %}username: "{{ job.user }}"{% endif %}{% endif %}
-
- # TODO add pod resources, if needed
- containers:
- - image: {{ job.image }}
- {% if job.workdir %}workingDir: {{ job.workdir }}{% endif %}
- {% if job.command %}command: {{ job.command }}{% endif %}
- {% if job.flux_option_flags %}fluxOptionFlags: "-ompi=openmpi@5"{% endif %}
- cores: {% if job.cores %}{{ job.cores }}{% else %}1{% endif %}
- {% if job.limits or job.resources %}resources:{% endif %}
- {% if job.limits %}limits:
- {% for limit in job.limits %}
- {{ limit[0] }}: {{ limit[1] }}
- {% endfor %}{% endif %}
- {% if job.requests %}requests:
- {% for limit in job.requests %}
- {{ limit[0] }}: {{ limit[1] }}
- {% endfor %}{% endif %}
- {% if job.pre_command %}preCommand: |
- {{ job.pre_command }}{% endif %}
diff --git a/fluxcloud/tests/__init__.py b/fluxcloud/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/fluxcloud/tests/helpers.py b/fluxcloud/tests/helpers.py
new file mode 100644
index 0000000..b9f8330
--- /dev/null
+++ b/fluxcloud/tests/helpers.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+# Copyright (C) 2022 Vanessa Sochat.
+
+# This Source Code Form is subject to the terms of the
+# Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
+# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import os
+import shlex
+import shutil
+
+from fluxcloud.client import get_parser
+from fluxcloud.main.client import ExperimentClient
+from fluxcloud.main import get_experiment_client
+
+here = os.path.dirname(os.path.abspath(__file__))
+root = os.path.dirname(here)
+
+
+def parse_args(argstr):
+ """
+ Given an argument string for a test, parse it.
+ """
+ parser = get_parser()
+ parser.prog = "fluxcloud"
+ args = parser.parse_args(shlex.split(argstr))
+ args.debug = True
+ return args
+
+
+def get_settings(tmpdir):
+ """
+ Create a temporary settings file
+ """
+ settings_file = os.path.join(root, "settings.yml")
+ new_settings = os.path.join(tmpdir, "settings.yml")
+ shutil.copyfile(settings_file, new_settings)
+ return new_settings
+
+
+def init_client(tmpdir, cloud=None):
+ """
+ Get a common client for some container technology and module system
+ """
+ new_settings = get_settings(tmpdir)
+ return get_experiment_client(cloud, debug=True, settings_file=new_settings)
\ No newline at end of file
diff --git a/fluxcloud/tests/test_examples.py b/fluxcloud/tests/test_examples.py
new file mode 100644
index 0000000..b5d2e17
--- /dev/null
+++ b/fluxcloud/tests/test_examples.py
@@ -0,0 +1,181 @@
+#!/usr/bin/python
+
+# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other
+# This is part of Flux Framework. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from glob import glob
+import os
+
+import fluxcloud.utils as utils
+from fluxcloud.main.experiment import ExperimentSetup
+
+from .helpers import here, init_client
+
+here = os.path.abspath(os.path.dirname(__file__))
+root = os.path.dirname(os.path.dirname(here))
+
+def check_lammps(minicluster_file):
+ """
+ Checks for examples that run lammps.
+ """
+ expected_outdir = os.path.dirname(os.path.dirname(minicluster_file))
+ for out in utils.recursive_find(expected_outdir, "log.out"):
+ content = utils.read_file(out)
+ assert "Total wall time" in content
+ assert "LAMMPS" in content
+
+
+def _test_example(dirname, tmp_path, check, test_apply=True):
+ """
+ Shared function to test an example in a dirname, with a check function
+ """
+ client = init_client(str(tmp_path), cloud="minikube")
+ experiment_file = os.path.join(
+ root, "examples", "minikube", dirname, "experiments.yaml"
+ )
+
+ # Create a new experiment directory to work from
+ experiment_dir = os.path.join(tmp_path, "experiment")
+ outdir = os.path.join(experiment_dir, "data")
+ utils.mkdir_p(experiment_dir)
+ setup = ExperimentSetup(experiment_file, outdir=outdir, force_cluster=True, quiet=False)
+
+ # Select the first (only) experiment!
+ experiment = setup.matrices[0]
+ client.up(setup, experiment=experiment)
+
+ # Expected output directory
+ expected_outdir = os.path.join(outdir, f"k8s-size-{experiment.size}-local")
+ expected_scripts = os.path.join(expected_outdir, ".scripts")
+
+ def shared_checks(info=True):
+ assert os.path.exists(expected_outdir)
+ assert "meta.json" in os.listdir(expected_outdir)
+ meta = utils.read_json(os.path.join(expected_outdir, "meta.json"))
+ assert meta["times"]
+ assert meta["minicluster"]
+ assert meta["jobs"]
+
+ # Info is only present for submit
+ if info:
+ assert meta["info"]
+
+ # Run the experiment in the working directory
+ with utils.working_dir(experiment_dir):
+ # This won't work in the CI it seems
+ client.submit(setup, experiment, interactive=False)
+ shared_checks()
+
+ files = glob(os.path.join(expected_scripts, "minicluster-size*.json"))
+ minicluster_file = files[0]
+ print(f'Found minicluster metadata file {minicluster_file}')
+
+ check(minicluster_file, experiment)
+
+ # Now do the same for apply
+ # shutil.rmtree(expected_outdir)
+ if test_apply:
+ client.apply(setup, experiment, interactive=False)
+ shared_checks(info=False)
+ check(minicluster_file, experiment)
+
+ client.down(setup, experiment=experiment)
+
+
+def test_minicluster_logging(tmp_path):
+ """
+ Ensure that the logging example returns expected logging params set
+ in the minicluster output.
+ """
+
+ def check(minicluster_file, experiment):
+ assert os.path.exists(minicluster_file)
+
+ # Assert that the logging spec matches
+ minicluster = utils.read_json(minicluster_file)
+ for level, value in experiment.minicluster["logging"].items():
+ assert level in minicluster["spec"]["logging"]
+ assert minicluster["spec"]["logging"][level] == value
+
+ check_lammps(minicluster_file)
+
+ # Run the example for submit and apply, with check
+ _test_example("logging", tmp_path, check)
+
+
+def test_minicluster_volumes(tmp_path):
+ """
+ Ensure that the volumes example produces the expected Minicluster spec
+ """
+
+ def check(minicluster_file, experiment):
+ assert os.path.exists(minicluster_file)
+
+ # Assert that the logging spec matches
+ minicluster = utils.read_json(minicluster_file)
+ assert "volumes" in minicluster["spec"]
+
+ check_lammps(minicluster_file)
+
+ # And container level volumes
+ assert "volumes" in minicluster["spec"]["containers"][0]
+ container_volumes = minicluster["spec"]["containers"][0]["volumes"]
+
+ # This checks the cluster level volumes
+ for name, volume in experiment.minicluster["volumes"].items():
+ assert name in minicluster["spec"]["volumes"]
+ generated_volume = minicluster["spec"]["volumes"][name]
+
+ for attr, value in volume.items():
+ if attr in generated_volume:
+ assert value == generated_volume[attr]
+
+ assert name in container_volumes
+
+ for vname, containervol in experiment.jobs["reaxc-hns-1"][
+ "volumes"
+ ].items():
+ assert vname in container_volumes
+ for attr, val in containervol.items():
+ assert attr in container_volumes[vname]
+ assert container_volumes[vname][attr] == val
+
+ # Run the example for submit and apply, with check
+ _test_example("volumes", tmp_path, check)
+
+
+def test_osu_benchmarks(tmp_path):
+ """
+ Ensure we can explicitly specify resources
+ """
+ def check(minicluster_file, experiment):
+ assert os.path.exists(minicluster_file)
+
+
+ # Run the example for submit and apply, with check
+ _test_example("osu-benchmarks", tmp_path, check, test_apply=False)
+
+
+def test_minicluster_resources(tmp_path):
+ """
+ Ensure that the resources example works as expected.
+ """
+
+ def check(minicluster_file, experiment):
+ assert os.path.exists(minicluster_file)
+
+ # Assert that the logging spec matches
+ minicluster = utils.read_json(minicluster_file)
+ check_lammps(minicluster_file)
+
+ assert "resources" in minicluster["spec"]["containers"][0]
+ resources = minicluster["spec"]["containers"][0]["resources"]
+
+ for rtype, rvalue in experiment.jobs["reaxc-hns-1"]["resources"].items():
+ assert rtype in resources
+ assert resources[rtype] == rvalue
+
+ # Run the example for submit and apply, with check
+ _test_example("resources", tmp_path, check)
diff --git a/fluxcloud/tests/test_settings.py b/fluxcloud/tests/test_settings.py
new file mode 100644
index 0000000..9d0b162
--- /dev/null
+++ b/fluxcloud/tests/test_settings.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python
+
+# Copyright 2022 Lawrence Livermore National Security, LLC and other
+# This is part of Flux Framework. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import pytest
+
+from fluxcloud.main.settings import UserSettings
+
+here = os.path.dirname(os.path.abspath(__file__))
+root = os.path.dirname(here)
+
+from .helpers import get_settings # noqa
+
+
+def test_invalid_properties(tmp_path):
+ """
+ Test invalid setting property
+ """
+ settings = UserSettings(get_settings(tmp_path))
+ assert settings.config_editor == "vim"
+ settings.set("config_editor", "code")
+ with pytest.raises(SystemExit):
+ settings.set("invalid_key", "invalid_value")
+ assert settings.config_editor == "code"
+
+
+def test_set_get(tmp_path):
+ """
+ Test variable set/get
+ """
+ settings = UserSettings(get_settings(tmp_path))
+
+ zone = "us-central1-a"
+ assert settings.google["zone"] == zone
+
+ # Cannot add invalid parameter
+ with pytest.raises(SystemExit):
+ settings.set("cache_only", True)
+
+ found_zone = settings.get("google:zone")
+ assert isinstance(zone, str)
+ assert zone == found_zone
+
+ # Just check the first in the list
+ assert settings.google["zone"] == zone
diff --git a/fluxcloud/tests/test_utils.py b/fluxcloud/tests/test_utils.py
new file mode 100644
index 0000000..b10c97d
--- /dev/null
+++ b/fluxcloud/tests/test_utils.py
@@ -0,0 +1,133 @@
+#!/usr/bin/python
+
+# Copyright (C) 2021-2022 Vanessa Sochat.
+
+# This Source Code Form is subject to the terms of the
+# Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
+# with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import json
+import os
+import shutil
+
+import pytest
+
+import fluxcloud.utils as utils
+
+
+def test_write_read_files(tmp_path):
+ """
+ test_write_read_files will test the functions write_file and read_file
+ """
+ print("Testing utils.write_file...")
+
+ tmpfile = str(tmp_path / "written_file.txt")
+ assert not os.path.exists(tmpfile)
+ utils.write_file("hello!", tmpfile)
+ assert os.path.exists(tmpfile)
+
+ print("Testing utils.read_file...")
+ content = utils.read_file(tmpfile)
+ assert content == "hello!"
+
+
+def test_write_bad_json(tmp_path):
+ bad_json = {"Wakkawakkawakka'}": [{True}, "2", 3]}
+ tmpfile = str(tmp_path / "json_file.txt")
+ assert not os.path.exists(tmpfile)
+ with pytest.raises(TypeError):
+ utils.write_json(bad_json, tmpfile)
+
+
+def test_write_json(tmp_path):
+ good_json = {"Wakkawakkawakka": [True, "2", 3]}
+ tmpfile = str(tmp_path / "good_json_file.txt")
+
+ assert not os.path.exists(tmpfile)
+ utils.write_json(good_json, tmpfile)
+ with open(tmpfile, "r") as f:
+ content = json.loads(f.read())
+ assert isinstance(content, dict)
+ assert "Wakkawakkawakka" in content
+ content = utils.read_json(tmpfile)
+ assert "Wakkawakkawakka" in content
+
+
+def test_check_install():
+ """
+ check install is used to check if a particular software is installed.
+ If no command is provided, singularity is assumed to be the test case
+ """
+ print("Testing utils.check_install")
+
+ is_installed = utils.check_install("echo")
+ assert is_installed
+ is_not_installed = utils.check_install("fakesoftwarename")
+ assert not is_not_installed
+
+
+def test_get_installdir():
+ """
+ Get install directory should return the base of where fluxcloud
+ is installed
+ """
+ print("Testing utils.get_installdir")
+
+ whereami = utils.get_installdir()
+ print(whereami)
+ assert whereami.endswith("fluxcloud")
+
+
+def test_get_file_hash():
+ print("Testing utils.get_file_hash")
+ here = os.path.dirname(os.path.abspath(__file__))
+ testdata = os.path.join(here, "testdata", "hashtest.txt")
+ assert (
+ utils.get_file_hash(testdata)
+ == "6bb92117bded3da774363713657a629a9f38eac2e57cd47e1dcda21d3445c67d"
+ )
+ assert utils.get_file_hash(testdata, "md5") == "e5d376ca96081dd561ff303c3a631fd5"
+
+
+def test_copyfile(tmp_path):
+ print("Testing utils.copyfile")
+ original = str(tmp_path / "location1.txt")
+ dest = str(tmp_path / "location2.txt")
+ print(original)
+ print(dest)
+ utils.write_file("CONTENT IN FILE", original)
+ utils.copyfile(original, dest)
+ assert os.path.exists(original)
+ assert os.path.exists(dest)
+
+
+def test_get_tmpdir_tmpfile():
+ print("Testing utils.get_tmpdir, get_tmpfile")
+ tmpdir = utils.get_tmpdir()
+ assert os.path.exists(tmpdir)
+ assert os.path.basename(tmpdir).startswith("fluxcloud")
+ shutil.rmtree(tmpdir)
+ tmpdir = utils.get_tmpdir(prefix="name")
+ assert os.path.basename(tmpdir).startswith("name")
+ shutil.rmtree(tmpdir)
+ tmpfile = utils.get_tmpfile()
+ assert "fluxcloud" in tmpfile
+ os.remove(tmpfile)
+ tmpfile = utils.get_tmpfile(prefix="pancakes")
+ assert "pancakes" in tmpfile
+ os.remove(tmpfile)
+
+
+def test_mkdir_p(tmp_path):
+ print("Testing utils.mkdir_p")
+ dirname = str(tmp_path / "input")
+ result = os.path.join(dirname, "level1", "level2", "level3")
+ utils.mkdir_p(result)
+ utils.mkdirp([result])
+ assert os.path.exists(result)
+
+
+def test_print_json():
+ print("Testing utils.print_json")
+ result = utils.print_json({1: 1})
+ assert result == '{\n "1": 1\n}'
diff --git a/fluxcloud/tests/testdata/hashtest.txt b/fluxcloud/tests/testdata/hashtest.txt
new file mode 100644
index 0000000..e85812c
--- /dev/null
+++ b/fluxcloud/tests/testdata/hashtest.txt
@@ -0,0 +1,2 @@
+This is a file that exists purely to test the functions to generate
+hashes. Please don't modify, thank you!
diff --git a/fluxcloud/utils/__init__.py b/fluxcloud/utils/__init__.py
index b079912..10c9291 100644
--- a/fluxcloud/utils/__init__.py
+++ b/fluxcloud/utils/__init__.py
@@ -18,7 +18,7 @@
write_json,
write_yaml,
)
-from .misc import chunks, get_hash, mb_to_bytes, print_bytes, slugify
+from .misc import chunks, get_hash, mb_to_bytes, print_bytes, slugify, working_dir
from .terminal import (
check_install,
confirm_action,
diff --git a/fluxcloud/utils/misc.py b/fluxcloud/utils/misc.py
index acfcc9c..0bee595 100644
--- a/fluxcloud/utils/misc.py
+++ b/fluxcloud/utils/misc.py
@@ -4,6 +4,21 @@
# SPDX-License-Identifier: Apache-2.0
import copy
+import os
+from contextlib import contextmanager
+
+
+@contextmanager
+def working_dir(path):
+ """
+ Sets the cwd within the context
+ """
+ here = os.getcwd()
+ try:
+ os.chdir(path)
+ yield
+ finally:
+ os.chdir(here)
def chunks(listing, chunk_size):
diff --git a/fluxcloud/version.py b/fluxcloud/version.py
index c3655ca..409163d 100644
--- a/fluxcloud/version.py
+++ b/fluxcloud/version.py
@@ -1,7 +1,7 @@
# Copyright 2022-2023 Lawrence Livermore National Security, LLC
# SPDX-License-Identifier: Apache-2.0
-__version__ = "0.1.19"
+__version__ = "0.2.0"
AUTHOR = "Vanessa Sochat"
EMAIL = "vsoch@users.noreply.github.com"
NAME = "flux-cloud"
@@ -14,6 +14,8 @@
# Global requirements
INSTALL_REQUIRES = (
+ ("kubernetes", {"min_version": None}),
+ ("fluxoperator", {"min_version": "0.0.12"}),
("ruamel.yaml", {"min_version": None}),
("jsonschema", {"min_version": None}),
("requests", {"min_version": None}),
diff --git a/tests/test.sh b/tests/test.sh
index ca485cb..7c1ac3c 100755
--- a/tests/test.sh
+++ b/tests/test.sh
@@ -37,39 +37,10 @@ echo "flux-cloud run --cloud minikube --output ${output} --force-cluster"
flux-cloud run --cloud minikube --output ${output} --force-cluster
retval=$?
-if [[ "${retval}" != "0" ]]; then
+if [[ ${retval} -ne 0 ]]; then
echo "Issue running Flux Cloud, return value ${retval}"
exit ${retval}
fi
-# Check output
-for filename in $(find ./data -type f -print); do
- echo "Checking $filename";
- filebase=$(basename ${filename})
-
- # Don't check these files, likely to change
- if [[ "${filebase}" == "flux-operator.yaml" ]]; then
- continue
- fi
- if [[ "${filebase}" == "nodes-size"* ]]; then
- continue
- fi
- suffix=$(echo ${filename:7})
- outfile="$output/$suffix"
- if [[ ! -e "${outfile}" ]]; then
- echo "Expected output $outfile does not exist."
- exit 1
- fi
- # Check the length
- actual=$(cat $filename | wc -l)
- found=$(cat $outfile | wc -l)
-
- if [[ "${actual}" != "${found}" ]]; then
- echo "Incorrect output length found for ${filename}: expected ${actual} vs found ${found}"
- cat ${outfile}
- exit 1
- fi
-done
-
echo ${output}
rm -rf ${output}