diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3739a40..04aa71f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -24,13 +24,74 @@ jobs: pip install -r .github/dev-requirements.txt pre-commit run --all-files - test-runs: + test-python: + runs-on: ubuntu-latest + steps: + - name: Clone the code + uses: actions/checkout@v3 + + - name: Install flux-cloud + run: | + conda create --quiet --name fc jinja2 + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + pip install .[all] + + - name: Test Python + run: | + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + export SHELL=/bin/bash + # This will bring MiniKube up/down + pytest -xs ./fluxcloud/tests/test_settings.py + pytest -xs ./fluxcloud/tests/test_utils.py + + test-examples: runs-on: ubuntu-latest strategy: fail-fast: false matrix: - test: ["lammps"] + test: ["test_minicluster_logging", "test_minicluster_volumes", + "test_minicluster_resources"] + steps: + - name: Clone the code + uses: actions/checkout@v3 + + - name: Setup Go + uses: actions/setup-go@v3 + with: + go-version: ^1.18 + + - name: Install flux-cloud + run: | + conda create --quiet --name fc jinja2 + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + pip install .[all] + pip install kubernetes + - name: Start minikube + uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9 + + - name: Test Example + env: + test: ${{ matrix.test }} + run: | + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + export SHELL=/bin/bash + eval $(minikube -p minikube docker-env) + # We need to delete the minikube cluster to bring it up again + minikube delete + # This will bring MiniKube up/down + pytest -xs ./fluxcloud/tests/test_examples.py::${test} + + test-runs: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + command: [["apply", "lammps"], ["submit", "./examples/minikube/basic"]] steps: - name: Clone the code uses: actions/checkout@v3 @@ -50,9 +111,10 @@ jobs: - name: Start minikube uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9 - - name: Test ${{ matrix.test }} + - name: Test ${{ matrix.command }} env: - name: ${{ matrix.test }} + name: ${{ matrix.command[1] }} + if: (matrix.command[0] == 'apply') run: | export PATH="/usr/share/miniconda/bin:$PATH" source activate fc @@ -61,3 +123,18 @@ jobs: # We need to delete the minikube cluster to bring it up again minikube delete /bin/bash ./tests/test.sh ${name} + + - name: Test ${{ matrix.command }} + env: + workdir: ${{ matrix.command[1] }} + if: (matrix.command[0] == 'submit') + run: | + export PATH="/usr/share/miniconda/bin:$PATH" + source activate fc + export SHELL=/bin/bash + eval $(minikube -p minikube docker-env) + minikube delete + cd ${workdir} + flux-cloud up --cloud minikube --force-cluster + flux-cloud --debug submit --non-interactive + flux-cloud down --cloud minikube diff --git a/.gitignore b/.gitignore index 50fc771..50592df 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ env .env dist __pycache__ +examples/**/data +examples/**/_data diff --git a/CHANGELOG.md b/CHANGELOG.md index aa1b003..b231b2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ and **Merged pull requests**. Critical items to know are: The versions coincide with releases on pip. Only major versions will be released as tags on Github. ## [0.0.x](https://github.com/converged-computing/flux-cloud/tree/main) (0.0.x) + - refactor flux submit and apply to use fluxoperator Python SDK (0.2.0) + - This reduces scripts in output folder, but is a good tradeoff for fewer errors + - remove "ui" command, flux-cloud is intended mostly for automation + - command and image will always be required. - fix bash script bugs (0.1.19) - support for node group level aws avail. zones, save times on each experiment apply (0.1.18) - data should be namespaced by cloud type (so multiple experiments can be run alongside) (0.1.17) diff --git a/README.md b/README.md index 4e9ae0c..8cd28ea 100644 --- a/README.md +++ b/README.md @@ -23,11 +23,6 @@ It will be expanded as needed. Code is under development and likely to change! In the meantime, for early documentation, see our ⭐️ [Documentation](https://converged-computing.github.io/flux-cloud/) ⭐️ -## TODO - - - test for list of experiments - - cloud-select could estimate the cost? - - run and add more cluster examples ## 😁️ Contributors 😁️ diff --git a/docs/getting_started/commands.md b/docs/getting_started/commands.md index 863e9ae..0c7f846 100644 --- a/docs/getting_started/commands.md +++ b/docs/getting_started/commands.md @@ -1,11 +1,104 @@ # Commands -The following commands are provided by Flux Cloud. For running jobs, you can either do: +Welcome to the commands section! You can learn the details of each command below, or +check out an [example](examples.md) or [cloud tutorial](../tutorials/index.md). +The general steps you want to take are: -- **apply**/**run**: A single/multi job submission intended for different containers to re-create pods each time. -- **batch**/**submit**: A single/multi job submission intended for a common container base where we use the same set of pods. +1. Generate or find an `experiments.yaml` configuration. +2. Decide if you want to use `submit` or `apply` +3. Create the cluster, run experiments, and clean up. -Both are described in the following sections. +If you don't want to use an existing example, see [experiment init](#init) for how to create an `experiments.yaml` from scratch. + +> What's the difference between submit and apply? + +For `apply`, we are running one job per Minicluster (the Flux Operator custom resource definition). This means +we bring up an entire set of pods for each container (each entry under "jobs" in your experiment.yaml), +run the single job directly with `flux start -> flux submit` to provide the command to the broker, and then +when it finished the container will exit and the job clean up. This approach likely is suited to fewer jobs +that are longer running, and if you want to see output appear as it's available (we stream the log from the broker pod). +For `apply` we also skip creating the [Flux RESTFul API](https;//github.com/flux-framework/flux-restful-api) server, +so it's one less dependency to worry about, and you also don't need to think about exposing an API or users. + +For `submit`, we take advantage of Flux as a scheduler, bringing up the fewest number of MiniClusters we can +derive based on the unique containers and sizes in your `experiments.yaml`. This means that, for each unique +set, we bring up one MiniCluster, and then submit all your jobs at once, allowing Flux to act as a scheduler. +We poll the server every 30 seconds to get an update on running jobs, and when they are all complete, jobs +output and results are saved. This approach is more ideal for many smaller jobs, as the MiniClusters are +only brought up once (and you don't need to wait for pods to go up and down for each job). The cons of this +approach are getting logs at the end, unless you decide to interact with the Flux RESTFul API on your own +earlier. + +Next, read about how to use these commands in detail. + +## experiment + +### init + +When you want to create a new experiment, do: + +```bash +$ mkdir -p my-experiment +$ cd my-experiment + +# Create a new experiment for minikube +$ flux-cloud experiment init --cloud minikube +$ flux-cloud experiment init --cloud aws +$ flux-cloud experiment init --cloud google +``` + +This will create an `experiments.yaml` template with custom variables for your +cloud of choice, and robustly commented. + +
+ +View Example Output of flux-cloud experiment init + +```bash +$ flux-cloud experiment init --cloud google > experiments.yaml +``` +```yaml +matrix: + size: [4] + + # This is a Google Cloud machine + machine: [n1-standard-1] + +variables: + # Customize zone just for this experiment + # otherwise defaults to your settings.yml + zone: us-central1-a + +# Flux MiniCluster experiment attributes +minicluster: + name: my-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + # They must be smaller than the Kubernetes cluster size or not possible to run! + size: [2, 4] + +# Under jobs should be named jobs (output orgainzed by name) where +# each is required to have a command and image. Repeats is the number +# of times to run each job +jobs: + reaxc-hns: + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + sleep: + command: 'sleep 5' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + hello-world: + command: 'echo hello world' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS +``` + +
## list @@ -46,9 +139,86 @@ And this will run across sizes. To ask for a specific size: $ flux-cloud apply -e k8s-size-8-m5.large --size 2 ``` -## run +### up + +Here is how to bring up a cluster (with the operator installed). For this command, +we will either select the first in the matrix (default): + +```bash +$ flux-cloud up +``` +```console +No experiment ID provided, assuming first experiment n1-standard-1-2. +``` + +or if you want to specify an experiment identifier based on the machine and size, you can do that: + +```bash +$ flux-cloud up -e n1-standard-1-2 +``` +```console +Selected experiment n1-standard-1-2. +``` + +And to force up without a prompt: + +```bash +$ flux-cloud up -e n1-standard-1-2 --force-cluster +``` + +## Ways to run jobs + +The following commands are provided by Flux Cloud. For running jobs, you can either do: + +- **apply**/**run**: A single/multi job submission intended for different containers to re-create pods each time. +- **batch**/**submit**: A batch mode, where we submit / schedule many jobs on the fewest MiniClusters + +Both are described in the following sections. + +### apply / run + +> Ideal for running multiple jobs with different containers. + +An apply assumes that you want to create a separate MiniCluster each time, meaning +bringing up an entire set of pods, running a single command, and then bringing everything +down. This is ideal for longing running experiments, but note that it does not take advantage +of using Flux as a scheduler. Flux is basically running one job and going away. + +#### apply + +After "up" you can choose to run experiments (as you feel) with "apply." + +```bash +$ flux-cloud apply +``` + +The same convention applies - not providing the identifier runs the +first entry, otherwise we use the identifier you provide. + +```bash +$ flux-cloud apply -e n1-standard-1-2 +``` + +To force overwrite of existing results (by default they are skipped) + +```bash +$ flux-cloud apply -e n1-standard-1-2 --force +``` + +Apply is going to be creating on CRD per job, so that's a lot of +pod creation and deletion. This is in comparison to "submit" that +brings up a MiniCluster once, and then executes commands to it, allowing +Flux to serve as the scheduler. Note that by default, we always wait for a previous run to be cleaned up +before continuing. If you don't want apply to be interactive (e.g., it will +ask you before cleaning up) you can do: + +```bash +$ flux-cloud apply --non-interactive +``` + +By default, apply via a "run" is non-interactive. -> Up, apply, down in one command, ideal for completely headless runs and jobs with different containers. +#### run The main command is a "run" that is going to, for each cluster: @@ -112,67 +282,18 @@ $ flux-cloud apply -e n1-standard-1-2 $ flux-cloud down -e n1-standard-1-2 ``` -These commands are discussed in more next. - -### up - -Here is how to bring up a cluster (with the operator installed). For this command, -we will either select the first in the matrix (default): - -```bash -$ flux-cloud up -``` -```console -No experiment ID provided, assuming first experiment n1-standard-1-2. -``` - -or if you want to specify an experiment identifier based on the machine and size, you can do that: - -```bash -$ flux-cloud up -e n1-standard-1-2 -``` -```console -Selected experiment n1-standard-1-2. -``` - -And to force up without a prompt: - -```bash -$ flux-cloud up -e n1-standard-1-2 --force-cluster -``` - -## apply +### submit / batch -> Ideal for running multiple jobs with different containers. +> Ideal for one or more commands and/or containers across persistent MiniClusters. -After "up" you can choose to run experiments (as you feel) with "apply." +These commands submit multiple jobs to the same MiniCluster and actually use Flux +as a scheduler! This means we get the unique set of images and MiniCluster sizes for +your experiments, and then bring up each one, submitting the matching jobs to it. +We submit all jobs at once, and then poll Flux until they are completed to get output. -```bash -$ flux-cloud apply -``` +#### submit -The same convention applies - not providing the identifier runs the -first entry, otherwise we use the identifier you provide. - -```bash -$ flux-cloud apply -e n1-standard-1-2 -``` - -To force overwrite of existing results (by default they are skipped) - -```bash -$ flux-cloud apply -e n1-standard-1-2 --force -``` - -Apply is going to be creating on CRD per job, so that's a lot of -pod creation and deletion. This is in comparison to "submit" that -brings up a MiniCluster once, and then executes commands to it, allowing -Flux to serve as the scheduler. Note that by default, we always wait for a previous run to be cleaned up -before continuing. - -## submit - -> Ideal for one or more commands across the same container(s) and MiniCluster size. +The entire flow might look like: ```bash $ flux-cloud up --cloud minikube @@ -185,27 +306,31 @@ to submit jobs. For submit (and the equivalent to bring it up and down with batc your commands aren't provided in the CRD, but rather to the Flux Restful API. Submit / batch will also generate one CRD per MiniCluster size, but use the same MiniCluster across jobs. This is different -from apply, which generates one CRD per job to run. +from apply, which generates one CRD per job to run. If you don't want submit to be interactive +(e.g., it will ask you before cleaning up) you can do: -## batch +```bash +$ flux-cloud submit --non-interactive +``` -> Up, submit, down in one command, ideal for jobs with the same container(s) +By default, submit run with batch is non-interactive. + +#### batch + +This is the equivalent of "submit" but includes the up and down for the larger +Kubernetes cluster. + +```bash +$ flux-cloud batch --cloud aws +``` -The "batch" command is comparable to "run" except we are running commands -across the same set of containers. We don't need to bring pods up/down each time, -and we are using Flux in our cluster to handle scheduling. This command is going to: 1. Create the cluster 2. Run each of the experiments, saving output and timing, on the same pods 3. Bring down the cluster -The output is organized in the same way, and as before, you can choose to run a single -command with "submit" - -```bash -$ flux-cloud batch --cloud aws -``` +The output is organized in the same way, Note that since we are communicating with the FluxRestful API, you are required to provide a `FLUX_USER` and `FLUX_TOKEN` for the API. If you are running this programmatically, @@ -219,32 +344,6 @@ $ flux-cloud submit $ flux-cloud down ``` -## ui - -If you are interested in interactive submission on your own, either in the user interface -or via one of our client SDKs, you can bring up the MiniCluster and it's interface with -the Flux Restful API with `ui`: - -```bash -$ flux-cloud ui --cloud minikube -``` - -If you have many sizes of MiniClusters, you'll need to specify the one that you want: - -```bash -$ flux-cloud ui --cloud minikube --size 4 -``` - -By default, it will use your single MiniCluster size. - - - -Which then looks like this in the browser, available for submission via the interface itself -or the restful API until the user presses control+c to close the port forward and delete -the MiniCluster. - -![img/ui.png](img/ui.png) - ## down And then bring down your first (or named) cluster: @@ -266,7 +365,6 @@ You can also use `--force-cluster` here: $ flux-cloud down --force-cluster ``` - ## debug For any command, you can add `--debug` as a main client argument to see additional information. E.g., @@ -297,11 +395,10 @@ managedNodeGroups: ## scripts -By default, flux cloud keeps all scripts that the job renders in the experiment output directory under `.scripts`. If you -want to cleanup instead, you can add the `--cleanup` flag. We do this so you can inspect a script to debug, or if you -just want to keep them for reproducibility. As an example, here is outfrom from a run with multiple repeats of the -same command, across two MiniCluster cluster sizes (2 and 4). As of version `0.1.17` the data is also organized -by the runner (e.g., minikube vs google) so you can run the experiments across multiple clouds without conflict. +Flux cloud (prior to version 0.2.0) ran each job with a script, and it would save each script. Since version 0.2.0, +we refactored to do everything with Python APIs/SDKs, so we no longer save submit scripts. However, we still save +scripts for bringing up an down each cluster, along with node and pod metadata (as json). We save this in in the +hidden `.scripts` directory. ```console $ tree -a ./data/ @@ -314,17 +411,11 @@ $ tree -a ./data/ │ └── log.out ├── meta.json └── .scripts - ├── cluster-create-minikube.sh - ├── flux-operator.yaml - ├── kubectl-version.yaml - ├── minicluster-run-lmp-size-2-minicluster-size-2.sh - ├── minicluster-run-lmp-size-4-minicluster-size-4.sh - ├── minicluster-size-2.yaml - ├── minicluster-size-4.yaml - ├── minikube-version.json - ├── nodes-size-4.json - └── nodes-size-4.txt -``` - -And that's it! I think there might be a more elegant way to determine what cluster is running, -however if the user decides to launch more than one, it might be harder. More thinking / docs / examples coming soon. + ├── minicluster-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json + ├── nodes-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json + └── pods-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json +``` + +And that's it! We recommend you look at [examples](examples.md) or [tutorials](../tutorials/index.md) for +getting started. If you are brave, just run `flux-cloud experiment init --cloud ` to create +your own experiment from scratch. diff --git a/docs/getting_started/debugging.md b/docs/getting_started/debugging.md new file mode 100644 index 0000000..53bea0a --- /dev/null +++ b/docs/getting_started/debugging.md @@ -0,0 +1,163 @@ +# Debugging + +> Oh no, my MiniCluster jobs aren't running! + +Kubernetes is a complex beast, so here are some debugging tips that might help you figure out what +is going on. We are generally going to be looking at objects owned by the Flux Operator - pods, +config maps, and (sometimes volumes or services). Note that the object deployed by the Flux Operator +custom resource definition is called a `minicluster`: + +```bash +$ kubectl get -n flux-operator minicluster +``` +```console +NAME AGE +osu-benchmarks 57s +``` + +## 0. kubectl pro tips + +These tips come from the amazing [Claudia](https://github.com/cmisale)! + +It's fairly arduous to copy paste or type complete pod names, especially for indexed jobs where there is a random +set of characters. You can enable kubectl to autocomplete by adding this to your bash profile (`~/.bashrc`): + +```bash +source <(kubectl completion bash) +``` + +Another shortcut that is nice to have is to make an alias for `kubectl` to just be `k`: + +```bash +alias k=kubectl +``` + +Another tip is how to get an interactive session to a pod: + +```bash +$ kubectl exec -n flux-operator -it -- bash +``` + +Yes, it's very docker-like! I've found I'm much faster having these tricks than before. + + +## 1. Start with logs + +You can usually first look to pod logs to see what pods are there, and their various states: + +```bash +$ kubectl get -n flux-operator pods +``` + +Remember that if you use `flux-cloud` apply without debug, you won't see output after it finds the broker pod, +but you'll see it being printed to logs in your `data` folder. If you want to see output, either add `--debug` +after `flux-cloud` or look at the log and add `-f` to keep it hanging: + +```bash +# See instant of a log +$ kubectl logs -n flux-operator osu-benchmarks-0-vxnfq + +# Stream to the terminal until the container is done +$ kubectl logs -n flux-operator osu-benchmarks-0-vxnfq -f +``` + +Here is looking at output for the certificate generator pod: + +```bash +$ kubectl logs -n flux-operator osu-benchmarks-cert-generator +``` + +For `flux-cloud apply` if you want to see output consistently, it's suggested to add `--debug`, +as the miniclusters are going to be created / deleted and you'd need to grab the pod logs +multiple times! + +### What should I expect to see? + +The certificate generator pod runs first. It's output should *only* be +the certificate: + +```console +# **** Generated on 2023-03-04 04:24:46 by CZMQ **** +# ZeroMQ CURVE **Secret** Certificate +# DO NOT PROVIDE THIS FILE TO OTHER USERS nor change its permissions. + +metadata + name = "osu-benchmarks-cert-generator" + time = "2023-03-04T04:24:46" + userid = "0" + hostname = "osu-benchmarks-cert-generator" +curve + public-key = "l12&OlN-DwF*6rhx##Y#ZQ^9w1zON039Vxh2&+8r" + secret-key = "o^(dM0R96q-d=2Jk-tEjgh=syRjW?q6%Kq{Q8Y4H" +``` + +If you see any error message about "invalid curve cert" this means that something was incorrectly +generated. As an example, you should use `preCommand` for any logic that is shared between +the certificate generator and worker/broker pods (e.g., sourcing an environment for Flux) and commands->pre +for anything else that is just for the worker/broker pods (printing to debug, etc.) + +For the broker pod, you should expect to see debugging output (if logging->debug is true) and then the +Flux Broker starting. The quorum should be reported to be full. E.g., + +```console +🌀 flux start -o --config /etc/flux/config -Scron.directory=/etc/flux/system/cron.d -Stbon.fanout=256 -Srundir=/run/flux -Sstatedir=/var/lib/flux -Slocal-uri=local:///run/flux/local -Slog-stderr-level=6 -Slog-stderr-mode=local +broker.info[1]: start: none->join 13.3684ms +broker.info[1]: parent-ready: join->init 1.14525s +broker.info[1]: configuration updated +broker.info[1]: rc1.0: running /etc/flux/rc1.d/01-sched-fluxion +broker.info[1]: rc1.0: running /etc/flux/rc1.d/02-cron +broker.info[1]: rc1.0: /etc/flux/rc1 Exited (rc=0) 0.2s +broker.info[1]: rc1-success: init->quorum 0.234173s +broker.info[1]: quorum-full: quorum->run 0.204937s +``` + +If you see any error messages from the broker, this should be looked into. +Warnings can sometimes be OK. Ask if you aren't sure. + +## 2. Use describe + +You can describe any object in Kubernetes space to debug. Describe is especially important when you are debugging +storage and want to figure out why something isn't mounting. Typically you might start by looking at pods in all +namespaces: + +```bash +$ kubectl get pods --all-namespaces -o wide +``` + +The wide format is useful because it will show you the node each pod is assigned to, which can be useful +for debugging resource limits and requests. You then might want to describe a particular pod, +maybe to look at annotations or volume mounts: + +```bash +$ kubectl describe pod -n flux-operator osu-benchmarks-1-tj6bt +``` + +You can get json output with a get for the pod (or object): + +```bash +$ kubectl get pod -n flux-operator osu-benchmarks-1-tj6bt -o json +``` + +And pipe that into `jq` to look for specific attributes! So let's say you see that a volume +failed for your pod. You likely want to next check your persistent volumes "pv" and claims "pvc": + +```bash +$ kubectl describe -n flux-operator pv +$ kubectl describe -n flux-operator pvc +``` + +For volumes, if you are using a container storage interface (CSI) you likely are using a daemon set that +deploys pods. Try looking at the logs for the pods, and/or the daemonset for issues: + +```bash +$ kubectl describe daemonset --all-namespaces +``` + +Finally, services (svc) can be useful if you suspect a permission or credential is wonky. + +## 3. Advanced + +Often when I'm debugging something complex I try to create the object I'm interested in so it is in a +continuously running state. As an example, to test a pod for a daemonset, I will get the raw YAML +for the daemonset and change the entrypoint to `sleep infinity`. I can then shell in and manually run +commands to see their output. diff --git a/docs/getting_started/examples.md b/docs/getting_started/examples.md index b2e0cdd..7ddd9be 100644 --- a/docs/getting_started/examples.md +++ b/docs/getting_started/examples.md @@ -3,24 +3,24 @@ The easiest thing to do is arguably to start with an example, and then customize it. Here we will add examples as we create them. -- [up-apply-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-apply-down): shows using `flux-cloud apply` for individual CRD submission. -- [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/osu-benchmarks) -- [up-submit-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-submit-down): shows using `flux-cloud submit` for batch submission. -- [aws-lammps](https://github.com/converged-computing/flux-cloud/tree/main/examples/aws-lammps): a simple lammps run on AWS. - -The above example runs a single command in a single Kubernetes cluster and MiniCluster, -and it's lammps! - -## Demo - -Here is a quick demo from the [up-apply-down](https://github.com/converged-computing/flux-cloud/tree/main/examples/up-apply-down) in the repository. - - - -which was actually run as: +- [minikube](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube) + - [basic](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/basic) + - [volumes](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/volumes) + - [resources](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/resources) + - [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/osu-benchmarks) + - [persistent](https://github.com/converged-computing/flux-cloud/tree/main/examples/minikube/persistent) +- [google](https://github.com/converged-computing/flux-cloud/tree/main/examples/google) + - [osu-benchmarks](https://github.com/converged-computing/flux-cloud/tree/main/examples/google/osu-benchmarks) + +All of the examples above (for MiniKube) are tested, and can be adopted for another cloud typically by adding +the "machines" directive under "matrix" and then any custom variables. As a reminder, you can generate +a blank template for any cloud (including variables) via: ```bash -$ flux-cloud run +$ flux-cloud experiment init --cloud minikube +$ flux-cloud experiment init --cloud aws +$ flux-cloud experiment init --cloud google ``` -for the purposes of the demo, and runs a lammps job on two tiny nodes! + +New examples for AWS will be coming soon - I didn't have credits to test when I wrote these. diff --git a/docs/getting_started/experiments.md b/docs/getting_started/experiments.md index 2736cfb..fa31abf 100644 --- a/docs/getting_started/experiments.md +++ b/docs/getting_started/experiments.md @@ -3,12 +3,21 @@ Welcome to the Flux Cloud experiments user guide! If you come here, we are assuming you want to run jobs with the Flux Operator on GKE, and that you have [installed](install.md) flux-cloud. Note this project is early in development so this could change or bugs could be introduced. -Let's get started with talking about experiments. Your experiments will typically be defined by two files: +Let's get started with talking about experiments. As of version 0.2.0, your experiments will be defined by one file: - - experiments.yaml: a yaml file that describes sizes, machines, and jobs to run - - minicluster-template.yaml: a completely or partially filled template custom resource definition. + - experiments.yaml: a yaml file that describes sizes, machines, miniclusters, and jobs to run We will walk through example experiment files here, along with a full set of fields you can use. +Note that to get an example experiments.yaml template for any cloud, you can always do: + +```bash +$ flux-cloud experiment init --cloud minikube +$ flux-cloud experiment init --cloud aws +$ flux-cloud experiment init --cloud google +``` + +The documentation here outlines the sections in details, however the above is the best +means to get an entire, holistic file. ## Experiment Definition @@ -29,6 +38,7 @@ matrix: size: [2, 4] machine: ["n1-standard-1", "n1-standard-2"] ``` + Note that the sizes at this level indicate *the size of the Kubernetes cluster*. We will expand on this idea later. This would run each size across each machine, for a total of 4 Kubernetes clusters created. The number of custom resource (CRD) definitions applied to each one would vary based on the number of jobs. @@ -167,78 +177,85 @@ jobs: osu_get_latency: command: './osu_get_latency' image: ghcr.io/awesome/science:latest - workdir: /path/to/science + working_dir: /path/to/science repeats: 3 ``` For repeats, we add another level to the output directory, and represent the result data as -subdirectories of the machine and size from 1..N. Note also that likely in the future we -can provide a default template and require all these variables -defined. For now we require you to provide the template. - - -## Custom Resource Definition - -> minicluster-template.yaml +subdirectories of the machine and size from 1..N. -The custom resource definition template "CRD" is currently suggested so you can customize exactly to your liking, -but it's not required. It is used by flux-cloud to populate your job metadata and then submit one or more jobs to your Kubernetes cluster. +#### Flux Options -### Use Your Own - -Here is an example that uses a shared working directory (so it's hard coded) and a variable -for the command: +How do job parameters map to Flux, in the case of using `flux-cloud submit`? Good question! Here is the mapping: ```yaml -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # Number of pods to create for MiniCluster - size: {{ minicluster.size }} - - # Disable verbose output - logging: - quiet: true - - # This is a list because a pod can support multiple containers - containers: - # The container URI to pull (currently needs to be public) - - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 - - # You can set the working directory if your container WORKDIR is not correct. - workingDir: /home/flux/examples/reaxff/HNS - command: {{ job.command }} +jobs: + example-job: + command: './osu_get_latency' + flux_option_flags: "-ompi=openmpi@5" + working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided # workdir + image: ghcr.io/rse-ops/osu-microbench:test + + # osu benchmarks requires exactly 2 processes + tasks: 2 # num_tasks + cores: 1 # cores_per_task + gpus: 0 # gpus_per_task + nodes: 1 # num_nodes ``` -### Use The Default - -To use the default, you want to make sure that you provide all variables that are required. -The following are required (and have defaults or are otherwise generated by flux cloud -so you could leave them out of your experiments.yaml): - -- minicluster.name -- minicluster.namespace -- minicluster.local_deploy (defaults to false) -- minicluster.verbose (default to false to run in test mode) - -It's recommended to set your listing of sizes for miniclusters: +#### Yaml Tricks -- minicluster.size +For your jobs, you likely will want to re-use parameters. There is a trick with YAML +to define a named section, and then re-use it. Here is an example running the OSU +benchmarks. -The following are specific to the job and required: +```yaml +# matrix of experiments to run - machine types and sizes are required +# This can obviously be expanded to more sizes or machines, +matrix: + size: [2] + machine: ["n1-standard-1", "n1-standard-2"] -- job.image -- job.command +# An example of shared container options! +x-container-options: &options + fluxOptionFlags: "-ompi=openmpi@5" + working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided + image: ghcr.io/rse-ops/osu-microbench:app-latest + # This MUST be run for the certificate generator and workers/broker + pre_command: source /etc/profile.d/z10_spack_environment.sh -The following are specific to the job but not required: +# Flux Mini Cluster experiment attributes +minicluster: + name: osu-benchmarks + namespace: flux-operator -- job.workdir -- job.tasks (recommended for better control of flux, as this would default to 1) -- job.flux_option_flags (e.g., "-ompi=openmpi@5") -- job.cores (defaults to 1 if not set, likely not ideal for your experiment) -- job.limits (key value pairs) -- job.requests (key value pairs) -- job.pre_command: the job pre-command (usually multiple lines) but not required. +# Each job can have a command and working directory +jobs: + osu_get_latency: + command: './osu_get_latency' + <<: *options + osu_acc_latency: + command: './osu_acc_latency' + <<: *options + osu_fop_latency: + command: './osu_fop_latency' + <<: *options + osu_get_bw: + command: './osu_get_bw' + <<: *options + osu_put_bibw: + command: './osu_put_bibw' + <<: *options + osu_put_latency: + command: './osu_put_latency' + <<: *options + osu_cas_latency: + command: './osu_cas_latency' + <<: *options + osu_get_acc_latency: + command: './osu_get_acc_latency' + <<: *options + osu_put_bw: + command: './osu_put_bw' + <<: *options +``` diff --git a/docs/getting_started/google.md b/docs/getting_started/google.md deleted file mode 100644 index 9f7c96c..0000000 --- a/docs/getting_started/google.md +++ /dev/null @@ -1,74 +0,0 @@ -# Google Cloud - -> Running on Google Kubernetes Engine, GKE - -The main functionality that flux-cloud provides are easy wrappers (and templates) to running -the Flux Operator on GKE. The main steps of running experiments are: - - - **up** to bring up a cluster - - **apply** to apply one or more experiments defined by an experiments.yaml - - **down** to destroy a cluster - -Each of these commands can be run in isolation, and we provide a single command **run** to -automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your -machine to do the work (e.g., kubectl and gcloud) and importantly, for every step we show -you the command, and if it fails, give you a chance to bail out. We do this so if you -want to remove the abstraction at any point and run the commands on your own, you can. - -## Pre-requisites - -You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts) -and ensure you are logged in and have kubectl installed: - -```bash -$ gcloud auth login -``` - -Depending on your install, you can either install with gcloud: - -```bash -$ gcloud components install kubectl -``` -or just [on your own](https://kubernetes.io/docs/tasks/tools/). - -## Cloud - -Finally, ensure that google is either your default cloud (the `default_cloud` in your settings.yml) -or you specify it with `--cloud` when you do run. - - -## Custom Variables - -The following custom variables are supported in the "variables" section (key value pairs) -for Google in an `experiments.yaml` - -```yaml -variables: - # Customize zone just for this experiment - zone: us-central1-a -``` - - -## Run Experiments - -Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to -populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the -library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to -provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) -directory for a few that we provide. We will walk through a generic one here to launch -an experiment on a Kubernetes cluster. Note that before doing this step you should -have installed flux-cloud, along with kubectl and gcloud, and set your defaults (e.g., project zone) -in your settings. - -```bash -$ flux-cloud run experiments.yaml -``` - -Note that since the experiments file defaults to that name, you can also just do: - -```bash -$ flux-cloud run -``` - -Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory. -Note that machines and size are required for the matrix, and variables get piped into all experiments (in full). diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md index 30879f8..9fc263c 100644 --- a/docs/getting_started/index.md +++ b/docs/getting_started/index.md @@ -10,8 +10,6 @@ install commands examples experiments +debugging settings -google -aws -minikube ``` diff --git a/docs/getting_started/minikube.md b/docs/getting_started/minikube.md deleted file mode 100644 index f851b51..0000000 --- a/docs/getting_started/minikube.md +++ /dev/null @@ -1,134 +0,0 @@ -# MiniKube - -> Running on a local MiniKube cluster - -Flux Cloud (as of version 0.1.0) can run on MiniKube! The main steps of running experiments with -different container bases are: - - - **up** to bring up a cluster - - **apply** to apply one or more CRDs from experiments defined by an experiments.yaml - - **down** to destroy a cluster - -or one or more commands with the same container base(s): - - - **up** to bring up a cluster - - **submit** to submit one or more experiments to the same set of pods defined by an experiments.yaml - - **down** to destroy a cluster - -Each of these commands can be run in isolation, and we provide a single command **run** to -automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your -machine to do the work (e.g., minikube and kubectl) and importantly, for every step we show -you the command, and if it fails, give you a chance to bail out. We do this so if you -want to remove the abstraction at any point and run the commands on your own, you can. - -## Pre-requisites - -You should first [install minikube](https://minikube.sigs.k8s.io/docs/start/) -and kubectl. - -## Run Experiments - -Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to -populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the -library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to -provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) -directory for a few that we provide. We will walk through a generic one here to launch -an experiment on a MiniKube Kubernetes cluster. Note that before doing this step you should -have installed flux-cloud, along with kubectl and minikube. Note that if it's not the default, -you'll need to specify using MiniKube - -### Apply / Run - -> Ideal if you need to run multiple jobs on different containers - -```bash -$ flux-cloud run --cloud minikube experiments.yaml -``` - -Or set to the default: - -```bash -$ flux-cloud config set default_cloud:minikube -``` - -Given MiniKube is the default, since the experiments file defaults to that name, you can also just do: - -```bash -$ flux-cloud run -``` - -Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory. -Note that only size is required for the matrix for MiniKube (there is currently no concept of a machine, -although there could be), and variables get piped into all experiments (in full). Under variables, -both "commands" and "ids" are required, and must be equal in length (each command is assigned to one id -for output). To just run the first entry in the matrix (test mode) do: - -```bash -$ flux-cloud run experiments.yaml --test -``` - -Note that you can also use the other commands in place of a single run, notably "up" "apply" and "down." -By default, results will be written to a temporary output directory, but you can customize this with `--outdir`. -Finally, since MiniKube often has trouble pulling images, we recommend you include the container image as a variable -in the experiment.yaml so it can be pulled before the experiment is run. E.g., this experiment: - -```yaml -matrix: - size: [4] - -# Flux Mini Cluster experiment attributes -minicluster: - name: lammps - namespace: flux-operator - size: [2, 4] - -# Each job can have a command and working directory -jobs: - lmp: - command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite - repeats: 2 - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 -``` - -And this config file: - -```yaml -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # Number of pods to create for MiniCluster - size: {{ minicluster.size }} - - # Disable verbose output - logging: - quiet: true - - # This is a list because a pod can support multiple containers - containers: - # The container URI to pull (currently needs to be public) - - image: {{ job.image }} - - # You can set the working directory if your container WORKDIR is not correct. - workingDir: /home/flux/examples/reaxff/HNS - command: {{ job.command }} -``` - -### Submit - -> Ideal for one or more commands across the same container(s) and MiniCluster size. - -```bash -$ flux-cloud up --cloud minikube -$ flux-cloud submit --cloud minikube -$ flux-cloud down --cloud minikube -``` - -The submit will always check if the MiniCluster is already created, and if not, create it -to submit jobs. For submit (and the equivalent to bring it up and down with batch) -your commands aren't provided in the CRD, -but rather to the Flux Restful API. Submit / batch will also generate one CRD -per MiniCluster size, but use the same MiniCluster across jobs. This is different -from apply, which generates one CRD per job to run. diff --git a/docs/index.rst b/docs/index.rst index efdb1a1..d9804fb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -52,15 +52,15 @@ to unite the worlds and technologies typical of cloud computing and high performance computing. To get started, check out the links below! -Would you like to request a feature or contribute? -[Open an issue](https://github.com/flux-framework/flux-cloud/issues). +Would you like to request a feature or contribute? `Open an issue `_. .. toctree:: :caption: Getting Started - :maxdepth: 1 + :maxdepth: 2 getting_started/index.md + tutorials/index.md .. toctree:: :caption: About diff --git a/docs/getting_started/aws.md b/docs/tutorials/aws.md similarity index 54% rename from docs/getting_started/aws.md rename to docs/tutorials/aws.md index 2ef4a6c..be1a0b3 100644 --- a/docs/getting_started/aws.md +++ b/docs/tutorials/aws.md @@ -1,19 +1,14 @@ # AWS > Running on Amazon Elastic Kubernetes Service EKS +Flux Cloud (as of version 0.1.0) can run on MiniKube! There are two primary use cases for using flux-cloud: -The flux-cloud software provides are easy wrappers (and templates) to running -the Flux Operator on Amazon. The main steps of running experiments are: + - **apply** is good for many larger experiments that require different container bases and / or take a longer time to run. + - **submit** is good for smaller experiments that might use the same container bases and / or take a shorter time to run. - - **up** to bring up a cluster - - **apply** to apply one or more experiments defined by an experiments.yaml - - **down** to destroy a cluster - -Each of these commands can be run in isolation, and we provide a single command **run** to -automate the entire thing. We emphasize the term "wrapper" as we are using scripts on your -machine to do the work (e.g., kubectl and gcloud) and importantly, for every step we show -you the command, and if it fails, give you a chance to bail out. We do this so if you -want to remove the abstraction at any point and run the commands on your own, you can. +For the latter (submit) we will bring up the minimum number of MiniClusters required (unique based on container image size) +and launch all jobs across them, using Flux as a scheduler. As of version 0.2.0 both commands both use the fluxoperator Python +SDK, so we only use bash scripts to bring up and down cloud-specific clusters. ## Pre-requisites @@ -45,6 +40,52 @@ This is used so you can ssh (connect) to your workers! Finally, ensure that aws is either your default cloud (the `default_cloud` in your settings.yml) or you specify it with `--cloud` when you do run. +## Run Experiments + +**IMPORTANT** for any experiment when you choose an instance type, you absolutely +need to choose a size that has [IsTrunkingCompatible](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go) +true. E.g., `m5.large` has it set to true so it would work. Each experiment is defined by the matrix and variables in an `experiment.yaml`. It's recommended you +start with a template populated for aws: + +```bash +$ flux-cloud experiment init --cloud aws +``` + +And see the [custom variables](#custom-variables) defined below to learn more about them, +or the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) +directory for a few examples that we provide. We will walk through a generic one here to launch +an experiment on a Kubernetes cluster. Note that before doing this step you should +have installed flux-cloud, along with ekctl, and set your defaults (e.g., project zone) +in your settings. + +Given an experiments.yaml in the present working directory, you can do an apply, +meaning creating a separate MiniCluster per job: + +```bash +# Up / apply / down +$ flux-cloud run --cloud aws + +# Manual up / apply / down (recommended) +$ flux-cloud up --cloud aws +$ flux-cloud apply --cloud aws +$ flux-cloud down --cloud aws +``` + +Or submit, creating shared MiniClusters to submit multiple jobs to: + +```bash +# Up / submit / down +$ flux-cloud batch --cloud aws + +# Manual up / submit / down (recommended) +$ flux-cloud up --cloud aws +$ flux-cloud submit --cloud aws +$ flux-cloud down --cloud aws +``` + +Note that machines and size are required for the matrix. + + ## Custom Variables The following custom variables are supported in the "variables" section (key value pairs) @@ -74,39 +115,3 @@ variables: Note that we currently take a simple approach for boolean values - if it's present (e.g., the examples) above) it will be rendered as true. Don't put False in there, but rather just delete the key. - -## Run Experiments - -**IMPORTANT** for any experiment when you choose an instance type, you absolutely -need to choose a size that has [IsTrunkingCompatible](https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/master/pkg/aws/vpc/limits.go) -true. E.g., `m5.large` has it set to true so it would work. - -Each experiment is defined by the matrix and variables in an `experiment.yaml` that is used to -populate a `minicluster-template.yaml` that you can either provide, or use a template provided by the -library. One of the goals of the Flux Cloud Experiment runner is not just to run things, but to -provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) -directory for a few that we provide. We will walk through a generic one here to launch -an experiment on a Kubernetes cluster. Note that before doing this step you should -have installed flux-cloud, along with kubectl and gcloud, and set your defaults (e.g., project zone) -in your settings. - -```bash -$ flux-cloud run experiments.yaml -``` - -Note that since the experiments file defaults to that name, you can also just do: - -```bash -$ flux-cloud run -``` - -Or for more control and/or verbosity: - -```bash -$ flux-cloud --debug up --cloud aws -$ flux-cloud --debug apply --cloud aws -$ flux-cloud --debug down --cloud aws -``` - -Given an experiments.yaml in the present working directory. Take a look at an `experients.yaml` in an example directory. -Note that machines and size are required for the matrix, and variables get piped into all experiments (in full). diff --git a/docs/tutorials/google.md b/docs/tutorials/google.md new file mode 100644 index 0000000..cf4616a --- /dev/null +++ b/docs/tutorials/google.md @@ -0,0 +1,91 @@ +# Google Cloud + +> Running on Google Kubernetes Engine, GKE + +The main functionality that flux-cloud provides are easy wrappers (and templates) to running +the Flux Operator on GKE. The main steps of running experiments are: + + - **up** to bring up a cluster + - **apply/submit** to apply or submit one or more experiments defined by an experiments.yaml + - **down** to destroy a cluster + +Each of these commands can be run in isolation, and we provide single commands **run/batch** to +automate the entire thing. For Google Cloud, you can see a small collection of [examples here](https://github.com/converged-computing/flux-cloud/tree/main/examples/google). + +## Pre-requisites + +You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts) +and ensure you are logged in and have kubectl installed: + +```bash +$ gcloud auth login +``` + +Depending on your install, you can either install with gcloud: + +```bash +$ gcloud components install kubectl +``` +or just [on your own](https://kubernetes.io/docs/tasks/tools/). + +## Cloud + +Finally, ensure that google is either your default cloud (the `default_cloud` in your settings.yml) +or you specify it with `--cloud` when you do run. + +## Custom Variables + +The following custom variables are supported in the "variables" section (key value pairs) +for Google in an `experiments.yaml` + +```yaml +variables: + # Customize zone just for this experiment + zone: us-central1-a +``` + + +## Run Experiments + +You can create an empty experiment template as follows: + +```bash +$ flux-cloud experiment init --cloud google +``` + +Each experiment is defined by the matrix and variables in an `experiment.yaml` +One of the goals of the Flux Cloud Experiment runner is not just to run things, but to +provide this library for you to easily edit and use! Take a look at the [examples](https://github.com/converged-computing/flux-cloud/tree/main/examples) +directory for a few that we provide. We will walk through a generic one here to launch +an experiment on a Kubernetes cluster. Note that before doing this step you should +have installed flux-cloud, along with gcloud, and set your defaults (e.g., project zone) +in your settings. + +Given an experiments.yaml in the present working directory, you can do an apply, +meaning creating a separate MiniCluster per job: + +```bash +# Up / apply / down +$ flux-cloud run --cloud google + +# Manual up / apply / down (recommended) +$ flux-cloud --debug up --cloud google +$ flux-cloud --debug apply --cloud google +$ flux-cloud --debug down --cloud google +``` + +For any of the commands here, add `--debug` after `flux-cloud` to see more verbosity. +Or submit, creating shared MiniClusters to submit multiple jobs to: + +```bash +# Up / submit / down +$ flux-cloud batch --cloud google + +# Manual up / submit / down (recommended) +$ flux-cloud --debug up --cloud google +$ flux-cloud --debug submit --cloud google +$ flux-cloud --debug down --cloud google +``` + +Note that machines and size are required for the matrix. See our [debugging guide](../getting-started/debugging.md) +for the Flux Operator for interacting with Flux Operator containers or debugging. diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md new file mode 100644 index 0000000..b545d8f --- /dev/null +++ b/docs/tutorials/index.md @@ -0,0 +1,11 @@ +# Tutorials + +These tutorials will walk through common use cases for Flux Cloud! If you have +any questions or issues, please [let us know](https://github.com/flux-framework/flux-cloud/issues) + +```{toctree} +:maxdepth: 3 +minikube +google +aws +``` diff --git a/docs/tutorials/minikube.md b/docs/tutorials/minikube.md new file mode 100644 index 0000000..4e68e4e --- /dev/null +++ b/docs/tutorials/minikube.md @@ -0,0 +1,313 @@ +# MiniKube + +> Running on a local MiniKube cluster + +Flux Cloud (as of version 0.1.0) can run on MiniKube! There are two primary use cases for using flux-cloud: + + - **apply** is good for many larger experiments that require different container bases and / or take a longer time to run. + - **submit** is good for smaller experiments that might use the same container bases and / or take a shorter time to run. + +For the latter (submit) we will bring up the minimum number of MiniClusters required (unique based on container image size) +and launch all jobs across them, using Flux as a scheduler. As of version 0.2.0 both commands both use the fluxoperator Python +SDK, so we only use bash scripts to bring up and down cloud-specific clusters. + + +## Pre-requisites + +You should first [install minikube](https://minikube.sigs.k8s.io/docs/start/) +and kubectl. + +## Run Experiments + +Let's start with a simple `experiments.yaml` file, where we have defined a number of different +experiments to run on MiniKube. `flux-cloud submit` relies entirely on this experiment file, +and programmatically generates the MiniCluster [custom resource definitions](https://flux-framework.org/flux-operator/getting_started/custom-resource-definition.html#workingdir) +for you, so you don't need to provide any kind of template. + +
+ +How does it work? + +A YAML file (such as the experiments.yaml) can be serialized to JSON, so each section under "jobs" is +also json, or actually (in Python) a dictionary of values. Since the values are passed to the +[Flux Operator Python SDK](https://github.com/flux-framework/flux-operator/tree/main/sdk/python/v1alpha1), +we can map them easily according to the following convention. Let's say we have a job in the experiments listing: + +```yaml +jobs: + # This is the start of the named job + reaxc-hns: + + # These are attributes for the MiniCluster (minus repeats) + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS +``` + +The content under the job name "reaxc-hns" would be mapped to the MiniCluster container as follows: + +```python +from fluxoperator.models import MiniClusterContainer + +container = MiniClusterContainer( + image="ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", + working_dir="/home/flux/examples/reaxff/HNS", + command="lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", + run_flux=True, +) +``` + +Note that in the above, since Go is in camel case and the Python SDK turns it into snake case, +`workingDir` is changed to `working_dir`. + +
+ + +Let's start with this set of experiments. Note that we've provided the same container +for all of them, meaning that we will only be creating one MiniCluster with that container. +If you provide jobs with separate containers, they will be brought up as separate clusters +to run (per each unique container, with all jobs matched to it). + +```yaml +# This is intended for MiniKube, so no machine needed. +# We will create a MiniKube cluster of size 2 +matrix: + size: [2] + +# Flux Mini Cluster experiment attributes +minicluster: + name: submit-jobs + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + +# Each of command and image are required to do a submit! +jobs: + reaxc-hns: + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + sleep: + command: 'sleep 5' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + hello-world: + command: 'echo hello world' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS +``` + +Each experiment is defined by the matrix and variables in an `experiment.yaml`, as shown above. +Note that the easiest way to get started is to use an existing example, or run: + +```bash +$ flux-cloud experiment init --cloud minikube +``` + +In the example above, we are targeting minikube. + + +### Apply / Run + +> Ideal if you need to run multiple jobs on different containers + +This apply/run workflow will create a new MiniCluster each time (pods up and down) +and not use Flux as a scheduler proper. A workflow might look like: + +```bash +$ flux-cloud up --cloud minikube +$ flux-cloud apply --cloud minikube +$ flux-cloud down --cloud minikube +``` +Or achieve all three with: + +```bash +$ flux-cloud run --cloud minikube +``` + +Let's run this with our `experiments.yaml` above in the present working directory, +and after having already run `up`: + +```bash +# Also print output to the terminal (so you can watch!) +$ flux-cloud --debug apply --cloud minikube + +# Only save output to output files +$ flux-cloud apply --cloud minikube +``` + +At the end of the run, you'll have an organized output directory with all of your +output logs, along with saved metadata about the minicluster, pods, and nodes. + +```bash + +``` + +### Submit + +> Ideal for one or more commands across the one or more containers and MiniCluster sizes + +The idea behind a submit is that we are going to create the minimal number of MiniClusters you +need (across the set of unique sizes and images) and then submit all jobs to Flux within +the MiniCluster. The submit mode is actually using Flux as a scheduler and not just a +"one job" running machine. A basic submit workflow using the config above might look like this: + +```bash +$ flux-cloud up --cloud minikube +$ flux-cloud submit --cloud minikube +$ flux-cloud down --cloud minikube +``` + +Instead of running one job at a time and waiting for output (e.g., apply) we instead +submit all the jobs, and then poll every 30 seconds to get job statuses. + +
+ +View full output of submit command + +```bash +$ flux-cloud --debug submit --cloud minikube +``` +```console +No experiment ID provided, assuming first experiment k8s-size-4-n1-standard-1. +Job experiments file generated 1 MiniCluster(s). + +🌀 Bringing up MiniCluster of size 2 with image ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 +All pods are in states "Running" or "Completed" +💾 Creating output directory /home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/minikube +MiniCluster created with credentials: + FLUX_USER=fluxuser + FLUX_TOKEN=d467215d-d07d-4c32-b2b9-41643cda3d7d +All pods are in states "Running" or "Completed" +Found broker pod lammps-job-0-ng8pz + +Waiting for http://lammps-job-0-ng8pz.pod.flux-operator.kubernetes:5000 to be ready +🪅️ RestFUL API server is ready! +. +Port forward opened to http://lammps-job-0-ng8pz.pod.flux-operator.kubernetes:5000 +Submitting reaxc-hns-1-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting reaxc-hns-2-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting reaxc-hns-3-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting reaxc-hns-4-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting reaxc-hns-5-minicluster-size-2: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite +Submitting sleep-1-minicluster-size-2: sleep 5 +Submitting sleep-2-minicluster-size-2: sleep 5 +Submitting sleep-3-minicluster-size-2: sleep 5 +Submitting sleep-4-minicluster-size-2: sleep 5 +Submitting sleep-5-minicluster-size-2: sleep 5 +Submitting hello-world-1-minicluster-size-2: echo hello world +Submitting hello-world-2-minicluster-size-2: echo hello world +Submitting hello-world-3-minicluster-size-2: echo hello world +Submitting hello-world-4-minicluster-size-2: echo hello world +Submitting hello-world-5-minicluster-size-2: echo hello world +Submit 15 jobs! Waiting for completion... +15 are active. + lmp is in state RUN + lmp is in state RUN + lmp is in state SCHED + lmp is in state SCHED + lmp is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED +15 are active. + lmp is finished COMPLETED in 28.64 seconds. + lmp is finished COMPLETED in 29.1 seconds. + lmp is in state RUN + lmp is in state RUN + lmp is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED +13 are active. + lmp is in state RUN + lmp is in state RUN + lmp is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + sleep is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED +13 are active. + lmp is finished COMPLETED in 36.56 seconds. + lmp is finished COMPLETED in 35.89 seconds. + lmp is in state RUN + sleep is finished COMPLETED in 5.02 seconds. + sleep is finished COMPLETED in 5.02 seconds. + sleep is finished COMPLETED in 5.02 seconds. + sleep is in state RUN + sleep is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED + echo is in state SCHED +8 are active. + lmp is finished COMPLETED in 24.6 seconds. + sleep is finished COMPLETED in 5.02 seconds. + sleep is finished COMPLETED in 5.02 seconds. + echo is finished COMPLETED in 0.01 seconds. + echo is finished COMPLETED in 0.02 seconds. + echo is finished COMPLETED in 0.02 seconds. + echo is finished COMPLETED in 0.01 seconds. + echo is finished COMPLETED in 0.01 seconds. +All jobs are complete! Cleaning up MiniCluster... +All pods are terminated. +``` + +
+ +After submit, you will still have an organized output directory with job output files +and metadata. + +```bash +$ tree -a data/minikube/ +data/minikube/ +└── k8s-size-4-n1-standard-1 + ├── hello-world-1-minicluster-size-2 + │ └── log.out + ├── hello-world-2-minicluster-size-2 + │ └── log.out + ├── hello-world-3-minicluster-size-2 + │ └── log.out + ├── hello-world-4-minicluster-size-2 + │ └── log.out + ├── hello-world-5-minicluster-size-2 + │ └── log.out + ├── meta.json + ├── reaxc-hns-1-minicluster-size-2 + │ └── log.out + ├── reaxc-hns-2-minicluster-size-2 + │ └── log.out + ├── reaxc-hns-3-minicluster-size-2 + │ └── log.out + ├── reaxc-hns-4-minicluster-size-2 + │ └── log.out + ├── reaxc-hns-5-minicluster-size-2 + │ └── log.out + └── .scripts + └── minicluster-size-2-lammps-job-ghcr.io-rse-ops-lammps-flux-sched-focal-v0.24.0.json +``` diff --git a/examples/aws-lammps/README.md b/examples/aws-lammps/README.md deleted file mode 100644 index 99b7d2c..0000000 --- a/examples/aws-lammps/README.md +++ /dev/null @@ -1,71 +0,0 @@ -# Lammps on Amazon Cloud - -In this set of experiments we will run the Flux Operator on Amazon Cloud at size N=2 -(the benchmarks require this) and multiple machine types. - -## Pre-requisites - -You should first [install eksctrl](https://github.com/weaveworks/eksctl) and make sure you have access to an AWS cloud (e.g., -with credentials or similar in your environment). E.g.,: - -```bash -export AWS_ACCESS_KEY_ID=xxxxxxxxxxxxxxxxxxx -export AWS_SECRET_ACCESS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -export AWS_SESSION_TOKEN=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -``` - -The last session token may not be required depending on your setup. -We assume you also have [kubectl](https://kubernetes.io/docs/tasks/tools/). - -### Setup SSH - -You'll need an ssh key for EKS. Here is how to generate it: - -```bash -ssh-keygen -# Ensure you enter the path to ~/.ssh/id_eks -``` - -This is used so you can ssh (connect) to your workers! - -### Cloud - -we will be using [Flux Cloud](https://github.com/converged-computing/flux-cloud) -to run the Operator on Google Cloud Kubernetes engine. - -```bash -$ pip install flux-cloud -``` - -Note that these experiments were run with version 0.1.0. -Ensure that aws is either your default cloud (the `default_cloud` in your settings.yml) -or you specify it with `--cloud` when you do run. - - -## Run Experiments - -Each experiment here is defined by the matrix and variables in [experiments.yaml](experiment.yaml) that is used to -populate a [minicluster-template.yaml](minicluster-template.yaml) and launch a Kubernetes cluster. -You can read the documentation for flux-cloud to understand the variables available. -This tutorial assumes you have flux-cloud installed and configured. See all unique Kubernetes clusters -we will run the jobs on: - -```bash -$ flux-cloud list -``` - -Then you can either run all at once: - -```bash -$ flux-cloud run --force-cluster -``` - -Or (for testing) to bring up just the first cluster and then manually apply: - -```bash -$ flux-cloud --debug up --cloud aws -$ flux-cloud --debug apply --cloud aws -$ flux-cloud --debug down --cloud aws -``` - -By default, results will be written to a [./data](data) directory, but you can customize this with `--outdir`. diff --git a/examples/aws-lammps/experiments.yaml b/examples/aws-lammps/experiments.yaml deleted file mode 100644 index e694254..0000000 --- a/examples/aws-lammps/experiments.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# matrix of experiments to run - machine types and sizes are required - -# These are mini runs intended for testing -matrix: - size: [8] - machine: ["m5.large"] - -# Flux Mini Cluster experiment attributes -minicluster: - name: lammps - namespace: flux-operator - size: [2, 4, 6, 8] - -# Each job can have a command and working directory -jobs: - lmp: - command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite - repeats: 3 diff --git a/examples/aws-lammps/minicluster-template.yaml b/examples/aws-lammps/minicluster-template.yaml deleted file mode 100644 index 7591645..0000000 --- a/examples/aws-lammps/minicluster-template.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster - -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # Number of pods to create for MiniCluster - size: {{ minicluster.size }} - - # This is a list because a pod can support multiple containers - containers: - - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 - workingDir: /home/flux/examples/reaxff/HNS - command: {{ job.command }} diff --git a/examples/google/osu-benchmarks/README.md b/examples/google/osu-benchmarks/README.md new file mode 100644 index 0000000..0bd718c --- /dev/null +++ b/examples/google/osu-benchmarks/README.md @@ -0,0 +1,5 @@ +# OSU Benchmarks + +This example demonstrates how to setup an [experiments.yaml](experiments.yaml) +to run on Google Cloud. See the [Google Cloud tutorials](https://converged-computing.github.io/flux-cloud/tutorials/google.html) +for how to run this tutorial. diff --git a/examples/google/osu-benchmarks/experiments.yaml b/examples/google/osu-benchmarks/experiments.yaml new file mode 100644 index 0000000..14e6806 --- /dev/null +++ b/examples/google/osu-benchmarks/experiments.yaml @@ -0,0 +1,59 @@ +# matrix of experiments to run - machine types and sizes are required +# This can obviously be expanded to more sizes or machines, +matrix: + size: [2] + machine: ["n1-standard-1", "n1-standard-2"] + +# An example of shared container options +x-container-options: &options + flux_option_flags: "-ompi=openmpi@5" + working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided + image: ghcr.io/rse-ops/osu-microbench:test + + # This will get passed during a flux submit + tasks: 2 + +# res = cli.submit(command=job["command"], num_tasks=2, cores_per_task=1, workdir=kwargs['workdir']) + +# Flux Mini Cluster experiment attributes +minicluster: + name: osu-benchmarks + namespace: flux-operator + + # provide credentials if you want to re-use a minicluster + flux_restful: + username: fluxuser + token: "123456" + + # osu benchmarks requires exactly 2 processes + tasks: 2 + +# Each job can have a command and working directory +jobs: + osu_get_latency: + command: './osu_get_latency' + <<: *options + osu_acc_latency: + command: './osu_acc_latency' + <<: *options + osu_fop_latency: + command: './osu_fop_latency' + <<: *options + osu_get_bw: + command: './osu_get_bw' + <<: *options + osu_put_bibw: + command: './osu_put_bibw' + <<: *options + osu_put_latency: + command: './osu_put_latency' + <<: *options + osu_cas_latency: + command: './osu_cas_latency' + <<: *options + osu_get_acc_latency: + command: './osu_get_acc_latency' + <<: *options + osu_put_bw: + command: './osu_put_bw' + <<: *options diff --git a/examples/minikube/basic/README.md b/examples/minikube/basic/README.md new file mode 100644 index 0000000..0a2a6aa --- /dev/null +++ b/examples/minikube/basic/README.md @@ -0,0 +1,3 @@ +# Up, Submit, Down + +See the [minikube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html) for how to run this tutorial. diff --git a/examples/up-submit-down/experiments.yaml b/examples/minikube/basic/experiments.yaml similarity index 64% rename from examples/up-submit-down/experiments.yaml rename to examples/minikube/basic/experiments.yaml index 880e652..ec0ce6a 100644 --- a/examples/up-submit-down/experiments.yaml +++ b/examples/minikube/basic/experiments.yaml @@ -1,7 +1,6 @@ # This is intended for MiniKube, so no machine needed matrix: size: [4] - machine: [n1-standard-1] # Flux Mini Cluster experiment attributes minicluster: @@ -10,22 +9,19 @@ minicluster: # Each of these sizes will be brought up and have commands run across it size: [2] -# Since we are creating a minicluster here to submit commands across -# on the same container, the container is required here. If you specify -# a size here, the image must be the same across sizes jobs: reaxc-hns: command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 repeats: 5 - workdir: /home/flux/examples/reaxff/HNS + working_dir: /home/flux/examples/reaxff/HNS sleep: command: 'sleep 5' image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 repeats: 5 - workdir: /home/flux/examples/reaxff/HNS + working_dir: /home/flux/examples/reaxff/HNS hello-world: command: 'echo hello world' image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 repeats: 5 - workdir: /home/flux/examples/reaxff/HNS + working_dir: /home/flux/examples/reaxff/HNS diff --git a/examples/minikube/logging/README.md b/examples/minikube/logging/README.md new file mode 100644 index 0000000..779e469 --- /dev/null +++ b/examples/minikube/logging/README.md @@ -0,0 +1,3 @@ +# Logging + +This experiments.yaml shows how to customize the MiniCluster logging. diff --git a/examples/minikube/logging/experiments.yaml b/examples/minikube/logging/experiments.yaml new file mode 100644 index 0000000..d457541 --- /dev/null +++ b/examples/minikube/logging/experiments.yaml @@ -0,0 +1,24 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + +# Flux Mini Cluster experiment attributes +minicluster: + name: lammps-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + + # How to set logging attributes + logging: + debug: False # defaults to False + quiet: True # defaults to False + strict: False # defaults to True + timed: False # defaults to False, requires time in containers + +jobs: + reaxc-hns: + command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 2 + working_dir: /home/flux/examples/reaxff/HNS diff --git a/examples/minikube/osu-benchmarks/README.md b/examples/minikube/osu-benchmarks/README.md new file mode 100644 index 0000000..8b8ec6b --- /dev/null +++ b/examples/minikube/osu-benchmarks/README.md @@ -0,0 +1,5 @@ +# OSU Benchmarks + +This example demonstrates how to setup an [experiments.yaml](experiments.yaml) +to run on MiniKube. See the [MiniKube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html) +for how to run this tutorial. diff --git a/examples/minikube/osu-benchmarks/experiments.yaml b/examples/minikube/osu-benchmarks/experiments.yaml new file mode 100644 index 0000000..d278aa5 --- /dev/null +++ b/examples/minikube/osu-benchmarks/experiments.yaml @@ -0,0 +1,65 @@ +# matrix of experiments to run - machine types and sizes are required + +# This can obviously be expanded to more sizes or machines, +matrix: + size: [2] + #machine: ["n1-standard-1", "n1-standard-2"] + +# TODO +# when get this working, save to experiments-full.yaml, move to minkube, have shortened version run for test +# then test this on google cloud +# flux operator / python api still need to be released - maybe only allow pam for auth? + +# An example of shared container options +x-container-options: &options + flux_option_flags: "-ompi=openmpi@5" + working_dir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided + image: ghcr.io/rse-ops/osu-microbench:test + + # This will get passed during a flux submit + tasks: 2 + +# res = cli.submit(command=job["command"], num_tasks=2, cores_per_task=1, workdir=kwargs['workdir']) + +# Flux Mini Cluster experiment attributes +minicluster: + name: osu-benchmarks + namespace: flux-operator + + # provide credentials if you want to re-use a minicluster + flux_restful: + username: fluxuser + token: "123456" + + # osu benchmarks requires exactly 2 processes + tasks: 2 + +# Each job can have a command and working directory +jobs: + osu_get_latency: + command: './osu_get_latency' + <<: *options + osu_acc_latency: + command: './osu_acc_latency' + <<: *options + osu_fop_latency: + command: './osu_fop_latency' + <<: *options + osu_get_bw: + command: './osu_get_bw' + <<: *options + osu_put_bibw: + command: './osu_put_bibw' + <<: *options + osu_put_latency: + command: './osu_put_latency' + <<: *options + osu_cas_latency: + command: './osu_cas_latency' + <<: *options + osu_get_acc_latency: + command: './osu_get_acc_latency' + <<: *options + osu_put_bw: + command: './osu_put_bw' + <<: *options diff --git a/examples/minikube/persistent/README.md b/examples/minikube/persistent/README.md new file mode 100644 index 0000000..5636c77 --- /dev/null +++ b/examples/minikube/persistent/README.md @@ -0,0 +1,49 @@ +# Persistent + +This is a trick to get a MiniCluster up and running (and have it stay running)! + + - For **submit** we run a job that will never complete + - For **apply** we do the same! + +I typically use this case to debug one or the other. E.g., (given MiniKube is running with the operator installed): + +```bash +$ flux-cloud --debug submit --cloud minikube +``` + +Then get the pod + +```bash +$ kubectl get -n flux-operator pods +NAME READY STATUS RESTARTS AGE +sleep-job-0-pm28c 1/1 Running 0 73s +sleep-job-1-h824z 1/1 Running 0 73s +sleep-job-cert-generator 0/1 Completed 0 73s +``` + +And ssh in! + +```bash +$ kubectl exec -it -n flux-operator sleep-job-0-pm28c -- bash +``` + +For either submit or apply, we can connect to the instance with the broker URI + +```bash +$ export FLUX_URI=local:///run/flux/local +$ sudo -u flux flux proxy $FLUX_URI +``` +and then see our infinite flux job! + +```bash +$ flux jobs -a + JOBID USER NAME ST NTASKS NNODES TIME INFO + ƒCvGx8CX flux sleep R 1 1 2.432m sleep-job-1 +``` + +The main difference is that submit is going to periodically ping the restful API to check +on the job. So you are probably better off with apply in that it's almost the same +thing (a flux start -> flux submit instead of starting the flux broker) without +the poll. + +See the [minikube tutorials](https://converged-computing.github.io/flux-cloud/tutorials/minikube.html) for how to run this tutorial. diff --git a/examples/minikube/persistent/experiments.yaml b/examples/minikube/persistent/experiments.yaml new file mode 100644 index 0000000..aa6b649 --- /dev/null +++ b/examples/minikube/persistent/experiments.yaml @@ -0,0 +1,16 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + +# Flux Mini Cluster experiment attributes +minicluster: + name: sleep-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + +# This will bring up a cluster to stay online (until you kill it) as the job will never end +jobs: + sleep: + command: 'sleep infinity' + image: ghcr.io/flux-framework/flux-restful-api:latest diff --git a/examples/minikube/resources/README.md b/examples/minikube/resources/README.md new file mode 100644 index 0000000..54eab30 --- /dev/null +++ b/examples/minikube/resources/README.md @@ -0,0 +1,3 @@ +# Resources + +This experiments.yaml shows how to customize MiniCluster resources. diff --git a/examples/minikube/resources/experiments.yaml b/examples/minikube/resources/experiments.yaml new file mode 100644 index 0000000..846ef81 --- /dev/null +++ b/examples/minikube/resources/experiments.yaml @@ -0,0 +1,25 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + +# Flux Mini Cluster experiment attributes +minicluster: + name: lammps-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + +jobs: + reaxc-hns: + command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 2 + working_dir: /home/flux/examples/reaxff/HNS + + # Resources for the specific container job + resources: + limits: + cpu: 1 + + requests: + cpu: 1 diff --git a/examples/minikube/volumes/README.md b/examples/minikube/volumes/README.md new file mode 100644 index 0000000..918f351 --- /dev/null +++ b/examples/minikube/volumes/README.md @@ -0,0 +1,3 @@ +# Volumes + +This experiments.yaml shows how to customize MiniCluster volumes. diff --git a/examples/minikube/volumes/experiments.yaml b/examples/minikube/volumes/experiments.yaml new file mode 100644 index 0000000..c2af90e --- /dev/null +++ b/examples/minikube/volumes/experiments.yaml @@ -0,0 +1,31 @@ +# This is intended for MiniKube, so no machine needed +matrix: + size: [4] + +# Flux Mini Cluster experiment attributes +minicluster: + name: lammps-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + size: [2] + + # How to create MiniCluster volumes - this is a volume named "data" + volumes: + data: + storageClass: hostpath + path: /tmp/data + labels: + type: "local" + + +jobs: + reaxc-hns: + command: 'lmp -v x 1 -v y 1 -v z 1 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 2 + working_dir: /home/flux/examples/reaxff/HNS + + # The volume named "data" above should be bound to "/data" + volumes: + data: + path: /data diff --git a/examples/osu-benchmarks/README.md b/examples/osu-benchmarks/README.md deleted file mode 100644 index 8fbd44c..0000000 --- a/examples/osu-benchmarks/README.md +++ /dev/null @@ -1,60 +0,0 @@ -# OSU Benchmarks on Google Kubernetes Engine - -In this set of experiments we will run the Flux Operator on Google Cloud at size N=2 -(the benchmarks require this) and multiple machine types. - -## Pre-requisites - -You should first [install gcloud](https://cloud.google.com/sdk/docs/quickstarts) -and ensure you are logged in and have kubectl installed: - -```bash -$ gcloud auth login -``` - -Depending on your install, you can either install with gcloud: - -```bash -$ gcloud components install kubectl -``` -or just [on your own](https://kubernetes.io/docs/tasks/tools/). - -## Run Experiments - -Each experiment here is defined by the matrix and variables in [experiments.yaml](experiment.yaml) that is used to -populate a [minicluster-template.yaml](minicluster-template.yaml) and launch a Kubernetes cluster. -You can read the documentation for flux-cloud to understand the variables available. -This tutorial assumes you have flux-cloud installed and configured. See all unique Kubernetes clusters -we will run the jobs on: - -```bash -$ flux-cloud list -``` - -Then you can either run all at once: - -```bash -$ flux-cloud run --force-cluster -``` - -Or (for testing) to bring up just the first cluster and then manually apply: - -```bash -$ flux-cloud up -$ flux-cloud apply -$ flux-cloud down -``` - -or do the same for a targeted Kubernetes cluster: - -```bash -$ flux-cloud up -e n1-standard-2-2 -$ flux-cloud apply -e n1-standard-2-2 -$ flux-cloud down -e n1-standard-2-2 -``` - - -The latter will either use a single experiment you've defined under `experiment` in your experiments.yaml file, -or select the first in your matrix (as we have here). - -By default, results will be written to a [./data](data) directory, but you can customize this with `--outdir`. diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/meta.json b/examples/osu-benchmarks/data/n1-standard-1-2/meta.json deleted file mode 100644 index 994b8cd..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/meta.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "size": 2, - "machine": "n1-standard-1", - "minicluster": { - "name": "osu-benchmarks", - "namespace": "flux-operator" - }, - "jobs": { - "osu_get_latency": { - "command": "./osu_get_latency" - }, - "osu_acc_latency": { - "command": "./osu_acc_latency" - }, - "osu_fop_latency": { - "command": "./osu_fop_latency" - }, - "osu_get_bw": { - "command": "./osu_get_bw" - }, - "osu_put_bibw": { - "command": "./osu_put_bibw" - }, - "osu_put_latency": { - "command": "./osu_put_latency" - }, - "osu_cas_latency": { - "command": "./osu_cas_latency" - }, - "osu_get_acc_latency": { - "command": "./osu_get_acc_latency" - }, - "osu_put_bw": { - "command": "./osu_put_bw" - } - }, - "id": "n1-standard-1-2", - "times": { - "create-cluster": 356.4845640659332, - "minicluster-run-osu_get_latency": 538.4266033172607, - "minicluster-run-osu_acc_latency": 346.2248685359955, - "minicluster-run-osu_fop_latency": 30.376757621765137, - "minicluster-run-osu_get_bw": 69.91457080841064, - "minicluster-run-osu_put_bibw": 121.5233302116394, - "minicluster-run-osu_put_latency": 347.232608795166, - "minicluster-run-osu_cas_latency": 30.295669078826904, - "minicluster-run-osu_get_acc_latency": 675.3228597640991, - "minicluster-run-osu_put_bw": 65.65373682975769 - } -} diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out deleted file mode 100644 index 33df75e..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Accumulate latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 2026.56 -2 1971.25 -4 1969.97 -8 2033.46 -16 1975.18 -32 2007.49 -64 1958.49 -128 2003.40 -256 2009.72 -512 1974.10 -1024 2027.20 -2048 2040.70 -4096 1958.00 -8192 2026.39 -16384 1962.29 -32768 2014.61 -65536 3992.00 -131072 4587.00 -262144 4074.00 -524288 4244.08 -1048576 4722.99 -2097152 9259.00 -4194304 18870.00 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out deleted file mode 100644 index 13c2d88..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_cas_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Compare_and_swap latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 2040.58 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out deleted file mode 100644 index fb575bb..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_fop_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Fetch_and_op latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 2025.38 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out deleted file mode 100644 index 89c06a4..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get_accumulate latency Test v5.8 -# Window creation: MPI_Win_create -# Synchronization: MPI_Win_lock/unlock -# Size Latency (us) -1 4028.65 -2 4036.95 -4 3977.30 -8 3959.60 -16 3999.67 -32 3974.93 -64 3965.61 -128 3921.54 -256 4020.49 -512 3987.41 -1024 3950.50 -2048 4023.82 -4096 4024.50 -8192 4032.61 -16384 4321.01 -32768 4077.98 -65536 6086.01 -131072 6358.00 -262144 6235.36 -524288 7140.15 -1048576 9408.58 -2097152 18535.45 -4194304 36929.51 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out deleted file mode 100644 index 498e3af..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.02 -2 0.04 -4 0.08 -8 0.17 -16 0.32 -32 0.66 -64 1.14 -128 2.58 -256 6.05 -512 9.96 -1024 19.80 -2048 35.15 -4096 64.85 -8192 126.64 -16384 174.69 -32768 205.94 -65536 220.74 -131072 220.53 -262144 173.80 -524288 227.82 -1048576 215.52 -2097152 226.24 -4194304 219.46 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out deleted file mode 100644 index 4bd281e..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_get_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 2129.50 -2 2057.45 -4 2013.89 -8 2015.55 -16 1979.00 -32 2024.10 -64 1983.17 -128 2008.34 -256 2023.70 -512 2008.37 -1024 2057.49 -2048 2030.20 -4096 2039.00 -8192 2027.52 -16384 1879.26 -32768 2086.65 -65536 3961.84 -131072 4195.01 -262144 4327.77 -524288 4295.87 -1048576 5230.83 -2097152 9040.55 -4194304 18364.76 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out deleted file mode 100644 index c6d659e..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bibw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bi-directional Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_post/start/complete/wait -# Size Bandwidth (MB/s) -1 0.02 -2 0.13 -4 0.26 -8 0.43 -16 1.06 -32 2.27 -64 3.77 -128 9.69 -256 15.68 -512 28.37 -1024 58.54 -2048 105.42 -4096 119.90 -8192 147.82 -16384 151.82 -32768 212.67 -65536 220.28 -131072 221.41 -262144 222.46 -524288 223.21 -1048576 207.12 -2097152 223.48 -4194304 223.16 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out deleted file mode 100644 index 5b8c58a..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.04 -2 0.07 -4 0.12 -8 0.25 -16 0.53 -32 1.10 -64 2.32 -128 3.69 -256 9.53 -512 17.77 -1024 28.00 -2048 56.37 -4096 67.47 -8192 93.29 -16384 147.11 -32768 222.45 -65536 205.60 -131072 227.43 -262144 232.69 -524288 229.48 -1048576 216.91 -2097152 219.29 -4194304 223.36 diff --git a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out deleted file mode 100644 index 4bd184c..0000000 --- a/examples/osu-benchmarks/data/n1-standard-1-2/osu_put_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 1907.18 -2 2021.62 -4 1932.61 -8 1984.30 -16 2022.26 -32 1931.50 -64 2016.32 -128 2010.00 -256 1979.04 -512 1993.74 -1024 1990.06 -2048 1982.00 -4096 1983.60 -8192 2014.80 -16384 2079.00 -32768 1999.49 -65536 4068.88 -131072 3994.00 -262144 4146.00 -524288 4276.83 -1048576 5456.03 -2097152 9407.04 -4194304 19134.00 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/meta.json b/examples/osu-benchmarks/data/n1-standard-2-2/meta.json deleted file mode 100644 index 431de1e..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/meta.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "size": 2, - "machine": "n1-standard-2", - "minicluster": { - "name": "osu-benchmarks", - "namespace": "flux-operator" - }, - "jobs": { - "osu_get_latency": { - "command": "./osu_get_latency" - }, - "osu_acc_latency": { - "command": "./osu_acc_latency" - }, - "osu_fop_latency": { - "command": "./osu_fop_latency" - }, - "osu_get_bw": { - "command": "./osu_get_bw" - }, - "osu_put_bibw": { - "command": "./osu_put_bibw" - }, - "osu_put_latency": { - "command": "./osu_put_latency" - }, - "osu_cas_latency": { - "command": "./osu_cas_latency" - }, - "osu_get_acc_latency": { - "command": "./osu_get_acc_latency" - }, - "osu_put_bw": { - "command": "./osu_put_bw" - } - }, - "id": "n1-standard-2-2", - "times": { - "create-cluster": 1367.3097712993622, - "destroy-cluster": 2073.518306493759, - "minicluster-run-osu_get_latency": 437.91792845726013, - "minicluster-run-osu_acc_latency": 38.31566119194031, - "minicluster-run-osu_fop_latency": 10.17687702178955, - "minicluster-run-osu_get_bw": 150.1252703666687, - "minicluster-run-osu_put_bibw": 38.277549743652344, - "minicluster-run-osu_put_latency": 36.958292961120605, - "minicluster-run-osu_cas_latency": 8.383898735046387, - "minicluster-run-osu_get_acc_latency": 64.05685710906982, - "minicluster-run-osu_put_bw": 19.466553211212158 - } -} diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out deleted file mode 100644 index bfb53a4..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Accumulate latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 141.11 -2 131.90 -4 123.68 -8 117.55 -16 121.55 -32 127.79 -64 114.15 -128 126.75 -256 131.81 -512 118.65 -1024 125.17 -2048 143.56 -4096 142.86 -8192 157.31 -16384 181.50 -32768 199.33 -65536 453.33 -131072 453.50 -262144 560.16 -524288 771.15 -1048576 1167.18 -2097152 1929.84 -4194304 4272.84 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out deleted file mode 100644 index ae44363..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_cas_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Compare_and_swap latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 163.29 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out deleted file mode 100644 index 6ec7d3c..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_fop_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Fetch_and_op latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 145.01 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out deleted file mode 100644 index 536ce73..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get_accumulate latency Test v5.8 -# Window creation: MPI_Win_create -# Synchronization: MPI_Win_lock/unlock -# Size Latency (us) -1 287.65 -2 286.34 -4 266.98 -8 297.64 -16 283.17 -32 282.60 -64 263.15 -128 282.87 -256 330.35 -512 295.81 -1024 292.13 -2048 311.83 -4096 323.84 -8192 329.55 -16384 319.65 -32768 341.19 -65536 589.78 -131072 712.14 -262144 887.74 -524288 1315.61 -1048576 2054.79 -2097152 3533.33 -4194304 5818.77 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out deleted file mode 100644 index 142ad29..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.06 -2 0.13 -4 0.27 -8 0.54 -16 1.02 -32 1.95 -64 3.75 -128 8.50 -256 16.98 -512 30.28 -1024 64.48 -2048 115.08 -4096 245.00 -8192 464.01 -16384 585.10 -32768 754.99 -65536 828.35 -131072 890.66 -262144 1042.80 -524288 955.92 -1048576 1142.67 -2097152 1169.05 -4194304 1172.25 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out deleted file mode 100644 index d226e0d..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_get_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 156.10 -2 146.43 -4 135.73 -8 141.91 -16 151.82 -32 151.16 -64 154.75 -128 149.68 -256 149.83 -512 147.85 -1024 141.86 -2048 153.64 -4096 152.21 -8192 165.91 -16384 204.98 -32768 196.08 -65536 343.46 -131072 452.29 -262144 519.62 -524288 1094.88 -1048576 1724.75 -2097152 1880.69 -4194304 4034.85 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out deleted file mode 100644 index 8882672..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bibw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bi-directional Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_post/start/complete/wait -# Size Bandwidth (MB/s) -1 0.85 -2 1.81 -4 3.81 -8 7.37 -16 14.24 -32 24.82 -64 47.65 -128 87.96 -256 161.55 -512 262.90 -1024 355.17 -2048 464.07 -4096 407.02 -8192 406.54 -16384 789.37 -32768 1210.71 -65536 915.45 -131072 835.05 -262144 600.15 -524288 762.38 -1048576 747.83 -2097152 1065.77 -4194304 873.67 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out deleted file mode 100644 index a731da2..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.42 -2 0.72 -4 1.49 -8 2.95 -16 5.55 -32 11.92 -64 22.03 -128 40.32 -256 79.76 -512 139.34 -1024 181.55 -2048 283.18 -4096 207.81 -8192 242.69 -16384 534.98 -32768 611.93 -65536 705.35 -131072 899.41 -262144 1065.81 -524288 1192.52 -1048576 989.94 -2097152 1189.93 -4194304 1089.70 diff --git a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out deleted file mode 100644 index 7a0c2e5..0000000 --- a/examples/osu-benchmarks/data/n1-standard-2-2/osu_put_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 121.19 -2 128.06 -4 130.09 -8 126.55 -16 121.53 -32 124.01 -64 124.88 -128 134.59 -256 131.45 -512 134.15 -1024 138.92 -2048 144.37 -4096 135.79 -8192 176.68 -16384 171.74 -32768 207.59 -65536 382.16 -131072 447.84 -262144 573.51 -524288 686.69 -1048576 972.05 -2097152 1942.87 -4194304 3791.28 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/meta.json b/examples/osu-benchmarks/data/n1-standard-4-2/meta.json deleted file mode 100644 index 9fd47b7..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/meta.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "size": 2, - "machine": "n1-standard-4", - "minicluster": { - "name": "osu-benchmarks", - "namespace": "flux-operator" - }, - "jobs": { - "osu_get_latency": { - "command": "./osu_get_latency" - }, - "osu_acc_latency": { - "command": "./osu_acc_latency" - }, - "osu_fop_latency": { - "command": "./osu_fop_latency" - }, - "osu_get_bw": { - "command": "./osu_get_bw" - }, - "osu_put_bibw": { - "command": "./osu_put_bibw" - }, - "osu_put_latency": { - "command": "./osu_put_latency" - }, - "osu_cas_latency": { - "command": "./osu_cas_latency" - }, - "osu_get_acc_latency": { - "command": "./osu_get_acc_latency" - }, - "osu_put_bw": { - "command": "./osu_put_bw" - } - }, - "id": "n1-standard-4-2", - "times": { - "minicluster-run-osu_get_latency": 277.4993796348572, - "minicluster-run-osu_acc_latency": 32.00839829444885, - "minicluster-run-osu_fop_latency": 137.7638008594513, - "minicluster-run-osu_get_bw": 149.44539713859558, - "minicluster-run-osu_put_bibw": 33.21780180931091, - "minicluster-run-osu_put_latency": 31.217578411102295, - "minicluster-run-osu_cas_latency": 138.34734511375427, - "minicluster-run-osu_get_acc_latency": 175.93916821479797, - "minicluster-run-osu_put_bw": 17.256979942321777 - } -} diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out deleted file mode 100644 index dc9e63d..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Accumulate latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 114.54 -2 89.25 -4 109.61 -8 120.59 -16 115.39 -32 115.15 -64 115.98 -128 117.26 -256 112.32 -512 113.18 -1024 116.00 -2048 123.82 -4096 122.91 -8192 131.24 -16384 151.30 -32768 166.14 -65536 313.66 -131072 359.74 -262144 444.67 -524288 648.49 -1048576 976.66 -2097152 1724.47 -4194304 3490.93 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out deleted file mode 100644 index df10d69..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_cas_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Compare_and_swap latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 115.93 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out deleted file mode 100644 index 37a557b..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_fop_latency/log.out +++ /dev/null @@ -1,5 +0,0 @@ -# OSU MPI_Fetch_and_op latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -8 105.31 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out deleted file mode 100644 index b49b29b..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_acc_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get_accumulate latency Test v5.8 -# Window creation: MPI_Win_create -# Synchronization: MPI_Win_lock/unlock -# Size Latency (us) -1 187.83 -2 185.87 -4 187.54 -8 187.00 -16 189.64 -32 187.64 -64 187.10 -128 189.27 -256 195.68 -512 190.57 -1024 194.59 -2048 205.99 -4096 216.30 -8192 205.38 -16384 220.19 -32768 250.27 -65536 472.44 -131072 552.09 -262144 698.95 -524288 872.33 -1048576 1280.24 -2097152 2214.98 -4194304 3719.21 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out deleted file mode 100644 index 66eaef6..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.09 -2 0.20 -4 0.39 -8 0.79 -16 1.57 -32 3.01 -64 5.50 -128 12.36 -256 25.06 -512 48.87 -1024 96.36 -2048 187.58 -4096 364.00 -8192 657.03 -16384 1121.71 -32768 880.91 -65536 1266.43 -131072 1237.42 -262144 1222.58 -524288 1220.72 -1048576 1217.06 -2097152 1214.67 -4194304 1213.09 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out deleted file mode 100644 index 54baa05..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_get_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Get latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 105.67 -2 106.76 -4 105.98 -8 104.79 -16 108.18 -32 104.17 -64 107.93 -128 104.04 -256 104.24 -512 100.87 -1024 106.00 -2048 106.41 -4096 107.10 -8192 116.07 -16384 121.87 -32768 153.42 -65536 287.48 -131072 304.28 -262144 394.81 -524288 542.64 -1048576 850.54 -2097152 1754.19 -4194304 4854.39 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out deleted file mode 100644 index 0e481f7..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bibw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bi-directional Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_post/start/complete/wait -# Size Bandwidth (MB/s) -1 0.92 -2 1.89 -4 3.58 -8 7.28 -16 13.10 -32 26.02 -64 52.44 -128 96.39 -256 165.92 -512 295.72 -1024 426.83 -2048 511.34 -4096 424.07 -8192 457.66 -16384 881.99 -32768 1144.43 -65536 909.92 -131072 672.95 -262144 632.42 -524288 546.17 -1048576 683.42 -2097152 1031.23 -4194304 1128.50 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out deleted file mode 100644 index 14fccbf..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_bw/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Bandwidth Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Bandwidth (MB/s) -1 0.51 -2 1.02 -4 2.14 -8 4.00 -16 7.66 -32 16.11 -64 29.96 -128 53.30 -256 104.51 -512 164.96 -1024 213.10 -2048 271.31 -4096 257.33 -8192 271.38 -16384 387.78 -32768 684.16 -65536 671.73 -131072 871.74 -262144 1241.50 -524288 1226.63 -1048576 1220.23 -2097152 1169.76 -4194304 1166.01 diff --git a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out b/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out deleted file mode 100644 index 2ea21e2..0000000 --- a/examples/osu-benchmarks/data/n1-standard-4-2/osu_put_latency/log.out +++ /dev/null @@ -1,27 +0,0 @@ -# OSU MPI_Put Latency Test v5.8 -# Window creation: MPI_Win_allocate -# Synchronization: MPI_Win_flush -# Size Latency (us) -1 101.02 -2 109.73 -4 109.01 -8 108.02 -16 111.83 -32 108.72 -64 109.09 -128 115.13 -256 112.40 -512 114.17 -1024 115.96 -2048 119.93 -4096 123.87 -8192 139.41 -16384 152.70 -32768 183.49 -65536 328.75 -131072 387.64 -262144 452.87 -524288 594.75 -1048576 871.06 -2097152 1714.35 -4194304 3572.91 diff --git a/examples/osu-benchmarks/experiments.yaml b/examples/osu-benchmarks/experiments.yaml deleted file mode 100644 index 2bc569c..0000000 --- a/examples/osu-benchmarks/experiments.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# matrix of experiments to run - machine types and sizes are required - -# This can obviously be expanded to more sizes or machines, -matrix: - size: [2] - machine: ["n1-standard-1", "n1-standard-2", "n1-standard-4"] - -# Flux Mini Cluster experiment attributes -minicluster: - name: osu-benchmarks - namespace: flux-operator - -# Each job can have a command and working directory -jobs: - osu_get_latency: - command: './osu_get_latency' - osu_acc_latency: - command: './osu_acc_latency' - osu_fop_latency: - command: './osu_fop_latency' - osu_get_bw: - command: './osu_get_bw' - osu_put_bibw: - command: './osu_put_bibw' - osu_put_latency: - command: './osu_put_latency' - osu_cas_latency: - command: './osu_cas_latency' - osu_get_acc_latency: - command: './osu_get_acc_latency' - osu_put_bw: - command: './osu_put_bw' diff --git a/examples/osu-benchmarks/minicluster-template.yaml b/examples/osu-benchmarks/minicluster-template.yaml deleted file mode 100644 index 8004cc8..0000000 --- a/examples/osu-benchmarks/minicluster-template.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - - # Number of pods to create for MiniCluster - size: {{ size }} - - # Disable verbose output - logging: - quiet: true - - # This is a list because a pod can support multiple containers - containers: - # The container URI to pull (currently needs to be public) - - image: ghcr.io/rse-ops/osu-microbench:app-latest - - # Option Flags for this flux runner wait.sh entrypoint - fluxOptionFlags: "-ompi=openmpi@5" - - # custom preCommand logic (run at start of script) - commands: - pre: | - source /etc/profile.d/z10_spack_environment.sh - asFlux="sudo -u flux -E PYTHONPATH=$PYTHONPATH" - - # All osu-benchmark experiments share the same working directory - workingDir: /opt/osu-benchmark/build.openmpi/libexec/osu-micro-benchmarks/mpi/one-sided - command: {{ job.command }} diff --git a/examples/up-apply-down/README.md b/examples/up-apply-down/README.md deleted file mode 100644 index e4cfc6c..0000000 --- a/examples/up-apply-down/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Up and Down - -This is an example of using flux cloud to bring up a cluster, install the Flux Operator -(and then you would use it as you please) and then bring it down. -You should have kubectl and gcloud installed for this demo. Note that -we use the [experiments.yaml](experiments.yaml) file as a default, -and we only provide basic metadata needed for a single experiment. - -## Up - -```bash -$ flux-cloud up -``` - -This will bring up your cluster, per the size and machine type defined -in your experiments file, and install the operator. - -## Apply - -An "apply" means running the single (or multiple) experiments defined in your -experiments.yaml. While these don't need to be in the same file, for simplicity -we have also defined our experiment metadata and template (provided at [minicluster-template.yaml](minicluster-template.yaml)) -in this directory. For this application we will run a simple llamps application. - -```bash -$ flux-cloud apply -``` - -Note that apply will work for a single experiment OR a matrix, so be careful! - -## Down - -To bring it down: - -```bash -$ flux-cloud down -``` diff --git a/examples/up-apply-down/data/meta.json b/examples/up-apply-down/data/meta.json deleted file mode 100644 index be7b873..0000000 --- a/examples/up-apply-down/data/meta.json +++ /dev/null @@ -1,19 +0,0 @@ -[ - { - "size": 2, - "machine": "n1-standard-1", - "minicluster": { - "name": "lammps-job", - "namespace": "flux-operator" - }, - "jobs": { - "reaxc-hns": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite" - } - }, - "times": { - "minicluster-run-reaxc-hns": 198.465562582016, - "create-cluster": 367.33847880363464 - } - } -] diff --git a/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out b/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out deleted file mode 100644 index 71de171..0000000 --- a/examples/up-apply-down/data/n1-standard-1-2/reaxc-hns/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 2 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.029 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 2 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 5.00 out of 8 (62.50%) - 2432 atoms - replicate CPU = 0.002 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 143.9 | 143.9 | 143.9 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52118 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2824 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.342 -111.57762 -1.7012247 27418.867 - 30 302.21063 -113.28428 7007.6629 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.8245 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0973 -111.58318 -1.7000523 27418.867 - 60 296.67807 -113.26777 7273.8119 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.5522 -111.55514 -1.6992158 27418.867 - 80 293.58677 -113.25831 5993.4438 -111.55946 -1.6988533 27418.867 - 90 300.62635 -113.27925 7202.8369 -111.58069 -1.6985592 27418.867 - 100 305.38276 -113.29357 10085.805 -111.59518 -1.6983874 27418.867 -Loop time of 20.075 on 2 procs for 100 steps with 2432 atoms - -Performance: 0.043 ns/day, 557.640 hours/ns, 4.981 timesteps/s -84.6% CPU use with 2 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 12.399 | 13.154 | 13.91 | 20.8 | 65.53 -Neigh | 0.40351 | 0.40416 | 0.4048 | 0.1 | 2.01 -Comm | 0.33357 | 1.0872 | 1.8408 | 72.3 | 5.42 -Output | 0.004412 | 0.0045916 | 0.0047713 | 0.3 | 0.02 -Modify | 5.4218 | 5.4219 | 5.422 | 0.0 | 27.01 -Other | | 0.002887 | | | 0.01 - -Nlocal: 1216.00 ave 1216 max 1216 min -Histogram: 2 0 0 0 0 0 0 0 0 0 -Nghost: 7591.50 ave 7597 max 7586 min -Histogram: 1 0 0 0 0 0 0 0 0 1 -Neighs: 432912.0 ave 432942 max 432882 min -Histogram: 1 0 0 0 0 0 0 0 0 1 - -Total # of neighbors = 865824 -Ave neighs/atom = 356.01316 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:20 diff --git a/examples/up-apply-down/experiments.yaml b/examples/up-apply-down/experiments.yaml deleted file mode 100644 index ddcfdac..0000000 --- a/examples/up-apply-down/experiments.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Minimum required to bring up a cluster -experiment: - size: 2 - machine: n1-standard-1 - -# Flux Mini Cluster experiment attributes -minicluster: - name: lammps-job - namespace: flux-operator - -# If your jobs share the same variables you can just put them in the template directly! -jobs: - reaxc-hns: - command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' diff --git a/examples/up-apply-down/minicluster-template.yaml b/examples/up-apply-down/minicluster-template.yaml deleted file mode 100644 index 6b34bdd..0000000 --- a/examples/up-apply-down/minicluster-template.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # Number of pods to create for MiniCluster - size: {{ size }} - - # Disable verbose output - logging: - quiet: true - - # This is a list because a pod can support multiple containers - containers: - # The container URI to pull (currently needs to be public) - - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 - - # You can set the working directory if your container WORKDIR is not correct. - workingDir: /home/flux/examples/reaxff/HNS - command: {{ job.command }} - - # This only matters if test is false - fluxLogLevel: 7 diff --git a/examples/up-submit-down/README.md b/examples/up-submit-down/README.md deleted file mode 100644 index 818e95d..0000000 --- a/examples/up-submit-down/README.md +++ /dev/null @@ -1,72 +0,0 @@ -``# Up, Submit, Down - -This is an example of using flux cloud to bring up a cluster, install the Flux Operator -(and then you would use it as you please) and run jobs with submit (on the same -MiniCluster) and then bring it down. -You should have kubectl and gcloud OR minikube installed for this demo. Note that -we use the [experiments.yaml](experiments.yaml) file as a default, -and we only provide basic metadata needed for a single experiment. - -## Up - -```bash -$ flux-cloud up -``` - -This will bring up your cluster, per the size and machine type defined -in your experiments file, and install the operator. - -## Submit - -A "submit" means running the single (or multiple) experiments defined in your -experiments.yaml on the same MiniCluster, without bringing it down between jobs. -This means we are using Flux as the scheduler proper, and we don't need to bring pods -up and down unecessarily (and submit a gazillion YAML files). There is only the number -of YAML CRD needed to correspond to the sizes of MiniClusters you run across. - -```bash -$ flux-cloud submit --cloud minikube -$ flux-cloud submit --cloud google -``` - -## Down - -To bring it down: - -```bash -$ flux-cloud down -``` - -## Batch - -Run all three with one command: - -```bash -$ flux-cloud batch --cloud minikube -$ flux-cloud batch --cloud google -``` - -## UI - -If you want to just bring up the cluster and open the user interface to interact with: - -```bash -$ flux-cloud up --cloud minikube -$ flux-cloud ui --cloud minikube -$ flux-cloud down --cloud minikube -``` - - -## Plot - -I threw together a script to compare running times with info and output times, -where: - -running time < info < output - -```bash -$ pip install pandas matplotlib seaborn -``` -```bash -$ python plot_results.py data/k8s-size-4-n1-standard-1/meta.json -``` diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh deleted file mode 100755 index c6fb8e0..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/broker-id.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -NAMESPACE="flux-operator" -JOB="lammps-job" -brokerPrefix="${JOB}-0" - -for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo ${pod} - break - fi -done diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh deleted file mode 100755 index bdace99..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-create.sh +++ /dev/null @@ -1,204 +0,0 @@ -#!/bin/bash - -# Source shared helper scripts -# Colors -red='\033[0;31m' -green='\033[0;32m' -yellow='\033[0;33m' -blue='\033[0;34m' -magenta='\033[0;35m' -cyan='\033[0;36m' -clear='\033[0m' - -function print_red() { - echo -e "${red}$@${clear}" -} -function print_yellow() { - echo -e "${yellow}$@${clear}" -} -function print_green() { - echo -e "${green}$@${clear}" -} -function print_blue() { - echo -e "${blue}$@${clear}" -} -function print_magenta() { - echo -e "${magenta}$@${clear}" -} -function print_cyan() { - echo -e "${cyan}$@${clear}" -} - -function is_installed () { - # Determine if a command is available for use! - cmd="${1}" - if command -v $cmd >/dev/null; then - echo "$cmd is installed" - else - echo "$cmd could not be found" - exit 1 - fi -} - -function install_operator() { - # Shared function to install the operator from a specific repository branch and cleanup - script_dir=${1} - repository=${2} - branch=${3} - tmpfile="${script_dir}/flux-operator.yaml" - run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml - kubectl apply -f $tmpfile -} - - -function run_echo() { - # Show the user the command then run it - echo - print_green "$@" - retry $@ -} - -function run_echo_allow_fail() { - echo - print_green "$@" - $@ || true -} - -function retry() { - # Retry an unsuccessful user command, per request - while true - do - $@ - retval=$? - if [[ "${retval}" == "0" ]]; then - return - fi - print_blue "That command was not successful. Do you want to try again? 🤔️" - read -p " (yes/no) " answer - # Exit with non-zero response so we know to stop in script. - case ${answer} in - yes ) continue;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac - done -} - - -function prompt() { - # Prompt the user with a yes/no command to continue or exit - print_blue "$@ 🤔️" - read -p " (yes/no) " answer - case ${answer} in - yes ) echo ok, we will proceed;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac -} - - -function with_exponential_backoff { - # Run with exponential backoff - assume containers take a while to pull - local max_attempts=100 - local timeout=1 - local attempt=0 - local exitcode=0 - - while [[ $attempt < $max_attempts ]]; do - "$@" - exitcode=$? - - if [[ $exitcode == 0 ]]; then - break - fi - - echo "Failure! Retrying in $timeout.." 1>&2 - sleep $timeout - attempt=$(( attempt + 1 )) - timeout=$(( timeout * 2 )) - done - - if [[ $exitCode != 0 ]]; then - echo "You've failed me for the last time! ($@)" 1>&2 - fi - return $exitcode -} - -# Defaults - these are in the config but left here for information -CLUSTER_NAME="flux-cluster" -ZONE="us-central1-a" -CLUSTER_VERSION="1.23" -MACHINE_TYPE="n1-standard-1" -FORCE_CLUSTER="false" -SIZE=4 -TAGS="flux-cluster" -REPOSITORY="flux-framework/flux-operator" -BRANCH="main" -GOOGLE_PROJECT="dinodev" -SCRIPT_DIR="/home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts" - -# Required arguments -if [ -z ${GOOGLE_PROJECT+x} ]; then - echo "Missing Google Project template variable as GOOGLE_PROJECT"; - exit 1 -fi - -if [ -z ${ZONE+x} ]; then - echo "Missing Google Cloud zone template variable as ZONE"; - exit 1 -fi - -if [ -z ${MACHINE_TYPE+x} ]; then - echo "Missing Google Cloud machine type template variable as MACHINE_TYPE"; - exit 1 -fi - -print_magenta " cluster : ${CLUSTER_NAME}" -print_magenta " version : ${CLUSTER_VERSION}" -print_magenta " project : ${GOOGLE_PROJECT}" -print_magenta " machine : ${MACHINE_TYPE}" -print_magenta " zone : ${ZONE}" -print_magenta " tags : ${TAGS}" -print_magenta " size : ${SIZE}" -print_magenta "repository : ${REPOSITORY}" -print_magenta " branch : ${BRANCH}" - -is_installed kubectl -is_installed gcloud -is_installed wget - -# Check if it already exists -gcloud container clusters list --zone ${ZONE} | grep ${CLUSTER_NAME} -retval=$? -if [[ "${retval}" == "0" ]]; then - print_blue "${CLUSTER_NAME} in ${ZONE} already exists." - echo - exit 0 -fi - -if [[ "${FORCE_CLUSTER}" != "true" ]]; then - prompt "Do you want to create this cluster?" -fi - -# Create the cluster -run_echo gcloud container clusters create ${CLUSTER_NAME} --project $GOOGLE_PROJECT \ - --zone ${ZONE} --cluster-version ${CLUSTER_VERSION} --machine-type ${MACHINE_TYPE} \ - --num-nodes=${SIZE} --enable-network-policy --tags=${TAGS} --enable-intra-node-visibility - -# Get credentials so kubectl will work -run_echo gcloud container clusters get-credentials ${CLUSTER_NAME} --zone ${ZONE} --project $GOOGLE_PROJECT -run_echo kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user $(gcloud config get-value core/account) - -# Show nodes -run_echo kubectl get nodes - -# Deploy the operator -mkdir -p ${SCRIPT_DIR} -install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} - -run_echo kubectl get namespace -run_echo kubectl describe namespace operator-system diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh deleted file mode 100755 index de8988e..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/cluster-destroy.sh +++ /dev/null @@ -1,161 +0,0 @@ -#!/bin/bash - -# Source shared helper scripts -# Colors -red='\033[0;31m' -green='\033[0;32m' -yellow='\033[0;33m' -blue='\033[0;34m' -magenta='\033[0;35m' -cyan='\033[0;36m' -clear='\033[0m' - -function print_red() { - echo -e "${red}$@${clear}" -} -function print_yellow() { - echo -e "${yellow}$@${clear}" -} -function print_green() { - echo -e "${green}$@${clear}" -} -function print_blue() { - echo -e "${blue}$@${clear}" -} -function print_magenta() { - echo -e "${magenta}$@${clear}" -} -function print_cyan() { - echo -e "${cyan}$@${clear}" -} - -function is_installed () { - # Determine if a command is available for use! - cmd="${1}" - if command -v $cmd >/dev/null; then - echo "$cmd is installed" - else - echo "$cmd could not be found" - exit 1 - fi -} - -function install_operator() { - # Shared function to install the operator from a specific repository branch and cleanup - script_dir=${1} - repository=${2} - branch=${3} - tmpfile="${script_dir}/flux-operator.yaml" - run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml - kubectl apply -f $tmpfile -} - - -function run_echo() { - # Show the user the command then run it - echo - print_green "$@" - retry $@ -} - -function run_echo_allow_fail() { - echo - print_green "$@" - $@ || true -} - -function retry() { - # Retry an unsuccessful user command, per request - while true - do - $@ - retval=$? - if [[ "${retval}" == "0" ]]; then - return - fi - print_blue "That command was not successful. Do you want to try again? 🤔️" - read -p " (yes/no) " answer - # Exit with non-zero response so we know to stop in script. - case ${answer} in - yes ) continue;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac - done -} - - -function prompt() { - # Prompt the user with a yes/no command to continue or exit - print_blue "$@ 🤔️" - read -p " (yes/no) " answer - case ${answer} in - yes ) echo ok, we will proceed;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac -} - - -function with_exponential_backoff { - # Run with exponential backoff - assume containers take a while to pull - local max_attempts=100 - local timeout=1 - local attempt=0 - local exitcode=0 - - while [[ $attempt < $max_attempts ]]; do - "$@" - exitcode=$? - - if [[ $exitcode == 0 ]]; then - break - fi - - echo "Failure! Retrying in $timeout.." 1>&2 - sleep $timeout - attempt=$(( attempt + 1 )) - timeout=$(( timeout * 2 )) - done - - if [[ $exitCode != 0 ]]; then - echo "You've failed me for the last time! ($@)" 1>&2 - fi - return $exitcode -} - -# Defaults - these are in the config but left here for information -CLUSTER_NAME="flux-cluster" -FORCE_CLUSTER="false" -ZONE="us-central1-a" - -if [ -z ${ZONE+x} ]; then - echo "Google Cloud zone template missing as ZONE"; - exit 1 -fi - -echo " cluster : ${CLUSTER_NAME}" -echo " zone : ${ZONE}" - -is_installed gcloud -is_installed yes || FORCE_CLUSTER="false" - -# Check if it already exists -gcloud container clusters list --zone ${ZONE} | grep ${CLUSTER_NAME} -retval=$? -if [[ "${retval}" != "0" ]]; then - print_blue "${CLUSTER_NAME} in ${ZONE} does not exist." - echo - exit 0 -fi - -# This command has a confirmation already -if [[ "${FORCE_CLUSTER}" == "true" ]]; then - yes | gcloud container clusters delete --zone ${ZONE} ${CLUSTER_NAME} -else - run_echo gcloud container clusters delete --zone ${ZONE} ${CLUSTER_NAME} -fi diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml deleted file mode 100644 index b4bc03e..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/flux-operator.yaml +++ /dev/null @@ -1,848 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - labels: - control-plane: controller-manager - name: operator-system ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.9.0 - creationTimestamp: null - name: miniclusters.flux-framework.org -spec: - group: flux-framework.org - names: - kind: MiniCluster - listKind: MiniClusterList - plural: miniclusters - singular: minicluster - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: MiniCluster is the Schema for a Flux job launcher on K8s - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - description: MiniCluster defines the desired state of a Flux MiniCluster - "I am a Flux user and I want to launch a MiniCluster for my job!" A - MiniCluster corresponds to a Batch Job -> StatefulSet + ConfigMaps A - "task" within that cluster is flux running something. - properties: - containers: - description: Containers is one or more containers to be created in - a pod. There should only be one container to run flux with runFlux - items: - properties: - command: - description: 'Single user executable to provide to flux start - IMPORTANT: This is left here, but not used in favor of exposing - Flux via a Restful API. We Can remove this when that is finalized.' - type: string - cores: - description: Cores the container should use - format: int32 - type: integer - diagnostics: - description: Run flux diagnostics on start instead of command - type: boolean - environment: - additionalProperties: - type: string - description: Key/value pairs for the environment - type: object - fluxLogLevel: - default: 6 - description: Log level to use for flux logging (only in non - TestMode) - format: int32 - type: integer - fluxOptionFlags: - description: Flux option flags, usually provided with -o optional - - if needed, default option flags for the server These can - also be set in the user interface to override here. This is - only valid for a FluxRunner - type: string - image: - default: fluxrm/flux-sched:focal - description: Container image must contain flux and flux-sched - install - type: string - imagePullSecret: - description: Allow the user to pull authenticated images By - default no secret is selected. Setting this with the name - of an already existing imagePullSecret will specify that secret - in the pod spec. - type: string - name: - description: Container name is only required for non flux runners - type: string - ports: - description: Ports to be exposed to other containers in the - cluster We take a single list of integers and map to the same - items: - format: int32 - type: integer - type: array - postStartExec: - description: Lifecycle can handle post start commands, etc. - type: string - preCommand: - description: Special command to run at beginning of script, - directly after asFlux is defined as sudo -u flux -E (so you - can change that if desired.) This is only valid if FluxRunner - is set (that writes a wait.sh script) - type: string - pullAlways: - default: false - description: Allow the user to dictate pulling By default we - pull if not present. Setting this to true will indicate to - pull always - type: boolean - resources: - description: Resources include limits and requests - properties: - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - type: object - type: object - runFlux: - description: Main container to run flux (only should be one) - type: boolean - volumes: - additionalProperties: - description: A Container volume must reference one defined - for the MiniCluster The path here is in the container - properties: - path: - type: string - readOnly: - default: true - type: boolean - required: - - path - type: object - description: Volumes that can be mounted (must be defined in - volumes) - type: object - workingDir: - description: Working directory to run command from - type: string - required: - - image - type: object - type: array - deadlineSeconds: - default: 31500000 - description: Should the job be limited to a particular number of seconds? - Approximately one year. This cannot be zero or job won't start - format: int64 - type: integer - fluxRestful: - description: Customization to Flux Restful API There should only be - one container to run flux with runFlux - properties: - branch: - default: main - description: Branch to clone Flux Restful API from - type: string - port: - default: 5000 - description: Port to run Flux Restful Server On - format: int32 - type: integer - token: - description: Token to use for RestFul API - type: string - username: - description: These two should not actually be set by a user, but - rather generated by tools and provided Username to use for RestFul - API - type: string - type: object - jobLabels: - additionalProperties: - type: string - description: Labels for the job - type: object - localDeploy: - default: false - description: localDeploy should be true for development, or deploying - in the case that there isn't an actual kubernetes cluster (e.g., - you are not using make deploy. It uses a persistent volume instead - of a claim - type: boolean - logging: - description: Logging modes determine the output you see in the job - log - properties: - quiet: - default: false - description: Quiet mode silences all output so the job only shows - the test running - type: boolean - timed: - default: false - description: Timed mode adds timing to Flux commands - type: boolean - type: object - pod: - description: Pod spec details - properties: - resources: - additionalProperties: - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - description: Resources include limits and requests - type: object - type: object - podLabels: - additionalProperties: - type: string - description: Labels for each pod - type: object - size: - default: 1 - description: Size (number of job pods to run, size of minicluster - in pods) - format: int32 - type: integer - tasks: - default: 1 - description: Total number of CPUs being run across entire cluster - format: int32 - type: integer - volumes: - additionalProperties: - description: Mini Cluster local volumes available to mount (these - are on the host) - properties: - path: - type: string - required: - - path - type: object - description: Volumes on the host (named) accessible to containers - type: object - required: - - containers - type: object - status: - description: MiniClusterStatus defines the observed state of Flux - properties: - conditions: - description: conditions hold the latest Flux Job and MiniCluster states - items: - description: "Condition contains details for one aspect of the current - state of this API Resource. --- This struct is intended for direct - use as an array at the field path .status.conditions. For example, - \n type FooStatus struct{ // Represents the observations of a - foo's current state. // Known .status.conditions.type are: \"Available\", - \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge - // +listType=map // +listMapKey=type Conditions []metav1.Condition - `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" - protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" - properties: - lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. Producers - of specific condition types may define expected values and - meanings for this field, and whether the values are considered - a guaranteed API. The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - --- Many .condition.type values are consistent across resources - like Available, but because arbitrary conditions can be useful - (see .node.status.conditions), the ability to deconflict is - important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - jobid: - description: The JobUid is set internally to associate to a miniCluster - type: string - required: - - jobid - type: object - type: object - served: true - storage: true - subresources: - status: {} ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: operator-controller-manager - namespace: operator-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: operator-leader-election-role - namespace: operator-system -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - creationTimestamp: null - name: operator-manager-role -rules: -- apiGroups: - - "" - resources: - - events - verbs: - - create - - update - - watch -- apiGroups: - - "" - resources: - - events - - nodes - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - create - - delete - - exec - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs/status - verbs: - - create - - delete - - exec - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - "" - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - batch - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - configmaps - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch -- apiGroups: - - "" - resources: - - jobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - networks - verbs: - - create - - patch -- apiGroups: - - "" - resources: - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - persistentvolumes - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - pods - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - pods/exec - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - pods/log - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - secrets - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - services - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - statefulsets - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - flux-framework.org - resources: - - clusters - - clusters/status - verbs: - - get - - list - - watch -- apiGroups: - - flux-framework.org - resources: - - machineclasses - - machinedeployments - - machinedeployments/status - - machines - - machines/status - - machinesets - - machinesets/status - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - flux-framework.org - resources: - - miniclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - flux-framework.org - resources: - - miniclusters/finalizers - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - flux-framework.org - resources: - - miniclusters/status - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - create - - delete - - get - - list - - patch - - update - - watch ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-metrics-reader -rules: -- nonResourceURLs: - - /metrics - verbs: - - get ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-proxy-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: operator-leader-election-rolebinding - namespace: operator-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: operator-leader-election-role -subjects: -- kind: ServiceAccount - name: operator-controller-manager - namespace: operator-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: operator-manager-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-manager-role -subjects: -- kind: ServiceAccount - name: operator-controller-manager - namespace: operator-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: operator-proxy-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-proxy-role -subjects: -- kind: ServiceAccount - name: operator-controller-manager - namespace: operator-system ---- -apiVersion: v1 -data: - controller_manager_config.yaml: | - apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 - kind: ControllerManagerConfig - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: 127.0.0.1:8080 - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: 14dde902.flux-framework.org -kind: ConfigMap -metadata: - name: operator-manager-config - namespace: operator-system ---- -apiVersion: v1 -kind: Service -metadata: - labels: - control-plane: controller-manager - name: operator-controller-manager-metrics-service - namespace: operator-system -spec: - ports: - - name: https - port: 8443 - protocol: TCP - targetPort: https - selector: - control-plane: controller-manager ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - control-plane: controller-manager - name: operator-controller-manager - namespace: operator-system -spec: - replicas: 1 - selector: - matchLabels: - control-plane: controller-manager - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - control-plane: controller-manager - spec: - containers: - - args: - - --secure-listen-address=0.0.0.0:8443 - - --upstream=http://127.0.0.1:8080/ - - --logtostderr=true - - --v=0 - image: gcr.io/kubebuilder/kube-rbac-proxy:v0.11.0 - name: kube-rbac-proxy - ports: - - containerPort: 8443 - name: https - protocol: TCP - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 5m - memory: 64Mi - securityContext: - allowPrivilegeEscalation: false - - args: - - --health-probe-bind-address=:8081 - - --metrics-bind-address=127.0.0.1:8080 - - --leader-elect - command: - - /manager - image: ghcr.io/flux-framework/flux-operator:latest - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - name: manager - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 10m - memory: 64Mi - securityContext: - allowPrivilegeEscalation: false - securityContext: - runAsNonRoot: true - serviceAccountName: operator-controller-manager - terminationGracePeriodSeconds: 10 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml deleted file mode 100644 index b3b2e17..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster - -metadata: - name: lammps-job - namespace: flux-operator -spec: - # localDeploy needs to be false - localDeploy: false - - # Number of pods to create for MiniCluster - size: 2 - tasks: 1 - - # Disable verbose output - - - # Optional credentials if running the flux restful api - fluxRestful: - token: "6b8a7393-129b-4e2d-83a7-795a5a7c9d9b" - username: "fluxuser" - - # TODO add pod resources, if needed - containers: - - image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 - - - - cores: 1 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh deleted file mode 100755 index 0bd72c3..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-submit-size-2.sh +++ /dev/null @@ -1,219 +0,0 @@ -#!/bin/bash - -# This is a template that will be populated with variables by Flux-Cloud -# We only run it to check if a MiniCluster is running. An apply is only -# needed if the MiniCluster is not created yet. - -# Include shared helper scripts -# Colors -red='\033[0;31m' -green='\033[0;32m' -yellow='\033[0;33m' -blue='\033[0;34m' -magenta='\033[0;35m' -cyan='\033[0;36m' -clear='\033[0m' - -function print_red() { - echo -e "${red}$@${clear}" -} -function print_yellow() { - echo -e "${yellow}$@${clear}" -} -function print_green() { - echo -e "${green}$@${clear}" -} -function print_blue() { - echo -e "${blue}$@${clear}" -} -function print_magenta() { - echo -e "${magenta}$@${clear}" -} -function print_cyan() { - echo -e "${cyan}$@${clear}" -} - -function is_installed () { - # Determine if a command is available for use! - cmd="${1}" - if command -v $cmd >/dev/null; then - echo "$cmd is installed" - else - echo "$cmd could not be found" - exit 1 - fi -} - -function install_operator() { - # Shared function to install the operator from a specific repository branch and cleanup - script_dir=${1} - repository=${2} - branch=${3} - tmpfile="${script_dir}/flux-operator.yaml" - run_echo wget -O $tmpfile https://raw.githubusercontent.com/${repository}/${branch}/examples/dist/flux-operator.yaml - kubectl apply -f $tmpfile -} - - -function run_echo() { - # Show the user the command then run it - echo - print_green "$@" - retry $@ -} - -function run_echo_allow_fail() { - echo - print_green "$@" - $@ || true -} - -function retry() { - # Retry an unsuccessful user command, per request - while true - do - $@ - retval=$? - if [[ "${retval}" == "0" ]]; then - return - fi - print_blue "That command was not successful. Do you want to try again? 🤔️" - read -p " (yes/no) " answer - # Exit with non-zero response so we know to stop in script. - case ${answer} in - yes ) continue;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac - done -} - - -function prompt() { - # Prompt the user with a yes/no command to continue or exit - print_blue "$@ 🤔️" - read -p " (yes/no) " answer - case ${answer} in - yes ) echo ok, we will proceed;; - no ) echo exiting...; - exit 1;; - * ) echo invalid response; - exit 1;; - esac -} - - -function with_exponential_backoff { - # Run with exponential backoff - assume containers take a while to pull - local max_attempts=100 - local timeout=1 - local attempt=0 - local exitcode=0 - - while [[ $attempt < $max_attempts ]]; do - "$@" - exitcode=$? - - if [[ $exitcode == 0 ]]; then - break - fi - - echo "Failure! Retrying in $timeout.." 1>&2 - sleep $timeout - attempt=$(( attempt + 1 )) - timeout=$(( timeout * 2 )) - done - - if [[ $exitCode != 0 ]]; then - echo "You've failed me for the last time! ($@)" 1>&2 - fi - return $exitcode -} - -NAMESPACE="flux-operator" -CRD="/home/vanessa/Desktop/Code/flux/flux-cloud/examples/up-submit-down/data/k8s-size-4-n1-standard-1/.scripts/minicluster-size-2.yaml" -JOB="lammps-job" - -# Size -1 to account for certificate generator -SIZE=2 - -print_magenta " apply : ${CRD}" -print_magenta " job : ${JOB}" - -is_installed kubectl - -# Create the namespace (ok if already exists) -run_echo_allow_fail kubectl create namespace ${NAMESPACE} - -# Always cleanup a previous one so tokens don't get stale -run_echo_allow_fail kubectl delete -f ${CRD} -echo -podsCleaned="false" -print_blue "Waiting for previous MiniCluster to be cleaned up..." -while [[ "${podsCleaned}" == "false" ]]; do - echo -n "." - sleep 2 - state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1) - lines=$(echo $state | wc -l) - if [[ "${lines}" == "1" ]] && [[ "${state}" == *"No resources found in"* ]]; then - echo - print_green "🌀️ Previous pods are cleaned up." - podsCleaned="true" - break - fi -done - -# Ensure we have a MiniCluster of the right namespace running -echo -print_green "🌀️ Creating MiniCluster in ${NAMESPACE}" -# Apply the job, get pods -run_echo kubectl apply -f ${CRD} -run_echo kubectl get -n ${NAMESPACE} pods - -# continue until we find the index-0 pod -podsReady="false" - -echo -print_blue "Waiting for MiniCluster of size ${SIZE} to be ready..." -while [[ "${podsReady}" == "false" ]]; do - echo -n "." - sleep 2 - pods=$(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=name | wc -l) - if [[ "${pods}" == "${SIZE}" ]]; then - echo - print_green "🌀️ All pods are running." - podsReady="true" - break - fi -done - -echo -brokerPod="" -brokerPrefix="${JOB}-0" -while [[ "${brokerPod}" == "" ]]; do - for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - brokerPod=${pod} - break - fi - done -done - -echo -serverReady="false" -print_blue "Waiting for Flux Restful API Server to be ready..." -while [[ "${serverReady}" == "false" ]]; do - echo -n "." - sleep 2 - logs=$(kubectl logs --namespace ${NAMESPACE} ${brokerPod} | grep "Uvicorn running") - retval=$? - if [[ "${retval}" == "0" ]]; then - echo - serverReady="true" - print_green "🌀️ Flux RestFul API Server is Ready." - break - fi -done diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-1-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-2-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-3-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-4-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out deleted file mode 100644 index 3b18e51..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/hello-world-5-minicluster-size-2/log.out +++ /dev/null @@ -1 +0,0 @@ -hello world diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json deleted file mode 100644 index b7b654b..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/meta.json +++ /dev/null @@ -1,698 +0,0 @@ -{ - "times": { - "destroy-cluster": 324.709, - "create-cluster": 86.521, - "minicluster-submit-size-2": 183.626, - "reaxc-hns-1-minicluster-size-2": 32.1847505569458, - "reaxc-hns-2-minicluster-size-2": 33.41048860549927, - "reaxc-hns-3-minicluster-size-2": 30.96457529067993, - "reaxc-hns-4-minicluster-size-2": 30.777089595794678, - "reaxc-hns-5-minicluster-size-2": 31.048890829086304, - "sleep-1-minicluster-size-2": 5.028888463973999, - "sleep-2-minicluster-size-2": 5.045725584030151, - "sleep-3-minicluster-size-2": 5.072444677352905, - "sleep-4-minicluster-size-2": 5.034207582473755, - "sleep-5-minicluster-size-2": 5.025948762893677, - "hello-world-1-minicluster-size-2": 0.07241106033325195, - "hello-world-2-minicluster-size-2": 0.052734375, - "hello-world-3-minicluster-size-2": 0.04248523712158203, - "hello-world-4-minicluster-size-2": 0.045003652572631836, - "hello-world-5-minicluster-size-2": 0.05110311508178711, - "minicluster-destroy-size-2": 0.277, - "minicluster-create-persistent-size-2": 42.606, - "minicluster-persistent-destroy-size-2": 0.164 - }, - "size": 4, - "machine": "n1-standard-1", - "minicluster": { - "name": "lammps-job", - "namespace": "flux-operator", - "size": [ - 2 - ] - }, - "jobs": { - "reaxc-hns-1": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "reaxc-hns-2": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "reaxc-hns-3": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "reaxc-hns-4": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "reaxc-hns-5": { - "command": "lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-1": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-2": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-3": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-4": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "sleep-5": { - "command": "sleep 5", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-1": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-2": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-3": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-4": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - }, - "hello-world-5": { - "command": "echo hello world", - "image": "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", - "repeats": 5, - "workdir": "/home/flux/examples/reaxff/HNS" - } - }, - "info": { - "reaxc-hns-1-minicluster-size-2": { - "id": 130073755648, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674444768.0517902, - "t_depend": 1674444768.0517902, - "t_run": 1674444768.100832, - "t_cleanup": 1674444800.2855825, - "t_inactive": 1674444800.290403, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049568.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 32.1847505569458, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 40.13091278076172, - "start_to_output_seconds": 43.215059757232666 - }, - "reaxc-hns-2-minicluster-size-2": { - "id": 816932978688, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674444808.9904723, - "t_depend": 1674444808.9904723, - "t_run": 1674444809.0098114, - "t_cleanup": 1674444842.4203, - "t_inactive": 1674444842.4249685, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "0", - "nodelist": "lammps-job-0", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049609.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 33.41048860549927, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 97.17731666564941, - "start_to_output_seconds": 97.31685972213745 - }, - "reaxc-hns-3-minicluster-size-2": { - "id": 2450245287936, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674444906.3438601, - "t_depend": 1674444906.3438601, - "t_run": 1674444906.3633585, - "t_cleanup": 1674444937.3279338, - "t_inactive": 1674444937.33689, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049706.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 30.96457529067993, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 67.29511857032776, - "start_to_output_seconds": 67.40737009048462 - }, - "reaxc-hns-4-minicluster-size-2": { - "id": 3581969170432, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674444973.8004916, - "t_depend": 1674444973.8004916, - "t_run": 1674444973.8231413, - "t_cleanup": 1674445004.600231, - "t_inactive": 1674445004.6049078, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049773.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 30.777089595794678, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 62.43251633644104, - "start_to_output_seconds": 62.51574635505676 - }, - "reaxc-hns-5-minicluster-size-2": { - "id": 4631065264128, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445036.3308098, - "t_depend": 1674445036.3308098, - "t_run": 1674445036.3509514, - "t_cleanup": 1674445067.3998423, - "t_inactive": 1674445067.4045572, - "state": "INACTIVE", - "name": "lmp", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675049836.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 31.048890829086304, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 92.83428883552551, - "start_to_output_seconds": 92.92412114143372 - }, - "sleep-1-minicluster-size-2": { - "id": 461004341248, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677415.8718548, - "t_depend": 1674677415.8718548, - "t_run": 1674677415.8845603, - "t_cleanup": 1674677420.9134488, - "t_inactive": 1674677420.9152129, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282215.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.028888463973999, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 14.840466737747192, - "start_to_output_seconds": 17.383413314819336 - }, - "sleep-2-minicluster-size-2": { - "id": 717628637184, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677431.16695, - "t_depend": 1674677431.16695, - "t_run": 1674677431.1903481, - "t_cleanup": 1674677436.2360737, - "t_inactive": 1674677436.2395134, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "0", - "nodelist": "lammps-job-0", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282231.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.045725584030151, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 12.824117660522461, - "start_to_output_seconds": 15.347451210021973 - }, - "sleep-3-minicluster-size-2": { - "id": 975108571136, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677446.5178363, - "t_depend": 1674677446.5178363, - "t_run": 1674677446.534995, - "t_cleanup": 1674677451.6074398, - "t_inactive": 1674677451.613382, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282246.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.072444677352905, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 12.840857744216919, - "start_to_output_seconds": 15.384143352508545 - }, - "sleep-4-minicluster-size-2": { - "id": 1234333335552, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677461.9656863, - "t_depend": 1674677461.9656863, - "t_run": 1674677461.9789429, - "t_cleanup": 1674677467.0131505, - "t_inactive": 1674677467.0233643, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282261.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.034207582473755, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 12.951504468917847, - "start_to_output_seconds": 15.509077787399292 - }, - "sleep-5-minicluster-size-2": { - "id": 1495168712704, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674677477.5129235, - "t_depend": 1674677477.5129235, - "t_run": 1674677477.5259533, - "t_cleanup": 1674677482.551902, - "t_inactive": 1674677482.555279, - "state": "INACTIVE", - "name": "sleep", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675282277.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 5.025948762893677, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 12.880193948745728, - "start_to_output_seconds": 15.410512447357178 - }, - "hello-world-1-minicluster-size-2": { - "id": 8356177641472, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445258.3653252, - "t_depend": 1674445258.3653252, - "t_run": 1674445258.3868065, - "t_cleanup": 1674445258.4592175, - "t_inactive": 1674445258.46398, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050058.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.07241106033325195, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 13.482953310012817, - "start_to_output_seconds": 16.53845739364624 - }, - "hello-world-2-minicluster-size-2": { - "id": 8635753168896, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445275.028449, - "t_depend": 1674445275.028449, - "t_run": 1674445275.0489655, - "t_cleanup": 1674445275.1016998, - "t_inactive": 1674445275.1059186, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050075.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.052734375, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 0.5918288230895996, - "start_to_output_seconds": 0.6222965717315674 - }, - "hello-world-3-minicluster-size-2": { - "id": 8641507753984, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445275.3710968, - "t_depend": 1674445275.3710968, - "t_run": 1674445275.3893383, - "t_cleanup": 1674445275.4318235, - "t_inactive": 1674445275.4359808, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050075.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.04248523712158203, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 0.17513155937194824, - "start_to_output_seconds": 0.21306657791137695 - }, - "hello-world-4-minicluster-size-2": { - "id": 8646121488384, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445275.6465385, - "t_depend": 1674445275.6465385, - "t_run": 1674445275.6643715, - "t_cleanup": 1674445275.7093751, - "t_inactive": 1674445275.7134967, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050075.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.045003652572631836, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 0.19276666641235352, - "start_to_output_seconds": 0.2307295799255371 - }, - "hello-world-5-minicluster-size-2": { - "id": 8649946693632, - "userid": 1234, - "urgency": 16, - "priority": 16, - "t_submit": 1674445275.8740122, - "t_depend": 1674445275.8740122, - "t_run": 1674445275.8942568, - "t_cleanup": 1674445275.94536, - "t_inactive": 1674445275.95746, - "state": "INACTIVE", - "name": "echo", - "ntasks": 1, - "nnodes": 1, - "ranks": "1", - "nodelist": "lammps-job-1", - "success": true, - "exception_occurred": false, - "result": "COMPLETED", - "expiration": 1675050075.0, - "annotations": { - "sched": { - "queue": "default" - } - }, - "waitstatus": 0, - "returncode": 0, - "runtime": 0.05110311508178711, - "exception": { - "occurred": false, - "severity": "", - "type": "", - "note": "" - }, - "duration": "", - "start_to_info_seconds": 0.17215561866760254, - "start_to_output_seconds": 0.19998478889465332 - } - } -} diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out deleted file mode 100644 index 647c484..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-1-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.005 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 29.8322 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.029 ns/day, 828.671 hours/ns, 3.352 timesteps/s -94.2% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 22.21 | 22.21 | 22.21 | 0.0 | 74.45 -Neigh | 0.61723 | 0.61723 | 0.61723 | 0.0 | 2.07 -Comm | 0.010007 | 0.010007 | 0.010007 | 0.0 | 0.03 -Output | 0.0004328 | 0.0004328 | 0.0004328 | 0.0 | 0.00 -Modify | 6.9933 | 6.9933 | 6.9933 | 0.0 | 23.44 -Other | | 0.00162 | | | 0.01 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out deleted file mode 100644 index 0b9df79..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-2-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.010 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 31.2338 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.028 ns/day, 867.606 hours/ns, 3.202 timesteps/s -91.3% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 23.353 | 23.353 | 23.353 | 0.0 | 74.77 -Neigh | 0.62616 | 0.62616 | 0.62616 | 0.0 | 2.00 -Comm | 0.0096617 | 0.0096617 | 0.0096617 | 0.0 | 0.03 -Output | 0.00044694 | 0.00044694 | 0.00044694 | 0.0 | 0.00 -Modify | 7.2429 | 7.2429 | 7.2429 | 0.0 | 23.19 -Other | | 0.001518 | | | 0.00 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:32 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out deleted file mode 100644 index b6380b6..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-3-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.002 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 29.6229 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.029 ns/day, 822.859 hours/ns, 3.376 timesteps/s -94.4% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 22.175 | 22.175 | 22.175 | 0.0 | 74.86 -Neigh | 0.63724 | 0.63724 | 0.63724 | 0.0 | 2.15 -Comm | 0.0097153 | 0.0097153 | 0.0097153 | 0.0 | 0.03 -Output | 0.00041342 | 0.00041342 | 0.00041342 | 0.0 | 0.00 -Modify | 6.799 | 6.799 | 6.799 | 0.0 | 22.95 -Other | | 0.001424 | | | 0.00 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out deleted file mode 100644 index 6c889f5..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-4-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.002 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 29.7805 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.029 ns/day, 827.235 hours/ns, 3.358 timesteps/s -94.2% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 22.214 | 22.214 | 22.214 | 0.0 | 74.59 -Neigh | 0.62414 | 0.62414 | 0.62414 | 0.0 | 2.10 -Comm | 0.01756 | 0.01756 | 0.01756 | 0.0 | 0.06 -Output | 0.00041921 | 0.00041921 | 0.00041921 | 0.0 | 0.00 -Modify | 6.9226 | 6.9226 | 6.9226 | 0.0 | 23.25 -Other | | 0.00152 | | | 0.01 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:30 diff --git a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out b/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out deleted file mode 100644 index 9c9d4df..0000000 --- a/examples/up-submit-down/data/k8s-size-4-n1-standard-1/reaxc-hns-5-minicluster-size-2/log.out +++ /dev/null @@ -1,80 +0,0 @@ -LAMMPS (29 Sep 2021 - Update 2) -OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98) - using 1 OpenMP thread(s) per MPI task -Reading data file ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (22.326000 11.141200 13.778966) with tilt (0.0000000 -5.0260300 0.0000000) - 1 by 1 by 1 MPI processor grid - reading atoms ... - 304 atoms - reading velocities ... - 304 velocities - read_data CPU = 0.002 seconds -Replicating atoms ... - triclinic box = (0.0000000 0.0000000 0.0000000) to (44.652000 22.282400 27.557932) with tilt (0.0000000 -10.052060 0.0000000) - 1 by 1 by 1 MPI processor grid - bounding box image = (0 -1 -1) to (0 1 1) - bounding box extra memory = 0.03 MB - average # of replicas added to proc = 8.00 out of 8 (100.00%) - 2432 atoms - replicate CPU = 0.001 seconds -Neighbor list info ... - update every 20 steps, delay 0 steps, check no - max neighbors/atom: 2000, page size: 100000 - master list distance cutoff = 11 - ghost atom cutoff = 11 - binsize = 5.5, bins = 10 5 6 - 2 neighbor lists, perpetual/occasional/extra = 2 0 0 - (1) pair reax/c, perpetual - attributes: half, newton off, ghost - pair build: half/bin/newtoff/ghost - stencil: full/ghost/bin/3d - bin: standard - (2) fix qeq/reax, perpetual, copy from (1) - attributes: half, newton off, ghost - pair build: copy - stencil: none - bin: none -Setting up Verlet run ... - Unit style : real - Current step : 0 - Time step : 0.1 -Per MPI rank memory allocation (min/avg/max) = 215.0 | 215.0 | 215.0 Mbytes -Step Temp PotEng Press E_vdwl E_coul Volume - 0 300 -113.27833 437.52122 -111.57687 -1.7014647 27418.867 - 10 299.38517 -113.27631 1439.2857 -111.57492 -1.7013813 27418.867 - 20 300.27107 -113.27884 3764.3739 -111.57762 -1.7012246 27418.867 - 30 302.21063 -113.28428 7007.6914 -111.58335 -1.7009363 27418.867 - 40 303.52265 -113.28799 9844.84 -111.58747 -1.7005186 27418.867 - 50 301.87059 -113.28324 9663.0443 -111.58318 -1.7000524 27418.867 - 60 296.67807 -113.26777 7273.7928 -111.56815 -1.6996137 27418.867 - 70 292.19999 -113.25435 5533.6428 -111.55514 -1.6992157 27418.867 - 80 293.58677 -113.25831 5993.4151 -111.55946 -1.6988533 27418.867 - 90 300.62636 -113.27925 7202.8651 -111.58069 -1.6985591 27418.867 - 100 305.38276 -113.29357 10085.748 -111.59518 -1.6983875 27418.867 -Loop time of 30.0677 on 1 procs for 100 steps with 2432 atoms - -Performance: 0.029 ns/day, 835.214 hours/ns, 3.326 timesteps/s -93.3% CPU use with 1 MPI tasks x 1 OpenMP threads - -MPI task timing breakdown: -Section | min time | avg time | max time |%varavg| %total ---------------------------------------------------------------- -Pair | 22.337 | 22.337 | 22.337 | 0.0 | 74.29 -Neigh | 0.73472 | 0.73472 | 0.73472 | 0.0 | 2.44 -Comm | 0.009731 | 0.009731 | 0.009731 | 0.0 | 0.03 -Output | 0.00041722 | 0.00041722 | 0.00041722 | 0.0 | 0.00 -Modify | 6.9844 | 6.9844 | 6.9844 | 0.0 | 23.23 -Other | | 0.001495 | | | 0.00 - -Nlocal: 2432.00 ave 2432 max 2432 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 10685.0 ave 10685 max 10685 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 823958.0 ave 823958 max 823958 min -Histogram: 1 0 0 0 0 0 0 0 0 0 - -Total # of neighbors = 823958 -Ave neighs/atom = 338.79852 -Neighbor list builds = 5 -Dangerous builds not checked -Total wall time: 0:00:30 diff --git a/examples/up-submit-down/plot_results.py b/examples/up-submit-down/plot_results.py deleted file mode 100644 index 6395f83..0000000 --- a/examples/up-submit-down/plot_results.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import json -import os -import sys - -import matplotlib.pyplot as plt -import pandas -import seaborn as sns - - -def read_json(filename): - """ - Read a file into a text blob. - """ - with open(filename, "r") as fd: - content = json.loads(fd.read()) - return content - - -def plot_outputs(raw, plotname, ext="pdf"): - """ - Parse results.json into dataframe and plots to save. - """ - # Let's save the following, with runid as index - columns = ["minicluster_size", "job_type", "time_seconds", "time_type"] - - # Let's first organize distributions of times - data = [] - index = [] - for jobname, item in raw["info"].items(): - index += [jobname, jobname, jobname] - jobtype = jobname.split("-minicluster-size")[0].rsplit("-", 1)[0] - - # This is how flux-cloud organized the output - minicluster_size = int(jobname.rsplit("size-", 1)[-1]) - - # Manual melt :) - data.append([minicluster_size, jobtype, item["runtime"], "runtime"]) - data.append( - [ - minicluster_size, - jobtype, - item["start_to_output_seconds"], - "output_seconds", - ] - ) - data.append( - [minicluster_size, jobtype, item["start_to_info_seconds"], "info_seconds"] - ) - - # Assemble the data frame, index is the runids - df = pandas.DataFrame(data, columns=columns) - df.index = index - - # Save raw data - df.to_csv("results-df.csv") - - # We need colors! - colors = sns.color_palette("hls", 8) - hexcolors = colors.as_hex() - - palette = {} - for size in df.time_type.unique(): - palette[size] = hexcolors.pop(0) - - # Sort by size - palette = dict(sorted(palette.items())) - - # Let's make a plot that shows distributions of the times by the cluster size, across all - make_plot( - df, - title="Flux MiniCluster Time Variation", - tag="minicluster_times", - ydimension="time_seconds", - palette=palette, - ext=ext, - plotname=plotname, - ) - - -def make_plot(df, title, tag, ydimension, palette, ext="pdf", plotname="lammps"): - """ - Helper function to make common plots. - """ - ext = ext.strip(".") - plt.figure(figsize=(12, 12)) - sns.set_style("dark") - ax = sns.boxplot( - x="job_type", - y=ydimension, - hue="time_type", - data=df, - whis=[5, 95], - palette=palette, - ) - plt.title(title) - plt.legend([], [], frameon=False) - ax.set_xlabel("Job Type", fontsize=16) - ax.set_ylabel("Time (seconds)", fontsize=16) - ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=14) - ax.set_yticklabels(ax.get_yticks(), fontsize=14) - handles, _ = ax.get_legend_handles_labels() - ax.legend(handles, list(palette)) - plt.savefig(f"{tag}_{plotname}.{ext}") - plt.clf() - - -def get_parser(): - """ - Process results file into plots. - """ - parser = argparse.ArgumentParser(description="Plot LAMMPS outputs") - parser.add_argument("results_json", help="results json file", nargs="?") - parser.add_argument( - "-p", - "--plotname", - default="lammps", - help="base name for plot output files", - ) - parser.add_argument( - "-e", - "--extension", - dest="extension", - default="pdf", - help="image extension to use (defaults to pdf)", - ) - return parser - - -def main(): - """ - Read in results json, and make plots. - """ - parser = get_parser() - args = parser.parse_args() - if not os.path.exists(args.results_json): - sys.exit(f"{args.results_json} does not exist.") - data = read_json(args.results_json) - plot_outputs(data, args.plotname, ext=args.extension) - - -if __name__ == "__main__": - main() diff --git a/fluxcloud/client/__init__.py b/fluxcloud/client/__init__.py index 673a585..1007f64 100644 --- a/fluxcloud/client/__init__.py +++ b/fluxcloud/client/__init__.py @@ -130,11 +130,6 @@ def get_parser(): description="Bring the cluster up, run experiments via applying CRDs, and bring it down.", formatter_class=argparse.RawTextHelpFormatter, ) - ui = subparsers.add_parser( - "ui", - description="Once the cluster is up, create/open the user interface.", - formatter_class=argparse.RawTextHelpFormatter, - ) batch = subparsers.add_parser( "batch", description="Bring the cluster up, run experiments via a Flux Restful API submit, and bring it down.", @@ -167,13 +162,38 @@ def get_parser(): help="Bring down all experiment clusters", dest="down_all", ) + for command in submit, apply: + command.add_argument( + "--non-interactive", + "--ni", + default=False, + action="store_true", + help="Don't ask before bringing miniclusters down or re-creating.", + dest="non_interactive", + ) + + experiment = subparsers.add_parser( + "experiment", + description="Experiment controller.", + formatter_class=argparse.RawTextHelpFormatter, + ) + experiment.add_argument( + "experiment_command", + help="Command for experiment (defaults to init)", + ) + experiment.add_argument( + "-c", + "--cloud", + help="cloud to use", + choices=clouds.cloud_names, + ) listing = subparsers.add_parser( "list", description="List experiment ids available.", formatter_class=argparse.RawTextHelpFormatter, ) - for command in run, up, down, apply, listing, batch, submit, ui: + for command in run, up, down, apply, listing, batch, submit: command.add_argument( "experiments", default="experiments.yaml", @@ -188,7 +208,7 @@ def get_parser(): choices=clouds.cloud_names, ) - for command in apply, up, down, run, batch, submit, ui: + for command in apply, up, down, run, batch, submit: command.add_argument( "--force-cluster", dest="force_cluster", @@ -228,11 +248,6 @@ def get_parser(): default=False, action="store_true", ) - command.add_argument( - "--template", - help="minicluster yaml template to populate for experiments (defaults to minicluster-template.yaml", - default="minicluster-template.yaml", - ) command.add_argument( "--force", help="force re-run if experiment already exists.", @@ -287,22 +302,22 @@ def help(return_code=0): # Does the user want a shell? if args.command == "apply": from .apply import main - elif args.command == "submit": - from .apply import submit as main - elif args.command == "list": - from .listing import main - elif args.command == "run": - from .run import main elif args.command == "batch": from .run import batch as main elif args.command == "config": from .config import main - elif args.command == "ui": - from .ui import main - elif args.command == "up": - from .up import main elif args.command == "down": from .down import main + elif args.command == "experiment": + from .experiment import main + elif args.command == "list": + from .listing import main + elif args.command == "run": + from .run import main + elif args.command == "submit": + from .apply import submit as main + elif args.command == "up": + from .up import main # Pass on to the correct parser return_code = 0 diff --git a/fluxcloud/client/apply.py b/fluxcloud/client/apply.py index 13d1369..db6d0ee 100644 --- a/fluxcloud/client/apply.py +++ b/fluxcloud/client/apply.py @@ -11,7 +11,7 @@ def main(args, parser, extra, subparser): apply parser submits via separate CRDs. """ cli, setup, experiment = prepare_client(args, extra) - cli.apply(setup, experiment=experiment) + cli.apply(setup, experiment=experiment, interactive=not args.non_interactive) setup.cleanup(setup.matrices) @@ -20,5 +20,5 @@ def submit(args, parser, extra, subparser): submit parser submits via the Flux Restful API to one cluster """ cli, setup, experiment = prepare_client(args, extra) - cli.submit(setup, experiment=experiment) + cli.submit(setup, experiment=experiment, interactive=not args.non_interactive) setup.cleanup(setup.matrices) diff --git a/fluxcloud/client/experiment.py b/fluxcloud/client/experiment.py new file mode 100644 index 0000000..0229135 --- /dev/null +++ b/fluxcloud/client/experiment.py @@ -0,0 +1,27 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import fluxcloud.main.template as templates +from fluxcloud.logger import logger +from fluxcloud.main import get_experiment_client + + +def main(args, parser, extra, subparser): + """ + apply parser submits via separate CRDs. + """ + cli = get_experiment_client(args.cloud) + if args.experiment_command == "init": + if cli.name == "aws": + print(templates.aws_experiment_template) + elif cli.name in ["google", "gcp"]: + print(templates.google_experiment_template) + elif cli.name == "minikube": + print(templates.minikube_experiment_template) + else: + logger.error(f"Client {cli.name} is not a recognized cloud") + + else: + logger.exit(f'{args.experiment_command} is not recognized. Try "init"') diff --git a/fluxcloud/client/helpers.py b/fluxcloud/client/helpers.py index d9973d4..1aba57c 100644 --- a/fluxcloud/client/helpers.py +++ b/fluxcloud/client/helpers.py @@ -17,11 +17,10 @@ def prepare_client(args, extra): """ utils.ensure_no_extra(extra) - cli = get_experiment_client(args.cloud) + cli = get_experiment_client(args.cloud, debug=args.debug) setup = ExperimentSetup( args.experiments, force_cluster=args.force_cluster, - template=args.template, cleanup=args.cleanup, # Ensure the output directory is namespaced by the cloud name outdir=os.path.join(args.output_dir, cli.name), diff --git a/fluxcloud/client/ui.py b/fluxcloud/client/ui.py deleted file mode 100644 index 40ae1d1..0000000 --- a/fluxcloud/client/ui.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2022 Lawrence Livermore National Security, LLC and other -# This is part of Flux Framework. See the COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -from fluxcloud.logger import logger - -from .helpers import prepare_client - - -def main(args, parser, extra, subparser): - """ - open the ui by starting flux - """ - cli, setup, experiment = prepare_client(args, extra) - size = args.size - if not size and len(experiment.minicluster.get("size")) != 1: - logger.exit( - "Your MiniCluster has more than one size - please define the targer size with --size." - ) - elif not size: - size = experiment.minicluster["size"][0] - logger.info(f"Selected size {size} MiniCluster to open user interface.") - cli.open_ui(setup, experiment=experiment, size=size, persistent=True) diff --git a/fluxcloud/defaults.py b/fluxcloud/defaults.py index fb64369..d321073 100644 --- a/fluxcloud/defaults.py +++ b/fluxcloud/defaults.py @@ -13,9 +13,6 @@ # The default settings file in the install root default_settings_file = os.path.join(reps["$install_dir"], "settings.yml") -# Default template if one is not provided -default_minicluster_template = os.path.join(install_dir, "minicluster-template.yaml") - # User home userhome = os.path.expanduser("~/.fluxcloud") diff --git a/fluxcloud/main/__init__.py b/fluxcloud/main/__init__.py index 836c786..ce6c478 100644 --- a/fluxcloud/main/__init__.py +++ b/fluxcloud/main/__init__.py @@ -1,10 +1,10 @@ -# Copyright 2022 Lawrence Livermore National Security, LLC and other +# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other # This is part of Flux Framework. See the COPYRIGHT file for details. # # SPDX-License-Identifier: Apache-2.0 -def get_experiment_client(cloud=None, force_cluster=False): +def get_experiment_client(cloud=None, **kwargs): """ Create the cloud experiment client. """ @@ -19,4 +19,4 @@ def get_experiment_client(cloud=None, force_cluster=False): cloud = clouds.get_cloud(cloud) else: cloud = clients.ExperimentClient - return cloud(force_cluster=force_cluster) + return cloud(**kwargs) diff --git a/fluxcloud/main/api.py b/fluxcloud/main/api.py index 24c693e..860217d 100644 --- a/fluxcloud/main/api.py +++ b/fluxcloud/main/api.py @@ -3,167 +3,319 @@ # # SPDX-License-Identifier: Apache-2.0 -import atexit -import logging import os +import re import shutil -import subprocess -import threading import time import uuid from flux_restful_client.main import get_client +from fluxoperator.client import FluxOperator import fluxcloud.utils as utils from fluxcloud.logger import logger here = os.path.dirname(os.path.abspath(__file__)) -exit_event = threading.Event() - class APIClient: - def __init__(self, token=None, user=None): + def __init__(self, token=None, user=None, secret_key=None): """ API client wrapper. """ - self.user = token or os.environ.get("FLUX_USER") or "fluxuser" + self.user = token or os.environ.get("FLUX_USER") or user or "fluxuser" self.token = token or os.environ.get("FLUX_TOKEN") or str(uuid.uuid4()) - self.cli = get_client(user=self.user, token=self.token) + self.secret_key = ( + secret_key or os.environ.get("FLUX_SECRET_KEY") or str(uuid.uuid4()) + ) self.proc = None self.broker_pod = None - def check(self, experiment): + def show_credentials(self): """ - Set the basic auth for username and password and check it works + Show the token and user, if requested. """ - minicluster = experiment.minicluster - get_broker_pod = experiment.get_shared_script( - "broker-id", {"minicluster": minicluster} - ) - - logger.info("Waiting for id of running broker pod...") - - # We've already waited for them to be running - broker_pod = None - while not broker_pod: - result = utils.run_capture(["/bin/bash", get_broker_pod], stream=True) - - # Save the broker pod, or exit on failure. - if result["message"]: - broker_pod = result["message"].strip() - - self.broker_pod = broker_pod - self.port_forward(minicluster["namespace"], self.broker_pod) + logger.info("MiniCluster created with credentials:") + logger.info(f" FLUX_USER={self.user}") + logger.info(f" FLUX_TOKEN={self.token}") - def port_forward(self, namespace, broker_pod): + def _set_minicluster_credentials(self, minicluster): """ - Ask user to open port to forward + If the user provided credentials, use """ - command = ["kubectl", "port-forward", "-n", namespace, broker_pod, "5000:5000"] + if "flux_restful" not in minicluster: + minicluster["flux_restful"] = {} - # This is detached - we can kill but not interact - logger.info(" ".join(command)) - self.proc = proc = subprocess.Popen( - command, - stdout=subprocess.DEVNULL if logger.level >= logging.DEBUG else None, - ) + if "username" not in minicluster["flux_restful"]: + minicluster["flux_restful"]["username"] = self.user + + if "token" not in minicluster["flux_restful"]: + minicluster["flux_restful"]["token"] = self.token - def cleanup(): - proc.kill() + if "secret_key" not in minicluster["flux_restful"]: + minicluster["flux_restful"]["secret_key"] = self.secret_key - # Ensure we cleanup if anything goes wrong - atexit.register(cleanup) + # Update credentials + self.user = minicluster["flux_restful"]["username"] + self.token = minicluster["flux_restful"]["token"] + self.secret_key = minicluster["flux_restful"]["secret_key"] + return minicluster - def submit(self, setup, experiment, size): + def _create_minicluster( + self, operator, minicluster, experiment, job, interactive=True + ): """ - Use the client to submit the jobs programatically. + Shared function to take an operator handle and create the minicluster. + + This can be used for apply or submit! We separate minicluster (gets + piped into the MiniClusterSpec) from job (gets piped into a + MiniClusterContainer spec). """ - # Submit jobs! - - # Sleep time will be time of last job, assuming they are similar - sleep_time = 5 - for jobname, job in experiment.jobs.items(): - # Do we want to run this job for this size and machine? - if not experiment.check_job_run(job, size): - logger.debug( - f"Skipping job {jobname} as does not match inclusion criteria." - ) - continue + namespace = minicluster["namespace"] + image = job["image"] + name = minicluster["name"] + size = minicluster["size"] + + self._set_minicluster_credentials(minicluster) + + try: + # The operator will time creation through pods being ready + result = operator.create_minicluster(**minicluster, container=job) + except Exception as e: + # Give the user the option to delete and recreate or just exit + logger.error(f"There was an issue creating the MiniCluster: {e}") + if interactive and not utils.confirm_action( + "Would you like to submit jobs to the current cluster? You will need to have provided the same username as password." + ): + if utils.confirm_action( + "Would you like to delete this mini cluster and re-create?" + ): + logger.info("Cleaning up MiniCluster...") + operator.delete_minicluster(name=name, namespace=namespace) + return self._create_minicluster( + operator, minicluster, experiment, job, interactive=interactive + ) + else: + logger.exit( + f"Try: 'kubectl delete -n {namespace} minicluster {name}'" + ) + elif not interactive: + logger.exit(f"Try: 'kubectl delete -n {namespace} minicluster {name}'") + return + + # Wait for pods to be ready to include in minicluster up time + self.show_credentials() + + # Save MiniCluster metadata + image_slug = re.sub("(:|/)", "-", image) + uid = f"{size}-{name}-{image_slug}" + experiment.save_json(result, f"minicluster-size-{uid}.json") + + # This is a good point to also save nodes metadata + nodes = operator.get_nodes() + operator.wait_pods(quiet=True) + pods = operator.get_pods() + + experiment.save_file(nodes.to_str(), f"nodes-{uid}.json") + experiment.save_file(pods.to_str(), f"pods-size-{uid}.json") + return result + + def apply( + self, + experiment, + minicluster, + job=None, + outfile=None, + stdout=True, + interactive=True, + ): + """ + Use the client to apply (1:1 job,minicluster) the jobs programatically. + """ + namespace = minicluster["namespace"] + name = minicluster["name"] - if "command" not in job: - logger.debug(f"Skipping job {jobname} as does not have a command.") - continue + # Interact with the Flux Operator Python SDK + operator = FluxOperator(namespace) - # The experiment is defined by the machine type and size - experiment_dir = experiment.root_dir + self._create_minicluster( + operator, minicluster, experiment, job, interactive=interactive + ) - # Add the size - jobname = f"{jobname}-minicluster-size-{size}" - job_output = os.path.join(experiment_dir, jobname) - logfile = os.path.join(job_output, "log.out") + # Get the broker pod (this would also wait for all pods to be ready) + broker = operator.get_broker_pod() - # Do we have output? - if os.path.exists(logfile) and not setup.force: - relpath = os.path.relpath(logfile, experiment_dir) - logger.warning( - f"{relpath} already exists and force is False, skipping." - ) - continue - - elif os.path.exists(logfile) and setup.force: - logger.warning(f"Cleaning up previous run in {job_output}.") - shutil.rmtree(job_output) - - # Create job directory anew - utils.mkdir_p(job_output) - - kwargs = dict(job) - del kwargs["command"] - - # Assume the task gets all nodes, unless specified in job - # Also assume the flux restful server is using one node - if "nodes" not in kwargs: - kwargs["nodes"] = size - 1 - if "tasks" not in kwargs: - kwargs["tasks"] = size - 1 - - # Ensure we convert - map between job params and the flux restful api - for convert in ( - ["num_tasks", "tasks"], - ["cores_per_task", "cores"], - ["gpus_per_task", "gpus"], - ["num_nodes", "nodes"], - ): - if convert[1] in kwargs: - kwargs[convert[0]] = kwargs[convert[1]] + # Time from when broker pod (and all pods are ready) + start = time.time() - # Let's also keep track of actual time to get logs, info, etc. - start = time.time() + # Get the pod to stream output from directly + if outfile is not None: + operator.stream_output(outfile, pod=broker, stdout=stdout) - # Run and block output until job is done - res = self.cli.submit(command=job["command"], **kwargs) + # When output done streaming, job is done + end = time.time() + logger.info(f"Job {name} is complete! Cleaning up MiniCluster...") - logger.info(f"Submitting {jobname}: {job['command']}") - info = self.cli.jobs(res["id"]) + # This also waits for termination (and pods to be gone) and times it + operator.delete_minicluster(name=name, namespace=namespace) - while info["returncode"] == "": - info = self.cli.jobs(res["id"]) - time.sleep(sleep_time) + # TODO likely need to separate minicluster up/down times. + results = {"times": operator.times} + results["times"][name] = end - start + return results - end1 = time.time() - output = self.cli.output(res["id"]).get("Output") - if output: - utils.write_file("".join(output), logfile) - end2 = time.time() + def submit( + self, setup, experiment, minicluster, job, poll_seconds=20, interactive=True + ): + """ + Use the client to submit the jobs programatically. + """ + namespace = minicluster["namespace"] + image = job["image"] + name = minicluster["name"] + size = minicluster["size"] + + # Interact with the Flux Operator Python SDK + operator = FluxOperator(namespace) - # Get the full job info, and add some wrapper times - info = self.cli.jobs(res["id"]) - info["start_to_info_seconds"] = end1 - start - info["start_to_output_seconds"] = end2 - start + self._create_minicluster( + operator, minicluster, experiment, job, interactive=interactive + ) - yield jobname, info - sleep_time = info["runtime"] + # Get the broker pod (this would also wait for all pods to be ready) + broker = operator.get_broker_pod() + + # Return results (and times) to calling client + results = {} + + # Submit jobs via port forward - this waits until the server is ready + with operator.port_forward(broker) as forward_url: + print(f"Port forward opened to {forward_url}") + + # See https://flux-framework.org/flux-restful-api/getting_started/api.html + cli = get_client( + host=forward_url, + user=self.user, + token=self.token, + secret_key=self.secret_key, + ) + cli.set_basic_auth(self.user, self.token) + + # Keep a lookup of jobid and output files. + # We will try waiting for all jobs to finish and then save output + jobs = [] + for jobname, job in experiment.jobs.items(): + # Do we want to run this job for this size, image? + if not experiment.check_job_run(job, size=size, image=image): + logger.debug( + f"Skipping job {jobname} as does not match inclusion criteria." + ) + continue + + if "command" not in job: + logger.debug(f"Skipping job {jobname} as does not have a command.") + continue + + # Here we submit all jobs to the scheduler. Let the scheduler handle it! + submit_job = self.submit_job( + cli, experiment, setup, minicluster, job, jobname + ) + if not submit_job: + continue + jobs.append(submit_job) + + logger.info(f"Submit {len(jobs)} jobs! Waiting for completion...") + + # Poll once every 30 seconds + # This could be improved with some kind of notification / pubsub thing + completed = [] + while jobs: + logger.info(f"{len(jobs)} are active.") + time.sleep(poll_seconds) + unfinished = [] + for job in jobs: + if "id" not in job: + logger.warning( + f"Job {job} is missing an id or name, likely an issue or not ready, skipping." + ) + continue + + info = cli.jobs(job["id"]) + + # If we don't have a name yet, it's still pending + if "name" not in info: + unfinished.append(job) + continue + + jobname = info["name"].rjust(15) + if info["state"] == "INACTIVE": + finish_time = round(info["runtime"], 2) + logger.debug( + f"{jobname} is finished {info['result']} in {finish_time} seconds." + ) + job["info"] = info + job["output"] = cli.output(job["id"]).get("Output") + completed.append(job) + else: + logger.debug(f"{jobname} is in state {info['state']}") + unfinished.append(job) + jobs = unfinished + + logger.info("All jobs are complete!") + + # This also waits for termination (and pods to be gone) and times it + if not interactive or utils.confirm_action( + "Would you like to delete this mini cluster?" + ): + logger.info("Cleaning up MiniCluster...") + operator.delete_minicluster(name=name, namespace=namespace) + + # Get times recorded by FluxOperator Python SDK + results["jobs"] = completed + results["times"] = operator.times + return results + + def submit_job(self, cli, experiment, setup, minicluster, job, jobname): + """ + Submit the job (if appropriate for the minicluster) - # Kill the connection to the service - self.proc.kill() + Return an appended Flux Restful API job result with the expected + output file. + """ + # The experiment is defined by the machine type and size + experiment_dir = experiment.root_dir + + jobname = f"{jobname}-minicluster-size-{minicluster['size']}" + job_output = os.path.join(experiment_dir, jobname) + logfile = os.path.join(job_output, "log.out") + + # Do we have output? + if os.path.exists(logfile) and not setup.force: + relpath = os.path.relpath(logfile, experiment_dir) + logger.warning(f"{relpath} already exists and force is False, skipping.") + return + + if os.path.exists(logfile) and setup.force: + logger.warning(f"Cleaning up previous run in {job_output}.") + shutil.rmtree(job_output) + + kwargs = dict(job) + del kwargs["command"] + + # Ensure we convert - map between job params and the flux restful api + for convert in ( + ["num_tasks", "tasks"], + ["cores_per_task", "cores"], + ["gpus_per_task", "gpus"], + ["num_nodes", "nodes"], + ["workdir", "working_dir"], + ): + if convert[1] in kwargs: + kwargs[convert[0]] = kwargs[convert[1]] + del kwargs[convert[1]] + + # Submit the job, add the expected output file, and return + logger.info(f"Submitting {jobname}: {job['command']}") + res = cli.submit(command=job["command"], **kwargs) + res["job_output"] = logfile + return res diff --git a/fluxcloud/main/client.py b/fluxcloud/main/client.py index e5a8db5..49cdbb8 100644 --- a/fluxcloud/main/client.py +++ b/fluxcloud/main/client.py @@ -3,13 +3,13 @@ # # SPDX-License-Identifier: Apache-2.0 +import copy import os import shutil -import time +import fluxcloud.main.api as api import fluxcloud.utils as utils from fluxcloud.logger import logger -from fluxcloud.main.api import APIClient from fluxcloud.main.decorator import save_meta, timed here = os.path.dirname(os.path.abspath(__file__)) @@ -26,9 +26,10 @@ def __init__(self, *args, **kwargs): self.settings = settings.Settings self.info = {} self.times = {} + self.debug = kwargs.get("debug", False) # Job prefix is used for organizing time entries - self.job_prefix = "minicluster-run" + self.job_prefix = "job_" def __repr__(self): return str(self) @@ -67,7 +68,7 @@ def run(self, setup): # Each experiment has its own cluster size and machine type for experiment in setup.iter_experiments(): self.up(setup, experiment=experiment) - self.apply(setup, experiment=experiment) + self.apply(setup, experiment=experiment, interactive=False) self.down(setup, experiment=experiment) @save_meta @@ -82,7 +83,7 @@ def batch(self, setup): # Each experiment has its own cluster size and machine type for experiment in setup.iter_experiments(): self.up(setup, experiment=experiment) - self.submit(setup, experiment=experiment) + self.submit(setup, experiment=experiment, interactive=False) self.down(setup, experiment=experiment) @save_meta @@ -93,81 +94,7 @@ def down(self, *args, **kwargs): raise NotImplementedError @save_meta - def open_ui(self, setup, experiment, size, api=None, persistent=False): - """ - Launch a CRD that opens the UI only. - """ - # The MiniCluster can vary on size - minicluster = experiment.minicluster - - # Create a FluxRestful API to submit to - created = False - if api is None: - api = APIClient() - created = True - - logger.info(f"\n🌀 Bringing up MiniCluster of size {size}") - - # Get persistent variables for this job size, image is required - job = experiment.get_persistent_variables(size, required=["image"]) - job.update({"token": api.token, "user": api.user}) - - # We can't have a command - if "command" in job: - del job["command"] - - # Pre-pull containers, etc. - if hasattr(self, "pre_apply"): - self.pre_apply(experiment, "global-job", job=job) - - # Create the minicluster via a CRD without a command - crd = experiment.generate_crd(job, size) - - # Create one MiniCluster CRD (without a command) to run the Flux Restful API - kwargs = { - "minicluster": minicluster, - "crd": crd, - "token": api.token, - "user": api.user, - "size": size, - } - submit_script = experiment.get_shared_script( - "minicluster-create-persistent", kwargs, suffix=f"-size-{size}" - ) - # Start the MiniCluster! This should probably be done better... - self.run_timed( - f"minicluster-create-persistent-size-{size}", ["/bin/bash", submit_script] - ) - - # Ensure our credentials still work, and open port forward - api.check(experiment) - logger.info(f"\n🌀 MiniCluster of size {size} is up.\n") - - # If created for the first time, show credentials - if created: - logger.info( - "Save these if you want to log into the Flux RESTFul interface, there are specific to the MiniCluster" - ) - logger.info(f"export FLUX_USER={api.user}") - logger.info(f"export FLUX_TOKEN={api.token}") - - # If we exit, the port forward will close. - if persistent: - try: - logger.info("Press Control+c to Disconnect.") - while True: - time.sleep(10) - except KeyboardInterrupt: - logger.info("🧽️ Cleaning up!") - self.run_timed( - f"minicluster-persistent-destroy-size-{size}", - ["kubectl", "delete", "-f", crd], - ) - - return api, kwargs - - @save_meta - def submit(self, setup, experiment): + def submit(self, setup, experiment, interactive=True): """ Submit a Job via the Restful API """ @@ -177,8 +104,6 @@ def submit(self, setup, experiment): ) return - api = None - # Iterate through all the cluster sizes for size in experiment.minicluster["size"]: # We can't run if the minicluster > the experiment size @@ -188,24 +113,49 @@ def submit(self, setup, experiment): ) continue - # Open the api for the size - api, uiattrs = self.open_ui(setup, experiment, size, api) - logger.info(f"\n🌀 Bringing up MiniCluster of size {size}") + # Launch a unique Minicluster per container image. E.g., + # if the user provides 2 images for size 4, we create two MiniClusters + # This will provide all shared volumes across the jobs + for minicluster, job in experiment.get_submit_miniclusters(size): + logger.info( + f"\n🌀 Bringing up MiniCluster of size {size} with image {job['image']}" + ) + + # Create the API client (creates the user and token for the cluster) + cli = api.APIClient() - # Save times (and logs in submit) as we go - for jobid, info in api.submit(setup, experiment, size): - logger.info(f"{jobid} took {info['runtime']} seconds.") - self.times[jobid] = info["runtime"] - self.info[jobid] = info + # Pre-pull containers, etc. + if hasattr(self, "pre_apply"): + self.pre_apply(experiment, minicluster["name"], job=job) - logger.info(f"\n🌀 MiniCluster of size {size} is finished") - self.run_timed( - f"minicluster-persistent-destroy-size-{size}", - ["kubectl", "delete", "-f", uiattrs["crd"]], - ) + # Get back results with times (for minicluster assets) and jobs + results = cli.submit( + setup, experiment, minicluster, job=job, interactive=interactive + ) + + # Save times and output files for jobs + for job in results.get("jobs", []): + self.save_job(job) + + def save_job(self, job): + """ + Save the job and add times to our times listing. + """ + jobid = f"{self.job_prefix}{job['id']}" + self.times[jobid] = job["info"]["runtime"] + + # Do we have an output file and output? + if job["output"]: + # Save to our output directory! + logfile = job["job_output"] + utils.mkdir_p(os.path.dirname(logfile)) + utils.write_file(job["output"], logfile) + + del job["output"] + self.info[jobid] = job @save_meta - def apply(self, setup, experiment): + def apply(self, setup, experiment, interactive=True): """ Apply a CRD to run the experiment and wait for output. @@ -246,22 +196,24 @@ def apply(self, setup, experiment): # Create job directory anew utils.mkdir_p(job_output) - # Generate the populated crd from the template - crd = experiment.generate_crd(job, size) - - # Prepare specific .crd for template - # Note the output directory is already specific to the job index - kwargs = { - "minicluster": experiment.minicluster, - "logfile": logfile, - "crd": crd, - } - apply_script = experiment.get_shared_script( - "minicluster-run", kwargs, suffix=f"-{jobname}" + # Prepare the client for one minicluster + cli = api.APIClient() + + # Prepare a specific MiniCluster for this size + minicluster = copy.deepcopy(experiment.minicluster) + minicluster["size"] = size + + # Get back results with times (for minicluster assets) and jobs + # If debug level, print job output to terminal too :) + results = cli.apply( + experiment=experiment, + minicluster=minicluster, + outfile=logfile, + stdout=self.debug, + job=job, + interactive=interactive, ) - - # Apply the job, and save to output directory - self.run_timed(f"{self.job_prefix}-{jobname}", ["/bin/bash", apply_script]) + self.times[jobname] = results["times"] # Save times between experiment runs experiment.save_metadata(self.times, self.info) diff --git a/fluxcloud/main/clouds/aws/scripts/cluster-create b/fluxcloud/main/clouds/aws/scripts/cluster-create index 3a3cd8e..14855d5 100755 --- a/fluxcloud/main/clouds/aws/scripts/cluster-create +++ b/fluxcloud/main/clouds/aws/scripts/cluster-create @@ -5,6 +5,7 @@ # Defaults - these are in the config but left here for information CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux-cluster{% endif %}" +NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}" REGION="{% if region %}{{ region }}{% else %}us-east-1{% endif %}" CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}" MACHINE_TYPE="{% if experiment.machine %}{{ experiment.machine }}{% else %}m5.large{% endif %}" @@ -33,6 +34,7 @@ if [ -z ${MACHINE_TYPE+x} ]; then exit 1 fi +print_magenta " namespace: ${NAMESPACE}" print_magenta " cluster : ${CLUSTER_NAME}" print_magenta " version : ${CLUSTER_VERSION}" print_magenta " machine : ${MACHINE_TYPE}" @@ -64,6 +66,8 @@ run_echo eksctl create cluster -f ${CONFIG_FILE} # Deploy the operator install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} + +run_echo kubectl create namespace ${NAMESPACE} || true run_echo kubectl get namespace run_echo kubectl describe namespace operator-system diff --git a/fluxcloud/main/clouds/google/scripts/cluster-create b/fluxcloud/main/clouds/google/scripts/cluster-create index c33b78f..51c2562 100755 --- a/fluxcloud/main/clouds/google/scripts/cluster-create +++ b/fluxcloud/main/clouds/google/scripts/cluster-create @@ -5,6 +5,7 @@ # Defaults - these are in the config but left here for information CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux{% endif %}" +NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}" ZONE="{% if zone %}{{ zone }}{% else %}us-central1-a{% endif %}" CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}" MACHINE_TYPE="{% if experiment.machine %}{{ experiment.machine }}{% else %}n1-standard-1{% endif %}" @@ -32,6 +33,7 @@ if [ -z ${MACHINE_TYPE+x} ]; then exit 1 fi +print_magenta " namespace: ${NAMESPACE}" print_magenta " cluster : ${CLUSTER_NAME}" print_magenta " version : ${CLUSTER_VERSION}" print_magenta " project : ${GOOGLE_PROJECT}" @@ -74,7 +76,7 @@ run_echo kubectl get nodes # Deploy the operator mkdir -p ${SCRIPT_DIR} install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} - +run_echo kubectl create namespace ${NAMESPACE} || true run_echo kubectl get namespace run_echo kubectl describe namespace operator-system diff --git a/fluxcloud/main/clouds/local/scripts/cluster-create-minikube b/fluxcloud/main/clouds/local/scripts/cluster-create-minikube index 35fb21c..116047d 100755 --- a/fluxcloud/main/clouds/local/scripts/cluster-create-minikube +++ b/fluxcloud/main/clouds/local/scripts/cluster-create-minikube @@ -5,6 +5,7 @@ # Defaults - these are in the config but left here for information CLUSTER_NAME="{% if experiment.cluster_name %}{{ experiment.cluster_name }}{% else %}flux-cluster{% endif %}" +NAMESPACE="{% if experiment.minicluster_namespace %}{{ experiment.minicluster_namespace }}{% else %}flux-operator{% endif %}" CLUSTER_VERSION="{% if experiment.kubernetes_version %}{{ experiment.kubernetes_version }}{% else %}1.23{% endif %}" FORCE_CLUSTER="{% if setup.force_cluster %}true{% else %}false{% endif %}" SIZE={% if experiment.size %}{{ experiment.size }}{% else %}4{% endif %} @@ -12,6 +13,7 @@ REPOSITORY="{% if experiment.operator_repository %}{{ experiment.operator_reposi BRANCH="{% if experiment.operator_branch %}{{ experiment.operator_branch }}{% else %}main{% endif %}" SCRIPT_DIR="{{ experiment.script_dir }}" +print_magenta " namespace: ${NAMESPACE}" print_magenta " cluster : ${CLUSTER_NAME}" print_magenta " version : ${CLUSTER_VERSION}" print_magenta " size : ${SIZE}" @@ -51,7 +53,7 @@ install_operator ${SCRIPT_DIR} ${REPOSITORY} ${BRANCH} # Show nodes run_echo kubectl get nodes - +run_echo kubectl create namespace ${NAMESPACE} || true run_echo kubectl get namespace run_echo kubectl describe namespace operator-system save_versions ${SCRIPT_DIR} ${SIZE} diff --git a/fluxcloud/main/clouds/shared/scripts/broker-id b/fluxcloud/main/clouds/shared/scripts/broker-id deleted file mode 100755 index a45ba8c..0000000 --- a/fluxcloud/main/clouds/shared/scripts/broker-id +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}" -JOB="{{ minicluster.name }}" -brokerPrefix="${JOB}-0" - -for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo ${pod} - break - fi -done diff --git a/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent b/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent deleted file mode 100755 index 3b2db0a..0000000 --- a/fluxcloud/main/clouds/shared/scripts/minicluster-create-persistent +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# This is a template that will be populated with variables by Flux-Cloud -# We only run it to check if a MiniCluster is running. An apply is only -# needed if the MiniCluster is not created yet. - -# Include shared helper scripts -{% include "helpers.sh" %} - -NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}" -CRD="{{ crd }}" -JOB="{{ minicluster.name }}" - -# Size -1 to account for certificate generator -SIZE={{ size }} - -print_magenta " apply : ${CRD}" -print_magenta " job : ${JOB}" - -is_installed kubectl - -# Create the namespace (ok if already exists) -run_echo_allow_fail kubectl create namespace ${NAMESPACE} - -# Always cleanup a previous one so tokens don't get stale -run_echo_allow_fail kubectl delete -f ${CRD} -{% include "wait_for_cleanup.sh" %} - -# Ensure we have a MiniCluster of the right namespace running -echo -print_green "🌀️ Creating MiniCluster in ${NAMESPACE}" -{% include "wait_for_all.sh" %} -{% include "wait_for_flux_restful.sh" %} diff --git a/fluxcloud/main/clouds/shared/scripts/minicluster-run b/fluxcloud/main/clouds/shared/scripts/minicluster-run deleted file mode 100755 index b7f14ce..0000000 --- a/fluxcloud/main/clouds/shared/scripts/minicluster-run +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# This is a template that will be populated with variables by Flux-Cloud -# It used to be a script proper with getopt, but in practice this was -# erroneous on different operating systems. - -# Include shared helper scripts -{% include "helpers.sh" %} - -NAMESPACE="{% if minicluster.namespace %}{{ minicluster.namespace }}{% else %}flux-operator{% endif %}" -CRD="{{ crd }}" -JOB="{{ minicluster.name }}" -LOGFILE="{{ logfile }}" - -print_magenta " apply : ${CRD}" -print_magenta " job : ${JOB}" -print_magenta "logfile : ${LOGFILE}" - -is_installed kubectl - -# Ensure we wait for the space to be cleaned up -{% include "wait_for_cleanup.sh" %} - -# Create the namespace (ok if already exists) -run_echo_allow_fail kubectl create namespace ${NAMESPACE} - -{% include "wait_for_broker.sh" %} - -# Get the name of the pods -pods=($(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}')) -brokerpod=${pods[0]} - -# This will hang like this until the job finishes running -echo -print_green "kubectl -n ${NAMESPACE} logs ${brokerpod} -f > ${LOGFILE}" -kubectl -n ${NAMESPACE} logs ${brokerpod} -f > ${LOGFILE} - -for exitcode in $(kubectl get -n ${NAMESPACE} pod --selector=job-name=${JOB} --output=jsonpath={.items...containerStatuses..state.terminated.exitCode}); do - if [[ ${exitcode} -ne 0 ]]; then - echo "Container in ${JOB} had nonzero exit code" - fi -done - -run_echo kubectl delete -f ${CRD} diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh deleted file mode 100644 index ddf5cc7..0000000 --- a/fluxcloud/main/clouds/shared/scripts/wait_for_all.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Apply the job, get pods -run_echo kubectl apply -f ${CRD} -run_echo kubectl get -n ${NAMESPACE} pods - -# continue until we find the index-0 pod -podsReady="false" - -echo -print_blue "Waiting for MiniCluster of size ${SIZE} to be ready..." -while [[ "${podsReady}" == "false" ]]; do - echo -n "." - sleep 2 - pods=$(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=name | wc -l) - if [[ ${pods} -eq ${SIZE} ]]; then - echo - print_green "🌀️ All pods are running." - podsReady="true" - break - fi -done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh deleted file mode 100644 index 9335313..0000000 --- a/fluxcloud/main/clouds/shared/scripts/wait_for_broker.sh +++ /dev/null @@ -1,40 +0,0 @@ -# Apply the job, get pods -run_echo kubectl apply -f ${CRD} -run_echo kubectl get -n ${NAMESPACE} pods - -# continue until we find the index-0 pod -brokerPrefix="${JOB}-0" -brokerReady="false" - -echo -print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be created..." -while [[ "${brokerReady}" == "false" ]]; do - echo -n "." - sleep 2 - for pod in $(kubectl get pods --selector=job-name=${JOB} --namespace ${NAMESPACE} --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - print_green "🌀️ Broker pod is created." - brokerReady="true" - break - fi - done -done - -# Now broker pod needs to be running -echo -print_blue "Waiting for broker pod with prefix ${brokerPrefix} to be running..." -brokerReady="false" -while [[ "${brokerReady}" == "false" ]]; do - echo -n "." - - # TODO - we likely want to check for running OR completed, it's rare but sometimes they can complete too fast. - for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - print_green "🌀️ Broker pod is running." - brokerReady="true" - break - fi - done -done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh deleted file mode 100644 index 466482f..0000000 --- a/fluxcloud/main/clouds/shared/scripts/wait_for_cleanup.sh +++ /dev/null @@ -1,15 +0,0 @@ -echo -podsCleaned="false" -print_blue "Waiting for previous MiniCluster to be cleaned up..." -while [[ "${podsCleaned}" == "false" ]]; do - echo -n "." - sleep 2 - state=$(kubectl get pods --namespace ${NAMESPACE} 2>&1) - lines=$(echo $state | wc -l) - if [[ ${lines} -eq 1 ]] && [[ "${state}" == *"No resources found in"* ]]; then - echo - print_green "🌀️ Previous pods are cleaned up." - podsCleaned="true" - break - fi -done diff --git a/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh b/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh deleted file mode 100644 index 6c27ba7..0000000 --- a/fluxcloud/main/clouds/shared/scripts/wait_for_flux_restful.sh +++ /dev/null @@ -1,29 +0,0 @@ - -echo -brokerPod="" -brokerPrefix="${JOB}-0" -while [[ "${brokerPod}" == "" ]]; do - for pod in $(kubectl get pods --namespace ${NAMESPACE} --field-selector=status.phase=Running --output=jsonpath='{.items[*].metadata.name}'); do - if [[ "${pod}" == ${brokerPrefix}* ]]; then - echo - brokerPod=${pod} - break - fi - done -done - -echo -serverReady="false" -print_blue "Waiting for Flux Restful API Server to be ready..." -while [[ "${serverReady}" == "false" ]]; do - echo -n "." - sleep 2 - logs=$(kubectl logs --namespace ${NAMESPACE} ${brokerPod} | grep "Uvicorn running") - retval=$? - if [[ ${retval} -eq 0 ]]; then - echo - serverReady="true" - print_green "🌀️ Flux RestFul API Server is Ready." - break - fi -done diff --git a/fluxcloud/main/experiment.py b/fluxcloud/main/experiment.py index c50d8b7..44a65c8 100644 --- a/fluxcloud/main/experiment.py +++ b/fluxcloud/main/experiment.py @@ -8,7 +8,6 @@ import os import shutil -import jinja2 import jsonschema import fluxcloud.defaults as defaults @@ -22,7 +21,6 @@ class ExperimentSetup: def __init__( self, experiments, - template=None, outdir=None, validate=True, cleanup=True, @@ -34,21 +32,12 @@ def __init__( An experiment setup is a light wrapper around a group of experiments. """ self.experiment_file = os.path.abspath(experiments) - self.template = ( - os.path.abspath(template) - if template is not None and os.path.exists(template) - else None - ) self.outdir = outdir self.test = test self.settings = settings.Settings self.quiet = quiet self.run_cleanup = cleanup - # Show the user the template file - if template: - logger.debug(f"Using template {self.template}") - # Rewrite existing outputs self.force = kwargs.get("force") or False # Don't ask for confirmation to create/destroy @@ -99,7 +88,7 @@ def prepare_matrices(self): validate_experiments(self.spec) # Sploot out into matrices - matrices = expand_experiments(self.spec, self.outdir, self.template) + matrices = expand_experiments(self.spec, self.outdir) if not matrices: raise ValueError( "No matrices generated. Did you include any empty variables in your matrix?" @@ -134,11 +123,10 @@ class Experiment: An experiment wrapper to make it easy to get variables in templates. """ - def __init__(self, experiment, outdir=None, template=None): + def __init__(self, experiment, outdir=None): self.experiment = experiment self.settings = settings.Settings self._outdir = outdir - self.template = template or defaults.default_minicluster_template @property def outdir(self): @@ -191,31 +179,60 @@ def iter_jobs(self): yield size, jobname, job - def get_persistent_variables(self, size, required=None): + def get_submit_miniclusters(self, size): """ - Get persistent variables that should be used across the MiniCluster + Return Miniclusters organized by unique sizes and containers + + For each, we return a faux job that includes (potentially) the job volumes. """ - jobvars = {} - for _, job in self.jobs.items(): - # Skip jobs targeted for a different size + # A faux job is provided that includes all volumes + images = {} + for name, job in self.jobs.items(): if "size" in job and job["size"] != size: continue - - for key, value in job.items(): - if key not in jobvars or (key in jobvars and jobvars[key] == value): - jobvars[key] = value - continue - logger.warning( - f'Inconsistent job variable between MiniCluster jobs: {value} vs. {jobvars["value"]}' - ) - - # If we get here and we don't have an image - for req in required or []: - if req not in jobvars: - raise ValueError( - f'Submit requires a "{req}" field under at least one job spec to create the MiniCluster.' - ) - return jobvars + if "image" not in job: + logger.warning(f"Job {name} is missing an image and cannot be run.") + + # Add the image if we don't know about it already + # This is where we can define shared minicluster container attributes (the job) + if job["image"] not in images: + images[job["image"]] = copy.deepcopy(job) + + # Update the job and warn the user for differences + else: + for k, v in job.items(): + # Skip the command + if k == "command": + continue + + # This shared job for the image doesn't have the attribute defined yet + if k not in images[job["image"]]: + images[job["image"]][k] = v + continue + current = images[job["image"]][k] + + # If it's a dictionary, just update + if isinstance(current, dict) and isinstance(v, dict): + images[job["image"]][k].update(v) + + # Otherwise give a warning we won't be updating + elif current != v: + logger.warning( + f"Found different definition of {k}, {v}. Using first discovered {current}" + ) + + logger.debug(f"Job experiments file generated {len(images)} MiniCluster(s).") + + # Prepare a MiniCluster and job for each image + for image in images: + minicluster = copy.deepcopy(self.minicluster) + minicluster["size"] = size + job = images[image] + + # A shared MiniCluster starts with no command to start flux restful + if "command" in job: + del job["command"] + yield minicluster, job @property def script_dir(self): @@ -238,15 +255,6 @@ def get_script(self, name, cloud, render_kwargs=None, ext="sh", suffix=""): utils.mkdir_p(outdir) return script.render(outfile=outfile, **render_kwargs) - def get_shared_script(self, name, render_kwargs=None, suffix="", ext="sh"): - """ - Get a named shared script - """ - render_kwargs = render_kwargs or {} - return self.get_script( - name, cloud="shared", render_kwargs=render_kwargs, suffix=suffix, ext=ext - ) - def cleanup(self): """ Cleanup the scripts directory for the experiment! @@ -255,36 +263,6 @@ def cleanup(self): logger.debug(f"Cleaning up {self.script_dir}") shutil.rmtree(self.script_dir) - def generate_crd(self, job, minicluster_size): - """ - Generate a custom resource definition for the experiment - """ - template = jinja2.Template(utils.read_file(self.template)) - experiment = copy.deepcopy(self.experiment) - - # If the experiment doesn't define a minicluster, add our default - if "minicluster" not in experiment: - experiment["minicluster"] = self.settings.minicluster - - # Update minicluster size to the one we want - experiment["minicluster"]["size"] = minicluster_size - - if "jobs" in experiment: - del experiment["jobs"] - experiment["job"] = job - result = template.render(**experiment).strip(" ") - logger.debug(result) - - # Write to output directory - outfile = os.path.join( - self.script_dir, f"minicluster-size-{minicluster_size}.yaml" - ) - outdir = os.path.dirname(outfile) - if not os.path.exists(outdir): - logger.info(f"Creating output directory for scripts {outdir}") - utils.mkdir_p(outdir) - return utils.write_file(result, outfile) - @property def jobs(self): return self.experiment.get("jobs", {}) @@ -325,10 +303,12 @@ def is_run(self): return False return True - def check_job_run(self, job, size): + def check_job_run(self, job, size, image=None): """ Determine if a job is marked for a MiniCluster size. """ + if "image" in job and image is not None and job["image"] != image: + return False if "sizes" in job and size not in job["sizes"]: return False if "size" in job and job["size"] != size: @@ -339,6 +319,27 @@ def check_job_run(self, job, size): return False return True + def save_file(self, obj, filename, is_json=False): + """ + Save a json dump of something to a filename in the experiment directory. + """ + experiment_dir = self.root_dir + save_file = os.path.join(experiment_dir, ".scripts", filename) + save_dir = os.path.dirname(save_file) + if not os.path.exists(save_dir): + utils.mkdir_p(save_dir) + if is_json: + utils.write_json(obj, save_file) + else: + utils.write_file(obj, save_file) + return save_file + + def save_json(self, obj, filename): + """ + Save a json dump of something to a filename in the experiment directory. + """ + return self.save_file(obj, filename, is_json=True) + def save_metadata(self, times, info=None): """ Save experiment metadata, loading an existing meta.json, if present. @@ -421,8 +422,17 @@ def minicluster(self): minicluster = self.experiment.get("minicluster") or self.settings.minicluster if "namespace" not in minicluster or not minicluster["namespace"]: minicluster["namespace"] = defaults.default_namespace + if "size" not in minicluster: + minicluster["size"] = [self.experiment.get("size")] return minicluster + @property + def minicluster_namespace(self): + """ + Get mini cluster namespace + """ + return self.minicluster["namespace"] + @property def machine(self): return self.experiment.get("machine") or self.settings.google["machine"] @@ -455,7 +465,7 @@ def kubernetes_version(self): ) -def expand_experiments(experiments, outdir, template=None): +def expand_experiments(experiments, outdir): """ Given a valid experiments.yaml, expand out into experiments """ @@ -484,7 +494,7 @@ def expand_experiments(experiments, outdir, template=None): # Put in final matrix form final = [] for entry in matrix: - final.append(Experiment(entry, outdir, template)) + final.append(Experiment(entry, outdir)) return final diff --git a/fluxcloud/main/schemas.py b/fluxcloud/main/schemas.py index 8556347..5902448 100644 --- a/fluxcloud/main/schemas.py +++ b/fluxcloud/main/schemas.py @@ -24,14 +24,14 @@ "properties": { "command": {"type": "string"}, "repeats": {"type": "number"}, - "workdir": {"type": "string"}, + "working_dir": {"type": "string"}, "image": {"type": "string"}, "machine": {"type": "string"}, "machines": {"type": "array", "items": {"type": "string"}}, "size": {"type": "number"}, "sizes": {"type": "array", "items": {"type": "number"}}, }, - "required": ["command"], + "required": ["command", "image"], } jobs_properties = { @@ -187,6 +187,9 @@ "required": ["size"], }, }, + "patternProperties": { + "x-*": {"type": "object"}, + }, "additionalProperties": False, } diff --git a/fluxcloud/main/template.py b/fluxcloud/main/template.py new file mode 100644 index 0000000..e176500 --- /dev/null +++ b/fluxcloud/main/template.py @@ -0,0 +1,91 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +experiment_base = """ +# Flux MiniCluster experiment attributes +minicluster: + name: my-job + namespace: flux-operator + # Each of these sizes will be brought up and have commands run across it + # They must be smaller than the Kubernetes cluster size or not possible to run! + size: [2, 4] + +# Under jobs should be named jobs (output orgainzed by name) where +# each is required to have a command and image. Repeats is the number +# of times to run each job +jobs: + reaxc-hns: + command: 'lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + sleep: + command: 'sleep 5' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS + hello-world: + command: 'echo hello world' + image: ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0 + repeats: 5 + working_dir: /home/flux/examples/reaxff/HNS +""" + +google_experiment_template = f""" +matrix: + size: [4] + + # This is a Google Cloud machine + machine: [n1-standard-1] + +variables: + # Customize zone just for this experiment + # otherwise defaults to your settings.yml + zone: us-central1-a + +{experiment_base} +""" + +minikube_experiment_template = f""" +# This is intended for MiniKube, so no machine needed +matrix: + + # This is the size of the MiniKube cluster (aka Kubernetes cluster) to bring up + size: [4] + +{experiment_base} +""" + +aws_experiment_template = f""" +matrix: + + # This is the size of the MiniKube cluster (aka Kubernetes cluster) to bring up + size: [4] + + # This is an EC2 machine + machine: [m5.large] + +variables: + # Enable private networking + private_networking: false + + # Enable efa (requires efa also set under the container limits) + efa_enabled: false + + # Add a custom placement group name to your workers managed node group + placement_group: eks-efa-testing + + # Customize region just for this experiment + region: us-east-2 + + # Customize availability zones for this experiment + availability_zones: [us-east-1a, us-east-1b] + + # Important for instance types only in one zone (hpc instances) + # Select your node group availability zone: + node_group_availability_zone: us-east-2b + +{experiment_base} +""" diff --git a/fluxcloud/minicluster-template.yaml b/fluxcloud/minicluster-template.yaml deleted file mode 100644 index ede959d..0000000 --- a/fluxcloud/minicluster-template.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: flux-framework.org/v1alpha1 -kind: MiniCluster - -metadata: - name: {{ minicluster.name }} - namespace: {{ minicluster.namespace }} -spec: - # localDeploy needs to be false - localDeploy: {% if minicluster.local_deploy %}true{% else %}false{% endif %} - - # Number of pods to create for MiniCluster - size: {{ minicluster.size }} - tasks: {% if job.tasks %}{{ job.tasks }}{% else %}1{% endif %} - - # Disable verbose output - {% if job.quiet or job.timed %}logging: - {% if job.quiet %}quiet: true{% endif %} - {% if job.timed %}timed: true{% endif %}{% endif %} - - # Optional credentials if running the flux restful api - {% if job.token or job.user %}fluxRestful: - {% if job.token %}token: "{{ job.token }}"{% endif %} - {% if job.user %}username: "{{ job.user }}"{% endif %}{% endif %} - - # TODO add pod resources, if needed - containers: - - image: {{ job.image }} - {% if job.workdir %}workingDir: {{ job.workdir }}{% endif %} - {% if job.command %}command: {{ job.command }}{% endif %} - {% if job.flux_option_flags %}fluxOptionFlags: "-ompi=openmpi@5"{% endif %} - cores: {% if job.cores %}{{ job.cores }}{% else %}1{% endif %} - {% if job.limits or job.resources %}resources:{% endif %} - {% if job.limits %}limits: - {% for limit in job.limits %} - {{ limit[0] }}: {{ limit[1] }} - {% endfor %}{% endif %} - {% if job.requests %}requests: - {% for limit in job.requests %} - {{ limit[0] }}: {{ limit[1] }} - {% endfor %}{% endif %} - {% if job.pre_command %}preCommand: | - {{ job.pre_command }}{% endif %} diff --git a/fluxcloud/tests/__init__.py b/fluxcloud/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fluxcloud/tests/helpers.py b/fluxcloud/tests/helpers.py new file mode 100644 index 0000000..b9f8330 --- /dev/null +++ b/fluxcloud/tests/helpers.py @@ -0,0 +1,47 @@ +#!/usr/bin/python + +# Copyright (C) 2022 Vanessa Sochat. + +# This Source Code Form is subject to the terms of the +# Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed +# with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os +import shlex +import shutil + +from fluxcloud.client import get_parser +from fluxcloud.main.client import ExperimentClient +from fluxcloud.main import get_experiment_client + +here = os.path.dirname(os.path.abspath(__file__)) +root = os.path.dirname(here) + + +def parse_args(argstr): + """ + Given an argument string for a test, parse it. + """ + parser = get_parser() + parser.prog = "fluxcloud" + args = parser.parse_args(shlex.split(argstr)) + args.debug = True + return args + + +def get_settings(tmpdir): + """ + Create a temporary settings file + """ + settings_file = os.path.join(root, "settings.yml") + new_settings = os.path.join(tmpdir, "settings.yml") + shutil.copyfile(settings_file, new_settings) + return new_settings + + +def init_client(tmpdir, cloud=None): + """ + Get a common client for some container technology and module system + """ + new_settings = get_settings(tmpdir) + return get_experiment_client(cloud, debug=True, settings_file=new_settings) \ No newline at end of file diff --git a/fluxcloud/tests/test_examples.py b/fluxcloud/tests/test_examples.py new file mode 100644 index 0000000..b5d2e17 --- /dev/null +++ b/fluxcloud/tests/test_examples.py @@ -0,0 +1,181 @@ +#!/usr/bin/python + +# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +from glob import glob +import os + +import fluxcloud.utils as utils +from fluxcloud.main.experiment import ExperimentSetup + +from .helpers import here, init_client + +here = os.path.abspath(os.path.dirname(__file__)) +root = os.path.dirname(os.path.dirname(here)) + +def check_lammps(minicluster_file): + """ + Checks for examples that run lammps. + """ + expected_outdir = os.path.dirname(os.path.dirname(minicluster_file)) + for out in utils.recursive_find(expected_outdir, "log.out"): + content = utils.read_file(out) + assert "Total wall time" in content + assert "LAMMPS" in content + + +def _test_example(dirname, tmp_path, check, test_apply=True): + """ + Shared function to test an example in a dirname, with a check function + """ + client = init_client(str(tmp_path), cloud="minikube") + experiment_file = os.path.join( + root, "examples", "minikube", dirname, "experiments.yaml" + ) + + # Create a new experiment directory to work from + experiment_dir = os.path.join(tmp_path, "experiment") + outdir = os.path.join(experiment_dir, "data") + utils.mkdir_p(experiment_dir) + setup = ExperimentSetup(experiment_file, outdir=outdir, force_cluster=True, quiet=False) + + # Select the first (only) experiment! + experiment = setup.matrices[0] + client.up(setup, experiment=experiment) + + # Expected output directory + expected_outdir = os.path.join(outdir, f"k8s-size-{experiment.size}-local") + expected_scripts = os.path.join(expected_outdir, ".scripts") + + def shared_checks(info=True): + assert os.path.exists(expected_outdir) + assert "meta.json" in os.listdir(expected_outdir) + meta = utils.read_json(os.path.join(expected_outdir, "meta.json")) + assert meta["times"] + assert meta["minicluster"] + assert meta["jobs"] + + # Info is only present for submit + if info: + assert meta["info"] + + # Run the experiment in the working directory + with utils.working_dir(experiment_dir): + # This won't work in the CI it seems + client.submit(setup, experiment, interactive=False) + shared_checks() + + files = glob(os.path.join(expected_scripts, "minicluster-size*.json")) + minicluster_file = files[0] + print(f'Found minicluster metadata file {minicluster_file}') + + check(minicluster_file, experiment) + + # Now do the same for apply + # shutil.rmtree(expected_outdir) + if test_apply: + client.apply(setup, experiment, interactive=False) + shared_checks(info=False) + check(minicluster_file, experiment) + + client.down(setup, experiment=experiment) + + +def test_minicluster_logging(tmp_path): + """ + Ensure that the logging example returns expected logging params set + in the minicluster output. + """ + + def check(minicluster_file, experiment): + assert os.path.exists(minicluster_file) + + # Assert that the logging spec matches + minicluster = utils.read_json(minicluster_file) + for level, value in experiment.minicluster["logging"].items(): + assert level in minicluster["spec"]["logging"] + assert minicluster["spec"]["logging"][level] == value + + check_lammps(minicluster_file) + + # Run the example for submit and apply, with check + _test_example("logging", tmp_path, check) + + +def test_minicluster_volumes(tmp_path): + """ + Ensure that the volumes example produces the expected Minicluster spec + """ + + def check(minicluster_file, experiment): + assert os.path.exists(minicluster_file) + + # Assert that the logging spec matches + minicluster = utils.read_json(minicluster_file) + assert "volumes" in minicluster["spec"] + + check_lammps(minicluster_file) + + # And container level volumes + assert "volumes" in minicluster["spec"]["containers"][0] + container_volumes = minicluster["spec"]["containers"][0]["volumes"] + + # This checks the cluster level volumes + for name, volume in experiment.minicluster["volumes"].items(): + assert name in minicluster["spec"]["volumes"] + generated_volume = minicluster["spec"]["volumes"][name] + + for attr, value in volume.items(): + if attr in generated_volume: + assert value == generated_volume[attr] + + assert name in container_volumes + + for vname, containervol in experiment.jobs["reaxc-hns-1"][ + "volumes" + ].items(): + assert vname in container_volumes + for attr, val in containervol.items(): + assert attr in container_volumes[vname] + assert container_volumes[vname][attr] == val + + # Run the example for submit and apply, with check + _test_example("volumes", tmp_path, check) + + +def test_osu_benchmarks(tmp_path): + """ + Ensure we can explicitly specify resources + """ + def check(minicluster_file, experiment): + assert os.path.exists(minicluster_file) + + + # Run the example for submit and apply, with check + _test_example("osu-benchmarks", tmp_path, check, test_apply=False) + + +def test_minicluster_resources(tmp_path): + """ + Ensure that the resources example works as expected. + """ + + def check(minicluster_file, experiment): + assert os.path.exists(minicluster_file) + + # Assert that the logging spec matches + minicluster = utils.read_json(minicluster_file) + check_lammps(minicluster_file) + + assert "resources" in minicluster["spec"]["containers"][0] + resources = minicluster["spec"]["containers"][0]["resources"] + + for rtype, rvalue in experiment.jobs["reaxc-hns-1"]["resources"].items(): + assert rtype in resources + assert resources[rtype] == rvalue + + # Run the example for submit and apply, with check + _test_example("resources", tmp_path, check) diff --git a/fluxcloud/tests/test_settings.py b/fluxcloud/tests/test_settings.py new file mode 100644 index 0000000..9d0b162 --- /dev/null +++ b/fluxcloud/tests/test_settings.py @@ -0,0 +1,50 @@ +#!/usr/bin/python + +# Copyright 2022 Lawrence Livermore National Security, LLC and other +# This is part of Flux Framework. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import os + +import pytest + +from fluxcloud.main.settings import UserSettings + +here = os.path.dirname(os.path.abspath(__file__)) +root = os.path.dirname(here) + +from .helpers import get_settings # noqa + + +def test_invalid_properties(tmp_path): + """ + Test invalid setting property + """ + settings = UserSettings(get_settings(tmp_path)) + assert settings.config_editor == "vim" + settings.set("config_editor", "code") + with pytest.raises(SystemExit): + settings.set("invalid_key", "invalid_value") + assert settings.config_editor == "code" + + +def test_set_get(tmp_path): + """ + Test variable set/get + """ + settings = UserSettings(get_settings(tmp_path)) + + zone = "us-central1-a" + assert settings.google["zone"] == zone + + # Cannot add invalid parameter + with pytest.raises(SystemExit): + settings.set("cache_only", True) + + found_zone = settings.get("google:zone") + assert isinstance(zone, str) + assert zone == found_zone + + # Just check the first in the list + assert settings.google["zone"] == zone diff --git a/fluxcloud/tests/test_utils.py b/fluxcloud/tests/test_utils.py new file mode 100644 index 0000000..b10c97d --- /dev/null +++ b/fluxcloud/tests/test_utils.py @@ -0,0 +1,133 @@ +#!/usr/bin/python + +# Copyright (C) 2021-2022 Vanessa Sochat. + +# This Source Code Form is subject to the terms of the +# Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed +# with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import json +import os +import shutil + +import pytest + +import fluxcloud.utils as utils + + +def test_write_read_files(tmp_path): + """ + test_write_read_files will test the functions write_file and read_file + """ + print("Testing utils.write_file...") + + tmpfile = str(tmp_path / "written_file.txt") + assert not os.path.exists(tmpfile) + utils.write_file("hello!", tmpfile) + assert os.path.exists(tmpfile) + + print("Testing utils.read_file...") + content = utils.read_file(tmpfile) + assert content == "hello!" + + +def test_write_bad_json(tmp_path): + bad_json = {"Wakkawakkawakka'}": [{True}, "2", 3]} + tmpfile = str(tmp_path / "json_file.txt") + assert not os.path.exists(tmpfile) + with pytest.raises(TypeError): + utils.write_json(bad_json, tmpfile) + + +def test_write_json(tmp_path): + good_json = {"Wakkawakkawakka": [True, "2", 3]} + tmpfile = str(tmp_path / "good_json_file.txt") + + assert not os.path.exists(tmpfile) + utils.write_json(good_json, tmpfile) + with open(tmpfile, "r") as f: + content = json.loads(f.read()) + assert isinstance(content, dict) + assert "Wakkawakkawakka" in content + content = utils.read_json(tmpfile) + assert "Wakkawakkawakka" in content + + +def test_check_install(): + """ + check install is used to check if a particular software is installed. + If no command is provided, singularity is assumed to be the test case + """ + print("Testing utils.check_install") + + is_installed = utils.check_install("echo") + assert is_installed + is_not_installed = utils.check_install("fakesoftwarename") + assert not is_not_installed + + +def test_get_installdir(): + """ + Get install directory should return the base of where fluxcloud + is installed + """ + print("Testing utils.get_installdir") + + whereami = utils.get_installdir() + print(whereami) + assert whereami.endswith("fluxcloud") + + +def test_get_file_hash(): + print("Testing utils.get_file_hash") + here = os.path.dirname(os.path.abspath(__file__)) + testdata = os.path.join(here, "testdata", "hashtest.txt") + assert ( + utils.get_file_hash(testdata) + == "6bb92117bded3da774363713657a629a9f38eac2e57cd47e1dcda21d3445c67d" + ) + assert utils.get_file_hash(testdata, "md5") == "e5d376ca96081dd561ff303c3a631fd5" + + +def test_copyfile(tmp_path): + print("Testing utils.copyfile") + original = str(tmp_path / "location1.txt") + dest = str(tmp_path / "location2.txt") + print(original) + print(dest) + utils.write_file("CONTENT IN FILE", original) + utils.copyfile(original, dest) + assert os.path.exists(original) + assert os.path.exists(dest) + + +def test_get_tmpdir_tmpfile(): + print("Testing utils.get_tmpdir, get_tmpfile") + tmpdir = utils.get_tmpdir() + assert os.path.exists(tmpdir) + assert os.path.basename(tmpdir).startswith("fluxcloud") + shutil.rmtree(tmpdir) + tmpdir = utils.get_tmpdir(prefix="name") + assert os.path.basename(tmpdir).startswith("name") + shutil.rmtree(tmpdir) + tmpfile = utils.get_tmpfile() + assert "fluxcloud" in tmpfile + os.remove(tmpfile) + tmpfile = utils.get_tmpfile(prefix="pancakes") + assert "pancakes" in tmpfile + os.remove(tmpfile) + + +def test_mkdir_p(tmp_path): + print("Testing utils.mkdir_p") + dirname = str(tmp_path / "input") + result = os.path.join(dirname, "level1", "level2", "level3") + utils.mkdir_p(result) + utils.mkdirp([result]) + assert os.path.exists(result) + + +def test_print_json(): + print("Testing utils.print_json") + result = utils.print_json({1: 1}) + assert result == '{\n "1": 1\n}' diff --git a/fluxcloud/tests/testdata/hashtest.txt b/fluxcloud/tests/testdata/hashtest.txt new file mode 100644 index 0000000..e85812c --- /dev/null +++ b/fluxcloud/tests/testdata/hashtest.txt @@ -0,0 +1,2 @@ +This is a file that exists purely to test the functions to generate +hashes. Please don't modify, thank you! diff --git a/fluxcloud/utils/__init__.py b/fluxcloud/utils/__init__.py index b079912..10c9291 100644 --- a/fluxcloud/utils/__init__.py +++ b/fluxcloud/utils/__init__.py @@ -18,7 +18,7 @@ write_json, write_yaml, ) -from .misc import chunks, get_hash, mb_to_bytes, print_bytes, slugify +from .misc import chunks, get_hash, mb_to_bytes, print_bytes, slugify, working_dir from .terminal import ( check_install, confirm_action, diff --git a/fluxcloud/utils/misc.py b/fluxcloud/utils/misc.py index acfcc9c..0bee595 100644 --- a/fluxcloud/utils/misc.py +++ b/fluxcloud/utils/misc.py @@ -4,6 +4,21 @@ # SPDX-License-Identifier: Apache-2.0 import copy +import os +from contextlib import contextmanager + + +@contextmanager +def working_dir(path): + """ + Sets the cwd within the context + """ + here = os.getcwd() + try: + os.chdir(path) + yield + finally: + os.chdir(here) def chunks(listing, chunk_size): diff --git a/fluxcloud/version.py b/fluxcloud/version.py index c3655ca..409163d 100644 --- a/fluxcloud/version.py +++ b/fluxcloud/version.py @@ -1,7 +1,7 @@ # Copyright 2022-2023 Lawrence Livermore National Security, LLC # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.1.19" +__version__ = "0.2.0" AUTHOR = "Vanessa Sochat" EMAIL = "vsoch@users.noreply.github.com" NAME = "flux-cloud" @@ -14,6 +14,8 @@ # Global requirements INSTALL_REQUIRES = ( + ("kubernetes", {"min_version": None}), + ("fluxoperator", {"min_version": "0.0.12"}), ("ruamel.yaml", {"min_version": None}), ("jsonschema", {"min_version": None}), ("requests", {"min_version": None}), diff --git a/tests/test.sh b/tests/test.sh index ca485cb..7c1ac3c 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -37,39 +37,10 @@ echo "flux-cloud run --cloud minikube --output ${output} --force-cluster" flux-cloud run --cloud minikube --output ${output} --force-cluster retval=$? -if [[ "${retval}" != "0" ]]; then +if [[ ${retval} -ne 0 ]]; then echo "Issue running Flux Cloud, return value ${retval}" exit ${retval} fi -# Check output -for filename in $(find ./data -type f -print); do - echo "Checking $filename"; - filebase=$(basename ${filename}) - - # Don't check these files, likely to change - if [[ "${filebase}" == "flux-operator.yaml" ]]; then - continue - fi - if [[ "${filebase}" == "nodes-size"* ]]; then - continue - fi - suffix=$(echo ${filename:7}) - outfile="$output/$suffix" - if [[ ! -e "${outfile}" ]]; then - echo "Expected output $outfile does not exist." - exit 1 - fi - # Check the length - actual=$(cat $filename | wc -l) - found=$(cat $outfile | wc -l) - - if [[ "${actual}" != "${found}" ]]; then - echo "Incorrect output length found for ${filename}: expected ${actual} vs found ${found}" - cat ${outfile} - exit 1 - fi -done - echo ${output} rm -rf ${output}