converged-computing · vsoch · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/controllers/ensemble/update.go b/controllers/ensemble/update.go
@@ -83,11 +83,12 @@ func (r *EnsembleReconciler) updateMiniClusterEnsemble(
 			Action:    algorithm.SubmitAction,
 		}
 		response, err := c.RequestAction(ctx, &in)
-		fmt.Println(response.Status)
 		if err != nil {
 			fmt.Printf("      Error with action request %s\n", err)
 			return ctrl.Result{}, err
 		}
+		fmt.Println(response.Status)
+
 		// Since we requeue anyway, we don't check error. But probably should.
 		return r.updateJobsMatrix(ctx, ensemble, decision.Jobs, i)
 	}

diff --git a/examples/algorithms/workload/autoscale/README.md b/examples/algorithms/workload/autoscale/README.md
@@ -0,0 +1,45 @@
+# Workload Demand Autoscaling Experiment
+
+You can read about the workload demand algorithm [here](https://github.com/converged-computing/ensemble-operator/blob/main/docs/algorithms.md#workoad-demand-of-consistent-sizes). Here we are testing this algorithm with autoscaling.
+
+This (larger) experiment has been moved to the [converged-computing/ensemble-experiments](https://github.com/converged-computing/ensemble-experiments) repository. This directory is used for testing components.
+
+## Create Cluster
+
+We want to create a GKE cluster first. It won't have autoscaling enabled, etc.
+
+```bash
+GOOGLE_PROJECT=myproject
+gcloud container clusters create test-cluster \
+    --enable-autoscaling \
+    --threads-per-core=1 \
+    --placement-type=COMPACT \
+    --autoscaling-profile=optimize-utilization \
+    --region=us-central1-a \
+    --num-nodes 1 \
+    --total-min-nodes 1 \
+    --total-max-nodes 18 \
+    --project=${GOOGLE_PROJECT} \
+    --machine-type=c2d-standard-8
+```
+
+Install the development operator and flux operator
+
+```bash
+make test-deploy-recreate
+kubectl apply -f https://raw.githubusercontent.com/flux-framework/flux-operator/main/examples/dist/flux-operator.yaml
+```
+
+## Run the Ensemble
+
+And apply, develop!
+
+```bash
+kubectl apply -f ensemble.yaml
+```
+
+When you are done, clean up.
+
+```bash
+gcloud container clusters delete test-cluster --region=us-central1-a
+```
diff --git a/examples/algorithms/workload/autoscale/ensemble.yaml b/examples/algorithms/workload/autoscale/ensemble.yaml
@@ -0,0 +1,70 @@
+apiVersion: ensemble.flux-framework.org/v1alpha1
+kind: Ensemble
+metadata:
+  name: ensemble
+spec:  
+  members:
+
+    # This is how you change the sidcar image, if needed. This is the one
+    # that I push and use for development. Pull always ensures we get latest
+  - sidecar:
+      pullAlways: true
+      image: ghcr.io/converged-computing/ensemble-operator-api:rockylinux9
+
+    # Algorithm and options:
+    # This is the algorithm run by the operator. The options are passed to
+    # the running queue to further alter the outcome.
+    # terminateChecks says to terminate after 2 subsequent inactive status checks
+    algorithm:
+      name: workload-demand
+      options:
+        terminateChecks: 2
+        scaleUpChecks: 1
+        order: "random"
+
+    # These are slightly different - flux gets to use the full node capacity
+    # so 4 cores per node, tasks == size * 4
+    jobs:
+      - name: lammps-2
+        command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+        count: 10
+        nodes: 2
+        tasks: 8
+      - name: lammps-4
+        command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+        count: 10
+        nodes: 4
+        tasks: 16
+      - name: lammps-6
+        command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+        count: 10
+        nodes: 6
+        tasks: 24
+      - name: lammps-8
+        command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
+        count: 10
+        nodes: 8
+        tasks: 32
+
+    minicluster:
+      spec:
+        size: 1
+        minSize: 1
+        maxSize: 16
+
+        # The workers should not fail when they clean up
+        flux:
+          completeWorkers: true
+
+        # This is a list because a pod can support multiple containers
+        containers:
+        - image: ghcr.io/converged-computing/metric-lammps:latest
+
+          # You can set the working directory if your container WORKDIR is not correct.
+          workingDir: /opt/lammps/examples/reaxff/HNS
+          resources:
+            limits:
+              cpu: 3
+            requests:
+              cpu: 3
+
diff --git a/python/ensemble_operator/server.py b/python/ensemble_operator/server.py
@@ -196,6 +196,7 @@ def RequestStatus(self, request, context):
         payload["counts"]["free_nodes"] = self.count_free_nodes_increasing_periods(payload["nodes"])
 
         # Always update the last timestamp when we do a status
+        metrics.tick()
         payload["metrics"] = metrics.to_dict()
         print(json.dumps(payload))