From fb5505b457c495503c6cf03b54c2ca79fda0b873 Mon Sep 17 00:00:00 2001 From: Zeid Adabel Date: Mon, 26 Feb 2024 13:11:49 +0100 Subject: [PATCH] Merge pull request #31 from Ortec-Finance/remove-job-deadline Remove job deadline --- CHANGELOG.md | 2 ++ docs/features/broker-scale-to-zero.md | 21 ++++++++++++ docs/the-job-paradigm.md | 32 +++++++++++++++++++ .../foundation/run-manager-autoscaler.yaml | 1 - .../base/foundation/runner-autoscaler.yaml | 1 - 5 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 docs/features/broker-scale-to-zero.md create mode 100644 docs/the-job-paradigm.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 510171c..5d01ecc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,6 @@ # Changelog +## v0.13.0 +Removed `activeDeadlineSeconds` from base configuration as it does not comply with the Job Paradigm as we intend it. Added Documentation that explains how the Job Paradigm is used in Sailfish. ## v0.12.0 Added kustomization.yaml in `k8s/observability` so it works with kustomize remote ref diff --git a/docs/features/broker-scale-to-zero.md b/docs/features/broker-scale-to-zero.md new file mode 100644 index 0000000..bb5d59b --- /dev/null +++ b/docs/features/broker-scale-to-zero.md @@ -0,0 +1,21 @@ +# Broker Scale to Zero +By enabling this component you will have all services of Sailfish scaled to zero, when no message is received + +## Prerequistes +- `sailfish-gateway` component +- Not use `ephemeral-broker` component + +The ScaledObject enabled by the `broker-scale-to-zero` component triggers a scaleup of the broker when it detects the `sailfish-gateway` pod! + +Do not use the `ephemeral-broker` component as that might result in data loss. + + +## Configuring your workloads +### The Gateway +The Gateway workload must be configured to wait for the broker to be up and running. This can be done by simply pinging the broker in a loop until successful. + +### Additional Queues +When you have additional queues, this must be considered when using this component. The `sailfish-amq-broker-autoscaler` `ScaledObject` triggers are designed to keep the broker up after the gateway is finished and scaled down. + +The ScaledJobs outside of the runner and run-manager must be added to the `triggers` of the `ScaledObject` as otherwise the broker might be scaled down when these queues are needed to be accessed. + diff --git a/docs/the-job-paradigm.md b/docs/the-job-paradigm.md new file mode 100644 index 0000000..19708bf --- /dev/null +++ b/docs/the-job-paradigm.md @@ -0,0 +1,32 @@ +# The Job Paradigm +Sailfish uses ScaledJobs to scale compute based on an Queue. +For your workloads to comply with this paradigm we need to consider a few symptoms + +--- + +## The Problems + +### Overshoot +The ScaledJobs tends to overshoot the need of jobs, this is due to delays between a job being picked up and the AMQ Broker signaling it via its Prometheus Metrics. This can sometimes result in more instances of Runners spawning per Task. Additionally, if your workloads are configured to not terminate after the completion of one Task, it can amplify this issue + +### The Nature of a Job +A Kubernetes Job, is not supposed to be terminated from the outside. It's meant to run to completion and Kubernetes respects that by never terminating it unless it is evicted. + +### Keeping Runners warm +For some workloads it can be beneficial to keep the Runners warm as the initialization can be time-consuming. + +--- + +## The Solution +To comply with these symptoms you have to design your workloads to have a stop condition, so that they can terminate gracefully. You can do this by after each computation trigger a self-destruct timer with a short grace period of ~30s. + +With this grace-period, we can have a Runner capable of picking up multiple tasks which prevents the initialization time penalty. + + +### Python +TODO: Code Examples + +### C# +TODO: Code Examples + + diff --git a/k8s/sailfish/base/foundation/run-manager-autoscaler.yaml b/k8s/sailfish/base/foundation/run-manager-autoscaler.yaml index 3f95793..7b92b50 100644 --- a/k8s/sailfish/base/foundation/run-manager-autoscaler.yaml +++ b/k8s/sailfish/base/foundation/run-manager-autoscaler.yaml @@ -49,7 +49,6 @@ spec: restartPolicy: Never backoffLimit: 4 parallelism: 1 - activeDeadlineSeconds: 60 pollingInterval: 10 maxReplicaCount: 20 # Optional. Default: 100 successfulJobsHistoryLimit: 1 # Optional. Default: 100. How many completed jobs should be kept. diff --git a/k8s/sailfish/base/foundation/runner-autoscaler.yaml b/k8s/sailfish/base/foundation/runner-autoscaler.yaml index 61fdd44..79d36e9 100644 --- a/k8s/sailfish/base/foundation/runner-autoscaler.yaml +++ b/k8s/sailfish/base/foundation/runner-autoscaler.yaml @@ -56,7 +56,6 @@ spec: restartPolicy: Never backoffLimit: 4 parallelism: 1 - activeDeadlineSeconds: 130 successfulJobsHistoryLimit: 1 # Optional. Default: 100. How many completed jobs should be kept. pollingInterval: 2 maxReplicaCount: 100 # Optional. Default: 100