Skip to content

Commit

Permalink
Merge pull request #93 from ruivieira/rhoai-2.16.1
Browse files Browse the repository at this point in the history
feat(lmeval): Add default offline mode for LMEval
  • Loading branch information
ruivieira authored Dec 13, 2024
2 parents 67da0c4 + d250af3 commit 4a03d6f
Show file tree
Hide file tree
Showing 15 changed files with 1,306 additions and 65 deletions.
4 changes: 4 additions & 0 deletions Dockerfile.lmes-job
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ RUN git clone https://github.com/opendatahub-io/lm-evaluation-harness.git && \
cd lm-evaluation-harness && git checkout release-0.4.5 && \
pip install --no-cache-dir --user -e .[api,ibm_watsonx_ai]

RUN git clone --branch odh-2.22 https://github.com/opendatahub-io/hf-evaluate.git /tmp/evaluate && \
cp -R /tmp/evaluate/metrics/. . && \
rm -Rf /tmp/evaluate

RUN python -c 'from lm_eval.tasks.unitxt import task; import os.path; print("class: !function " + task.__file__.replace("task.py", "task.Unitxt"))' > ./my_tasks/unitxt

ENV PYTHONPATH=/opt/app-root/src/.local/lib/python3.11/site-packages:/opt/app-root/src/lm-evaluation-harness:/opt/app-root/src:/opt/app-root/src/server
Expand Down
11 changes: 10 additions & 1 deletion api/lmes/v1alpha1/lmevaljob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,17 @@ type LMEvalJobSpec struct {
// Outputs specifies storage for evaluation results
// +optional
Outputs *Outputs `json:"outputs,omitempty"`
// Offline specifies settings for running LMEvalJobs in a offline mode
// Offline specifies settings for running LMEvalJobs in an offline mode
// +optional
Offline *OfflineSpec `json:"offline,omitempty"`
// AllowOnly specifies whether the LMEvalJob can directly download remote code, datasets and metrics. Default is false.
// +optional
// +kubebuilder:default:=false
AllowOnline *bool `json:"allowOnline,omitempty"`
// AllowCodeExecution specifies whether the LMEvalJob can execute remote code. Default is false.
// +optional
// +kubebuilder:default:=false
AllowCodeExecution *bool `json:"allowCodeExecution,omitempty"`
}

// IsOffline returns whether this LMEvalJob is configured to run offline
Expand Down
10 changes: 10 additions & 0 deletions api/lmes/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion config/base/params.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ lmes-image-pull-policy=Always
lmes-max-batch-size=24
lmes-default-batch-size=8
lmes-detect-device=true

lmes-allow-online=true
lmes-allow-code-execution=true
13 changes: 13 additions & 0 deletions config/component_metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
releases:
- name: TrustyAI operator
version: latest
repoUrl: https://github.com/trustyai-explainability/trustyai-service-operator
- name: TrustyAI service
version: latest
repoUrl: https://github.com/trustyai-explainability/trustyai-explainability
- name: TrustyAI LMEval driver
version: latest
repoUrl: https://github.com/trustyai-explainability/trustyai-service-operator
- name: TrustyAI LMEval job
version: latest
repoUrl: https://github.com/opendatahub-io/lm-evaluation-harness
12 changes: 11 additions & 1 deletion config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ spec:
spec:
description: LMEvalJobSpec defines the desired state of LMEvalJob
properties:
allowCodeExecution:
default: false
description: AllowCodeExecution specifies whether the LMEvalJob can
execute remote code. Default is false.
type: boolean
allowOnline:
default: false
description: AllowOnly specifies whether the LMEvalJob can directly
download remote code, datasets and metrics. Default is false.
type: boolean
batchSize:
description: |-
Batch size for the evaluation. This is used by the models that run and are loaded
Expand Down Expand Up @@ -91,7 +101,7 @@ spec:
type: integer
offline:
description: Offline specifies settings for running LMEvalJobs in
a offline mode
an offline mode
properties:
storage:
description: OfflineStorageSpec defines the storage configuration
Expand Down
2 changes: 2 additions & 0 deletions config/overlays/odh-kueue/params.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ lmes-image-pull-policy=Always
lmes-max-batch-size=24
lmes-default-batch-size=8
lmes-detect-device=true
lmes-allow-online=true
lmes-allow-code-execution=true
2 changes: 2 additions & 0 deletions config/overlays/odh/params.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ lmes-image-pull-policy=Always
lmes-max-batch-size=24
lmes-default-batch-size=8
lmes-detect-device=true
lmes-allow-online=true
lmes-allow-code-execution=true
2 changes: 2 additions & 0 deletions config/overlays/rhoai/params.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ lmes-image-pull-policy=Always
lmes-max-batch-size=24
lmes-default-batch-size=8
lmes-detect-device=true
lmes-allow-online=false
lmes-allow-code-execution=false
175 changes: 132 additions & 43 deletions controllers/job_mgr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ When Job Manager is an enabled service LMevalJob requires `kueue.x-k8s.io/queue-
curl -L https://github.com/kubernetes-sigs/kueue/releases/download/v0.8.1/manifests.yaml | sed 's/# externalFrameworks/ externalFrameworks/; s/# - "Foo.v1.example.com"/ - "trustyai.opendatahub.io\/lmevaljob"/'|kubectl apply --server-side -f -
```
Create 2 sets of Kueue CRs.
After the kueue-controller-manager deployment is ready, create a ClusterQueue, ResourceFlavor, and a namespaced LocalQueue at least.
```bash
cat <<EOF | kubectl apply -f -
apiVersion: kueue.x-k8s.io/v1beta1
Expand All @@ -35,22 +35,6 @@ When Job Manager is an enabled service LMevalJob requires `kueue.x-k8s.io/queue-
- coveredResources: ["cpu", "memory"]
flavors:
- name: "default-flavor"
resources:
- name: "cpu"
nominalQuota: 4
- name: "memory"
nominalQuota: 88888888Gi
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: "cluster-queue-2"
spec:
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["cpu", "memory"]
flavors:
- name: "default-flavor-2"
resources:
- name: "cpu"
nominalQuota: 4
Expand All @@ -61,17 +45,6 @@ When Job Manager is an enabled service LMevalJob requires `kueue.x-k8s.io/queue-
kind: ResourceFlavor
metadata:
name: "default-flavor"
spec:
nodeLabels:
kubernetes.io/hostname: kueue-worker
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "default-flavor-2"
spec:
nodeLabels:
kubernetes.io/hostname: kueue-worker2
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
Expand All @@ -80,19 +53,9 @@ When Job Manager is an enabled service LMevalJob requires `kueue.x-k8s.io/queue-
name: "user-queue"
spec:
clusterQueue: "cluster-queue"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: "default"
name: "user-queue-2"
spec:
clusterQueue: "cluster-queue-2"
EOF
```
1. Install TAS CRDs
```bash
make install
Expand All @@ -117,19 +80,20 @@ When Job Manager is an enabled service LMevalJob requires `kueue.x-k8s.io/queue-
data:
kServeServerless: disabled
lmes-default-batch-size: "8"
lmes-driver-image: quay.io/yhwang/ta-lmes-driver:latest
lmes-driver-image: quay.io/trustyai/ta-lmes-driver:latest
lmes-image-pull-policy: Always
lmes-max-batch-size: "24"
lmes-pod-checking-interval: 10s
lmes-pod-image: quay.io/tedchang/ta-lmes-job:latest
lmes-pod-image: quay.io/trustyai/ta-lmes-job:latest
oauthProxyImage: quay.io/openshift/origin-oauth-proxy:4.14.0
trustyaiOperatorImage: quay.io/tedchang/trustyai-service-operator:latest
trustyaiOperatorImage: quay.io/trustyai/trustyai-service-operator:latest
trustyaiServiceImage: quay.io/trustyai/trustyai-service:latest
EOF
```
Start the controller locally:
```bash
# This file is needed to run controller outside of container
mkdir -p /var/run/secrets/kubernetes.io/serviceaccount
echo -n "default">/var/run/secrets/kubernetes.io/serviceaccount/namespace
ENABLED_SERVICES=LMES,JOB_MGR make run
```
Expand All @@ -138,8 +102,129 @@ When Job Manager is an enabled service LMevalJob requires `kueue.x-k8s.io/queue-
INFO Starting workers {"controller": "lmevaljob", "controllerGroup": "trustyai.opendatahub.io", "controllerKind": "LMEvalJob", "worker count": 1}
INFO Starting workers {"controller": "LMEvalJobWorkload", "controllerGroup": "trustyai.opendatahub.io", "controllerKind": "LMEvalJob", "worker count": 1}
```
1. Quota and Node Affinity example. We will create 5 jobs.
1. Admit an lmevaljob to the `user-queue` LocalQueue, specify `kueue.x-k8s.io/queue-name: user-queue` in the metadata.labels
```bash
cat <<EOF | kubectl create -f -
apiVersion: trustyai.opendatahub.io/v1alpha1
kind: LMEvalJob
metadata:
labels:
app.kubernetes.io/name: fms-lm-eval-service
app.kubernetes.io/managed-by: kustomize
kueue.x-k8s.io/queue-name: user-queue
generateName: evaljob-sample-
namespace: default
spec:
pod:
container:
resources:
requests:
cpu: 2
suspend: true
model: hf
modelArgs:
- name: pretrained
value: EleutherAI/pythia-70m
taskList:
taskNames:
- unfair_tos
logSamples: true
limit: "5"
EOF
```
Run this command:
```bash
kubectl get lmevaljob,workloads,pod
```
Verify the output is similar to:
```
NAME STATE
lmevaljob.trustyai.opendatahub.io/evaljob-sample-5mdgj Scheduled
NAME QUEUE RESERVED IN ADMITTED FINISHED AGE
workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-5mdgj-b0412 user-queue cluster-queue True 27s
NAME READY STATUS RESTARTS AGE
pod/evaljob-sample-5mdgj 0/1 PodInitializing 0 26s
```
Delete the job:
```bash
kubectl delete lmevaljob $(kubectl get lmevaljob|grep evaljob-sample-|cut -d" " -f1)
```
1. Quota and Node Affinity example. To run a lmevaljob Pod on a particular node, use `spec.nodeLabels` in ResourceFlavor.
Create 2 sets of Kueue CRs.
```bash
cat <<EOF | kubectl apply -f -
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: "cluster-queue"
spec:
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["cpu", "memory"]
flavors:
- name: "default-flavor"
resources:
- name: "cpu"
nominalQuota: 4
- name: "memory"
nominalQuota: 50Gi
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: "cluster-queue-2"
spec:
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["cpu", "memory"]
flavors:
- name: "default-flavor-2"
resources:
- name: "cpu"
nominalQuota: 4
- name: "memory"
nominalQuota: 50Gi
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "default-flavor"
spec:
nodeLabels:
kubernetes.io/hostname: kueue-worker
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "default-flavor-2"
spec:
nodeLabels:
kubernetes.io/hostname: kueue-worker2
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: "default"
name: "user-queue"
spec:
clusterQueue: "cluster-queue"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: "default"
name: "user-queue-2"
spec:
clusterQueue: "cluster-queue-2"
EOF
```
We will create 5 lmevaljobs.
Jobs labeled with `user-queue` will be run on `kueue-worker` node.
Job labeled with `user-queue-2` will be run on `kueue-worker2` node.
Job will be Suspended if there is not enough quota.
Expand Down Expand Up @@ -331,6 +416,10 @@ When Job Manager is an enabled service LMevalJob requires `kueue.x-k8s.io/queue-
```
Verify they are in running state:
```bash
watch -d -n5 kubectl get lmevaljob,workloads,pod -owide
```
```
NAME STATE
lmevaljob.trustyai.opendatahub.io/evaljob-sample-8cr8k Running
Expand Down
3 changes: 3 additions & 0 deletions controllers/job_mgr/job_mgr_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ func (job *LMEvalJob) GVK() schema.GroupVersionKind {
func convertToAffinity(psi []podset.PodSetInfo) *corev1.Affinity {
if len(psi) > 0 {
nsl := psi[0].NodeSelector // Note there is only 1 element in podset array see PodSets method above.
if len(nsl) == 0 {
return nil
}
nsra := []corev1.NodeSelectorRequirement{}
for k, v := range nsl {
nsr := corev1.NodeSelectorRequirement{
Expand Down
4 changes: 4 additions & 0 deletions controllers/lmes/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ var Options *serviceOptions = &serviceOptions{
MaxBatchSize: DefaultMaxBatchSize,
DetectDevice: DefaultDetectDevice,
DefaultBatchSize: DefaultBatchSize,
AllowOnline: false,
AllowCodeExecution: false,
}

type serviceOptions struct {
Expand All @@ -47,6 +49,8 @@ type serviceOptions struct {
MaxBatchSize int
DefaultBatchSize string
DetectDevice bool
AllowOnline bool
AllowCodeExecution bool
}

func constructOptionsFromConfigMap(log *logr.Logger, configmap *corev1.ConfigMap) error {
Expand Down
2 changes: 2 additions & 0 deletions controllers/lmes/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ const (
MaxBatchSizeKey = "lmes-max-batch-size"
DefaultBatchSizeKey = "lmes-default-batch-size"
DetectDeviceKey = "lmes-detect-device"
AllowOnline = "lmes-allow-online"
AllowCodeExecution = "lmes-allow-code-execution"
DefaultPodImage = "quay.io/trustyai/ta-lmes-job:latest"
DefaultDriverImage = "quay.io/trustyai/ta-lmes-driver:latest"
DefaultPodCheckingInterval = time.Second * 10
Expand Down
Loading

0 comments on commit 4a03d6f

Please sign in to comment.