Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
Signed-off-by: ted chang <[email protected]>
  • Loading branch information
tedhtchang committed Oct 25, 2024
1 parent 2003c61 commit e34252f
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 43 deletions.
2 changes: 1 addition & 1 deletion Dockerfile.lmes-job
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ ENV PYTHONPATH=/opt/app-root/src/.local/lib/python3.11/site-packages:/opt/app-ro
ENV HF_HOME=/opt/app-root/src/hf_home
ENV UNITXT_ARTIFACTORIES=/opt/app-root/src/my_catalogs

USER 1001030000:0
USER 65532:65532
CMD ["/opt/app-root/bin/python"]
5 changes: 5 additions & 0 deletions api/lmes/v1alpha1/lmevaljob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,11 @@ type LMEvalJob struct {
Status LMEvalJobStatus `json:"status,omitempty"`
}

// generate pod name for the job
func (j *LMEvalJob) GetPodName() string {
return j.Name
}

// +kubebuilder:object:root=true

// LMEvalJobList contains a list of LMEvalJob
Expand Down
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func main() {
var probeAddr string
var configMap string
var enabledServices controllers.EnabledServices
flag.StringVar(&metricsAddr, "metrics-bind-address", ":9443", "The address and port the metric endpoint binds to.")
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address and port the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
Expand Down
28 changes: 13 additions & 15 deletions controllers/job_mgr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,28 +105,26 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: trustyai-service-operator-config
labels:
name: trustyai-service-operator-config
labels:
app.kubernetes.io/part-of: trustyai
annotations:
annotations:
internal.config.kubernetes.io/generatorBehavior: unspecified
internal.config.kubernetes.io/prefixes: trustyai-service-operator-
internal.config.kubernetes.io/previousKinds: ConfigMap,ConfigMap
internal.config.kubernetes.io/previousNames: config,trustyai-service-operator-config
internal.config.kubernetes.io/previousNamespaces: default,default
data:
kServeServerless: disabled
lmes-default-batch-size: "8"
lmes-driver-image: quay.io/yhwang/ta-lmes-driver:latest
lmes-grpc-port: "8082"
lmes-grpc-service: lmes-grpc
lmes-image-pull-policy: Always
lmes-max-batch-size: "24"
lmes-pod-checking-interval: 10s
lmes-pod-image: quay.io/tedchang/ta-lmes-job:latest
oauthProxyImage: quay.io/openshift/origin-oauth-proxy:4.14.0
trustyaiOperatorImage: quay.io/tedchang/trustyai-service-operator:latest
trustyaiServiceImage: quay.io/trustyai/trustyai-service:latest
kServeServerless: disabled
lmes-default-batch-size: "8"
lmes-driver-image: quay.io/yhwang/ta-lmes-driver:latest
lmes-image-pull-policy: Always
lmes-max-batch-size: "24"
lmes-pod-checking-interval: 10s
lmes-pod-image: quay.io/tedchang/ta-lmes-job:latest
oauthProxyImage: quay.io/openshift/origin-oauth-proxy:4.14.0
trustyaiOperatorImage: quay.io/tedchang/trustyai-service-operator:latest
trustyaiServiceImage: quay.io/trustyai/trustyai-service:latest
EOF
```
Start the controller locally:
Expand Down
16 changes: 16 additions & 0 deletions controllers/job_mgr/constants.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/*
Copyright 2024 IBM Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package job_mgr

const (
Expand Down
43 changes: 23 additions & 20 deletions controllers/job_mgr/job_mgr_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func ControllerSetUp(mgr manager.Manager, ns, configmap string, recorder record.
if err := jobframework.SetupWorkloadOwnerIndex(ctx, mgr.GetFieldIndexer(), workloadv1alpha1.GroupVersion.WithKind("LMEvalJob")); err != nil {
return fmt.Errorf("workload indexer: %w", err)
}
lmes.Job_mgr_enabled = true
lmes.JobMgrEnabled = true
return jobframework.NewGenericReconcilerFactory(
func() jobframework.GenericJob { return &LMEvalJob{} },
func(b *builder.Builder, c client.Client) *builder.Builder {
Expand Down Expand Up @@ -116,7 +116,7 @@ func (job *LMEvalJob) PodSets() []kueue.PodSet {
log := log.FromContext(context.TODO())
pod := lmes.CreatePod(lmes.Options, &job.LMEvalJob, log)
podSet := kueue.PodSet{
Name: job.Status.PodName,
Name: job.GetPodName(),
Count: 1,
Template: corev1.PodTemplateSpec{Spec: pod.Spec},
}
Expand Down Expand Up @@ -148,26 +148,29 @@ func (job *LMEvalJob) GVK() schema.GroupVersionKind {
return workloadv1alpha1.GroupVersion.WithKind("LMEvalJob")
}

// Convert NodeSelector in the PodSetInfo to Pod.Spec.Affinity
func convertToAffinity(psi []podset.PodSetInfo) *corev1.Affinity {
// Note there is only 1 element in podset array see PodSets method above.
nsl := psi[0].NodeSelector
nsra := []corev1.NodeSelectorRequirement{}
for k, v := range nsl {
nsr := corev1.NodeSelectorRequirement{
Key: k,
Operator: "In",
Values: []string{v},
if len(psi) > 0 {
nsl := psi[0].NodeSelector // Note there is only 1 element in podset array see PodSets method above.
nsra := []corev1.NodeSelectorRequirement{}
for k, v := range nsl {
nsr := corev1.NodeSelectorRequirement{
Key: k,
Operator: "In",
Values: []string{v},
}
nsra = append(nsra, nsr)
}
nsra = append(nsra, nsr)
}
nsta := []corev1.NodeSelectorTerm{}
nsta = append(nsta, corev1.NodeSelectorTerm{MatchExpressions: nsra})
affinity := corev1.Affinity{
NodeAffinity: &corev1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
NodeSelectorTerms: nsta,
nsta := []corev1.NodeSelectorTerm{}
nsta = append(nsta, corev1.NodeSelectorTerm{MatchExpressions: nsra})
affinity := corev1.Affinity{
NodeAffinity: &corev1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
NodeSelectorTerms: nsta,
},
},
},
}
return &affinity
}
return &affinity
return nil
}
2 changes: 1 addition & 1 deletion controllers/lmes/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (
)

// set by job_mgr controllerSetup func
var Job_mgr_enabled bool
var JobMgrEnabled bool
var Options *serviceOptions = &serviceOptions{
DriverImage: DefaultDriverImage,
PodImage: DefaultPodImage,
Expand Down
1 change: 0 additions & 1 deletion controllers/lmes/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,4 @@ const (
DefaultBatchSize = 8
DefaultDetectDevice = true
ServiceName = "LMES"
EnableKueue = false
)
8 changes: 4 additions & 4 deletions controllers/lmes/lmevaljob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func (r *LMEvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
job.Status.State = lmesv1alpha1.NewJobState
}

if job.Spec.Suspend && Job_mgr_enabled {
if job.Spec.Suspend && JobMgrEnabled {
return r.handleSuspend(ctx, log, job)
}

Expand Down Expand Up @@ -408,7 +408,7 @@ func (r *LMEvalJobReconciler) handleNewCR(ctx context.Context, log logr.Logger,

// Create the pod successfully. Wait for the driver to update the status
job.Status.State = lmesv1alpha1.ScheduledJobState
job.Status.PodName = pod.Name
job.Status.PodName = job.GetPodName()
job.Status.LastScheduleTime = &currentTime
if err := r.Status().Update(ctx, job); err != nil {
log.Error(err, "unable to update LMEvalJob status (pod creation done)")
Expand Down Expand Up @@ -453,7 +453,7 @@ func (r *LMEvalJobReconciler) checkScheduledPod(ctx context.Context, log logr.Lo
if err := r.Status().Update(ctx, job); err != nil {
log.Error(err, "unable to update LMEvalJob status for pod failure")
}
log.Info("detect an error on the job's pod. marked the job as done", "name", job.Name)
log.Info("detect an error on the job's pod. marked the job as done", "name", job.GetPodName())
return ctrl.Result{}, err
} else if pod.Status.ContainerStatuses[mainIdx].State.Running == nil {
return r.pullingJobs.addOrUpdate(string(job.GetUID()), Options.PodCheckingInterval), nil
Expand Down Expand Up @@ -679,7 +679,7 @@ func CreatePod(svcOpts *serviceOptions, job *lmesv1alpha1.LMEvalJob, log logr.Lo
APIVersion: "v1",
},
ObjectMeta: v1.ObjectMeta{
Name: job.Name,
Name: job.GetPodName(),
Namespace: job.Namespace,
OwnerReferences: []v1.OwnerReference{
{
Expand Down

0 comments on commit e34252f

Please sign in to comment.