From 107c5011a969bf4759b8f1fa193042c51942f12b Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Thu, 21 Mar 2024 15:17:26 -0400 Subject: [PATCH] fix: workspace condition --- api/v1alpha1/workspace_condition_types.go | 10 +++++++-- api/v1alpha1/workspace_validation_test.go | 8 +++---- pkg/controllers/workspace_controller.go | 26 +++++++++++------------ pkg/model/interface.go | 4 ++-- pkg/utils/testModel.go | 8 +++---- presets/models/falcon/model.go | 16 +++++++------- presets/models/llama2/model.go | 12 +++++------ presets/models/llama2chat/model.go | 12 +++++------ presets/models/mistral/model.go | 8 +++---- presets/models/phi/model.go | 4 ++-- 10 files changed, 57 insertions(+), 51 deletions(-) diff --git a/api/v1alpha1/workspace_condition_types.go b/api/v1alpha1/workspace_condition_types.go index 9845b8a0c..e14995f12 100644 --- a/api/v1alpha1/workspace_condition_types.go +++ b/api/v1alpha1/workspace_condition_types.go @@ -16,8 +16,14 @@ const ( // WorkspaceConditionTypeInferenceStatus is the state when Inference has been created. WorkspaceConditionTypeInferenceStatus = ConditionType("InferenceReady") - // WorkspaceConditionTypeTuningStatus is the state when Tuning has been created. - WorkspaceConditionTypeTuningStatus = ConditionType("TuningReady") + // WorkspaceConditionTypeTuningStarted indicates that the tuning Job has been started. + WorkspaceConditionTypeTuningStarted = ConditionType("TuningStarted") + + // WorkspaceConditionTypeTuningComplete indicates that the tuning Job has completed successfully. + WorkspaceConditionTypeTuningComplete = ConditionType("TuningComplete") + + // WorkspaceConditionTypeTuningFailed indicates that the tuning Job has failed to complete. + WorkspaceConditionTypeTuningFailed = ConditionType("TuningFailed") //WorkspaceConditionTypeDeleting is the Workspace state when starts to get deleted. WorkspaceConditionTypeDeleting = ConditionType("WorkspaceDeleting") diff --git a/api/v1alpha1/workspace_validation_test.go b/api/v1alpha1/workspace_validation_test.go index 4fc193f52..d196beabc 100644 --- a/api/v1alpha1/workspace_validation_test.go +++ b/api/v1alpha1/workspace_validation_test.go @@ -28,7 +28,7 @@ func (*testModel) GetInferenceParameters() *model.PresetParam { PerGPUMemoryRequirement: perGPUMemoryRequirement, } } -func (*testModel) GetTrainingParameters() *model.PresetParam { +func (*testModel) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: gpuCountRequirement, TotalGPUMemoryRequirement: totalGPUMemoryRequirement, @@ -38,7 +38,7 @@ func (*testModel) GetTrainingParameters() *model.PresetParam { func (*testModel) SupportDistributedInference() bool { return false } -func (*testModel) SupportTraining() bool { +func (*testModel) SupportTuning() bool { return true } @@ -52,7 +52,7 @@ func (*testModelPrivate) GetInferenceParameters() *model.PresetParam { PerGPUMemoryRequirement: perGPUMemoryRequirement, } } -func (*testModelPrivate) GetTrainingParameters() *model.PresetParam { +func (*testModelPrivate) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ ImageAccessMode: "private", GPUCountRequirement: gpuCountRequirement, @@ -63,7 +63,7 @@ func (*testModelPrivate) GetTrainingParameters() *model.PresetParam { func (*testModelPrivate) SupportDistributedInference() bool { return false } -func (*testModelPrivate) SupportTraining() bool { +func (*testModelPrivate) SupportTuning() bool { return true } diff --git a/pkg/controllers/workspace_controller.go b/pkg/controllers/workspace_controller.go index 2ab2b4604..ef2e4c01e 100644 --- a/pkg/controllers/workspace_controller.go +++ b/pkg/controllers/workspace_controller.go @@ -5,13 +5,12 @@ package controllers import ( "context" "fmt" - "github.com/azure/kaito/pkg/tuning" "sort" "strings" "time" - appsv1 "k8s.io/api/apps/v1" - "k8s.io/utils/clock" + "github.com/azure/kaito/pkg/tuning" + batchv1 "k8s.io/api/batch/v1" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" @@ -22,6 +21,7 @@ import ( "github.com/azure/kaito/pkg/utils/plugin" "github.com/go-logr/logr" "github.com/samber/lo" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -29,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" + "k8s.io/utils/clock" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -442,22 +443,21 @@ func (c *WorkspaceReconciler) applyTuning(ctx context.Context, wObj *kaitov1alph presetName := string(wObj.Tuning.Preset.Name) model := plugin.KaitoModelRegister.MustGet(presetName) - trainingParam := model.GetTrainingParameters() - - existingObj := &appsv1.Deployment{} + tuningParam := model.GetTuningParameters() + existingObj := &batchv1.Job{} if err = resources.GetResource(ctx, wObj.Name, wObj.Namespace, c.Client, existingObj); err == nil { - klog.InfoS("A training workload already exists for workspace", "workspace", klog.KObj(wObj)) - if err = resources.CheckResourceStatus(existingObj, c.Client, trainingParam.WorkloadTimeout); err != nil { + klog.InfoS("A tuning workload already exists for workspace", "workspace", klog.KObj(wObj)) + if err = resources.CheckResourceStatus(existingObj, c.Client, tuningParam.WorkloadTimeout); err != nil { return } } else if apierrors.IsNotFound(err) { var workloadObj client.Object // Need to create a new workload - workloadObj, err = tuning.CreatePresetTuning(ctx, wObj, trainingParam, c.Client) + workloadObj, err = tuning.CreatePresetTuning(ctx, wObj, tuningParam, c.Client) if err != nil { return } - if err = resources.CheckResourceStatus(workloadObj, c.Client, trainingParam.WorkloadTimeout); err != nil { + if err = resources.CheckResourceStatus(workloadObj, c.Client, tuningParam.WorkloadTimeout); err != nil { return } } @@ -465,7 +465,7 @@ func (c *WorkspaceReconciler) applyTuning(ctx context.Context, wObj *kaitov1alph }() if err != nil { - if updateErr := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeTuningStatus, metav1.ConditionFalse, + if updateErr := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeTuningFailed, metav1.ConditionFalse, "WorkspaceTuningStatusFailed", err.Error()); updateErr != nil { klog.ErrorS(updateErr, "failed to update workspace status", "workspace", klog.KObj(wObj)) return updateErr @@ -475,8 +475,8 @@ func (c *WorkspaceReconciler) applyTuning(ctx context.Context, wObj *kaitov1alph } } - if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeTuningStatus, metav1.ConditionTrue, - "WorkspaceTuningStatusSuccess", "Tuning has been deployed successfully"); err != nil { + if err := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeTuningStarted, metav1.ConditionTrue, + "WorkspaceTuningStatusStarted", "Tuning has been deployed successfully"); err != nil { klog.ErrorS(err, "failed to update workspace status", "workspace", klog.KObj(wObj)) return err } diff --git a/pkg/model/interface.go b/pkg/model/interface.go index 148d80fcf..eba2e0d0e 100644 --- a/pkg/model/interface.go +++ b/pkg/model/interface.go @@ -8,9 +8,9 @@ import ( type Model interface { GetInferenceParameters() *PresetParam - GetTrainingParameters() *PresetParam + GetTuningParameters() *PresetParam SupportDistributedInference() bool //If true, the model workload will be a StatefulSet, using the torch elastic runtime framework. - SupportTraining() bool + SupportTuning() bool } // PresetParam defines the preset inference parameters for a model. diff --git a/pkg/utils/testModel.go b/pkg/utils/testModel.go index fdf9423c3..f03633d7c 100644 --- a/pkg/utils/testModel.go +++ b/pkg/utils/testModel.go @@ -18,7 +18,7 @@ func (*testModel) GetInferenceParameters() *model.PresetParam { WorkloadTimeout: time.Duration(30) * time.Minute, } } -func (*testModel) GetTrainingParameters() *model.PresetParam { +func (*testModel) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: "1", WorkloadTimeout: time.Duration(30) * time.Minute, @@ -27,7 +27,7 @@ func (*testModel) GetTrainingParameters() *model.PresetParam { func (*testModel) SupportDistributedInference() bool { return false } -func (*testModel) SupportTraining() bool { +func (*testModel) SupportTuning() bool { return true } @@ -39,7 +39,7 @@ func (*testDistributedModel) GetInferenceParameters() *model.PresetParam { WorkloadTimeout: time.Duration(30) * time.Minute, } } -func (*testDistributedModel) GetTrainingParameters() *model.PresetParam { +func (*testDistributedModel) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ GPUCountRequirement: "1", WorkloadTimeout: time.Duration(30) * time.Minute, @@ -48,7 +48,7 @@ func (*testDistributedModel) GetTrainingParameters() *model.PresetParam { func (*testDistributedModel) SupportDistributedInference() bool { return true } -func (*testDistributedModel) SupportTraining() bool { +func (*testDistributedModel) SupportTuning() bool { return true } diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go index f99cac208..e6774eed1 100644 --- a/presets/models/falcon/model.go +++ b/presets/models/falcon/model.go @@ -69,7 +69,7 @@ func (*falcon7b) GetInferenceParameters() *model.PresetParam { Tag: PresetFalconTagMap["Falcon7B"], } } -func (*falcon7b) GetTrainingParameters() *model.PresetParam { +func (*falcon7b) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ ModelFamilyName: "Falcon", ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic), @@ -88,7 +88,7 @@ func (*falcon7b) GetTrainingParameters() *model.PresetParam { func (*falcon7b) SupportDistributedInference() bool { return false } -func (*falcon7b) SupportTraining() bool { +func (*falcon7b) SupportTuning() bool { return true } @@ -112,13 +112,13 @@ func (*falcon7bInst) GetInferenceParameters() *model.PresetParam { } } -func (*falcon7bInst) GetTrainingParameters() *model.PresetParam { +func (*falcon7bInst) GetTuningParameters() *model.PresetParam { return nil // It is not recommended/ideal to further fine-tune instruct models - Already been fine-tuned } func (*falcon7bInst) SupportDistributedInference() bool { return false } -func (*falcon7bInst) SupportTraining() bool { +func (*falcon7bInst) SupportTuning() bool { return false } @@ -142,7 +142,7 @@ func (*falcon40b) GetInferenceParameters() *model.PresetParam { } } -func (*falcon40b) GetTrainingParameters() *model.PresetParam { +func (*falcon40b) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ ModelFamilyName: "Falcon", ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic), @@ -160,7 +160,7 @@ func (*falcon40b) GetTrainingParameters() *model.PresetParam { func (*falcon40b) SupportDistributedInference() bool { return false } -func (*falcon40b) SupportTraining() bool { +func (*falcon40b) SupportTuning() bool { return true } @@ -183,12 +183,12 @@ func (*falcon40bInst) GetInferenceParameters() *model.PresetParam { Tag: PresetFalconTagMap["Falcon40BInstruct"], } } -func (*falcon40bInst) GetTrainingParameters() *model.PresetParam { +func (*falcon40bInst) GetTuningParameters() *model.PresetParam { return nil // It is not recommended/ideal to further fine-tune instruct models - Already been fine-tuned } func (*falcon40bInst) SupportDistributedInference() bool { return false } -func (*falcon40bInst) SupportTraining() bool { +func (*falcon40bInst) SupportTuning() bool { return false } diff --git a/presets/models/llama2/model.go b/presets/models/llama2/model.go index b1e1dc180..c38d9ef4d 100644 --- a/presets/models/llama2/model.go +++ b/presets/models/llama2/model.go @@ -56,13 +56,13 @@ func (*llama2Text7b) GetInferenceParameters() *model.PresetParam { } } -func (*llama2Text7b) GetTrainingParameters() *model.PresetParam { +func (*llama2Text7b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning } func (*llama2Text7b) SupportDistributedInference() bool { return false } -func (*llama2Text7b) SupportTraining() bool { +func (*llama2Text7b) SupportTuning() bool { return false } @@ -87,13 +87,13 @@ func (*llama2Text13b) GetInferenceParameters() *model.PresetParam { // Tag: llama has private image access mode. The image tag is determined by the user. } } -func (*llama2Text13b) GetTrainingParameters() *model.PresetParam { +func (*llama2Text13b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning } func (*llama2Text13b) SupportDistributedInference() bool { return true } -func (*llama2Text13b) SupportTraining() bool { +func (*llama2Text13b) SupportTuning() bool { return false } @@ -118,12 +118,12 @@ func (*llama2Text70b) GetInferenceParameters() *model.PresetParam { // Tag: llama has private image access mode. The image tag is determined by the user. } } -func (*llama2Text70b) GetTrainingParameters() *model.PresetParam { +func (*llama2Text70b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning } func (*llama2Text70b) SupportDistributedInference() bool { return true } -func (*llama2Text70b) SupportTraining() bool { +func (*llama2Text70b) SupportTuning() bool { return false } diff --git a/presets/models/llama2chat/model.go b/presets/models/llama2chat/model.go index 9108a41d5..1afc17655 100644 --- a/presets/models/llama2chat/model.go +++ b/presets/models/llama2chat/model.go @@ -55,13 +55,13 @@ func (*llama2Chat7b) GetInferenceParameters() *model.PresetParam { // Tag: llama has private image access mode. The image tag is determined by the user. } } -func (*llama2Chat7b) GetTrainingParameters() *model.PresetParam { +func (*llama2Chat7b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning } func (*llama2Chat7b) SupportDistributedInference() bool { return false } -func (*llama2Chat7b) SupportTraining() bool { +func (*llama2Chat7b) SupportTuning() bool { return false } @@ -86,13 +86,13 @@ func (*llama2Chat13b) GetInferenceParameters() *model.PresetParam { // Tag: llama has private image access mode. The image tag is determined by the user. } } -func (*llama2Chat13b) GetTrainingParameters() *model.PresetParam { +func (*llama2Chat13b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning } func (*llama2Chat13b) SupportDistributedInference() bool { return true } -func (*llama2Chat13b) SupportTraining() bool { +func (*llama2Chat13b) SupportTuning() bool { return false } @@ -117,12 +117,12 @@ func (*llama2Chat70b) GetInferenceParameters() *model.PresetParam { // Tag: llama has private image access mode. The image tag is determined by the user. } } -func (*llama2Chat70b) GetTrainingParameters() *model.PresetParam { +func (*llama2Chat70b) GetTuningParameters() *model.PresetParam { return nil // Currently doesn't support fine-tuning } func (*llama2Chat70b) SupportDistributedInference() bool { return true } -func (*llama2Chat70b) SupportTraining() bool { +func (*llama2Chat70b) SupportTuning() bool { return false } diff --git a/presets/models/mistral/model.go b/presets/models/mistral/model.go index c4c518feb..9a3dc8217 100644 --- a/presets/models/mistral/model.go +++ b/presets/models/mistral/model.go @@ -58,7 +58,7 @@ func (*mistral7b) GetInferenceParameters() *model.PresetParam { } } -func (*mistral7b) GetTrainingParameters() *model.PresetParam { +func (*mistral7b) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ ModelFamilyName: "Mistral", ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic), @@ -77,7 +77,7 @@ func (*mistral7b) GetTrainingParameters() *model.PresetParam { func (*mistral7b) SupportDistributedInference() bool { return false } -func (*mistral7b) SupportTraining() bool { +func (*mistral7b) SupportTuning() bool { return true } @@ -101,12 +101,12 @@ func (*mistral7bInst) GetInferenceParameters() *model.PresetParam { } } -func (*mistral7bInst) GetTrainingParameters() *model.PresetParam { +func (*mistral7bInst) GetTuningParameters() *model.PresetParam { return nil // It is not recommended/ideal to further fine-tune instruct models - Already been fine-tuned } func (*mistral7bInst) SupportDistributedInference() bool { return false } -func (*mistral7bInst) SupportTraining() bool { +func (*mistral7bInst) SupportTuning() bool { return false } diff --git a/presets/models/phi/model.go b/presets/models/phi/model.go index 6a1bfd109..189b9d9ec 100644 --- a/presets/models/phi/model.go +++ b/presets/models/phi/model.go @@ -51,7 +51,7 @@ func (*phi2) GetInferenceParameters() *model.PresetParam { Tag: PresetPhiTagMap["Phi2"], } } -func (*phi2) GetTrainingParameters() *model.PresetParam { +func (*phi2) GetTuningParameters() *model.PresetParam { return &model.PresetParam{ ModelFamilyName: "Phi", ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic), @@ -69,6 +69,6 @@ func (*phi2) GetTrainingParameters() *model.PresetParam { func (*phi2) SupportDistributedInference() bool { return false } -func (*phi2) SupportTraining() bool { +func (*phi2) SupportTuning() bool { return true }