Skip to content

Commit

Permalink
sync: sync up dev/lm-eval branch with main branch (#336)
Browse files Browse the repository at this point in the history
* [CI] Run tests from trustyai-tests (#279)

* Change Dockerfile to clone trustyai-tests

* Add PYTEST_MARKERS env and remove TESTS_REGEX

* RHOAIENG-12274: Update operator's overlays (#287)

* Update operator's overlays

* Update kustomization.yaml

* Add devflag printout to GH Action comment (#289)

* Add timeout loop to DSC install (#305)

* RHOAIENG-13625: Add DBAvailable status to CR (#304)

* Add DBAvailable status to CR

* Remove probes

* Add KServe destination rule for Inference Services in the ServiceMesh (#315)

* Add DestinationRule creation for KServe serverless

* Add permissions for destination rules

* Add role for destination rules

* Add missing role for creating destination rules

* Fix spacing in DestinationRule template

* Add check if DestinationRule CRD is present before creating it (#316)

* Add check for DestinationRule CRD

* Add API extensions to operator's scheme

* Add permission for CRD resource

* Fix operator metrics service target port (#320)

* Add readiness probes (#312)

* Enable KServe serverless in the rhoai overlay (#321)

* Update overlay images (#331)

* Add correct CA cert to JDBC (#324)

* Add correct CA cert to JDBC

* Add require SSL

* Support for VirtualServices for InferenceLogger traffic (#332)

* Generate KServe Inference Logger in conformance with DestinationRule and VirtualService

* Add VirtualService creation for models in the mesh

* Add permissions for VirtualServices

* Update manifests for VirtualServices

* Fix VirtualServiceName variable

* fix yaml linter after the sync

Signed-off-by: Yihong Wang <[email protected]>

* tidy the go.mod and go.sum as well

Signed-off-by: Yihong Wang <[email protected]>

---------

Signed-off-by: Yihong Wang <[email protected]>
Co-authored-by: Adolfo Aguirrezabal <[email protected]>
Co-authored-by: Rui Vieira <[email protected]>
Co-authored-by: Rob Geada <[email protected]>
Co-authored-by: Rui Vieira <[email protected]>
  • Loading branch information
5 people authored Oct 21, 2024
1 parent faf468b commit 471738b
Show file tree
Hide file tree
Showing 27 changed files with 601 additions and 68 deletions.
10 changes: 10 additions & 0 deletions .github/workflows/build-and-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ jobs:
if: env.BUILD_CONTEXT == 'ci'
run: |
sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/base/params.env
sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/overlays/odh/params.env
sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/overlays/rhoai/params.env
rm -Rf $(ls . | grep -v config)
rm -Rf .gitignore .dockerignore .github .git .yamllint.yaml
# pysh to ci-manifest repo
Expand Down Expand Up @@ -146,4 +148,12 @@ jobs:
📦 [LMES job image](https://quay.io/trustyai/ta-lmes-job:${{ github.event.pull_request.head.sha }}): `quay.io/trustyai/ta-lmes-job:${{ github.event.pull_request.head.sha }}`
🗂️ [CI manifests](https://github.com/trustyai-explainability/trustyai-service-operator-ci/tree/operator-${{ env.TAG }})
```
devFlags:
manifests:
- contextDir: config
sourcePath: ''
uri: https://api.github.com/repos/trustyai-explainability/trustyai-service-operator-ci/tarball/operator-${{ env.TAG }}
```
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai
| `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. |
| `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. |

#### Database Status

| Status Type | Status Reason | Description |
|---------------|-------------------------|---------------------------------------------------|
| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found |
| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) |
| `DBAvailable` | `DBConnectionError` | Service error connecting to the database |
| `DBAvailable` | `DBAvailable` | Successfully connected to the database |


#### Status Behavior

- If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`.
- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready`
- However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so.

## Contributing
Expand Down
2 changes: 2 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1"
routev1 "github.com/openshift/api/route/v1"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
Expand Down Expand Up @@ -58,6 +59,7 @@ func init() {
utilruntime.Must(kservev1alpha1.AddToScheme(scheme))
utilruntime.Must(kservev1beta1.AddToScheme(scheme))
utilruntime.Must(routev1.AddToScheme(scheme))
utilruntime.Must(apiextensionsv1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}

Expand Down
1 change: 1 addition & 0 deletions config/base/params.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ lmes-image-pull-policy=Always
lmes-max-batch-size=24
lmes-default-batch-size=8
lmes-detect-device=true

2 changes: 1 addition & 1 deletion config/overlays/rhoai/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ patchesStrategicMerge:
configMapGenerator:
- env: params.env
behavior: merge
name: config
name: config
2 changes: 1 addition & 1 deletion config/rbac/auth_proxy_service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ spec:
- name: https
port: 8443
protocol: TCP
targetPort: 8080
targetPort: 8081
selector:
control-plane: controller-manager
21 changes: 21 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ rules:
- list
- update
- watch
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
Expand Down Expand Up @@ -99,6 +107,19 @@ rules:
- create
- list
- watch
- apiGroups:
- networking.istio.io
resources:
- destinationrules
- virtualservices
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- rbac.authorization.k8s.io
resources:
Expand Down
15 changes: 15 additions & 0 deletions controllers/tas/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const (
StatusTypePVCAvailable = "PVCAvailable"
StatusTypeRouteAvailable = "RouteAvailable"
StatusTypeAvailable = "Available"
StatusTypeDBAvailable = "DBAvailable"
)

// Status reasons
Expand All @@ -59,6 +60,10 @@ const (
StatusReasonRouteFound = "RouteFound"
StatusAvailable = "AllComponentsReady"
StatusNotAvailable = "NotAllComponentsReady"
StatusDBCredentialsNotFound = "DBCredentialsNotFound"
StatusDBCredentialsError = "DBCredentialsError"
StatusDBConnectionError = "DBConnectionError"
StatusDBAvailable = "DBAvailable"
)

// Event reasons
Expand All @@ -68,4 +73,14 @@ const (
EventReasonServiceMonitorCreated = "ServiceMonitorCreated"
)

const (
StateReasonCrashLoopBackOff = "CrashLoopBackOff"
)

// Phases
const (
PhaseReady = "Ready"
PhaseNotReady = "Not Ready"
)

const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration"
62 changes: 62 additions & 0 deletions controllers/tas/database.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package tas

import (
"context"
"strings"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}
err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment)
if err != nil {
if errors.IsNotFound(err) {
return false, nil
}
return false, err
}

for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == "trustyai-service" {
if cs.State.Running != nil {
return true, nil
}

if cs.LastTerminationState.Terminated != nil {
termination := cs.LastTerminationState.Terminated
if termination.Reason == "Error" && termination.Message != "" {
if strings.Contains(termination.Message, "Socket fail to connect to host:address") {
return false, nil
}
}
}

if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
}
}
}
}
}

return false, nil
}
31 changes: 26 additions & 5 deletions controllers/tas/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ import (
"reflect"
"strconv"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
"github.com/trustyai-explainability/trustyai-service-operator/controllers/constants"
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/tas/templates"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

Expand Down Expand Up @@ -74,13 +74,13 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context,
}

if instance.Spec.Storage.IsStorageDatabase() {
_, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace)
_, err := r.getSecret(ctx, instance.Name+"-db-ca", instance.Namespace)
if err != nil {
deploymentConfig.UseDBTLSCerts = false
log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found")
log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-ca not found")
} else {
deploymentConfig.UseDBTLSCerts = true
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls")
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-ca")
}
} else {
deploymentConfig.UseDBTLSCerts = false
Expand Down Expand Up @@ -203,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan
return nil
}

// checkDeploymentReady verifies that a TrustyAI service deployment is ready
func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}

Expand All @@ -217,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in
for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
return false, nil
}
}
}

return true, nil
}
}
Expand Down
89 changes: 89 additions & 0 deletions controllers/tas/destination_rule.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package tas

import (
"context"
"fmt"
"reflect"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/tas/templates"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
)

const (
destinationRuleTemplatePath = "service/destination-rule.tmpl.yaml"
destinationRuleCDRName = "destinationrules.networking.istio.io"
)

// DestinationRuleConfig has the variables for the DestinationRule template
type DestinationRuleConfig struct {
Name string
Namespace string
DestinationRuleName string
}

// isDestinationRuleCRDPresent returns true if the DestinationRule CRD is present, false otherwise
func (r *TrustyAIServiceReconciler) isDestinationRuleCRDPresent(ctx context.Context) (bool, error) {
crd := &apiextensionsv1.CustomResourceDefinition{}

err := r.Get(ctx, types.NamespacedName{Name: destinationRuleCDRName}, crd)
if err != nil {
if !errors.IsNotFound(err) {
return false, fmt.Errorf("error getting "+destinationRuleCDRName+" CRD: %v", err)
}
// Not found
return false, nil
}

// Found
return true, nil
}

func (r *TrustyAIServiceReconciler) ensureDestinationRule(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error {

destinationRuleName := instance.Name + "-internal"

existingDestinationRule := &unstructured.Unstructured{}
existingDestinationRule.SetKind("DestinationRule")
existingDestinationRule.SetAPIVersion("networking.istio.io/v1beta1")

// Check if the DestinationRule already exists
err := r.Get(ctx, types.NamespacedName{Name: destinationRuleName, Namespace: instance.Namespace}, existingDestinationRule)
if err == nil {
// DestinationRule exists
return nil
}

if !errors.IsNotFound(err) {
return fmt.Errorf("failed to check for existing DestinationRule: %v", err)
}

destinationRuleConfig := DestinationRuleConfig{
Name: instance.Name,
Namespace: instance.Namespace,
DestinationRuleName: destinationRuleName,
}

var destinationRule *unstructured.Unstructured
destinationRule, err = templateParser.ParseResource[unstructured.Unstructured](destinationRuleTemplatePath, destinationRuleConfig, reflect.TypeOf(&unstructured.Unstructured{}))
if err != nil {
log.FromContext(ctx).Error(err, "could not parse the DestinationRule template")
return err
}

if err := ctrl.SetControllerReference(instance, destinationRule, r.Scheme); err != nil {
return err
}

err = r.Create(ctx, destinationRule)
if err != nil {
return fmt.Errorf("failed to create DestinationRule: %v", err)
}

return nil
}
Loading

0 comments on commit 471738b

Please sign in to comment.