From c3868c11839beeb9fd57403a7eb58c9de7e18971 Mon Sep 17 00:00:00 2001 From: Sara-KS <50249410+Sara-KS@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:52:50 -0600 Subject: [PATCH] Fix: Distributed Training Rendezvous error with MCAD v.1.34.1 (#793) * fix: distributed rendezvous error with MCAD v.1.34.1 * fix: Update tests --- torchx/schedulers/kubernetes_mcad_scheduler.py | 2 +- torchx/schedulers/test/kubernetes_mcad_scheduler_test.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/torchx/schedulers/kubernetes_mcad_scheduler.py b/torchx/schedulers/kubernetes_mcad_scheduler.py index 145036180..f6f651da0 100644 --- a/torchx/schedulers/kubernetes_mcad_scheduler.py +++ b/torchx/schedulers/kubernetes_mcad_scheduler.py @@ -436,7 +436,7 @@ def mcad_svc( target_port=int(service_port), ) ], - selector={"appwrapper.workload.codeflare.dev": svc_name}, + selector={LABEL_UNIQUE_NAME: svc_name}, session_affinity="None", type="ClusterIP", ), diff --git a/torchx/schedulers/test/kubernetes_mcad_scheduler_test.py b/torchx/schedulers/test/kubernetes_mcad_scheduler_test.py index 71b5dd5ba..b60cb9bf2 100644 --- a/torchx/schedulers/test/kubernetes_mcad_scheduler_test.py +++ b/torchx/schedulers/test/kubernetes_mcad_scheduler_test.py @@ -450,7 +450,7 @@ def test_create_mcad_service(self) -> None: target_port=int(service_port), ) ], - selector={"appwrapper.workload.codeflare.dev": service_name}, + selector={"app.kubernetes.io/instance": service_name}, session_affinity="None", type="ClusterIP", ), @@ -667,7 +667,7 @@ def test_submit_dryrun(self) -> None: targetPort: 1234 publishNotReadyAddresses: true selector: - appwrapper.workload.codeflare.dev: app-name + app.kubernetes.io/instance: app-name sessionAffinity: None type: ClusterIP status: