Skip to content

Commit

Permalink
perf: Improve TopologyGroup node domain iteration (#1820)
Browse files Browse the repository at this point in the history
Co-authored-by: Jason Deal <[email protected]>
  • Loading branch information
jonathan-innis and jmdeal authored Nov 18, 2024
1 parent b0ddeb8 commit 65aa1bf
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 25 deletions.
54 changes: 53 additions & 1 deletion pkg/controllers/provisioning/scheduling/topology_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1642,6 +1642,59 @@ var _ = Describe("Topology", func() {
}
Expect(len(nodeNames)).To(Equal(1))
})
It("should allow two nodes to be created when two pods with matching affinities have incompatible selectors", func() {
affLabels := map[string]string{"security": "s1"}
// the pod needs to provide it's own zonal affinity, but we further limit it to only being on test-zone-2
pod1 := test.UnschedulablePod(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: affLabels,
},
PodRequirements: []corev1.PodAffinityTerm{{
LabelSelector: &metav1.LabelSelector{
MatchLabels: affLabels,
},
TopologyKey: corev1.LabelTopologyZone,
}},
NodeRequirements: []corev1.NodeSelectorRequirement{
{
Key: corev1.LabelTopologyZone,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"test-zone-2"},
},
},
})
// the pod needs to provide it's own zonal affinity, but we further limit it to only being on test-zone-3
pod2 := test.UnschedulablePod(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: affLabels,
},
PodRequirements: []corev1.PodAffinityTerm{{
LabelSelector: &metav1.LabelSelector{
MatchLabels: affLabels,
},
TopologyKey: corev1.LabelTopologyZone,
}},
NodeRequirements: []corev1.NodeSelectorRequirement{
{
Key: corev1.LabelTopologyZone,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"test-zone-3"},
},
},
})
ExpectApplied(ctx, env.Client, nodePool)
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod1, pod2)
nodeNames := map[string]struct{}{}

n1 := ExpectScheduled(ctx, env.Client, pod1)
nodeNames[n1.Name] = struct{}{}
Expect(n1.Labels[corev1.LabelTopologyZone]).To(Equal("test-zone-2"))

n2 := ExpectScheduled(ctx, env.Client, pod2)
nodeNames[n2.Name] = struct{}{}
Expect(n2.Labels[corev1.LabelTopologyZone]).To(Equal("test-zone-3"))
Expect(len(nodeNames)).To(Equal(2))
})
It("should allow violation of preferred pod affinity", func() {
topology := []corev1.TopologySpreadConstraint{{
TopologyKey: corev1.LabelHostname,
Expand Down Expand Up @@ -1940,7 +1993,6 @@ var _ = Describe("Topology", func() {
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, affPod)
node2 := ExpectScheduled(ctx, env.Client, affPod)
Expect(node1.Labels[corev1.LabelTopologyZone]).ToNot(Equal(node2.Labels[corev1.LabelTopologyZone]))

})
It("should not violate pod anti-affinity on zone (inverse w/existing nodes)", func() {
affLabels := map[string]string{"security": "s2"}
Expand Down
111 changes: 87 additions & 24 deletions pkg/controllers/provisioning/scheduling/topologygroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func (t *TopologyGroup) Get(pod *v1.Pod, podDomains, nodeDomains *scheduling.Req
case TopologyTypePodAffinity:
return t.nextDomainAffinity(pod, podDomains, nodeDomains)
case TopologyTypePodAntiAffinity:
return t.nextDomainAntiAffinity(podDomains)
return t.nextDomainAntiAffinity(podDomains, nodeDomains)
default:
panic(fmt.Sprintf("Unrecognized topology group type: %s", t.Type))
}
Expand Down Expand Up @@ -134,7 +134,6 @@ func (t *TopologyGroup) Register(domains ...string) {
}
}

// Unregister removes the topology group from being aware of the domain
func (t *TopologyGroup) Unregister(domains ...string) {
for _, domain := range domains {
delete(t.domains, domain)
Expand Down Expand Up @@ -178,25 +177,45 @@ func (t *TopologyGroup) Hash() uint64 {
// nextDomainTopologySpread returns a scheduling.Requirement that includes a node domain that a pod should be scheduled to.
// If there are multiple eligible domains, we return any random domain that satisfies the `maxSkew` configuration.
// If there are no eligible domains, we return a `DoesNotExist` requirement, implying that we could not satisfy the topologySpread requirement.
// nolint:gocyclo
func (t *TopologyGroup) nextDomainTopologySpread(pod *v1.Pod, podDomains, nodeDomains *scheduling.Requirement) *scheduling.Requirement {
// min count is calculated across all domains
min := t.domainMinCount(podDomains)
selfSelecting := t.selects(pod)

minDomain := ""
minCount := int32(math.MaxInt32)
for domain := range t.domains {
// but we can only choose from the node domains
if nodeDomains.Has(domain) {
// comment from kube-scheduler regarding the viable choices to schedule to based on skew is:
// 'existing matching num' + 'if self-match (1 or 0)' - 'global min matching num' <= 'maxSkew'
count := t.domains[domain]
if selfSelecting {
count++

// If we are explicitly selecting on specific node domains ("In" requirement),
// this is going to be more efficient to iterate through
// This is particularly useful when considering the hostname topology key that can have a
// lot of t.domains but only a single nodeDomain
if nodeDomains.Operator() == v1.NodeSelectorOpIn {
for _, domain := range nodeDomains.Values() {
if count, ok := t.domains[domain]; ok {
if selfSelecting {
count++
}
if count-min <= t.maxSkew && count < minCount {
minDomain = domain
minCount = count
}
}
if count-min <= t.maxSkew && count < minCount {
minDomain = domain
minCount = count
}
} else {
for domain := range t.domains {
// but we can only choose from the node domains
if nodeDomains.Has(domain) {
// comment from kube-scheduler regarding the viable choices to schedule to based on skew is:
// 'existing matching num' + 'if self-match (1 or 0)' - 'global min matching num' <= 'maxSkew'
count := t.domains[domain]
if selfSelecting {
count++
}
if count-min <= t.maxSkew && count < minCount {
minDomain = domain
minCount = count
}
}
}
}
Expand Down Expand Up @@ -230,17 +249,34 @@ func (t *TopologyGroup) domainMinCount(domains *scheduling.Requirement) int32 {
return min
}

// nolint:gocyclo
func (t *TopologyGroup) nextDomainAffinity(pod *v1.Pod, podDomains *scheduling.Requirement, nodeDomains *scheduling.Requirement) *scheduling.Requirement {
options := scheduling.NewRequirement(podDomains.Key, v1.NodeSelectorOpDoesNotExist)
for domain := range t.domains {
if podDomains.Has(domain) && t.domains[domain] > 0 {
options.Insert(domain)

// If we are explicitly selecting on specific node domains ("In" requirement),
// this is going to be more efficient to iterate through
// This is particularly useful when considering the hostname topology key that can have a
// lot of t.domains but only a single nodeDomain
if nodeDomains.Operator() == v1.NodeSelectorOpIn {
for _, domain := range nodeDomains.Values() {
if count, ok := t.domains[domain]; podDomains.Has(domain) && ok && count > 0 {
options.Insert(domain)
}
}
} else {
for domain := range t.domains {
if podDomains.Has(domain) && t.domains[domain] > 0 && nodeDomains.Has(domain) {
options.Insert(domain)
}
}
}
if options.Len() != 0 {
return options
}

// If pod is self selecting and no pod has been scheduled yet, we can pick a domain at random to bootstrap scheduling

if options.Len() == 0 && t.selects(pod) {
// If pod is self-selecting and no pod has been scheduled yet OR the pods that have scheduled are
// incompatible with our podDomains, we can pick a domain at random to bootstrap scheduling.
if t.selects(pod) && (len(t.domains) == len(t.emptyDomains) || !t.anyCompatiblePodDomain(podDomains)) {
// First try to find a domain that is within the intersection of pod/node domains. In the case of an in-flight node
// this causes us to pick the domain that the existing in-flight node is already in if possible instead of picking
// a random viable domain.
Expand All @@ -263,16 +299,43 @@ func (t *TopologyGroup) nextDomainAffinity(pod *v1.Pod, podDomains *scheduling.R
return options
}

func (t *TopologyGroup) nextDomainAntiAffinity(domains *scheduling.Requirement) *scheduling.Requirement {
options := scheduling.NewRequirement(domains.Key, v1.NodeSelectorOpDoesNotExist)
// anyCompatiblePodDomain validates whether any t.domain is compatible with our podDomains
// This is only useful in affinity checking because it tells us whether we can schedule the pod
// to the current node since it is the first pod that exists in the TopologyGroup OR all other domains
// in the TopologyGroup are incompatible with the podDomains
func (t *TopologyGroup) anyCompatiblePodDomain(podDomains *scheduling.Requirement) bool {
for domain := range t.domains {
if podDomains.Has(domain) && t.domains[domain] > 0 {
return true
}
}
return false
}

// nolint:gocyclo
func (t *TopologyGroup) nextDomainAntiAffinity(podDomains, nodeDomains *scheduling.Requirement) *scheduling.Requirement {
options := scheduling.NewRequirement(podDomains.Key, v1.NodeSelectorOpDoesNotExist)
// pods with anti-affinity must schedule to a domain where there are currently none of those pods (an empty
// domain). If there are none of those domains, then the pod can't schedule and we don't need to walk this
// list of domains. The use case where this optimization is really great is when we are launching nodes for
// a deployment of pods with self anti-affinity. The domains map here continues to grow, and we continue to
// fully scan it each iteration.
for domain := range t.emptyDomains {
if domains.Has(domain) && t.domains[domain] == 0 {
options.Insert(domain)

// If we are explicitly selecting on specific node domains ("In" requirement) and the number of node domains
// is less than our empty domains, this is going to be more efficient to iterate through
// This is particularly useful when considering the hostname topology key that can have a
// lot of t.domains but only a single nodeDomain
if nodeDomains.Operator() == v1.NodeSelectorOpIn && nodeDomains.Len() < len(t.emptyDomains) {
for _, domain := range nodeDomains.Values() {
if t.emptyDomains.Has(domain) && podDomains.Has(domain) {
options.Insert(domain)
}
}
} else {
for domain := range t.emptyDomains {
if nodeDomains.Has(domain) && podDomains.Has(domain) {
options.Insert(domain)
}
}
}
return options
Expand Down

0 comments on commit 65aa1bf

Please sign in to comment.