diff --git a/pkg/controllers/provisioning/scheduling/topology_test.go b/pkg/controllers/provisioning/scheduling/topology_test.go index 42e9bf730d..86b06b63fa 100644 --- a/pkg/controllers/provisioning/scheduling/topology_test.go +++ b/pkg/controllers/provisioning/scheduling/topology_test.go @@ -1642,6 +1642,59 @@ var _ = Describe("Topology", func() { } Expect(len(nodeNames)).To(Equal(1)) }) + It("should allow two nodes to be created when two pods with matching affinities have incompatible selectors", func() { + affLabels := map[string]string{"security": "s1"} + // the pod needs to provide it's own zonal affinity, but we further limit it to only being on test-zone-2 + pod1 := test.UnschedulablePod(test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: affLabels, + }, + PodRequirements: []corev1.PodAffinityTerm{{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: affLabels, + }, + TopologyKey: corev1.LabelTopologyZone, + }}, + NodeRequirements: []corev1.NodeSelectorRequirement{ + { + Key: corev1.LabelTopologyZone, + Operator: corev1.NodeSelectorOpIn, + Values: []string{"test-zone-2"}, + }, + }, + }) + // the pod needs to provide it's own zonal affinity, but we further limit it to only being on test-zone-3 + pod2 := test.UnschedulablePod(test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: affLabels, + }, + PodRequirements: []corev1.PodAffinityTerm{{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: affLabels, + }, + TopologyKey: corev1.LabelTopologyZone, + }}, + NodeRequirements: []corev1.NodeSelectorRequirement{ + { + Key: corev1.LabelTopologyZone, + Operator: corev1.NodeSelectorOpIn, + Values: []string{"test-zone-3"}, + }, + }, + }) + ExpectApplied(ctx, env.Client, nodePool) + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod1, pod2) + nodeNames := map[string]struct{}{} + + n1 := ExpectScheduled(ctx, env.Client, pod1) + nodeNames[n1.Name] = struct{}{} + Expect(n1.Labels[corev1.LabelTopologyZone]).To(Equal("test-zone-2")) + + n2 := ExpectScheduled(ctx, env.Client, pod2) + nodeNames[n2.Name] = struct{}{} + Expect(n2.Labels[corev1.LabelTopologyZone]).To(Equal("test-zone-3")) + Expect(len(nodeNames)).To(Equal(2)) + }) It("should allow violation of preferred pod affinity", func() { topology := []corev1.TopologySpreadConstraint{{ TopologyKey: corev1.LabelHostname, @@ -1940,7 +1993,6 @@ var _ = Describe("Topology", func() { ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, affPod) node2 := ExpectScheduled(ctx, env.Client, affPod) Expect(node1.Labels[corev1.LabelTopologyZone]).ToNot(Equal(node2.Labels[corev1.LabelTopologyZone])) - }) It("should not violate pod anti-affinity on zone (inverse w/existing nodes)", func() { affLabels := map[string]string{"security": "s2"} diff --git a/pkg/controllers/provisioning/scheduling/topologygroup.go b/pkg/controllers/provisioning/scheduling/topologygroup.go index f19148e4d1..9dd5a9bea7 100644 --- a/pkg/controllers/provisioning/scheduling/topologygroup.go +++ b/pkg/controllers/provisioning/scheduling/topologygroup.go @@ -105,7 +105,7 @@ func (t *TopologyGroup) Get(pod *v1.Pod, podDomains, nodeDomains *scheduling.Req case TopologyTypePodAffinity: return t.nextDomainAffinity(pod, podDomains, nodeDomains) case TopologyTypePodAntiAffinity: - return t.nextDomainAntiAffinity(podDomains) + return t.nextDomainAntiAffinity(podDomains, nodeDomains) default: panic(fmt.Sprintf("Unrecognized topology group type: %s", t.Type)) } @@ -134,7 +134,6 @@ func (t *TopologyGroup) Register(domains ...string) { } } -// Unregister removes the topology group from being aware of the domain func (t *TopologyGroup) Unregister(domains ...string) { for _, domain := range domains { delete(t.domains, domain) @@ -178,6 +177,7 @@ func (t *TopologyGroup) Hash() uint64 { // nextDomainTopologySpread returns a scheduling.Requirement that includes a node domain that a pod should be scheduled to. // If there are multiple eligible domains, we return any random domain that satisfies the `maxSkew` configuration. // If there are no eligible domains, we return a `DoesNotExist` requirement, implying that we could not satisfy the topologySpread requirement. +// nolint:gocyclo func (t *TopologyGroup) nextDomainTopologySpread(pod *v1.Pod, podDomains, nodeDomains *scheduling.Requirement) *scheduling.Requirement { // min count is calculated across all domains min := t.domainMinCount(podDomains) @@ -185,18 +185,37 @@ func (t *TopologyGroup) nextDomainTopologySpread(pod *v1.Pod, podDomains, nodeDo minDomain := "" minCount := int32(math.MaxInt32) - for domain := range t.domains { - // but we can only choose from the node domains - if nodeDomains.Has(domain) { - // comment from kube-scheduler regarding the viable choices to schedule to based on skew is: - // 'existing matching num' + 'if self-match (1 or 0)' - 'global min matching num' <= 'maxSkew' - count := t.domains[domain] - if selfSelecting { - count++ + + // If we are explicitly selecting on specific node domains ("In" requirement), + // this is going to be more efficient to iterate through + // This is particularly useful when considering the hostname topology key that can have a + // lot of t.domains but only a single nodeDomain + if nodeDomains.Operator() == v1.NodeSelectorOpIn { + for _, domain := range nodeDomains.Values() { + if count, ok := t.domains[domain]; ok { + if selfSelecting { + count++ + } + if count-min <= t.maxSkew && count < minCount { + minDomain = domain + minCount = count + } } - if count-min <= t.maxSkew && count < minCount { - minDomain = domain - minCount = count + } + } else { + for domain := range t.domains { + // but we can only choose from the node domains + if nodeDomains.Has(domain) { + // comment from kube-scheduler regarding the viable choices to schedule to based on skew is: + // 'existing matching num' + 'if self-match (1 or 0)' - 'global min matching num' <= 'maxSkew' + count := t.domains[domain] + if selfSelecting { + count++ + } + if count-min <= t.maxSkew && count < minCount { + minDomain = domain + minCount = count + } } } } @@ -230,17 +249,34 @@ func (t *TopologyGroup) domainMinCount(domains *scheduling.Requirement) int32 { return min } +// nolint:gocyclo func (t *TopologyGroup) nextDomainAffinity(pod *v1.Pod, podDomains *scheduling.Requirement, nodeDomains *scheduling.Requirement) *scheduling.Requirement { options := scheduling.NewRequirement(podDomains.Key, v1.NodeSelectorOpDoesNotExist) - for domain := range t.domains { - if podDomains.Has(domain) && t.domains[domain] > 0 { - options.Insert(domain) + + // If we are explicitly selecting on specific node domains ("In" requirement), + // this is going to be more efficient to iterate through + // This is particularly useful when considering the hostname topology key that can have a + // lot of t.domains but only a single nodeDomain + if nodeDomains.Operator() == v1.NodeSelectorOpIn { + for _, domain := range nodeDomains.Values() { + if count, ok := t.domains[domain]; podDomains.Has(domain) && ok && count > 0 { + options.Insert(domain) + } + } + } else { + for domain := range t.domains { + if podDomains.Has(domain) && t.domains[domain] > 0 && nodeDomains.Has(domain) { + options.Insert(domain) + } } } + if options.Len() != 0 { + return options + } - // If pod is self selecting and no pod has been scheduled yet, we can pick a domain at random to bootstrap scheduling - - if options.Len() == 0 && t.selects(pod) { + // If pod is self-selecting and no pod has been scheduled yet OR the pods that have scheduled are + // incompatible with our podDomains, we can pick a domain at random to bootstrap scheduling. + if t.selects(pod) && (len(t.domains) == len(t.emptyDomains) || !t.anyCompatiblePodDomain(podDomains)) { // First try to find a domain that is within the intersection of pod/node domains. In the case of an in-flight node // this causes us to pick the domain that the existing in-flight node is already in if possible instead of picking // a random viable domain. @@ -263,16 +299,43 @@ func (t *TopologyGroup) nextDomainAffinity(pod *v1.Pod, podDomains *scheduling.R return options } -func (t *TopologyGroup) nextDomainAntiAffinity(domains *scheduling.Requirement) *scheduling.Requirement { - options := scheduling.NewRequirement(domains.Key, v1.NodeSelectorOpDoesNotExist) +// anyCompatiblePodDomain validates whether any t.domain is compatible with our podDomains +// This is only useful in affinity checking because it tells us whether we can schedule the pod +// to the current node since it is the first pod that exists in the TopologyGroup OR all other domains +// in the TopologyGroup are incompatible with the podDomains +func (t *TopologyGroup) anyCompatiblePodDomain(podDomains *scheduling.Requirement) bool { + for domain := range t.domains { + if podDomains.Has(domain) && t.domains[domain] > 0 { + return true + } + } + return false +} + +// nolint:gocyclo +func (t *TopologyGroup) nextDomainAntiAffinity(podDomains, nodeDomains *scheduling.Requirement) *scheduling.Requirement { + options := scheduling.NewRequirement(podDomains.Key, v1.NodeSelectorOpDoesNotExist) // pods with anti-affinity must schedule to a domain where there are currently none of those pods (an empty // domain). If there are none of those domains, then the pod can't schedule and we don't need to walk this // list of domains. The use case where this optimization is really great is when we are launching nodes for // a deployment of pods with self anti-affinity. The domains map here continues to grow, and we continue to // fully scan it each iteration. - for domain := range t.emptyDomains { - if domains.Has(domain) && t.domains[domain] == 0 { - options.Insert(domain) + + // If we are explicitly selecting on specific node domains ("In" requirement) and the number of node domains + // is less than our empty domains, this is going to be more efficient to iterate through + // This is particularly useful when considering the hostname topology key that can have a + // lot of t.domains but only a single nodeDomain + if nodeDomains.Operator() == v1.NodeSelectorOpIn && nodeDomains.Len() < len(t.emptyDomains) { + for _, domain := range nodeDomains.Values() { + if t.emptyDomains.Has(domain) && podDomains.Has(domain) { + options.Insert(domain) + } + } + } else { + for domain := range t.emptyDomains { + if nodeDomains.Has(domain) && podDomains.Has(domain) { + options.Insert(domain) + } } } return options