diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index bf9241797c2..0bc1fbf2c1f 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -462,8 +462,10 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring := all.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now) desiredChanges.Ignore += uint64(len(ignore)) - // Determine what set of terminal allocations need to be rescheduled - untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment) + // Determine what set of terminal allocations need to be rescheduled and + // see that we don't discard allocations that carry important information + // for future reschedules or deployments + untainted, rescheduleNow, rescheduleLater, informational := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment) // If there are allocations reconnecting we need to reconcile them and // their replacements first because there is specific logic when deciding @@ -509,7 +511,7 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { // the reschedule policy won't be enabled and the lost allocations // wont be rescheduled, and PreventRescheduleOnLost is ignored. if tg.GetDisconnectLostTimeout() != 0 { - untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment) + untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting, _ := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment) rescheduleNow = rescheduleNow.union(rescheduleDisconnecting) untainted = untainted.union(untaintedDisconnecting) @@ -591,7 +593,7 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { // * An alloc was lost var place []allocPlaceResult if len(lostLater) == 0 { - place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow, lost, isCanarying) + place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow, lost, isCanarying, informational) if !existingDeployment { dstate.DesiredTotal += len(place) } @@ -798,7 +800,7 @@ func (a *allocReconciler) computeUnderProvisionedBy(group *structs.TaskGroup, un // Placements will meet or exceed group count. func (a *allocReconciler) computePlacements(group *structs.TaskGroup, nameIndex *allocNameIndex, untainted, migrate, reschedule, lost allocSet, - isCanarying bool) []allocPlaceResult { + isCanarying bool, informational []*structs.Allocation) []allocPlaceResult { // Add rescheduled placement results var place []allocPlaceResult @@ -843,9 +845,18 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup, // Add remaining placement results if existing < group.Count { for _, name := range nameIndex.Next(uint(group.Count - existing)) { + + // if there are any informational allocs, pop and add them as previousAlloc to + // our new placement + var a *structs.Allocation + if len(informational) > 0 { + a, informational = informational[len(informational)-1], informational[:len(informational)-1] + } + place = append(place, allocPlaceResult{ name: name, taskGroup: group, + previousAlloc: a, downgradeNonCanary: isCanarying, }) } diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 653ea1d9a0d..08087a0b467 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -388,6 +388,9 @@ func TestReconciler_Place_Existing(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -429,6 +432,9 @@ func TestReconciler_ScaleDown_Partial(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -471,6 +477,9 @@ func TestReconciler_ScaleDown_Zero(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -514,6 +523,9 @@ func TestReconciler_ScaleDown_Zero_DuplicateNames(t *testing.T) { alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2)) allocs = append(allocs, alloc) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} expectedStopped = append(expectedStopped, i%2) } @@ -552,6 +564,9 @@ func TestReconciler_Inplace(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -593,6 +608,9 @@ func TestReconciler_Inplace_ScaleUp(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -636,6 +654,9 @@ func TestReconciler_Inplace_ScaleDown(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -686,6 +707,9 @@ func TestReconciler_Inplace_Rollback(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } // allocs[0] is an allocation from version 0 @@ -746,6 +770,9 @@ func TestReconciler_Destructive(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -782,6 +809,9 @@ func TestReconciler_DestructiveMaxParallel(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -821,6 +851,9 @@ func TestReconciler_Destructive_ScaleUp(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -863,6 +896,9 @@ func TestReconciler_Destructive_ScaleDown(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = uuid.Generate() alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} allocs = append(allocs, alloc) } @@ -1018,6 +1054,10 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) alloc.DesiredStatus = structs.AllocDesiredStatusRun + // set host volume IDs on running allocations to make sure their presence doesn't + // interfere with reconciler behavior + alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} + // Set one of the allocations to failed if i == 4 { alloc.ClientStatus = structs.AllocClientStatusFailed @@ -1063,6 +1103,126 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { } } +func TestReconciler_InformationalAllocs(t *testing.T) { + disabledReschedulePolicy := &structs.ReschedulePolicy{ + Attempts: 0, + Unlimited: false, + } + + ci.Parallel(t) + now := time.Now() + + testCases := []struct { + name string + count int + stoppedCount int + failedCount int + reschedulePolicy *structs.ReschedulePolicy + expectPlace int + expectStop int + expectIgnore int + }{ + { + name: "Count 3, 2 allocs failed, 1 stopped, no reschedule", + count: 3, + stoppedCount: 1, + failedCount: 2, + reschedulePolicy: disabledReschedulePolicy, + expectPlace: 2, + expectStop: 1, + expectIgnore: 1, + }, + { + name: "Count 1, 1 alloc failed, 1 stopped, reschedule", + count: 1, + stoppedCount: 1, + failedCount: 1, + reschedulePolicy: &structs.ReschedulePolicy{ + Attempts: 1, + }, + expectPlace: 1, + expectStop: 2, + expectIgnore: 0, + }, + { + name: "Count 2, no allocs failed, 2 stopped, no reschedule", + count: 2, + stoppedCount: 2, + failedCount: 0, + reschedulePolicy: disabledReschedulePolicy, + expectPlace: 2, + expectStop: 1, + expectIgnore: 0, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + job := mock.Job() + job.TaskGroups[0].Count = tc.count + job.TaskGroups[0].ReschedulePolicy = tc.reschedulePolicy + + var allocs []*structs.Allocation + for i := 0; i < tc.failedCount; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + alloc.HostVolumeIDs = []string{"foo"} + alloc.DesiredStatus = structs.AllocDesiredStatusRun + alloc.ClientStatus = structs.AllocClientStatusFailed + + allocs = append(allocs, alloc) + } + + for i := 0; i < tc.stoppedCount; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + alloc.HostVolumeIDs = []string{"foo"} + alloc.DesiredStatus = structs.AllocDesiredStatusStop + alloc.ClientStatus = structs.AllocClientStatusComplete + + allocs = append(allocs, alloc) + } + + // Build a map of tainted nodes, one down one disconnected + tainted := make(map[string]*structs.Node, 2) + downNode := mock.Node() + downNode.ID = allocs[0].NodeID + downNode.Status = structs.NodeStatusDown + tainted[downNode.ID] = downNode + + disconnected := mock.Node() + disconnected.ID = allocs[1].NodeID + disconnected.Status = structs.NodeStatusDisconnected + tainted[disconnected.ID] = disconnected + + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, + nil, allocs, tainted, "", 50, true, AllocRenconcilerWithNow(now)) + r := reconciler.Compute() + + // Assert the correct results + assertResults(t, r, &resultExpectation{ + createDeployment: nil, + deploymentUpdates: nil, + place: tc.expectPlace, + stop: tc.expectStop, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + job.TaskGroups[0].Name: { + Place: uint64(tc.expectPlace), + Stop: uint64(tc.expectStop), + Ignore: uint64(tc.expectIgnore), + }, + }, + }) + }) + } +} + // Tests the reconciler properly handles lost nodes with allocations func TestReconciler_LostNode(t *testing.T) { ci.Parallel(t) diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 41a56503c7e..47c0f44fd32 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -218,8 +218,8 @@ func (a allocSet) fromKeys(keys ...[]string) allocSet { return from } -// filterByTainted takes a set of tainted nodes and filters the allocation set -// into the following groups: +// filterByTainted takes a set of tainted nodes and partitions the allocation +// set into the following groups: // 1. Those that exist on untainted nodes // 2. Those exist on nodes that are draining // 3. Those that exist on lost nodes or have expired @@ -385,14 +385,25 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS return } -// filterByRescheduleable filters the allocation set to return the set of allocations that are either -// untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled -// at a future time are also returned so that we can create follow up evaluations for them. Allocs are -// skipped or considered untainted according to logic defined in shouldFilter method. -func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (allocSet, allocSet, []*delayedRescheduleInfo) { +// filterByRescheduleable filters the allocation set to return the set of +// allocations that are either untainted or a set of allocations that must be +// rescheduled now. Allocations that can be rescheduled at a future time are +// also returned so that we can create follow up evaluations for them. Allocs +// are skipped or considered untainted according to logic defined in +// shouldFilter method. +// +// filterByRescheduleable returns an extra slice of allocations as its last +// output: these allocs are "informational." They will not be rescheduled now +// or later, but they carry important information for future allocations that +// might get rescheduled. An example of such allocations are stateful +// deployments: allocs that require particular host volume IDs. +func (a allocSet) filterByRescheduleable( + isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment, +) (allocSet, allocSet, []*delayedRescheduleInfo, []*structs.Allocation) { untainted := make(map[string]*structs.Allocation) rescheduleNow := make(map[string]*structs.Allocation) rescheduleLater := []*delayedRescheduleInfo{} + informational := []*structs.Allocation{} for _, alloc := range a { // Ignore disconnecting allocs that are already unknown. This can happen @@ -411,6 +422,12 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time continue } + // Allocations with host volume IDs can be ignored, but we must keep + // the information they carry for future migrated allocs + if len(alloc.HostVolumeIDs) > 0 { + informational = append(informational, alloc) + } + isUntainted, ignore := shouldFilter(alloc, isBatch) if isUntainted && !isDisconnecting { untainted[alloc.ID] = alloc @@ -436,7 +453,7 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time } } - return untainted, rescheduleNow, rescheduleLater + return untainted, rescheduleNow, rescheduleLater, informational } // shouldFilter returns whether the alloc should be ignored or considered untainted. diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go index e4f01d0b675..d38a5db96e8 100644 --- a/scheduler/reconcile_util_test.go +++ b/scheduler/reconcile_util_test.go @@ -1875,6 +1875,8 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { } testJob.TaskGroups[0] = rescheduleTG + stickyDeploymentJob := mock.Job() + now := time.Now() rt := &structs.RescheduleTracker{ @@ -1895,9 +1897,10 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { deployment *structs.Deployment // expected results - untainted allocSet - resNow allocSet - resLater []*delayedRescheduleInfo + untainted allocSet + resNow allocSet + resLater []*delayedRescheduleInfo + informational []*structs.Allocation } testCases := []testCase{ @@ -1921,8 +1924,9 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { TaskGroup: "noRescheduleTG", }, }, - resNow: allocSet{}, - resLater: []*delayedRescheduleInfo{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { name: "batch ignore unknown disconnecting allocs", @@ -1935,9 +1939,10 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { Job: testJob, }, }, - untainted: allocSet{}, - resNow: allocSet{}, - resLater: []*delayedRescheduleInfo{}, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { name: "batch disconnecting allocation reschedule", @@ -1962,7 +1967,8 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { RescheduleTracker: rt, }, }, - resLater: []*delayedRescheduleInfo{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { @@ -1991,8 +1997,9 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { "task": {State: structs.TaskStateDead, Failed: false}}, }, }, - resNow: allocSet{}, - resLater: []*delayedRescheduleInfo{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { name: "service disconnecting allocation no reschedule", @@ -2014,8 +2021,9 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { TaskGroup: "noRescheduleTG", }, }, - resNow: allocSet{}, - resLater: []*delayedRescheduleInfo{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { name: "service disconnecting allocation reschedule", @@ -2040,7 +2048,8 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { RescheduleTracker: rt, }, }, - resLater: []*delayedRescheduleInfo{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { name: "service ignore unknown disconnecting allocs", @@ -2053,9 +2062,10 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { Job: testJob, }, }, - untainted: allocSet{}, - resNow: allocSet{}, - resLater: []*delayedRescheduleInfo{}, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { name: "service previously rescheduled alloc should not reschedule", @@ -2070,9 +2080,10 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { TaskGroup: "rescheduleTG", }, }, - untainted: allocSet{}, - resNow: allocSet{}, - resLater: []*delayedRescheduleInfo{}, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { name: "service complete should be ignored", @@ -2087,9 +2098,10 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { TaskGroup: "rescheduleTG", }, }, - untainted: allocSet{}, - resNow: allocSet{}, - resLater: []*delayedRescheduleInfo{}, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, }, { name: "service running allocation no reschedule", @@ -2111,18 +2123,44 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { TaskGroup: "noRescheduleTG", }, }, - resNow: allocSet{}, - resLater: []*delayedRescheduleInfo{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{}, + }, + { + name: "allocation with sticky volumes", + isDisconnecting: false, + isBatch: false, + all: allocSet{ + "sticky": { + ID: "sticky", + DesiredStatus: structs.AllocDesiredStatusStop, + ClientStatus: structs.AllocClientStatusComplete, + Job: stickyDeploymentJob, + HostVolumeIDs: []string{"hostVolumeID1"}, + }, + }, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + informational: []*structs.Allocation{{ + ID: "sticky", + DesiredStatus: structs.AllocDesiredStatusStop, + ClientStatus: structs.AllocClientStatusComplete, + Job: stickyDeploymentJob, + HostVolumeIDs: []string{"hostVolumeID1"}, + }}, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - untainted, resNow, resLater := tc.all.filterByRescheduleable(tc.isBatch, + untainted, resNow, resLater, informational := tc.all.filterByRescheduleable(tc.isBatch, tc.isDisconnecting, now, "evailID", tc.deployment) must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted")) must.Eq(t, tc.resNow, resNow, must.Sprintf("with-nodes: reschedule-now")) must.Eq(t, tc.resLater, resLater, must.Sprintf("with-nodes: rescheduleLater")) + must.Eq(t, tc.informational, informational, must.Sprintf("with-nodes: informational")) }) } }