blob: b0d0bcb1672bfe2bd95185fc25e391f85da254b3 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01002// SPDX-License-Identifier: Apache-2.0
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01003
4package supervisor
5
6import (
7 "context"
8 "errors"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01009 "fmt"
10 "runtime/debug"
Serge Bazanskiec19b602022-03-09 20:41:31 +010011 "sort"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010012 "time"
Tim Windelschmidt22c1a2a2025-03-24 17:23:17 +010013
14 "source.monogon.dev/osbase/logtree"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010015)
16
Serge Bazanski216fe7b2021-05-21 18:36:16 +020017// The processor maintains runnable goroutines - ie., when requested will start
Jan Schäraa6b42a2024-12-18 18:03:26 +010018// one, and then once it exits, it will record the result and act accordingly.
Serge Bazanski216fe7b2021-05-21 18:36:16 +020019// It is also responsible for detecting and acting upon supervision subtrees
20// that need to be restarted after death (via a 'GC' process)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010021
Serge Bazanski216fe7b2021-05-21 18:36:16 +020022// processorRequest is a request for the processor. Only one of the fields can
23// be set.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010024type processorRequest struct {
Serge Bazanskiac6b6442020-05-06 19:13:43 +020025 schedule *processorRequestSchedule
26 died *processorRequestDied
27 waitSettled *processorRequestWaitSettled
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010028}
29
30// processorRequestSchedule requests that a given node's runnable be started.
31type processorRequestSchedule struct {
32 dn string
33}
34
Serge Bazanski216fe7b2021-05-21 18:36:16 +020035// processorRequestDied is a signal from a runnable goroutine that the runnable
36// has died.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010037type processorRequestDied struct {
38 dn string
39 err error
40}
41
Serge Bazanskiac6b6442020-05-06 19:13:43 +020042type processorRequestWaitSettled struct {
43 waiter chan struct{}
44}
45
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010046// processor is the main processing loop.
47func (s *supervisor) processor(ctx context.Context) {
48 s.ilogger.Info("supervisor processor started")
49
Serge Bazanskiac6b6442020-05-06 19:13:43 +020050 // Waiters waiting for the GC to be settled.
51 var waiters []chan struct{}
52
Serge Bazanski216fe7b2021-05-21 18:36:16 +020053 // The GC will run every millisecond if needed. Any time the processor
54 // requests a change in the supervision tree (ie a death or a new runnable)
55 // it will mark the state as dirty and run the GC on the next millisecond
56 // cycle.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010057 gc := time.NewTicker(1 * time.Millisecond)
58 defer gc.Stop()
59 clean := true
60
Serge Bazanskiac6b6442020-05-06 19:13:43 +020061 // How long has the GC been clean. This is used to notify 'settled' waiters.
62 cleanCycles := 0
63
64 markDirty := func() {
65 clean = false
66 cleanCycles = 0
67 }
68
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010069 for {
70 select {
71 case <-ctx.Done():
Serge Bazanskic7359672020-10-30 16:38:57 +010072 s.ilogger.Infof("supervisor processor exiting: %v", ctx.Err())
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010073 s.processKill()
Serge Bazanskiec19b602022-03-09 20:41:31 +010074 s.ilogger.Info("supervisor exited, starting liquidator to clean up remaining runnables...")
75 go s.liquidator()
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010076 return
77 case <-gc.C:
78 if !clean {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010079 s.processGC()
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010080 }
81 clean = true
Serge Bazanskiac6b6442020-05-06 19:13:43 +020082 cleanCycles += 1
83
Serge Bazanski216fe7b2021-05-21 18:36:16 +020084 // This threshold is somewhat arbitrary. It's a balance between
85 // test speed and test reliability.
Serge Bazanskiac6b6442020-05-06 19:13:43 +020086 if cleanCycles > 50 {
87 for _, w := range waiters {
88 close(w)
89 }
90 waiters = nil
91 }
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010092 case r := <-s.pReq:
93 switch {
94 case r.schedule != nil:
95 s.processSchedule(r.schedule)
Serge Bazanskiac6b6442020-05-06 19:13:43 +020096 markDirty()
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010097 case r.died != nil:
98 s.processDied(r.died)
Serge Bazanskiac6b6442020-05-06 19:13:43 +020099 markDirty()
100 case r.waitSettled != nil:
101 waiters = append(waiters, r.waitSettled.waiter)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100102 default:
103 panic(fmt.Errorf("unhandled request %+v", r))
104 }
105 }
106 }
107}
108
Serge Bazanskiec19b602022-03-09 20:41:31 +0100109// The liquidator is a context-free goroutine which the supervisor starts after
110// its context has been canceled. Its job is to take over listening on the
111// processing channels that the supervisor processor would usually listen on,
112// and implement the minimum amount of logic required to mark existing runnables
113// as DEAD.
114//
115// It exits when all runnables have exited one way or another, and the
116// supervision tree is well and truly dead. This will also be reflected by
117// liveRunnables returning an empty list.
118func (s *supervisor) liquidator() {
119 for {
Tim Windelschmidt931b3a32024-04-18 23:39:20 +0200120 r := <-s.pReq
121 switch {
122 case r.schedule != nil:
123 s.ilogger.Infof("liquidator: refusing to schedule %s", r.schedule.dn)
124 s.mu.Lock()
125 n := s.nodeByDN(r.schedule.dn)
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000126 n.state = NodeStateDead
Tim Windelschmidt931b3a32024-04-18 23:39:20 +0200127 s.mu.Unlock()
128 case r.died != nil:
129 s.ilogger.Infof("liquidator: %s exited", r.died.dn)
130 s.mu.Lock()
131 n := s.nodeByDN(r.died.dn)
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000132 n.state = NodeStateDead
Tim Windelschmidt931b3a32024-04-18 23:39:20 +0200133 s.mu.Unlock()
Serge Bazanskiec19b602022-03-09 20:41:31 +0100134 }
135 live := s.liveRunnables()
136 if len(live) == 0 {
137 s.ilogger.Infof("liquidator: complete, all runnables dead or done")
138 return
139 }
140 }
141}
142
Jan Schäraa6b42a2024-12-18 18:03:26 +0100143// liveRunnables returns a list of runnable DNs that aren't DONE/DEAD/CANCELED.
144// This is used by the liquidator to figure out when its job is done, and by the
Serge Bazanskiec19b602022-03-09 20:41:31 +0100145// TestHarness to know when to unblock the test cleanup function.
146func (s *supervisor) liveRunnables() []string {
147 s.mu.RLock()
148 defer s.mu.RUnlock()
149
Jan Schäraa6b42a2024-12-18 18:03:26 +0100150 // DFS through supervision tree, making note of live (non-DONE/DEAD/CANCELED
151 // runnables).
Serge Bazanskiec19b602022-03-09 20:41:31 +0100152 var live []string
153 seen := make(map[string]bool)
154 q := []*node{s.root}
155 for {
156 if len(q) == 0 {
157 break
158 }
159
160 // Pop from DFS queue.
161 el := q[0]
162 q = q[1:]
163
164 // Skip already visited runnables (this shouldn't happen because the supervision
165 // tree is, well, a tree - but better stay safe than get stuck in a loop).
166 eldn := el.dn()
167 if seen[eldn] {
168 continue
169 }
170 seen[eldn] = true
171
Jan Schäraa6b42a2024-12-18 18:03:26 +0100172 if el.state != NodeStateDead && el.state != NodeStateDone && el.state != NodeStateCanceled {
Serge Bazanskiec19b602022-03-09 20:41:31 +0100173 live = append(live, eldn)
174 }
175
176 // Recurse.
177 for _, child := range el.children {
178 q = append(q, child)
179 }
180 }
181
182 sort.Strings(live)
183 return live
184}
185
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200186// processKill cancels all nodes in the supervision tree. This is only called
187// right before exiting the processor, so they do not get automatically
188// restarted.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100189func (s *supervisor) processKill() {
190 s.mu.Lock()
191 defer s.mu.Unlock()
192
193 // Gather all context cancel functions.
194 var cancels []func()
195 queue := []*node{s.root}
196 for {
197 if len(queue) == 0 {
198 break
199 }
200
201 cur := queue[0]
202 queue = queue[1:]
203
204 cancels = append(cancels, cur.ctxC)
205 for _, c := range cur.children {
206 queue = append(queue, c)
207 }
208 }
209
210 // Call all context cancels.
211 for _, c := range cancels {
212 c()
213 }
214}
215
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200216// processSchedule starts a node's runnable in a goroutine and records its
217// output once it's done.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100218func (s *supervisor) processSchedule(r *processorRequestSchedule) {
219 s.mu.Lock()
220 defer s.mu.Unlock()
221
222 n := s.nodeByDN(r.dn)
Serge Bazanskicf864da2024-07-31 11:23:34 +0000223 if n.state != NodeStateNew {
224 panic("programming error: scheduled node not new")
225 }
226 s.metrics.NotifyNodeState(r.dn, n.state)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100227 go func() {
Serge Bazanski19bb4122020-05-04 17:57:50 +0200228 if !s.propagatePanic {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100229 defer func() {
230 if rec := recover(); rec != nil {
231 s.pReq <- &processorRequest{
232 died: &processorRequestDied{
233 dn: r.dn,
234 err: fmt.Errorf("panic: %v, stacktrace: %s", rec, string(debug.Stack())),
235 },
236 }
237 }
238 }()
239 }
240
241 res := n.runnable(n.ctx)
242
243 s.pReq <- &processorRequest{
244 died: &processorRequestDied{
245 dn: r.dn,
246 err: res,
247 },
248 }
249 }()
250}
251
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200252// processDied records the result from a runnable goroutine, and updates its
253// node state accordingly. If the result is a death and not an expected exit,
254// related nodes (ie. children and group siblings) are canceled accordingly.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100255func (s *supervisor) processDied(r *processorRequestDied) {
256 s.mu.Lock()
257 defer s.mu.Unlock()
258
259 // Okay, so a Runnable has quit. What now?
260 n := s.nodeByDN(r.dn)
261 ctx := n.ctx
262
Jan Schär08c1c722024-12-19 12:03:17 +0100263 // Simple case: it has signaled Done and quit with no error.
264 if n.signaledDone && r.err == nil {
265 // Mark the node as DONE.
266 n.state = NodeStateDone
Serge Bazanskicf864da2024-07-31 11:23:34 +0000267 s.metrics.NotifyNodeState(r.dn, n.state)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100268 return
269 }
270
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200271 // Simple case: the context was canceled and the returned error is the
272 // context error.
Tim Windelschmidt47d03442024-04-23 15:08:44 +0200273 if r.err != nil && ctx.Err() != nil && errors.Is(r.err, ctx.Err()) {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100274 // Mark the node as canceled successfully.
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000275 n.state = NodeStateCanceled
Serge Bazanskicf864da2024-07-31 11:23:34 +0000276 s.metrics.NotifyNodeState(r.dn, n.state)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100277 return
278 }
279
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200280 // Otherwise, the Runnable should not have died or quit. Handle
281 // accordingly.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100282 err := r.err
283 // A lack of returned error is also an error.
284 if err == nil {
Serge Bazanski0164c712023-03-16 17:54:07 +0100285 err = fmt.Errorf("returned nil when %s", n.state)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100286 }
287
Tim Windelschmidt22c1a2a2025-03-24 17:23:17 +0100288 s.logtree.MustLeveledFor(logtree.DN(n.dn())).Fatalf("runnable returned error: %v", err)
289
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100290 // Mark as dead.
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000291 n.state = NodeStateDead
Serge Bazanskicf864da2024-07-31 11:23:34 +0000292 s.metrics.NotifyNodeState(r.dn, n.state)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100293
294 // Cancel that node's context, just in case something still depends on it.
295 n.ctxC()
296
297 // Cancel all siblings.
298 if n.parent != nil {
Tim Windelschmidt6b6428d2024-04-11 01:35:41 +0200299 for name := range n.parent.groupSiblings(n.name) {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100300 if name == n.name {
301 continue
302 }
303 sibling := n.parent.children[name]
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200304 // TODO(q3k): does this need to run in a goroutine, ie. can a
305 // context cancel block?
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100306 sibling.ctxC()
307 }
308 }
309}
310
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200311// processGC runs the GC process. It's not really Garbage Collection, as in, it
312// doesn't remove unnecessary tree nodes - but it does find nodes that need to
313// be restarted, find the subset that can and then schedules them for running.
314// As such, it's less of a Garbage Collector and more of a Necromancer.
315// However, GC is a friendlier name.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100316func (s *supervisor) processGC() {
317 s.mu.Lock()
318 defer s.mu.Unlock()
319
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200320 // The 'GC' serves is the main business logic of the supervision tree. It
321 // traverses a locked tree and tries to find subtrees that must be
322 // restarted (because of a DEAD/CANCELED runnable). It then finds which of
323 // these subtrees that should be restarted can be restarted, ie. which ones
324 // are fully recursively DEAD/CANCELED. It also finds the smallest set of
325 // largest subtrees that can be restarted, ie. if there's multiple DEAD
326 // runnables that can be restarted at once, it will do so.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100327
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100328 // Phase one: Find all leaves.
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200329 // This is a simple DFS that finds all the leaves of the tree, ie all nodes
330 // that do not have children nodes.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100331 leaves := make(map[string]bool)
332 queue := []*node{s.root}
333 for {
334 if len(queue) == 0 {
335 break
336 }
337 cur := queue[0]
338 queue = queue[1:]
339
340 for _, c := range cur.children {
341 queue = append([]*node{c}, queue...)
342 }
343
344 if len(cur.children) == 0 {
345 leaves[cur.dn()] = true
346 }
347 }
348
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200349 // Phase two: traverse tree from node to root and make note of all subtrees
350 // that can be restarted.
351 // A subtree is restartable/ready iff every node in that subtree is either
352 // CANCELED, DEAD or DONE. Such a 'ready' subtree can be restarted by the
353 // supervisor if needed.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100354
355 // DNs that we already visited.
356 visited := make(map[string]bool)
357 // DNs whose subtrees are ready to be restarted.
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200358 // These are all subtrees recursively - ie., root.a.a and root.a will both
359 // be marked here.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100360 ready := make(map[string]bool)
361
362 // We build a queue of nodes to visit, starting from the leaves.
363 queue = []*node{}
Tim Windelschmidt6b6428d2024-04-11 01:35:41 +0200364 for l := range leaves {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100365 queue = append(queue, s.nodeByDN(l))
366 }
367
368 for {
369 if len(queue) == 0 {
370 break
371 }
372
373 cur := queue[0]
374 curDn := cur.dn()
375
376 queue = queue[1:]
377
378 // Do we have a decision about our children?
379 allVisited := true
380 for _, c := range cur.children {
381 if !visited[c.dn()] {
382 allVisited = false
383 break
384 }
385 }
386
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200387 // If no decision about children is available, it means we ended up in
388 // this subtree through some shorter path of a shorter/lower-order
389 // leaf. There is a path to a leaf that's longer than the one that
390 // caused this node to be enqueued. Easy solution: just push back the
391 // current element and retry later.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100392 if !allVisited {
393 // Push back to queue and wait for a decision later.
394 queue = append(queue, cur)
395 continue
396 }
397
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200398 // All children have been visited and we have an idea about whether
399 // they're ready/restartable. All of the node's children must be
400 // restartable in order for this node to be restartable.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100401 childrenReady := true
Serge Bazanskiba7bf7d2021-10-29 16:59:00 +0200402 var childrenNotReady []string
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100403 for _, c := range cur.children {
404 if !ready[c.dn()] {
Serge Bazanskiba7bf7d2021-10-29 16:59:00 +0200405 childrenNotReady = append(childrenNotReady, c.dn())
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100406 childrenReady = false
407 break
408 }
409 }
410
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200411 // In addition to children, the node itself must be restartable (ie.
412 // DONE, DEAD or CANCELED).
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100413 curReady := false
414 switch cur.state {
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000415 case NodeStateDone:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100416 curReady = true
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000417 case NodeStateCanceled:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100418 curReady = true
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000419 case NodeStateDead:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100420 curReady = true
Tim Windelschmidt9b2c1562024-04-11 01:39:25 +0200421 default:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100422 }
423
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000424 if cur.state == NodeStateDead && !childrenReady {
Serge Bazanskiba7bf7d2021-10-29 16:59:00 +0200425 s.ilogger.Warningf("Not restarting %s: children not ready to be restarted: %v", curDn, childrenNotReady)
426 }
427
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200428 // Note down that we have an opinion on this node, and note that
429 // opinion down.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100430 visited[curDn] = true
431 ready[curDn] = childrenReady && curReady
432
433 // Now we can also enqueue the parent of this node for processing.
434 if cur.parent != nil && !visited[cur.parent.dn()] {
435 queue = append(queue, cur.parent)
436 }
437 }
438
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200439 // Phase 3: traverse tree from root to find largest subtrees that need to
440 // be restarted and are ready to be restarted.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100441
442 // All DNs that need to be restarted by the GC process.
443 want := make(map[string]bool)
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200444 // All DNs that need to be restarted and can be restarted by the GC process
445 // - a subset of 'want' DNs.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100446 can := make(map[string]bool)
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200447 // The set difference between 'want' and 'can' are all nodes that should be
448 // restarted but can't yet (ie. because a child is still in the process of
449 // being canceled).
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100450
451 // DFS from root.
452 queue = []*node{s.root}
453 for {
454 if len(queue) == 0 {
455 break
456 }
457
458 cur := queue[0]
459 queue = queue[1:]
460
Jan Schärfce7c762024-12-19 14:07:24 +0100461 // If this node's context is canceled and it has exited, it should be
462 // restarted.
463 exited := cur.state == NodeStateDead || cur.state == NodeStateCanceled || cur.state == NodeStateDone
464 if cur.ctx.Err() != nil && exited {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100465 want[cur.dn()] = true
466 }
467
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200468 // If it should be restarted and is ready to be restarted...
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100469 if want[cur.dn()] && ready[cur.dn()] {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200470 // And its parent context is valid (ie hasn't been canceled), mark
471 // it as restartable.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200472 if cur.parent == nil || cur.parent.ctx.Err() == nil {
473 can[cur.dn()] = true
474 continue
475 }
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100476 }
477
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200478 // Otherwise, traverse further down the tree to see if something else
479 // needs to be done.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100480 for _, c := range cur.children {
481 queue = append(queue, c)
482 }
483 }
484
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100485 // Reinitialize and reschedule all subtrees
Tim Windelschmidt6b6428d2024-04-11 01:35:41 +0200486 for dn := range can {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100487 n := s.nodeByDN(dn)
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200488
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200489 // Only back off when the node unexpectedly died - not when it got
490 // canceled.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200491 bo := time.Duration(0)
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000492 if n.state == NodeStateDead {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100493 bo = n.bo.NextBackOff()
494 }
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200495
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200496 // Prepare node for rescheduling - remove its children, reset its state
497 // to new.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100498 n.reset()
Serge Bazanskic7359672020-10-30 16:38:57 +0100499 s.ilogger.Infof("rescheduling supervised node %s with backoff %s", dn, bo.String())
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100500
501 // Reschedule node runnable to run after backoff.
502 go func(n *node, bo time.Duration) {
Jan Schär65602092024-12-19 10:37:34 +0100503 select {
504 case <-time.After(bo):
505 s.pReq <- &processorRequest{
506 schedule: &processorRequestSchedule{dn: n.dn()},
507 }
508 case <-n.ctx.Done():
509 s.pReq <- &processorRequest{
510 died: &processorRequestDied{
511 dn: n.dn(),
512 err: n.ctx.Err(),
513 },
514 }
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100515 }
516 }(n, bo)
517 }
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100518}