blob: 7f2ffa4f3fe0c90608ded6cbb12481af7612afac [file] [log] [blame]
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package supervisor
18
19import (
20 "context"
21 "fmt"
22 "testing"
23 "time"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010024)
25
Serge Bazanski35e43d12021-07-06 13:12:14 +020026// waitSettle waits until the supervisor reaches a 'settled' state - ie., one
27// where no actions have been performed for a number of GC cycles.
28// This is used in tests only.
29func (s *supervisor) waitSettle(ctx context.Context) error {
30 waiter := make(chan struct{})
31 s.pReq <- &processorRequest{
32 waitSettled: &processorRequestWaitSettled{
33 waiter: waiter,
34 },
35 }
36
37 select {
38 case <-ctx.Done():
39 return ctx.Err()
40 case <-waiter:
41 return nil
42 }
43}
44
45// waitSettleError wraps waitSettle to fail a test if an error occurs, eg. the
46// context is canceled.
47func (s *supervisor) waitSettleError(ctx context.Context, t *testing.T) {
48 err := s.waitSettle(ctx)
49 if err != nil {
50 t.Fatalf("waitSettle: %v", err)
51 }
52}
53
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010054func runnableBecomesHealthy(healthy, done chan struct{}) Runnable {
55 return func(ctx context.Context) error {
56 Signal(ctx, SignalHealthy)
57
58 go func() {
59 if healthy != nil {
60 healthy <- struct{}{}
61 }
62 }()
63
64 <-ctx.Done()
65
Serge Bazanski579015a2021-11-18 13:20:20 +010066 if done != nil {
67 done <- struct{}{}
68 }
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010069
70 return ctx.Err()
71 }
72}
73
74func runnableSpawnsMore(healthy, done chan struct{}, levels int) Runnable {
75 return func(ctx context.Context) error {
76 if levels > 0 {
77 err := RunGroup(ctx, map[string]Runnable{
78 "a": runnableSpawnsMore(nil, nil, levels-1),
79 "b": runnableSpawnsMore(nil, nil, levels-1),
80 })
81 if err != nil {
82 return err
83 }
84 }
85
86 Signal(ctx, SignalHealthy)
87
88 go func() {
89 if healthy != nil {
90 healthy <- struct{}{}
91 }
92 }()
93
94 <-ctx.Done()
95
Serge Bazanski579015a2021-11-18 13:20:20 +010096 if done != nil {
97 done <- struct{}{}
98 }
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010099 return ctx.Err()
100 }
101}
102
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200103// rc is a Remote Controlled runnable. It is a generic runnable used for
104// testing the supervisor.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100105type rc struct {
106 req chan rcRunnableRequest
107}
108
109type rcRunnableRequest struct {
110 cmd rcRunnableCommand
111 stateC chan rcRunnableState
112}
113
114type rcRunnableCommand int
115
116const (
117 rcRunnableCommandBecomeHealthy rcRunnableCommand = iota
118 rcRunnableCommandBecomeDone
119 rcRunnableCommandDie
120 rcRunnableCommandPanic
121 rcRunnableCommandState
122)
123
124type rcRunnableState int
125
126const (
127 rcRunnableStateNew rcRunnableState = iota
128 rcRunnableStateHealthy
129 rcRunnableStateDone
130)
131
132func (r *rc) becomeHealthy() {
133 r.req <- rcRunnableRequest{cmd: rcRunnableCommandBecomeHealthy}
134}
135
136func (r *rc) becomeDone() {
137 r.req <- rcRunnableRequest{cmd: rcRunnableCommandBecomeDone}
138}
139func (r *rc) die() {
140 r.req <- rcRunnableRequest{cmd: rcRunnableCommandDie}
141}
142
143func (r *rc) panic() {
144 r.req <- rcRunnableRequest{cmd: rcRunnableCommandPanic}
145}
146
147func (r *rc) state() rcRunnableState {
148 c := make(chan rcRunnableState)
149 r.req <- rcRunnableRequest{
150 cmd: rcRunnableCommandState,
151 stateC: c,
152 }
153 return <-c
154}
155
156func (r *rc) waitState(s rcRunnableState) {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200157 // This is poll based. Making it non-poll based would make the RC runnable
158 // logic a bit more complex for little gain.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100159 for {
160 got := r.state()
161 if got == s {
162 return
163 }
164 time.Sleep(10 * time.Millisecond)
165 }
166}
167
168func newRC() *rc {
169 return &rc{
170 req: make(chan rcRunnableRequest),
171 }
172}
173
174// Remote Controlled Runnable
175func (r *rc) runnable() Runnable {
176 return func(ctx context.Context) error {
177 state := rcRunnableStateNew
178
179 for {
180 select {
181 case <-ctx.Done():
182 return ctx.Err()
183 case r := <-r.req:
184 switch r.cmd {
185 case rcRunnableCommandBecomeHealthy:
186 Signal(ctx, SignalHealthy)
187 state = rcRunnableStateHealthy
188 case rcRunnableCommandBecomeDone:
189 Signal(ctx, SignalDone)
190 state = rcRunnableStateDone
191 case rcRunnableCommandDie:
192 return fmt.Errorf("died on request")
193 case rcRunnableCommandPanic:
194 panic("at the disco")
195 case rcRunnableCommandState:
196 r.stateC <- state
197 }
198 }
199 }
200 }
201}
202
203func TestSimple(t *testing.T) {
204 h1 := make(chan struct{})
205 d1 := make(chan struct{})
206 h2 := make(chan struct{})
207 d2 := make(chan struct{})
208
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100209 ctx, ctxC := context.WithCancel(context.Background())
210 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100211 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100212 err := RunGroup(ctx, map[string]Runnable{
213 "one": runnableBecomesHealthy(h1, d1),
214 "two": runnableBecomesHealthy(h2, d2),
215 })
216 if err != nil {
217 return err
218 }
219 Signal(ctx, SignalHealthy)
220 Signal(ctx, SignalDone)
221 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200222 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100223
224 // Expect both to start running.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200225 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100226 select {
227 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200228 default:
229 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100230 }
231 select {
232 case <-h2:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200233 default:
234 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100235 }
236}
237
238func TestSimpleFailure(t *testing.T) {
239 h1 := make(chan struct{})
240 d1 := make(chan struct{})
241 two := newRC()
242
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200243 ctx, ctxC := context.WithTimeout(context.Background(), 10*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100244 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100245 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100246 err := RunGroup(ctx, map[string]Runnable{
247 "one": runnableBecomesHealthy(h1, d1),
248 "two": two.runnable(),
249 })
250 if err != nil {
251 return err
252 }
253 Signal(ctx, SignalHealthy)
254 Signal(ctx, SignalDone)
255 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200256 }, WithPropagatePanic)
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200257 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100258
259 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200260 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100261 // Expect one to start running.
262 select {
263 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200264 default:
265 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100266 }
267
268 // Kill off two, one should restart.
269 two.die()
Serge Bazanski579015a2021-11-18 13:20:20 +0100270 <-d1
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100271
272 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200273 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100274 select {
275 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200276 default:
277 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100278 }
279}
280
281func TestDeepFailure(t *testing.T) {
282 h1 := make(chan struct{})
283 d1 := make(chan struct{})
284 two := newRC()
285
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200286 ctx, ctxC := context.WithTimeout(context.Background(), 10*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100287 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100288 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100289 err := RunGroup(ctx, map[string]Runnable{
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200290 "one": runnableSpawnsMore(h1, d1, 5),
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100291 "two": two.runnable(),
292 })
293 if err != nil {
294 return err
295 }
296 Signal(ctx, SignalHealthy)
297 Signal(ctx, SignalDone)
298 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200299 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100300
301 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200302 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100303 // Expect one to start running.
304 select {
305 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200306 default:
307 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100308 }
309
310 // Kill off two, one should restart.
311 two.die()
Serge Bazanski579015a2021-11-18 13:20:20 +0100312 <-d1
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100313
314 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200315 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100316 select {
317 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200318 default:
319 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100320 }
321}
322
323func TestPanic(t *testing.T) {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100324 h1 := make(chan struct{})
325 d1 := make(chan struct{})
326 two := newRC()
327
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100328 ctx, ctxC := context.WithCancel(context.Background())
329 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100330 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100331 err := RunGroup(ctx, map[string]Runnable{
332 "one": runnableBecomesHealthy(h1, d1),
333 "two": two.runnable(),
334 })
335 if err != nil {
336 return err
337 }
338 Signal(ctx, SignalHealthy)
339 Signal(ctx, SignalDone)
340 return nil
341 })
342
343 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200344 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100345 // Expect one to start running.
346 select {
347 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200348 default:
349 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100350 }
351
352 // Kill off two, one should restart.
353 two.panic()
Serge Bazanski579015a2021-11-18 13:20:20 +0100354 <-d1
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100355
356 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200357 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100358 select {
359 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200360 default:
361 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100362 }
363}
364
365func TestMultipleLevelFailure(t *testing.T) {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100366 ctx, ctxC := context.WithCancel(context.Background())
367 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100368 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100369 err := RunGroup(ctx, map[string]Runnable{
370 "one": runnableSpawnsMore(nil, nil, 4),
371 "two": runnableSpawnsMore(nil, nil, 4),
372 })
373 if err != nil {
374 return err
375 }
376 Signal(ctx, SignalHealthy)
377 Signal(ctx, SignalDone)
378 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200379 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100380}
381
382func TestBackoff(t *testing.T) {
383 one := newRC()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200384
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200385 ctx, ctxC := context.WithTimeout(context.Background(), 20*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100386 defer ctxC()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200387
Serge Bazanskic7359672020-10-30 16:38:57 +0100388 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100389 if err := Run(ctx, "one", one.runnable()); err != nil {
390 return err
391 }
392 Signal(ctx, SignalHealthy)
393 Signal(ctx, SignalDone)
394 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200395 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100396
397 one.becomeHealthy()
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200398 // Die a bunch of times in a row, this brings up the next exponential
399 // backoff to over a second.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100400 for i := 0; i < 4; i += 1 {
401 one.die()
402 one.waitState(rcRunnableStateNew)
403 }
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200404 // Measure how long it takes for the runnable to respawn after a number of
405 // failures
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100406 start := time.Now()
407 one.die()
408 one.becomeHealthy()
409 one.waitState(rcRunnableStateHealthy)
410 taken := time.Since(start)
411 if taken < 1*time.Second {
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200412 t.Errorf("Runnable took %v to restart, wanted at least a second from backoff", taken)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100413 }
414
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200415 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100416 // Now that we've become healthy, die again. Becoming healthy resets the backoff.
417 start = time.Now()
418 one.die()
419 one.becomeHealthy()
420 one.waitState(rcRunnableStateHealthy)
421 taken = time.Since(start)
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200422 if taken > 1*time.Second || taken < 100*time.Millisecond {
423 t.Errorf("Runnable took %v to restart, wanted at least 100ms from backoff and at most 1s from backoff reset", taken)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100424 }
425}
426
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200427// TestResilience throws some curveballs at the supervisor - either programming
428// errors or high load. It then ensures that another runnable is running, and
429// that it restarts on its sibling failure.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100430func TestResilience(t *testing.T) {
431 // request/response channel for testing liveness of the 'one' runnable
432 req := make(chan chan struct{})
433
434 // A runnable that responds on the 'req' channel.
435 one := func(ctx context.Context) error {
436 Signal(ctx, SignalHealthy)
437 for {
438 select {
439 case <-ctx.Done():
440 return ctx.Err()
441 case r := <-req:
442 r <- struct{}{}
443 }
444 }
445 }
446 oneSibling := newRC()
447
448 oneTest := func() {
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200449 timeout := time.NewTicker(1000 * time.Millisecond)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100450 ping := make(chan struct{})
451 req <- ping
452 select {
453 case <-ping:
454 case <-timeout.C:
455 t.Fatalf("one ping response timeout")
456 }
457 timeout.Stop()
458 }
459
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200460 // A nasty runnable that calls Signal with the wrong context (this is a
461 // programming error)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100462 two := func(ctx context.Context) error {
463 Signal(context.TODO(), SignalHealthy)
464 return nil
465 }
466
467 // A nasty runnable that calls Signal wrong (this is a programming error).
468 three := func(ctx context.Context) error {
469 Signal(ctx, SignalDone)
470 return nil
471 }
472
473 // A nasty runnable that runs in a busy loop (this is a programming error).
474 four := func(ctx context.Context) error {
475 for {
476 time.Sleep(0)
477 }
478 }
479
480 // A nasty runnable that keeps creating more runnables.
481 five := func(ctx context.Context) error {
482 i := 1
483 for {
484 err := Run(ctx, fmt.Sprintf("r%d", i), runnableSpawnsMore(nil, nil, 2))
485 if err != nil {
486 return err
487 }
488
489 time.Sleep(100 * time.Millisecond)
490 i += 1
491 }
492 }
493
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100494 ctx, ctxC := context.WithCancel(context.Background())
495 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100496 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100497 RunGroup(ctx, map[string]Runnable{
498 "one": one,
499 "oneSibling": oneSibling.runnable(),
500 })
501 rs := map[string]Runnable{
502 "two": two, "three": three, "four": four, "five": five,
503 }
504 for k, v := range rs {
505 if err := Run(ctx, k, v); err != nil {
506 return err
507 }
508 }
509 Signal(ctx, SignalHealthy)
510 Signal(ctx, SignalDone)
511 return nil
512 })
513
514 // Five rounds of letting one run, then restarting it.
515 for i := 0; i < 5; i += 1 {
516 oneSibling.becomeHealthy()
517 oneSibling.waitState(rcRunnableStateHealthy)
518
519 // 'one' should work for at least a second.
520 deadline := time.Now().Add(1 * time.Second)
521 for {
522 if time.Now().After(deadline) {
523 break
524 }
525
526 oneTest()
527 }
528
529 // Killing 'oneSibling' should restart one.
530 oneSibling.panic()
531 }
532 // Make sure 'one' is still okay.
533 oneTest()
534}
535
536func ExampleNew() {
537 // Minimal runnable that is immediately done.
538 childC := make(chan struct{})
539 child := func(ctx context.Context) error {
540 Signal(ctx, SignalHealthy)
541 close(childC)
542 Signal(ctx, SignalDone)
543 return nil
544 }
545
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100546 // Start a supervision tree with a root runnable.
547 ctx, ctxC := context.WithCancel(context.Background())
548 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100549 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100550 err := Run(ctx, "child", child)
551 if err != nil {
552 return fmt.Errorf("could not run 'child': %w", err)
553 }
554 Signal(ctx, SignalHealthy)
555
556 t := time.NewTicker(time.Second)
557 defer t.Stop()
558
559 // Do something in the background, and exit on context cancel.
560 for {
561 select {
562 case <-t.C:
563 fmt.Printf("tick!")
564 case <-ctx.Done():
565 return ctx.Err()
566 }
567 }
568 })
569
570 // root.child will close this channel.
571 <-childC
572}