blob: db84163beb3472472c391eb963ca5751ef3dede3 [file] [log] [blame]
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package supervisor
18
19import (
20 "context"
21 "fmt"
22 "testing"
23 "time"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010024)
25
26func runnableBecomesHealthy(healthy, done chan struct{}) Runnable {
27 return func(ctx context.Context) error {
28 Signal(ctx, SignalHealthy)
29
30 go func() {
31 if healthy != nil {
32 healthy <- struct{}{}
33 }
34 }()
35
36 <-ctx.Done()
37
38 go func() {
39 if done != nil {
40 done <- struct{}{}
41 }
42 }()
43
44 return ctx.Err()
45 }
46}
47
48func runnableSpawnsMore(healthy, done chan struct{}, levels int) Runnable {
49 return func(ctx context.Context) error {
50 if levels > 0 {
51 err := RunGroup(ctx, map[string]Runnable{
52 "a": runnableSpawnsMore(nil, nil, levels-1),
53 "b": runnableSpawnsMore(nil, nil, levels-1),
54 })
55 if err != nil {
56 return err
57 }
58 }
59
60 Signal(ctx, SignalHealthy)
61
62 go func() {
63 if healthy != nil {
64 healthy <- struct{}{}
65 }
66 }()
67
68 <-ctx.Done()
69
70 go func() {
71 if done != nil {
72 done <- struct{}{}
73 }
74 }()
75 return ctx.Err()
76 }
77}
78
Serge Bazanski216fe7b2021-05-21 18:36:16 +020079// rc is a Remote Controlled runnable. It is a generic runnable used for
80// testing the supervisor.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010081type rc struct {
82 req chan rcRunnableRequest
83}
84
85type rcRunnableRequest struct {
86 cmd rcRunnableCommand
87 stateC chan rcRunnableState
88}
89
90type rcRunnableCommand int
91
92const (
93 rcRunnableCommandBecomeHealthy rcRunnableCommand = iota
94 rcRunnableCommandBecomeDone
95 rcRunnableCommandDie
96 rcRunnableCommandPanic
97 rcRunnableCommandState
98)
99
100type rcRunnableState int
101
102const (
103 rcRunnableStateNew rcRunnableState = iota
104 rcRunnableStateHealthy
105 rcRunnableStateDone
106)
107
108func (r *rc) becomeHealthy() {
109 r.req <- rcRunnableRequest{cmd: rcRunnableCommandBecomeHealthy}
110}
111
112func (r *rc) becomeDone() {
113 r.req <- rcRunnableRequest{cmd: rcRunnableCommandBecomeDone}
114}
115func (r *rc) die() {
116 r.req <- rcRunnableRequest{cmd: rcRunnableCommandDie}
117}
118
119func (r *rc) panic() {
120 r.req <- rcRunnableRequest{cmd: rcRunnableCommandPanic}
121}
122
123func (r *rc) state() rcRunnableState {
124 c := make(chan rcRunnableState)
125 r.req <- rcRunnableRequest{
126 cmd: rcRunnableCommandState,
127 stateC: c,
128 }
129 return <-c
130}
131
132func (r *rc) waitState(s rcRunnableState) {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200133 // This is poll based. Making it non-poll based would make the RC runnable
134 // logic a bit more complex for little gain.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100135 for {
136 got := r.state()
137 if got == s {
138 return
139 }
140 time.Sleep(10 * time.Millisecond)
141 }
142}
143
144func newRC() *rc {
145 return &rc{
146 req: make(chan rcRunnableRequest),
147 }
148}
149
150// Remote Controlled Runnable
151func (r *rc) runnable() Runnable {
152 return func(ctx context.Context) error {
153 state := rcRunnableStateNew
154
155 for {
156 select {
157 case <-ctx.Done():
158 return ctx.Err()
159 case r := <-r.req:
160 switch r.cmd {
161 case rcRunnableCommandBecomeHealthy:
162 Signal(ctx, SignalHealthy)
163 state = rcRunnableStateHealthy
164 case rcRunnableCommandBecomeDone:
165 Signal(ctx, SignalDone)
166 state = rcRunnableStateDone
167 case rcRunnableCommandDie:
168 return fmt.Errorf("died on request")
169 case rcRunnableCommandPanic:
170 panic("at the disco")
171 case rcRunnableCommandState:
172 r.stateC <- state
173 }
174 }
175 }
176 }
177}
178
179func TestSimple(t *testing.T) {
180 h1 := make(chan struct{})
181 d1 := make(chan struct{})
182 h2 := make(chan struct{})
183 d2 := make(chan struct{})
184
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100185 ctx, ctxC := context.WithCancel(context.Background())
186 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100187 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100188 err := RunGroup(ctx, map[string]Runnable{
189 "one": runnableBecomesHealthy(h1, d1),
190 "two": runnableBecomesHealthy(h2, d2),
191 })
192 if err != nil {
193 return err
194 }
195 Signal(ctx, SignalHealthy)
196 Signal(ctx, SignalDone)
197 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200198 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100199
200 // Expect both to start running.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200201 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100202 select {
203 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200204 default:
205 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100206 }
207 select {
208 case <-h2:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200209 default:
210 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100211 }
212}
213
214func TestSimpleFailure(t *testing.T) {
215 h1 := make(chan struct{})
216 d1 := make(chan struct{})
217 two := newRC()
218
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200219 ctx, ctxC := context.WithTimeout(context.Background(), 10*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100220 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100221 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100222 err := RunGroup(ctx, map[string]Runnable{
223 "one": runnableBecomesHealthy(h1, d1),
224 "two": two.runnable(),
225 })
226 if err != nil {
227 return err
228 }
229 Signal(ctx, SignalHealthy)
230 Signal(ctx, SignalDone)
231 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200232 }, WithPropagatePanic)
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200233 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100234
235 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200236 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100237 // Expect one to start running.
238 select {
239 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200240 default:
241 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100242 }
243
244 // Kill off two, one should restart.
245 two.die()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200246 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100247 select {
248 case <-d1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200249 default:
250 t.Fatalf("runnable 'one' didn't acknowledge cancel")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100251 }
252
253 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200254 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100255 select {
256 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200257 default:
258 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100259 }
260}
261
262func TestDeepFailure(t *testing.T) {
263 h1 := make(chan struct{})
264 d1 := make(chan struct{})
265 two := newRC()
266
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200267 ctx, ctxC := context.WithTimeout(context.Background(), 10*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100268 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100269 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100270 err := RunGroup(ctx, map[string]Runnable{
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200271 "one": runnableSpawnsMore(h1, d1, 5),
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100272 "two": two.runnable(),
273 })
274 if err != nil {
275 return err
276 }
277 Signal(ctx, SignalHealthy)
278 Signal(ctx, SignalDone)
279 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200280 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100281
282 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200283 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100284 // Expect one to start running.
285 select {
286 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200287 default:
288 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100289 }
290
291 // Kill off two, one should restart.
292 two.die()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200293 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100294 select {
295 case <-d1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200296 default:
297 t.Fatalf("runnable 'one' didn't acknowledge cancel")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100298 }
299
300 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200301 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100302 select {
303 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200304 default:
305 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100306 }
307}
308
309func TestPanic(t *testing.T) {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100310 h1 := make(chan struct{})
311 d1 := make(chan struct{})
312 two := newRC()
313
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100314 ctx, ctxC := context.WithCancel(context.Background())
315 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100316 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100317 err := RunGroup(ctx, map[string]Runnable{
318 "one": runnableBecomesHealthy(h1, d1),
319 "two": two.runnable(),
320 })
321 if err != nil {
322 return err
323 }
324 Signal(ctx, SignalHealthy)
325 Signal(ctx, SignalDone)
326 return nil
327 })
328
329 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200330 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100331 // Expect one to start running.
332 select {
333 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200334 default:
335 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100336 }
337
338 // Kill off two, one should restart.
339 two.panic()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200340 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100341 select {
342 case <-d1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200343 default:
344 t.Fatalf("runnable 'one' didn't acknowledge cancel")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100345 }
346
347 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200348 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100349 select {
350 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200351 default:
352 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100353 }
354}
355
356func TestMultipleLevelFailure(t *testing.T) {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100357 ctx, ctxC := context.WithCancel(context.Background())
358 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100359 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100360 err := RunGroup(ctx, map[string]Runnable{
361 "one": runnableSpawnsMore(nil, nil, 4),
362 "two": runnableSpawnsMore(nil, nil, 4),
363 })
364 if err != nil {
365 return err
366 }
367 Signal(ctx, SignalHealthy)
368 Signal(ctx, SignalDone)
369 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200370 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100371}
372
373func TestBackoff(t *testing.T) {
374 one := newRC()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200375
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200376 ctx, ctxC := context.WithTimeout(context.Background(), 20*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100377 defer ctxC()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200378
Serge Bazanskic7359672020-10-30 16:38:57 +0100379 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100380 if err := Run(ctx, "one", one.runnable()); err != nil {
381 return err
382 }
383 Signal(ctx, SignalHealthy)
384 Signal(ctx, SignalDone)
385 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200386 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100387
388 one.becomeHealthy()
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200389 // Die a bunch of times in a row, this brings up the next exponential
390 // backoff to over a second.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100391 for i := 0; i < 4; i += 1 {
392 one.die()
393 one.waitState(rcRunnableStateNew)
394 }
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200395 // Measure how long it takes for the runnable to respawn after a number of
396 // failures
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100397 start := time.Now()
398 one.die()
399 one.becomeHealthy()
400 one.waitState(rcRunnableStateHealthy)
401 taken := time.Since(start)
402 if taken < 1*time.Second {
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200403 t.Errorf("Runnable took %v to restart, wanted at least a second from backoff", taken)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100404 }
405
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200406 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100407 // Now that we've become healthy, die again. Becoming healthy resets the backoff.
408 start = time.Now()
409 one.die()
410 one.becomeHealthy()
411 one.waitState(rcRunnableStateHealthy)
412 taken = time.Since(start)
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200413 if taken > 1*time.Second || taken < 100*time.Millisecond {
414 t.Errorf("Runnable took %v to restart, wanted at least 100ms from backoff and at most 1s from backoff reset", taken)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100415 }
416}
417
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200418// TestResilience throws some curveballs at the supervisor - either programming
419// errors or high load. It then ensures that another runnable is running, and
420// that it restarts on its sibling failure.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100421func TestResilience(t *testing.T) {
422 // request/response channel for testing liveness of the 'one' runnable
423 req := make(chan chan struct{})
424
425 // A runnable that responds on the 'req' channel.
426 one := func(ctx context.Context) error {
427 Signal(ctx, SignalHealthy)
428 for {
429 select {
430 case <-ctx.Done():
431 return ctx.Err()
432 case r := <-req:
433 r <- struct{}{}
434 }
435 }
436 }
437 oneSibling := newRC()
438
439 oneTest := func() {
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200440 timeout := time.NewTicker(1000 * time.Millisecond)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100441 ping := make(chan struct{})
442 req <- ping
443 select {
444 case <-ping:
445 case <-timeout.C:
446 t.Fatalf("one ping response timeout")
447 }
448 timeout.Stop()
449 }
450
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200451 // A nasty runnable that calls Signal with the wrong context (this is a
452 // programming error)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100453 two := func(ctx context.Context) error {
454 Signal(context.TODO(), SignalHealthy)
455 return nil
456 }
457
458 // A nasty runnable that calls Signal wrong (this is a programming error).
459 three := func(ctx context.Context) error {
460 Signal(ctx, SignalDone)
461 return nil
462 }
463
464 // A nasty runnable that runs in a busy loop (this is a programming error).
465 four := func(ctx context.Context) error {
466 for {
467 time.Sleep(0)
468 }
469 }
470
471 // A nasty runnable that keeps creating more runnables.
472 five := func(ctx context.Context) error {
473 i := 1
474 for {
475 err := Run(ctx, fmt.Sprintf("r%d", i), runnableSpawnsMore(nil, nil, 2))
476 if err != nil {
477 return err
478 }
479
480 time.Sleep(100 * time.Millisecond)
481 i += 1
482 }
483 }
484
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100485 ctx, ctxC := context.WithCancel(context.Background())
486 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100487 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100488 RunGroup(ctx, map[string]Runnable{
489 "one": one,
490 "oneSibling": oneSibling.runnable(),
491 })
492 rs := map[string]Runnable{
493 "two": two, "three": three, "four": four, "five": five,
494 }
495 for k, v := range rs {
496 if err := Run(ctx, k, v); err != nil {
497 return err
498 }
499 }
500 Signal(ctx, SignalHealthy)
501 Signal(ctx, SignalDone)
502 return nil
503 })
504
505 // Five rounds of letting one run, then restarting it.
506 for i := 0; i < 5; i += 1 {
507 oneSibling.becomeHealthy()
508 oneSibling.waitState(rcRunnableStateHealthy)
509
510 // 'one' should work for at least a second.
511 deadline := time.Now().Add(1 * time.Second)
512 for {
513 if time.Now().After(deadline) {
514 break
515 }
516
517 oneTest()
518 }
519
520 // Killing 'oneSibling' should restart one.
521 oneSibling.panic()
522 }
523 // Make sure 'one' is still okay.
524 oneTest()
525}
526
527func ExampleNew() {
528 // Minimal runnable that is immediately done.
529 childC := make(chan struct{})
530 child := func(ctx context.Context) error {
531 Signal(ctx, SignalHealthy)
532 close(childC)
533 Signal(ctx, SignalDone)
534 return nil
535 }
536
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100537 // Start a supervision tree with a root runnable.
538 ctx, ctxC := context.WithCancel(context.Background())
539 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100540 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100541 err := Run(ctx, "child", child)
542 if err != nil {
543 return fmt.Errorf("could not run 'child': %w", err)
544 }
545 Signal(ctx, SignalHealthy)
546
547 t := time.NewTicker(time.Second)
548 defer t.Stop()
549
550 // Do something in the background, and exit on context cancel.
551 for {
552 select {
553 case <-t.C:
554 fmt.Printf("tick!")
555 case <-ctx.Done():
556 return ctx.Err()
557 }
558 }
559 })
560
561 // root.child will close this channel.
562 <-childC
563}