blob: 9c7bdb76ecbd266e626c63464114695345390197 [file] [log] [blame]
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package supervisor
18
19import (
20 "context"
21 "fmt"
22 "testing"
23 "time"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010024)
25
26func runnableBecomesHealthy(healthy, done chan struct{}) Runnable {
27 return func(ctx context.Context) error {
28 Signal(ctx, SignalHealthy)
29
30 go func() {
31 if healthy != nil {
32 healthy <- struct{}{}
33 }
34 }()
35
36 <-ctx.Done()
37
38 go func() {
39 if done != nil {
40 done <- struct{}{}
41 }
42 }()
43
44 return ctx.Err()
45 }
46}
47
48func runnableSpawnsMore(healthy, done chan struct{}, levels int) Runnable {
49 return func(ctx context.Context) error {
50 if levels > 0 {
51 err := RunGroup(ctx, map[string]Runnable{
52 "a": runnableSpawnsMore(nil, nil, levels-1),
53 "b": runnableSpawnsMore(nil, nil, levels-1),
54 })
55 if err != nil {
56 return err
57 }
58 }
59
60 Signal(ctx, SignalHealthy)
61
62 go func() {
63 if healthy != nil {
64 healthy <- struct{}{}
65 }
66 }()
67
68 <-ctx.Done()
69
70 go func() {
71 if done != nil {
72 done <- struct{}{}
73 }
74 }()
75 return ctx.Err()
76 }
77}
78
79// rc is a Remote Controlled runnable. It is a generic runnable used for testing the supervisor.
80type rc struct {
81 req chan rcRunnableRequest
82}
83
84type rcRunnableRequest struct {
85 cmd rcRunnableCommand
86 stateC chan rcRunnableState
87}
88
89type rcRunnableCommand int
90
91const (
92 rcRunnableCommandBecomeHealthy rcRunnableCommand = iota
93 rcRunnableCommandBecomeDone
94 rcRunnableCommandDie
95 rcRunnableCommandPanic
96 rcRunnableCommandState
97)
98
99type rcRunnableState int
100
101const (
102 rcRunnableStateNew rcRunnableState = iota
103 rcRunnableStateHealthy
104 rcRunnableStateDone
105)
106
107func (r *rc) becomeHealthy() {
108 r.req <- rcRunnableRequest{cmd: rcRunnableCommandBecomeHealthy}
109}
110
111func (r *rc) becomeDone() {
112 r.req <- rcRunnableRequest{cmd: rcRunnableCommandBecomeDone}
113}
114func (r *rc) die() {
115 r.req <- rcRunnableRequest{cmd: rcRunnableCommandDie}
116}
117
118func (r *rc) panic() {
119 r.req <- rcRunnableRequest{cmd: rcRunnableCommandPanic}
120}
121
122func (r *rc) state() rcRunnableState {
123 c := make(chan rcRunnableState)
124 r.req <- rcRunnableRequest{
125 cmd: rcRunnableCommandState,
126 stateC: c,
127 }
128 return <-c
129}
130
131func (r *rc) waitState(s rcRunnableState) {
132 // This is poll based. Making it non-poll based would make the RC runnable logic a bit more complex for little gain.
133 for {
134 got := r.state()
135 if got == s {
136 return
137 }
138 time.Sleep(10 * time.Millisecond)
139 }
140}
141
142func newRC() *rc {
143 return &rc{
144 req: make(chan rcRunnableRequest),
145 }
146}
147
148// Remote Controlled Runnable
149func (r *rc) runnable() Runnable {
150 return func(ctx context.Context) error {
151 state := rcRunnableStateNew
152
153 for {
154 select {
155 case <-ctx.Done():
156 return ctx.Err()
157 case r := <-r.req:
158 switch r.cmd {
159 case rcRunnableCommandBecomeHealthy:
160 Signal(ctx, SignalHealthy)
161 state = rcRunnableStateHealthy
162 case rcRunnableCommandBecomeDone:
163 Signal(ctx, SignalDone)
164 state = rcRunnableStateDone
165 case rcRunnableCommandDie:
166 return fmt.Errorf("died on request")
167 case rcRunnableCommandPanic:
168 panic("at the disco")
169 case rcRunnableCommandState:
170 r.stateC <- state
171 }
172 }
173 }
174 }
175}
176
177func TestSimple(t *testing.T) {
178 h1 := make(chan struct{})
179 d1 := make(chan struct{})
180 h2 := make(chan struct{})
181 d2 := make(chan struct{})
182
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100183 ctx, ctxC := context.WithCancel(context.Background())
184 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100185 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100186 err := RunGroup(ctx, map[string]Runnable{
187 "one": runnableBecomesHealthy(h1, d1),
188 "two": runnableBecomesHealthy(h2, d2),
189 })
190 if err != nil {
191 return err
192 }
193 Signal(ctx, SignalHealthy)
194 Signal(ctx, SignalDone)
195 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200196 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100197
198 // Expect both to start running.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200199 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100200 select {
201 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200202 default:
203 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100204 }
205 select {
206 case <-h2:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200207 default:
208 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100209 }
210}
211
212func TestSimpleFailure(t *testing.T) {
213 h1 := make(chan struct{})
214 d1 := make(chan struct{})
215 two := newRC()
216
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200217 ctx, ctxC := context.WithTimeout(context.Background(), 10*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100218 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100219 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100220 err := RunGroup(ctx, map[string]Runnable{
221 "one": runnableBecomesHealthy(h1, d1),
222 "two": two.runnable(),
223 })
224 if err != nil {
225 return err
226 }
227 Signal(ctx, SignalHealthy)
228 Signal(ctx, SignalDone)
229 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200230 }, WithPropagatePanic)
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200231 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100232
233 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200234 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100235 // Expect one to start running.
236 select {
237 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200238 default:
239 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100240 }
241
242 // Kill off two, one should restart.
243 two.die()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200244 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100245 select {
246 case <-d1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200247 default:
248 t.Fatalf("runnable 'one' didn't acknowledge cancel")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100249 }
250
251 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200252 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100253 select {
254 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200255 default:
256 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100257 }
258}
259
260func TestDeepFailure(t *testing.T) {
261 h1 := make(chan struct{})
262 d1 := make(chan struct{})
263 two := newRC()
264
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200265 ctx, ctxC := context.WithTimeout(context.Background(), 10*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100266 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100267 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100268 err := RunGroup(ctx, map[string]Runnable{
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200269 "one": runnableSpawnsMore(h1, d1, 5),
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100270 "two": two.runnable(),
271 })
272 if err != nil {
273 return err
274 }
275 Signal(ctx, SignalHealthy)
276 Signal(ctx, SignalDone)
277 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200278 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100279
280 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200281 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100282 // Expect one to start running.
283 select {
284 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200285 default:
286 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100287 }
288
289 // Kill off two, one should restart.
290 two.die()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200291 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100292 select {
293 case <-d1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200294 default:
295 t.Fatalf("runnable 'one' didn't acknowledge cancel")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100296 }
297
298 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200299 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100300 select {
301 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200302 default:
303 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100304 }
305}
306
307func TestPanic(t *testing.T) {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100308 h1 := make(chan struct{})
309 d1 := make(chan struct{})
310 two := newRC()
311
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100312 ctx, ctxC := context.WithCancel(context.Background())
313 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100314 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100315 err := RunGroup(ctx, map[string]Runnable{
316 "one": runnableBecomesHealthy(h1, d1),
317 "two": two.runnable(),
318 })
319 if err != nil {
320 return err
321 }
322 Signal(ctx, SignalHealthy)
323 Signal(ctx, SignalDone)
324 return nil
325 })
326
327 two.becomeHealthy()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200328 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100329 // Expect one to start running.
330 select {
331 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200332 default:
333 t.Fatalf("runnable 'one' didn't start")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100334 }
335
336 // Kill off two, one should restart.
337 two.panic()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200338 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100339 select {
340 case <-d1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200341 default:
342 t.Fatalf("runnable 'one' didn't acknowledge cancel")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100343 }
344
345 // And one should start running again.
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200346 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100347 select {
348 case <-h1:
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200349 default:
350 t.Fatalf("runnable 'one' didn't restart")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100351 }
352}
353
354func TestMultipleLevelFailure(t *testing.T) {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100355 ctx, ctxC := context.WithCancel(context.Background())
356 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100357 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100358 err := RunGroup(ctx, map[string]Runnable{
359 "one": runnableSpawnsMore(nil, nil, 4),
360 "two": runnableSpawnsMore(nil, nil, 4),
361 })
362 if err != nil {
363 return err
364 }
365 Signal(ctx, SignalHealthy)
366 Signal(ctx, SignalDone)
367 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200368 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100369}
370
371func TestBackoff(t *testing.T) {
372 one := newRC()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200373
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200374 ctx, ctxC := context.WithTimeout(context.Background(), 20*time.Second)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100375 defer ctxC()
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200376
Serge Bazanskic7359672020-10-30 16:38:57 +0100377 s := New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100378 if err := Run(ctx, "one", one.runnable()); err != nil {
379 return err
380 }
381 Signal(ctx, SignalHealthy)
382 Signal(ctx, SignalDone)
383 return nil
Serge Bazanski19bb4122020-05-04 17:57:50 +0200384 }, WithPropagatePanic)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100385
386 one.becomeHealthy()
387 // Die a bunch of times in a row, this brings up the next exponential backoff to over a second.
388 for i := 0; i < 4; i += 1 {
389 one.die()
390 one.waitState(rcRunnableStateNew)
391 }
392 // Measure how long it takes for the runnable to respawn after a number of failures
393 start := time.Now()
394 one.die()
395 one.becomeHealthy()
396 one.waitState(rcRunnableStateHealthy)
397 taken := time.Since(start)
398 if taken < 1*time.Second {
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200399 t.Errorf("Runnable took %v to restart, wanted at least a second from backoff", taken)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100400 }
401
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200402 s.waitSettleError(ctx, t)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100403 // Now that we've become healthy, die again. Becoming healthy resets the backoff.
404 start = time.Now()
405 one.die()
406 one.becomeHealthy()
407 one.waitState(rcRunnableStateHealthy)
408 taken = time.Since(start)
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200409 if taken > 1*time.Second || taken < 100*time.Millisecond {
410 t.Errorf("Runnable took %v to restart, wanted at least 100ms from backoff and at most 1s from backoff reset", taken)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100411 }
412}
413
414// TestResilience throws some curveballs at the supervisor - either programming errors or high load. It then ensures
415// that another runnable is running, and that it restarts on its sibling failure.
416func TestResilience(t *testing.T) {
417 // request/response channel for testing liveness of the 'one' runnable
418 req := make(chan chan struct{})
419
420 // A runnable that responds on the 'req' channel.
421 one := func(ctx context.Context) error {
422 Signal(ctx, SignalHealthy)
423 for {
424 select {
425 case <-ctx.Done():
426 return ctx.Err()
427 case r := <-req:
428 r <- struct{}{}
429 }
430 }
431 }
432 oneSibling := newRC()
433
434 oneTest := func() {
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200435 timeout := time.NewTicker(1000 * time.Millisecond)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100436 ping := make(chan struct{})
437 req <- ping
438 select {
439 case <-ping:
440 case <-timeout.C:
441 t.Fatalf("one ping response timeout")
442 }
443 timeout.Stop()
444 }
445
446 // A nasty runnable that calls Signal with the wrong context (this is a programming error)
447 two := func(ctx context.Context) error {
448 Signal(context.TODO(), SignalHealthy)
449 return nil
450 }
451
452 // A nasty runnable that calls Signal wrong (this is a programming error).
453 three := func(ctx context.Context) error {
454 Signal(ctx, SignalDone)
455 return nil
456 }
457
458 // A nasty runnable that runs in a busy loop (this is a programming error).
459 four := func(ctx context.Context) error {
460 for {
461 time.Sleep(0)
462 }
463 }
464
465 // A nasty runnable that keeps creating more runnables.
466 five := func(ctx context.Context) error {
467 i := 1
468 for {
469 err := Run(ctx, fmt.Sprintf("r%d", i), runnableSpawnsMore(nil, nil, 2))
470 if err != nil {
471 return err
472 }
473
474 time.Sleep(100 * time.Millisecond)
475 i += 1
476 }
477 }
478
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100479 ctx, ctxC := context.WithCancel(context.Background())
480 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100481 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100482 RunGroup(ctx, map[string]Runnable{
483 "one": one,
484 "oneSibling": oneSibling.runnable(),
485 })
486 rs := map[string]Runnable{
487 "two": two, "three": three, "four": four, "five": five,
488 }
489 for k, v := range rs {
490 if err := Run(ctx, k, v); err != nil {
491 return err
492 }
493 }
494 Signal(ctx, SignalHealthy)
495 Signal(ctx, SignalDone)
496 return nil
497 })
498
499 // Five rounds of letting one run, then restarting it.
500 for i := 0; i < 5; i += 1 {
501 oneSibling.becomeHealthy()
502 oneSibling.waitState(rcRunnableStateHealthy)
503
504 // 'one' should work for at least a second.
505 deadline := time.Now().Add(1 * time.Second)
506 for {
507 if time.Now().After(deadline) {
508 break
509 }
510
511 oneTest()
512 }
513
514 // Killing 'oneSibling' should restart one.
515 oneSibling.panic()
516 }
517 // Make sure 'one' is still okay.
518 oneTest()
519}
520
521func ExampleNew() {
522 // Minimal runnable that is immediately done.
523 childC := make(chan struct{})
524 child := func(ctx context.Context) error {
525 Signal(ctx, SignalHealthy)
526 close(childC)
527 Signal(ctx, SignalDone)
528 return nil
529 }
530
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100531 // Start a supervision tree with a root runnable.
532 ctx, ctxC := context.WithCancel(context.Background())
533 defer ctxC()
Serge Bazanskic7359672020-10-30 16:38:57 +0100534 New(ctx, func(ctx context.Context) error {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100535 err := Run(ctx, "child", child)
536 if err != nil {
537 return fmt.Errorf("could not run 'child': %w", err)
538 }
539 Signal(ctx, SignalHealthy)
540
541 t := time.NewTicker(time.Second)
542 defer t.Stop()
543
544 // Do something in the background, and exit on context cancel.
545 for {
546 select {
547 case <-t.C:
548 fmt.Printf("tick!")
549 case <-ctx.Done():
550 return ctx.Err()
551 }
552 }
553 })
554
555 // root.child will close this channel.
556 <-childC
557}