osbase/supervisor: make restart sleep cancelable
Previously, if a runnable died, and immediately after also its parent,
then the parent was only restarted after waiting for both the child
and then parent restart backoff sleep. Additionally, the child was
restarted with an already canceled context.
Now, the restart sleep can be canceled, and if canceled will directly
go to CANCELED state without first starting the runnable.
Change-Id: Ie986db680d4df12d590881d1a7e468c741a732d9
Reviewed-on: https://review.monogon.dev/c/monogon/+/3714
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/osbase/supervisor/supervisor_test.go b/osbase/supervisor/supervisor_test.go
index feb5510..3b81291 100644
--- a/osbase/supervisor/supervisor_test.go
+++ b/osbase/supervisor/supervisor_test.go
@@ -426,6 +426,46 @@
}
}
+// TestCancelRestart fails a runnable, but before its restart timeout expires,
+// also fails its parent. This should cause cancelation of the restart timeout.
+func TestCancelRestart(t *testing.T) {
+ startedOuter := make(chan struct{})
+ failInner := make(chan struct{})
+ failOuter := make(chan struct{})
+
+ ctx, ctxC := context.WithCancel(context.Background())
+ defer ctxC()
+
+ New(ctx, func(ctx context.Context) error {
+ <-startedOuter
+ err := Run(ctx, "inner", func(ctx context.Context) error {
+ <-failInner
+ return fmt.Errorf("failed")
+ })
+ if err != nil {
+ return err
+ }
+ <-failOuter
+ return fmt.Errorf("failed")
+ }, WithPropagatePanic)
+
+ startedOuter <- struct{}{}
+ failInner <- struct{}{}
+ time.Sleep(10 * time.Millisecond)
+ // Before the inner runnable has restarted, fail the outer runnable.
+ failOuter <- struct{}{}
+
+ start := time.Now()
+ startedOuter <- struct{}{}
+ taken := time.Since(start)
+ // With the default backoff parameters, the initial backoff time is
+ // 0.5s +- 0.25s because of randomization. If the inner restart timer is not
+ // canceled, then it takes twice as long.
+ if taken > 1*time.Second {
+ t.Errorf("Runnable took %v to restart, wanted at most 1s", taken)
+ }
+}
+
// TestResilience throws some curveballs at the supervisor - either programming
// errors or high load. It then ensures that another runnable is running, and
// that it restarts on its sibling failure.