osbase/supervisor: restart DONE nodes when needed
Nodes in DONE state are normally not restarted. However, if a sibling of
the node dies, then the node in DONE state and all its children are
canceled and need to be restarted. Previously, this did not happen, so
the newly added test got stuck waiting forever for the inner runnable to
start again.
Change-Id: I8c437937d21edb822985ec6fa3f36425e12ff218
Reviewed-on: https://review.monogon.dev/c/monogon/+/3716
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/osbase/supervisor/supervisor_processor.go b/osbase/supervisor/supervisor_processor.go
index 667b2ab..a65a364 100644
--- a/osbase/supervisor/supervisor_processor.go
+++ b/osbase/supervisor/supervisor_processor.go
@@ -468,8 +468,10 @@
cur := queue[0]
queue = queue[1:]
- // If this node is DEAD or CANCELED it should be restarted.
- if cur.state == NodeStateDead || cur.state == NodeStateCanceled {
+ // If this node's context is canceled and it has exited, it should be
+ // restarted.
+ exited := cur.state == NodeStateDead || cur.state == NodeStateCanceled || cur.state == NodeStateDone
+ if cur.ctx.Err() != nil && exited {
want[cur.dn()] = true
}
diff --git a/osbase/supervisor/supervisor_test.go b/osbase/supervisor/supervisor_test.go
index fabfb8b..e426c8b 100644
--- a/osbase/supervisor/supervisor_test.go
+++ b/osbase/supervisor/supervisor_test.go
@@ -497,6 +497,52 @@
time.Sleep(20 * time.Millisecond)
}
+// TestCancelDoneSibling tests that a node in state DONE is restarted if it is
+// canceled because a sibling has died.
+func TestCancelDoneSibling(t *testing.T) {
+ innerRunning := make(chan struct{})
+ innerExit := make(chan struct{})
+ failSibling := make(chan struct{})
+
+ ctx, ctxC := context.WithCancel(context.Background())
+ defer ctxC()
+
+ New(ctx, func(ctx context.Context) error {
+ err := RunGroup(ctx, map[string]Runnable{
+ "done": func(ctx context.Context) error {
+ err := Run(ctx, "inner", func(ctx context.Context) error {
+ <-innerRunning
+ <-ctx.Done()
+ <-innerExit
+ return ctx.Err()
+ })
+ if err != nil {
+ return err
+ }
+ Signal(ctx, SignalHealthy)
+ Signal(ctx, SignalDone)
+ return nil
+ },
+ "sibling": func(ctx context.Context) error {
+ <-failSibling
+ return fmt.Errorf("failed")
+ },
+ })
+ if err != nil {
+ return err
+ }
+ Signal(ctx, SignalHealthy)
+ Signal(ctx, SignalDone)
+ return nil
+ }, WithPropagatePanic)
+
+ innerRunning <- struct{}{}
+ failSibling <- struct{}{}
+ // The inner node should exit and start running again.
+ innerExit <- struct{}{}
+ innerRunning <- struct{}{}
+}
+
// TestResilience throws some curveballs at the supervisor - either programming
// errors or high load. It then ensures that another runnable is running, and
// that it restarts on its sibling failure.