osbase/supervisor: restart DONE nodes when needed Nodes in DONE state are normally not restarted. However, if a sibling of the node dies, then the node in DONE state and all its children are canceled and need to be restarted. Previously, this did not happen, so the newly added test got stuck waiting forever for the inner runnable to start again. Change-Id: I8c437937d21edb822985ec6fa3f36425e12ff218 Reviewed-on: https://review.monogon.dev/c/monogon/+/3716 Tested-by: Jenkins CI Reviewed-by: Lorenz Brun <lorenz@monogon.tech>

commit: fce7c765ace0ae7058af9498e228eca30dd4695c [log] [tgz]
author: Jan Schär <jan@monogon.tech> Thu Dec 19 14:07:24 2024 +0100
committer: Jan Schär <jan@monogon.tech> Mon Dec 23 10:13:43 2024 +0000
tree: 4d331860270a59969d0f917a72e87e7dde74b9a6
parent: 08c1c72246da4f8f18dfb7e94a1da813b094c7a4 [diff]
diff --git a/osbase/supervisor/supervisor_processor.go b/osbase/supervisor/supervisor_processor.go
index 667b2ab..a65a364 100644
--- a/osbase/supervisor/supervisor_processor.go
+++ b/osbase/supervisor/supervisor_processor.go

@@ -468,8 +468,10 @@
 		cur := queue[0]
 		queue = queue[1:]
 
-		// If this node is DEAD or CANCELED it should be restarted.
-		if cur.state == NodeStateDead || cur.state == NodeStateCanceled {
+		// If this node's context is canceled and it has exited, it should be
+		// restarted.
+		exited := cur.state == NodeStateDead || cur.state == NodeStateCanceled || cur.state == NodeStateDone
+		if cur.ctx.Err() != nil && exited {
 			want[cur.dn()] = true
 		}
 

diff --git a/osbase/supervisor/supervisor_test.go b/osbase/supervisor/supervisor_test.go
index fabfb8b..e426c8b 100644
--- a/osbase/supervisor/supervisor_test.go
+++ b/osbase/supervisor/supervisor_test.go

@@ -497,6 +497,52 @@
 	time.Sleep(20 * time.Millisecond)
 }
 
+// TestCancelDoneSibling tests that a node in state DONE is restarted if it is
+// canceled because a sibling has died.
+func TestCancelDoneSibling(t *testing.T) {
+	innerRunning := make(chan struct{})
+	innerExit := make(chan struct{})
+	failSibling := make(chan struct{})
+
+	ctx, ctxC := context.WithCancel(context.Background())
+	defer ctxC()
+
+	New(ctx, func(ctx context.Context) error {
+		err := RunGroup(ctx, map[string]Runnable{
+			"done": func(ctx context.Context) error {
+				err := Run(ctx, "inner", func(ctx context.Context) error {
+					<-innerRunning
+					<-ctx.Done()
+					<-innerExit
+					return ctx.Err()
+				})
+				if err != nil {
+					return err
+				}
+				Signal(ctx, SignalHealthy)
+				Signal(ctx, SignalDone)
+				return nil
+			},
+			"sibling": func(ctx context.Context) error {
+				<-failSibling
+				return fmt.Errorf("failed")
+			},
+		})
+		if err != nil {
+			return err
+		}
+		Signal(ctx, SignalHealthy)
+		Signal(ctx, SignalDone)
+		return nil
+	}, WithPropagatePanic)
+
+	innerRunning <- struct{}{}
+	failSibling <- struct{}{}
+	// The inner node should exit and start running again.
+	innerExit <- struct{}{}
+	innerRunning <- struct{}{}
+}
+
 // TestResilience throws some curveballs at the supervisor - either programming
 // errors or high load. It then ensures that another runnable is running, and
 // that it restarts on its sibling failure.
commit	fce7c765ace0ae7058af9498e228eca30dd4695c	[log] [tgz]
author	Jan Schär <jan@monogon.tech>	Thu Dec 19 14:07:24 2024 +0100
committer	Jan Schär <jan@monogon.tech>	Mon Dec 23 10:13:43 2024 +0000
tree	4d331860270a59969d0f917a72e87e7dde74b9a6
parent	08c1c72246da4f8f18dfb7e94a1da813b094c7a4 [diff]