osbase/supervisor: only enter DONE state after runnable returns Previously, a node was marked DONE as soon as it signaled DONE. If a GC run happens between the time when the runnable signals DONE, and when the runnable exit is processed, this causes problems. The test which I added panics without the other changes: panic: could not find [inner] (root.inner) in root (NODE_STATE_NEW) If the delay is long enough that the inner node has already restarted, then this could even end up with multiple instances of the same runnable running simultaneously. I fixed this problem by only entering the DONE state after the runnable has returned. Change-Id: If73b73f104c4cc204bce4374f4ba5f7e163e4a0b Reviewed-on: https://review.monogon.dev/c/monogon/+/3715 Tested-by: Jenkins CI Reviewed-by: Lorenz Brun <lorenz@monogon.tech>

commit: 08c1c72246da4f8f18dfb7e94a1da813b094c7a4 [log] [tgz]
author: Jan Schär <jan@monogon.tech> Thu Dec 19 12:03:17 2024 +0100
committer: Jan Schär <jan@monogon.tech> Mon Dec 23 10:13:43 2024 +0000
tree: e8dd5a474edb79f2658b951e64b993b38c46b210
parent: 65602097b716674316a318d5594a8b2b0e52d239 [diff]
diff --git a/osbase/supervisor/supervisor_node.go b/osbase/supervisor/supervisor_node.go
index 272b650..4f4d180 100644
--- a/osbase/supervisor/supervisor_node.go
+++ b/osbase/supervisor/supervisor_node.go

@@ -56,6 +56,10 @@
 	// The current state of the runnable in this node.
 	state NodeState
 
+	// signaledDone is set when the runnable has signaled Done. The transition to
+	// DONE state only happens after the runnable returns.
+	signaledDone bool
+
 	// Backoff used to keep runnables from being restarted too fast.
 	bo *backoff.ExponentialBackOff
 
@@ -211,6 +215,7 @@
 
 	// Clear children and state
 	n.state = NodeStateNew
+	n.signaledDone = false
 	n.children = make(map[string]*node)
 	n.reserved = make(map[string]bool)
 	n.groups = nil
@@ -307,8 +312,10 @@
 		if n.state != NodeStateHealthy {
 			panic(fmt.Errorf("node %s signaled done", n))
 		}
-		n.state = NodeStateDone
-		n.sup.metrics.NotifyNodeState(n.dn(), n.state)
+		if n.signaledDone {
+			panic(fmt.Errorf("node %s signaled done twice", n))
+		}
+		n.signaledDone = true
 		n.bo.Reset()
 	}
 }

diff --git a/osbase/supervisor/supervisor_processor.go b/osbase/supervisor/supervisor_processor.go
index 595e2be..667b2ab 100644
--- a/osbase/supervisor/supervisor_processor.go
+++ b/osbase/supervisor/supervisor_processor.go

@@ -271,10 +271,11 @@
 	n := s.nodeByDN(r.dn)
 	ctx := n.ctx
 
-	// Simple case: it was marked as Done and quit with no error.
-	if n.state == NodeStateDone && r.err == nil {
+	// Simple case: it has signaled Done and quit with no error.
+	if n.signaledDone && r.err == nil {
+		// Mark the node as DONE.
+		n.state = NodeStateDone
 		s.metrics.NotifyNodeState(r.dn, n.state)
-		// Do nothing. This was supposed to happen. Keep the process as DONE.
 		return
 	}
 

diff --git a/osbase/supervisor/supervisor_test.go b/osbase/supervisor/supervisor_test.go
index 3b81291..fabfb8b 100644
--- a/osbase/supervisor/supervisor_test.go
+++ b/osbase/supervisor/supervisor_test.go

@@ -466,6 +466,37 @@
 	}
 }
 
+// TestDoneDelay test that a node is only considered restartable once it has
+// returned, not already when it has signaled Done. Otherwise, we can get into
+// an inconsistent state and for example panic because the node no longer
+// exists once the runnable returns.
+func TestDoneDelay(t *testing.T) {
+	startedInner := make(chan struct{})
+	failOuter := make(chan struct{})
+
+	ctx, ctxC := context.WithCancel(context.Background())
+	defer ctxC()
+
+	New(ctx, func(ctx context.Context) error {
+		err := Run(ctx, "inner", func(ctx context.Context) error {
+			Signal(ctx, SignalHealthy)
+			Signal(ctx, SignalDone)
+			<-startedInner
+			time.Sleep(10 * time.Millisecond)
+			return nil
+		})
+		if err != nil {
+			return err
+		}
+		<-failOuter
+		return fmt.Errorf("failed")
+	}, WithPropagatePanic)
+
+	startedInner <- struct{}{}
+	failOuter <- struct{}{}
+	time.Sleep(20 * time.Millisecond)
+}
+
 // TestResilience throws some curveballs at the supervisor - either programming
 // errors or high load. It then ensures that another runnable is running, and
 // that it restarts on its sibling failure.
commit	08c1c72246da4f8f18dfb7e94a1da813b094c7a4	[log] [tgz]
author	Jan Schär <jan@monogon.tech>	Thu Dec 19 12:03:17 2024 +0100
committer	Jan Schär <jan@monogon.tech>	Mon Dec 23 10:13:43 2024 +0000
tree	e8dd5a474edb79f2658b951e64b993b38c46b210
parent	65602097b716674316a318d5594a8b2b0e52d239 [diff]