osbase/supervisor: only enter DONE state after runnable returns
Previously, a node was marked DONE as soon as it signaled DONE. If a GC
run happens between the time when the runnable signals DONE, and when
the runnable exit is processed, this causes problems. The test which I
added panics without the other changes:
panic: could not find [inner] (root.inner) in root (NODE_STATE_NEW)
If the delay is long enough that the inner node has already restarted,
then this could even end up with multiple instances of the same runnable
running simultaneously.
I fixed this problem by only entering the DONE state after the runnable
has returned.
Change-Id: If73b73f104c4cc204bce4374f4ba5f7e163e4a0b
Reviewed-on: https://review.monogon.dev/c/monogon/+/3715
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/osbase/supervisor/supervisor_node.go b/osbase/supervisor/supervisor_node.go
index 272b650..4f4d180 100644
--- a/osbase/supervisor/supervisor_node.go
+++ b/osbase/supervisor/supervisor_node.go
@@ -56,6 +56,10 @@
// The current state of the runnable in this node.
state NodeState
+ // signaledDone is set when the runnable has signaled Done. The transition to
+ // DONE state only happens after the runnable returns.
+ signaledDone bool
+
// Backoff used to keep runnables from being restarted too fast.
bo *backoff.ExponentialBackOff
@@ -211,6 +215,7 @@
// Clear children and state
n.state = NodeStateNew
+ n.signaledDone = false
n.children = make(map[string]*node)
n.reserved = make(map[string]bool)
n.groups = nil
@@ -307,8 +312,10 @@
if n.state != NodeStateHealthy {
panic(fmt.Errorf("node %s signaled done", n))
}
- n.state = NodeStateDone
- n.sup.metrics.NotifyNodeState(n.dn(), n.state)
+ if n.signaledDone {
+ panic(fmt.Errorf("node %s signaled done twice", n))
+ }
+ n.signaledDone = true
n.bo.Reset()
}
}
diff --git a/osbase/supervisor/supervisor_processor.go b/osbase/supervisor/supervisor_processor.go
index 595e2be..667b2ab 100644
--- a/osbase/supervisor/supervisor_processor.go
+++ b/osbase/supervisor/supervisor_processor.go
@@ -271,10 +271,11 @@
n := s.nodeByDN(r.dn)
ctx := n.ctx
- // Simple case: it was marked as Done and quit with no error.
- if n.state == NodeStateDone && r.err == nil {
+ // Simple case: it has signaled Done and quit with no error.
+ if n.signaledDone && r.err == nil {
+ // Mark the node as DONE.
+ n.state = NodeStateDone
s.metrics.NotifyNodeState(r.dn, n.state)
- // Do nothing. This was supposed to happen. Keep the process as DONE.
return
}
diff --git a/osbase/supervisor/supervisor_test.go b/osbase/supervisor/supervisor_test.go
index 3b81291..fabfb8b 100644
--- a/osbase/supervisor/supervisor_test.go
+++ b/osbase/supervisor/supervisor_test.go
@@ -466,6 +466,37 @@
}
}
+// TestDoneDelay test that a node is only considered restartable once it has
+// returned, not already when it has signaled Done. Otherwise, we can get into
+// an inconsistent state and for example panic because the node no longer
+// exists once the runnable returns.
+func TestDoneDelay(t *testing.T) {
+ startedInner := make(chan struct{})
+ failOuter := make(chan struct{})
+
+ ctx, ctxC := context.WithCancel(context.Background())
+ defer ctxC()
+
+ New(ctx, func(ctx context.Context) error {
+ err := Run(ctx, "inner", func(ctx context.Context) error {
+ Signal(ctx, SignalHealthy)
+ Signal(ctx, SignalDone)
+ <-startedInner
+ time.Sleep(10 * time.Millisecond)
+ return nil
+ })
+ if err != nil {
+ return err
+ }
+ <-failOuter
+ return fmt.Errorf("failed")
+ }, WithPropagatePanic)
+
+ startedInner <- struct{}{}
+ failOuter <- struct{}{}
+ time.Sleep(20 * time.Millisecond)
+}
+
// TestResilience throws some curveballs at the supervisor - either programming
// errors or high load. It then ensures that another runnable is running, and
// that it restarts on its sibling failure.