osbase/supervisor: implement Metrics API
This is a base building block for exporting per-DN/runnable status from
the supervisor into an external system. A sample implementation is
provided which can be used in simple debug facilities to inspect the
current supervision tree.
A follow-up change will use the same API to implement Prometheus
metrics.
Change-Id: I0d586b03a397a3ccf8dac2d8043b9dd2f319be4e
Reviewed-on: https://review.monogon.dev/c/monogon/+/3290
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/osbase/supervisor/supervisor_test.go b/osbase/supervisor/supervisor_test.go
index f812531..feb5510 100644
--- a/osbase/supervisor/supervisor_test.go
+++ b/osbase/supervisor/supervisor_test.go
@@ -616,6 +616,108 @@
}
}
+func TestMetrics(t *testing.T) {
+ ctx, ctxC := context.WithCancel(context.Background())
+ defer ctxC()
+
+ // Build a supervision tree with 'wait'/step channels per runnable:
+ //
+ // root: wait, start one, wait, healthy
+ // one: wait, start two, crash, wait, start two, healthy, wait, done
+ // two: wait, healthy, run forever
+ //
+ // This tree allows us to exercise a few flows, like two getting canceled when
+ // one crashes, runnables returning done, runnables staying healthy, etc.
+
+ stepRoot := make(chan struct{})
+ stepOne := make(chan struct{})
+ stepTwo := make(chan struct{})
+ m := InMemoryMetrics{}
+
+ New(ctx, func(ctx context.Context) error {
+ <-stepRoot
+
+ attempts := 0
+ Run(ctx, "one", func(ctx context.Context) error {
+ <-stepOne
+ Run(ctx, "two", func(ctx context.Context) error {
+ <-stepTwo
+ Signal(ctx, SignalHealthy)
+ <-ctx.Done()
+ return ctx.Err()
+ })
+ if attempts == 0 {
+ attempts += 1
+ return fmt.Errorf("failed")
+ }
+ Signal(ctx, SignalHealthy)
+ <-stepOne
+ Signal(ctx, SignalDone)
+ return nil
+ })
+
+ <-stepRoot
+ Signal(ctx, SignalHealthy)
+ return nil
+ }, WithPropagatePanic, WithMetrics(&m))
+
+ // expectDN waits a second until a given DN is at a given state and fails the
+ // test otherwise.
+ expectDN := func(dn string, state NodeState) {
+ t.Helper()
+ start := time.Now()
+ for {
+ snap := m.DNs()
+ if _, ok := snap[dn]; !ok {
+ if time.Since(start) > time.Second {
+ t.Fatalf("No DN %q", dn)
+ } else {
+ time.Sleep(100 * time.Millisecond)
+ continue
+ }
+ }
+ if want, got := state, snap[dn].State; want != got {
+ if time.Since(start) > time.Second {
+ t.Fatalf("Expected %q to be %s, got %s", dn, want, got)
+ } else {
+ time.Sleep(100 * time.Millisecond)
+ continue
+ }
+ }
+ break
+ }
+ }
+
+ // Make progress thorugh the runnable tree and check expected states.
+
+ expectDN("root", NodeStateNew)
+
+ stepRoot <- struct{}{}
+ expectDN("root", NodeStateNew)
+ expectDN("root.one", NodeStateNew)
+
+ stepOne <- struct{}{}
+ stepTwo <- struct{}{}
+ expectDN("root", NodeStateNew)
+ expectDN("root.one", NodeStateDead)
+ expectDN("root.one.two", NodeStateCanceled)
+
+ stepOne <- struct{}{}
+ expectDN("root", NodeStateNew)
+ expectDN("root.one", NodeStateHealthy)
+ expectDN("root.one.two", NodeStateNew)
+
+ stepOne <- struct{}{}
+ expectDN("root", NodeStateNew)
+ expectDN("root.one", NodeStateDone)
+ expectDN("root.one.two", NodeStateNew)
+
+ stepTwo <- struct{}{}
+ expectDN("root", NodeStateNew)
+ expectDN("root.one", NodeStateDone)
+ expectDN("root.one.two", NodeStateHealthy)
+}
+
func ExampleNew() {
// Minimal runnable that is immediately done.
childC := make(chan struct{})