osbase/supervisor: implement Metrics API
This is a base building block for exporting per-DN/runnable status from
the supervisor into an external system. A sample implementation is
provided which can be used in simple debug facilities to inspect the
current supervision tree.
A follow-up change will use the same API to implement Prometheus
metrics.
Change-Id: I0d586b03a397a3ccf8dac2d8043b9dd2f319be4e
Reviewed-on: https://review.monogon.dev/c/monogon/+/3290
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/osbase/supervisor/supervisor_processor.go b/osbase/supervisor/supervisor_processor.go
index 2a01cf7..6304b09 100644
--- a/osbase/supervisor/supervisor_processor.go
+++ b/osbase/supervisor/supervisor_processor.go
@@ -230,6 +230,10 @@
defer s.mu.Unlock()
n := s.nodeByDN(r.dn)
+ if n.state != NodeStateNew {
+ panic("programming error: scheduled node not new")
+ }
+ s.metrics.NotifyNodeState(r.dn, n.state)
go func() {
if !s.propagatePanic {
defer func() {
@@ -268,6 +272,7 @@
// Simple case: it was marked as Done and quit with no error.
if n.state == NodeStateDone && r.err == nil {
+ s.metrics.NotifyNodeState(r.dn, n.state)
// Do nothing. This was supposed to happen. Keep the process as DONE.
return
}
@@ -277,6 +282,7 @@
if r.err != nil && ctx.Err() != nil && errors.Is(r.err, ctx.Err()) {
// Mark the node as canceled successfully.
n.state = NodeStateCanceled
+ s.metrics.NotifyNodeState(r.dn, n.state)
return
}
@@ -291,6 +297,7 @@
s.ilogger.Errorf("%s: %v", n.dn(), err)
// Mark as dead.
n.state = NodeStateDead
+ s.metrics.NotifyNodeState(r.dn, n.state)
// Cancel that node's context, just in case something still depends on it.
n.ctxC()