osbase/supervisor: implement Prometheus metrics

This is a simple implementation of supervisor.Metrics which will be used
in Metropolis.

Change-Id: I8d47b3aa631dec5b07295d5498b3b0d1ad36c0f7
Reviewed-on: https://review.monogon.dev/c/monogon/+/3291
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/osbase/supervisor/supervisor_metrics_prometheus.go b/osbase/supervisor/supervisor_metrics_prometheus.go
new file mode 100644
index 0000000..49ee973
--- /dev/null
+++ b/osbase/supervisor/supervisor_metrics_prometheus.go
@@ -0,0 +1,88 @@
+package supervisor
+
+import (
+	"fmt"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// MetricsPrometheus is a Metrics implementation which exports the supervisor
+// metrics over some prometheus registry.
+//
+// This structure must be constructed with NewMetricsPrometheus.
+//
+// The metrics exported are:
+//   - monogon_supervisor_dn_state_total
+//   - monogon_superfisor_dn_state_transition_count
+type MetricsPrometheus struct {
+	exportedState *prometheus.GaugeVec
+	exportedEdge  *prometheus.CounterVec
+	cachedState   map[string]*NodeState
+}
+
+// NewMetricsPrometheus initializes Supervisor metrics in a prometheus registry
+// and return a Metrics instance to be used with WithMetrics.
+//
+// This should only be called once for a given registry.
+func NewMetricsPrometheus(registry *prometheus.Registry) (*MetricsPrometheus, error) {
+	res := &MetricsPrometheus{
+		exportedState: prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Namespace: "monogon",
+			Subsystem: "supervisor",
+			Name:      "dn_state_total",
+			Help:      "Total count of supervisor runnables, broken up by DN and state",
+		}, []string{"dn", "state"}),
+		exportedEdge: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Namespace:   "monogon",
+			Subsystem:   "supervisor",
+			Name:        "dn_state_transition_count",
+			Help:        "Total count of supervisor runnable state transitions, broken up by DN and (old_state, new_state) tuple",
+			ConstLabels: nil,
+		}, []string{"dn", "old_state", "new_state"}),
+		cachedState: make(map[string]*NodeState),
+	}
+	if err := registry.Register(res.exportedState); err != nil {
+		return nil, fmt.Errorf("when registering dn_state_total: %w", err)
+	}
+	if err := registry.Register(res.exportedEdge); err != nil {
+		return nil, fmt.Errorf("when registering dn_state_transition_count: %w", err)
+	}
+	return res, nil
+}
+
+func (m *MetricsPrometheus) exportState(dn string, state NodeState, value float64) {
+	m.exportedState.With(map[string]string{
+		"state": state.String(),
+		"dn":    dn,
+	}).Set(value)
+}
+
+func (m *MetricsPrometheus) exportEdge(dn string, oldState, newState NodeState) {
+	m.exportedEdge.With(map[string]string{
+		"old_state": oldState.String(),
+		"new_state": newState.String(),
+		"dn":        dn,
+	}).Inc()
+}
+
+func (m *MetricsPrometheus) NotifyNodeState(dn string, state NodeState) {
+	// Set all other exported states to zero, so that a given DN is only in a single
+	// state.
+	for _, st := range NodeStates {
+		if st == state {
+			continue
+		}
+		m.exportState(dn, st, 0.0)
+	}
+	// Export new state.
+	m.exportState(dn, state, 1.0)
+
+	// Export edge transition (assume previous state was Dead if this is the first
+	// time we see this DN).
+	previous := NodeStateDead
+	if m.cachedState[dn] != nil {
+		previous = *m.cachedState[dn]
+	}
+	m.exportEdge(dn, previous, state)
+	m.cachedState[dn] = &state
+}