osbase/supervisor: implement Prometheus metrics

This is a simple implementation of supervisor.Metrics which will be used
in Metropolis.

Change-Id: I8d47b3aa631dec5b07295d5498b3b0d1ad36c0f7
Reviewed-on: https://review.monogon.dev/c/monogon/+/3291
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/osbase/supervisor/BUILD.bazel b/osbase/supervisor/BUILD.bazel
index 1997f45..b6b4861 100644
--- a/osbase/supervisor/BUILD.bazel
+++ b/osbase/supervisor/BUILD.bazel
@@ -5,6 +5,7 @@
     srcs = [
         "supervisor.go",
         "supervisor_metrics.go",
+        "supervisor_metrics_prometheus.go",
         "supervisor_node.go",
         "supervisor_processor.go",
         "supervisor_support.go",
@@ -16,6 +17,7 @@
     deps = [
         "//osbase/logtree",
         "@com_github_cenkalti_backoff_v4//:backoff",
+        "@com_github_prometheus_client_golang//prometheus",
         "@org_golang_google_grpc//:grpc",
     ],
 )
diff --git a/osbase/supervisor/supervisor_metrics_prometheus.go b/osbase/supervisor/supervisor_metrics_prometheus.go
new file mode 100644
index 0000000..49ee973
--- /dev/null
+++ b/osbase/supervisor/supervisor_metrics_prometheus.go
@@ -0,0 +1,88 @@
+package supervisor
+
+import (
+	"fmt"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// MetricsPrometheus is a Metrics implementation which exports the supervisor
+// metrics over some prometheus registry.
+//
+// This structure must be constructed with NewMetricsPrometheus.
+//
+// The metrics exported are:
+//   - monogon_supervisor_dn_state_total
+//   - monogon_superfisor_dn_state_transition_count
+type MetricsPrometheus struct {
+	exportedState *prometheus.GaugeVec
+	exportedEdge  *prometheus.CounterVec
+	cachedState   map[string]*NodeState
+}
+
+// NewMetricsPrometheus initializes Supervisor metrics in a prometheus registry
+// and return a Metrics instance to be used with WithMetrics.
+//
+// This should only be called once for a given registry.
+func NewMetricsPrometheus(registry *prometheus.Registry) (*MetricsPrometheus, error) {
+	res := &MetricsPrometheus{
+		exportedState: prometheus.NewGaugeVec(prometheus.GaugeOpts{
+			Namespace: "monogon",
+			Subsystem: "supervisor",
+			Name:      "dn_state_total",
+			Help:      "Total count of supervisor runnables, broken up by DN and state",
+		}, []string{"dn", "state"}),
+		exportedEdge: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Namespace:   "monogon",
+			Subsystem:   "supervisor",
+			Name:        "dn_state_transition_count",
+			Help:        "Total count of supervisor runnable state transitions, broken up by DN and (old_state, new_state) tuple",
+			ConstLabels: nil,
+		}, []string{"dn", "old_state", "new_state"}),
+		cachedState: make(map[string]*NodeState),
+	}
+	if err := registry.Register(res.exportedState); err != nil {
+		return nil, fmt.Errorf("when registering dn_state_total: %w", err)
+	}
+	if err := registry.Register(res.exportedEdge); err != nil {
+		return nil, fmt.Errorf("when registering dn_state_transition_count: %w", err)
+	}
+	return res, nil
+}
+
+func (m *MetricsPrometheus) exportState(dn string, state NodeState, value float64) {
+	m.exportedState.With(map[string]string{
+		"state": state.String(),
+		"dn":    dn,
+	}).Set(value)
+}
+
+func (m *MetricsPrometheus) exportEdge(dn string, oldState, newState NodeState) {
+	m.exportedEdge.With(map[string]string{
+		"old_state": oldState.String(),
+		"new_state": newState.String(),
+		"dn":        dn,
+	}).Inc()
+}
+
+func (m *MetricsPrometheus) NotifyNodeState(dn string, state NodeState) {
+	// Set all other exported states to zero, so that a given DN is only in a single
+	// state.
+	for _, st := range NodeStates {
+		if st == state {
+			continue
+		}
+		m.exportState(dn, st, 0.0)
+	}
+	// Export new state.
+	m.exportState(dn, state, 1.0)
+
+	// Export edge transition (assume previous state was Dead if this is the first
+	// time we see this DN).
+	previous := NodeStateDead
+	if m.cachedState[dn] != nil {
+		previous = *m.cachedState[dn]
+	}
+	m.exportEdge(dn, previous, state)
+	m.cachedState[dn] = &state
+}
diff --git a/osbase/supervisor/supervisor_node.go b/osbase/supervisor/supervisor_node.go
index 44e8c84..272b650 100644
--- a/osbase/supervisor/supervisor_node.go
+++ b/osbase/supervisor/supervisor_node.go
@@ -84,6 +84,15 @@
 	NodeStateCanceled
 )
 
+// NodeStates is a list of all possible values of a NodeState.
+var NodeStates = []NodeState{
+	NodeStateNew,
+	NodeStateHealthy,
+	NodeStateDead,
+	NodeStateDone,
+	NodeStateCanceled,
+}
+
 func (s NodeState) String() string {
 	switch s {
 	case NodeStateNew: