osbase/supervisor: implement Prometheus metrics
This is a simple implementation of supervisor.Metrics which will be used
in Metropolis.
Change-Id: I8d47b3aa631dec5b07295d5498b3b0d1ad36c0f7
Reviewed-on: https://review.monogon.dev/c/monogon/+/3291
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/osbase/supervisor/BUILD.bazel b/osbase/supervisor/BUILD.bazel
index 1997f45..b6b4861 100644
--- a/osbase/supervisor/BUILD.bazel
+++ b/osbase/supervisor/BUILD.bazel
@@ -5,6 +5,7 @@
srcs = [
"supervisor.go",
"supervisor_metrics.go",
+ "supervisor_metrics_prometheus.go",
"supervisor_node.go",
"supervisor_processor.go",
"supervisor_support.go",
@@ -16,6 +17,7 @@
deps = [
"//osbase/logtree",
"@com_github_cenkalti_backoff_v4//:backoff",
+ "@com_github_prometheus_client_golang//prometheus",
"@org_golang_google_grpc//:grpc",
],
)
diff --git a/osbase/supervisor/supervisor_metrics_prometheus.go b/osbase/supervisor/supervisor_metrics_prometheus.go
new file mode 100644
index 0000000..49ee973
--- /dev/null
+++ b/osbase/supervisor/supervisor_metrics_prometheus.go
@@ -0,0 +1,88 @@
+package supervisor
+
+import (
+ "fmt"
+
+ "github.com/prometheus/client_golang/prometheus"
+)
+
+// MetricsPrometheus is a Metrics implementation which exports the supervisor
+// metrics over some prometheus registry.
+//
+// This structure must be constructed with NewMetricsPrometheus.
+//
+// The metrics exported are:
+// - monogon_supervisor_dn_state_total
+// - monogon_superfisor_dn_state_transition_count
+type MetricsPrometheus struct {
+ exportedState *prometheus.GaugeVec
+ exportedEdge *prometheus.CounterVec
+ cachedState map[string]*NodeState
+}
+
+// NewMetricsPrometheus initializes Supervisor metrics in a prometheus registry
+// and return a Metrics instance to be used with WithMetrics.
+//
+// This should only be called once for a given registry.
+func NewMetricsPrometheus(registry *prometheus.Registry) (*MetricsPrometheus, error) {
+ res := &MetricsPrometheus{
+ exportedState: prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: "monogon",
+ Subsystem: "supervisor",
+ Name: "dn_state_total",
+ Help: "Total count of supervisor runnables, broken up by DN and state",
+ }, []string{"dn", "state"}),
+ exportedEdge: prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: "monogon",
+ Subsystem: "supervisor",
+ Name: "dn_state_transition_count",
+ Help: "Total count of supervisor runnable state transitions, broken up by DN and (old_state, new_state) tuple",
+ ConstLabels: nil,
+ }, []string{"dn", "old_state", "new_state"}),
+ cachedState: make(map[string]*NodeState),
+ }
+ if err := registry.Register(res.exportedState); err != nil {
+ return nil, fmt.Errorf("when registering dn_state_total: %w", err)
+ }
+ if err := registry.Register(res.exportedEdge); err != nil {
+ return nil, fmt.Errorf("when registering dn_state_transition_count: %w", err)
+ }
+ return res, nil
+}
+
+func (m *MetricsPrometheus) exportState(dn string, state NodeState, value float64) {
+ m.exportedState.With(map[string]string{
+ "state": state.String(),
+ "dn": dn,
+ }).Set(value)
+}
+
+func (m *MetricsPrometheus) exportEdge(dn string, oldState, newState NodeState) {
+ m.exportedEdge.With(map[string]string{
+ "old_state": oldState.String(),
+ "new_state": newState.String(),
+ "dn": dn,
+ }).Inc()
+}
+
+func (m *MetricsPrometheus) NotifyNodeState(dn string, state NodeState) {
+ // Set all other exported states to zero, so that a given DN is only in a single
+ // state.
+ for _, st := range NodeStates {
+ if st == state {
+ continue
+ }
+ m.exportState(dn, st, 0.0)
+ }
+ // Export new state.
+ m.exportState(dn, state, 1.0)
+
+ // Export edge transition (assume previous state was Dead if this is the first
+ // time we see this DN).
+ previous := NodeStateDead
+ if m.cachedState[dn] != nil {
+ previous = *m.cachedState[dn]
+ }
+ m.exportEdge(dn, previous, state)
+ m.cachedState[dn] = &state
+}
diff --git a/osbase/supervisor/supervisor_node.go b/osbase/supervisor/supervisor_node.go
index 44e8c84..272b650 100644
--- a/osbase/supervisor/supervisor_node.go
+++ b/osbase/supervisor/supervisor_node.go
@@ -84,6 +84,15 @@
NodeStateCanceled
)
+// NodeStates is a list of all possible values of a NodeState.
+var NodeStates = []NodeState{
+ NodeStateNew,
+ NodeStateHealthy,
+ NodeStateDead,
+ NodeStateDone,
+ NodeStateCanceled,
+}
+
func (s NodeState) String() string {
switch s {
case NodeStateNew: