m/n/kubernetes: add metricsprovider

Kubernetes has a metrics provider interface, add an adapter to be able
to get these into our Prometheus registry. This code exists in a similar
form inside K8s but against their custom metrics architecture, not plain
Prometheus.

As these metrics are shared across all workqueues we follow K8s in
implementing this with a singleton/global. It's not the prettiest, but
otherwise we may get issues with Prometheus and duplicate metrics.

Change-Id: I0b6d608d14793e44859166a5a59d446c8f662a25
Reviewed-on: https://review.monogon.dev/c/monogon/+/3829
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/kubernetes/metricsprovider/metricsprovider.go b/metropolis/node/kubernetes/metricsprovider/metricsprovider.go
new file mode 100644
index 0000000..23aa254
--- /dev/null
+++ b/metropolis/node/kubernetes/metricsprovider/metricsprovider.go
@@ -0,0 +1,114 @@
+// Copyright The Monogon Project Authors.
+// Copyright 2019 The Kubernetes Authors.
+// SPDX-License-Identifier: Apache-2.0
+
+// Package metricsprovider provides a Prometheus registry for code in K8s
+// client-go capable of providing metrics. Currently it registers itself
+// as a metrics backend for workqueues, more can be added in the future.
+// The registry with all the metrics is available as `Registry`.
+package metricsprovider
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"k8s.io/client-go/util/workqueue"
+)
+
+// Metrics subsystem and keys used by the workqueue.
+const (
+	WorkQueueSubsystem         = "workqueue"
+	DepthKey                   = "depth"
+	AddsKey                    = "adds_total"
+	QueueLatencyKey            = "queue_duration_seconds"
+	WorkDurationKey            = "work_duration_seconds"
+	UnfinishedWorkKey          = "unfinished_work_seconds"
+	LongestRunningProcessorKey = "longest_running_processor_seconds"
+	RetriesKey                 = "retries_total"
+)
+
+var Registry = prometheus.NewRegistry()
+
+var (
+	depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      DepthKey,
+		Help:      "Current depth of workqueue",
+	}, []string{"name"})
+
+	adds = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      AddsKey,
+		Help:      "Total number of adds handled by workqueue",
+	}, []string{"name"})
+
+	latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      QueueLatencyKey,
+		Help:      "How long in seconds an item stays in the workqueue before being requested.",
+		Buckets:   prometheus.ExponentialBuckets(10e-9, 10, 10),
+	}, []string{"name"})
+
+	workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      WorkDurationKey,
+		Help:      "How long in seconds processing an item from workqueue takes.",
+		Buckets:   prometheus.ExponentialBuckets(10e-9, 10, 10),
+	}, []string{"name"})
+
+	unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      UnfinishedWorkKey,
+		Help: "How many seconds of work has done that " +
+			"is in progress and hasn't been observed by work_duration. Large " +
+			"values indicate stuck threads. One can deduce the number of stuck " +
+			"threads by observing the rate at which this increases.",
+	}, []string{"name"})
+
+	longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      LongestRunningProcessorKey,
+		Help: "How many seconds has the longest running " +
+			"processor for workqueue been running.",
+	}, []string{"name"})
+
+	retries = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      RetriesKey,
+		Help:      "Total number of retries handled by workqueue",
+	}, []string{"name"})
+)
+
+func init() {
+	Registry.MustRegister(depth, adds, latency, workDuration, unfinished, longestRunningProcessor, retries)
+	workqueue.SetProvider(&promProvider{})
+}
+
+type promProvider struct {
+}
+
+func (promProvider) NewDepthMetric(name string) workqueue.GaugeMetric {
+	return depth.WithLabelValues(name)
+}
+
+func (promProvider) NewAddsMetric(name string) workqueue.CounterMetric {
+	return adds.WithLabelValues(name)
+}
+
+func (promProvider) NewLatencyMetric(name string) workqueue.HistogramMetric {
+	return latency.WithLabelValues(name)
+}
+
+func (promProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
+	return workDuration.WithLabelValues(name)
+}
+
+func (promProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric {
+	return unfinished.WithLabelValues(name)
+}
+
+func (promProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric {
+	return longestRunningProcessor.WithLabelValues(name)
+}
+
+func (promProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
+	return retries.WithLabelValues(name)
+}