m/n/kubernetes: add metricsprovider
Kubernetes has a metrics provider interface, add an adapter to be able
to get these into our Prometheus registry. This code exists in a similar
form inside K8s but against their custom metrics architecture, not plain
Prometheus.
As these metrics are shared across all workqueues we follow K8s in
implementing this with a singleton/global. It's not the prettiest, but
otherwise we may get issues with Prometheus and duplicate metrics.
Change-Id: I0b6d608d14793e44859166a5a59d446c8f662a25
Reviewed-on: https://review.monogon.dev/c/monogon/+/3829
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/build/analysis/BUILD.bazel b/build/analysis/BUILD.bazel
index 91a2f20..ae0cd74 100644
--- a/build/analysis/BUILD.bazel
+++ b/build/analysis/BUILD.bazel
@@ -196,6 +196,14 @@
"cgo/": "cgo",
},
},
+ "haslicense": {
+ "exclude_files": {
+ "metropolis/node/kubernetes/metricsprovider": "Multi-Party copyright statement (#376)",
+ "external/": "third_party",
+ "bazel-out/": "generated_output",
+ "cgo/": "cgo",
+ },
+ },
}
# All analyzers that should be disabled for external, generated or cgo code.
@@ -215,7 +223,6 @@
"hash",
"errcmp",
"gofmt",
- "haslicense",
] + ALL_STATICCHECK_ANALYZERS
# We override the variable with itself unioned with the other
diff --git a/metropolis/node/kubernetes/BUILD.bazel b/metropolis/node/kubernetes/BUILD.bazel
index 16f773c..636295c 100644
--- a/metropolis/node/kubernetes/BUILD.bazel
+++ b/metropolis/node/kubernetes/BUILD.bazel
@@ -27,9 +27,11 @@
"//metropolis/node/core/curator/watcher",
"//metropolis/node/core/identity",
"//metropolis/node/core/localstorage",
+ "//metropolis/node/core/metrics",
"//metropolis/node/core/network",
"//metropolis/node/kubernetes/authproxy",
"//metropolis/node/kubernetes/clusternet",
+ "//metropolis/node/kubernetes/metricsprovider",
"//metropolis/node/kubernetes/metricsproxy",
"//metropolis/node/kubernetes/nfproxy",
"//metropolis/node/kubernetes/pki",
diff --git a/metropolis/node/kubernetes/metricsprovider/BUILD.bazel b/metropolis/node/kubernetes/metricsprovider/BUILD.bazel
new file mode 100644
index 0000000..4bba439
--- /dev/null
+++ b/metropolis/node/kubernetes/metricsprovider/BUILD.bazel
@@ -0,0 +1,12 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "metricsprovider",
+ srcs = ["metricsprovider.go"],
+ importpath = "source.monogon.dev/metropolis/node/kubernetes/metricsprovider",
+ visibility = ["//visibility:public"],
+ deps = [
+ "@com_github_prometheus_client_golang//prometheus",
+ "@io_k8s_client_go//util/workqueue",
+ ],
+)
diff --git a/metropolis/node/kubernetes/metricsprovider/metricsprovider.go b/metropolis/node/kubernetes/metricsprovider/metricsprovider.go
new file mode 100644
index 0000000..23aa254
--- /dev/null
+++ b/metropolis/node/kubernetes/metricsprovider/metricsprovider.go
@@ -0,0 +1,114 @@
+// Copyright The Monogon Project Authors.
+// Copyright 2019 The Kubernetes Authors.
+// SPDX-License-Identifier: Apache-2.0
+
+// Package metricsprovider provides a Prometheus registry for code in K8s
+// client-go capable of providing metrics. Currently it registers itself
+// as a metrics backend for workqueues, more can be added in the future.
+// The registry with all the metrics is available as `Registry`.
+package metricsprovider
+
+import (
+ "github.com/prometheus/client_golang/prometheus"
+ "k8s.io/client-go/util/workqueue"
+)
+
+// Metrics subsystem and keys used by the workqueue.
+const (
+ WorkQueueSubsystem = "workqueue"
+ DepthKey = "depth"
+ AddsKey = "adds_total"
+ QueueLatencyKey = "queue_duration_seconds"
+ WorkDurationKey = "work_duration_seconds"
+ UnfinishedWorkKey = "unfinished_work_seconds"
+ LongestRunningProcessorKey = "longest_running_processor_seconds"
+ RetriesKey = "retries_total"
+)
+
+var Registry = prometheus.NewRegistry()
+
+var (
+ depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Subsystem: WorkQueueSubsystem,
+ Name: DepthKey,
+ Help: "Current depth of workqueue",
+ }, []string{"name"})
+
+ adds = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Subsystem: WorkQueueSubsystem,
+ Name: AddsKey,
+ Help: "Total number of adds handled by workqueue",
+ }, []string{"name"})
+
+ latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+ Subsystem: WorkQueueSubsystem,
+ Name: QueueLatencyKey,
+ Help: "How long in seconds an item stays in the workqueue before being requested.",
+ Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
+ }, []string{"name"})
+
+ workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+ Subsystem: WorkQueueSubsystem,
+ Name: WorkDurationKey,
+ Help: "How long in seconds processing an item from workqueue takes.",
+ Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
+ }, []string{"name"})
+
+ unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Subsystem: WorkQueueSubsystem,
+ Name: UnfinishedWorkKey,
+ Help: "How many seconds of work has done that " +
+ "is in progress and hasn't been observed by work_duration. Large " +
+ "values indicate stuck threads. One can deduce the number of stuck " +
+ "threads by observing the rate at which this increases.",
+ }, []string{"name"})
+
+ longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Subsystem: WorkQueueSubsystem,
+ Name: LongestRunningProcessorKey,
+ Help: "How many seconds has the longest running " +
+ "processor for workqueue been running.",
+ }, []string{"name"})
+
+ retries = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Subsystem: WorkQueueSubsystem,
+ Name: RetriesKey,
+ Help: "Total number of retries handled by workqueue",
+ }, []string{"name"})
+)
+
+func init() {
+ Registry.MustRegister(depth, adds, latency, workDuration, unfinished, longestRunningProcessor, retries)
+ workqueue.SetProvider(&promProvider{})
+}
+
+type promProvider struct {
+}
+
+func (promProvider) NewDepthMetric(name string) workqueue.GaugeMetric {
+ return depth.WithLabelValues(name)
+}
+
+func (promProvider) NewAddsMetric(name string) workqueue.CounterMetric {
+ return adds.WithLabelValues(name)
+}
+
+func (promProvider) NewLatencyMetric(name string) workqueue.HistogramMetric {
+ return latency.WithLabelValues(name)
+}
+
+func (promProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
+ return workDuration.WithLabelValues(name)
+}
+
+func (promProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric {
+ return unfinished.WithLabelValues(name)
+}
+
+func (promProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric {
+ return longestRunningProcessor.WithLabelValues(name)
+}
+
+func (promProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
+ return retries.WithLabelValues(name)
+}
diff --git a/metropolis/node/kubernetes/service_worker.go b/metropolis/node/kubernetes/service_worker.go
index e65d39b..6f6633b 100644
--- a/metropolis/node/kubernetes/service_worker.go
+++ b/metropolis/node/kubernetes/service_worker.go
@@ -19,8 +19,10 @@
"source.monogon.dev/metropolis/node"
oclusternet "source.monogon.dev/metropolis/node/core/clusternet"
"source.monogon.dev/metropolis/node/core/localstorage"
+ "source.monogon.dev/metropolis/node/core/metrics"
"source.monogon.dev/metropolis/node/core/network"
"source.monogon.dev/metropolis/node/kubernetes/clusternet"
+ "source.monogon.dev/metropolis/node/kubernetes/metricsprovider"
"source.monogon.dev/metropolis/node/kubernetes/nfproxy"
kpki "source.monogon.dev/metropolis/node/kubernetes/pki"
"source.monogon.dev/metropolis/node/kubernetes/plugins/kvmdevice"
@@ -56,6 +58,8 @@
}
func (s *Worker) Run(ctx context.Context) error {
+ metrics.CoreRegistry.MustRegister(metricsprovider.Registry)
+ defer metrics.CoreRegistry.Unregister(metricsprovider.Registry)
// Run apiproxy, which load-balances connections from worker components to this
// cluster's api servers. This is necessary as we want to round-robin across all
// available apiservers, and Kubernetes components do not implement client-side