metropolis/node: export core/supervisor metrics

Change-Id: Ibe3be27f9a5b3fc5e36babecc74d7d784d1f5e10
Reviewed-on: https://review.monogon.dev/c/monogon/+/3292
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/cli/metroctl/cmd_node_metrics.go b/metropolis/cli/metroctl/cmd_node_metrics.go
index e445086..f94f020 100644
--- a/metropolis/cli/metroctl/cmd_node_metrics.go
+++ b/metropolis/cli/metroctl/cmd_node_metrics.go
@@ -28,6 +28,8 @@
 
 A node ID and exporter must be provided. Currently available exporters are:
 
+  - core: metrics from the core process of the node (which contains the
+    supervision tree)
   - node: node_exporter metrics for the node
   - etcd: etcd metrics, if the node is running the cluster control plane
   - kubernetes-scheduler, kubernetes-controller-manager, kubernetes-apiserver:
diff --git a/metropolis/node/core/BUILD.bazel b/metropolis/node/core/BUILD.bazel
index 47ee2c1..f881f04 100644
--- a/metropolis/node/core/BUILD.bazel
+++ b/metropolis/node/core/BUILD.bazel
@@ -28,6 +28,7 @@
         "//metropolis/node/core/devmgr",
         "//metropolis/node/core/localstorage",
         "//metropolis/node/core/localstorage/declarative",
+        "//metropolis/node/core/metrics",
         "//metropolis/node/core/mgmt",
         "//metropolis/node/core/network",
         "//metropolis/node/core/roleserve",
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index e9d1ad1..ede3478 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -31,6 +31,7 @@
 	"source.monogon.dev/metropolis/node/core/devmgr"
 	"source.monogon.dev/metropolis/node/core/localstorage"
 	"source.monogon.dev/metropolis/node/core/localstorage/declarative"
+	"source.monogon.dev/metropolis/node/core/metrics"
 	"source.monogon.dev/metropolis/node/core/network"
 	"source.monogon.dev/metropolis/node/core/roleserve"
 	"source.monogon.dev/metropolis/node/core/rpc/resolver"
@@ -221,6 +222,12 @@
 		return m.Run(ctx)
 	}
 
+	pm, err := supervisor.NewMetricsPrometheus(metrics.CoreRegistry)
+	if err != nil {
+		// Fatal, because this generally shouldn't happen.
+		logger.Fatalf("Failed to register supervisor metrics: %v", err)
+	}
+
 	// Start the init function in a one-shot runnable. Smuggle out any errors from
 	// the init function and stuff them into the fatal channel. This is where the
 	// system supervisor takes over as the main process management system.
@@ -232,11 +239,11 @@
 			select {}
 		}
 		return nil
-	}, supervisor.WithExistingLogtree(lt))
+	}, supervisor.WithExistingLogtree(lt), supervisor.WithMetrics(pm))
 
 	// Meanwhile, wait for any fatal error from the init process, and handle it
 	// accordingly.
-	err := <-fatal
+	err = <-fatal
 	// Log error with primary logging mechanism still active.
 	logger.Infof("Node startup failed: %v", err)
 	// Start shutting down the supervision tree...
diff --git a/metropolis/node/core/metrics/BUILD.bazel b/metropolis/node/core/metrics/BUILD.bazel
index 6385bb8..30d08a7 100644
--- a/metropolis/node/core/metrics/BUILD.bazel
+++ b/metropolis/node/core/metrics/BUILD.bazel
@@ -16,6 +16,8 @@
         "//metropolis/node/core/curator/watcher",
         "//metropolis/node/core/identity",
         "//osbase/supervisor",
+        "@com_github_prometheus_client_golang//prometheus",
+        "@com_github_prometheus_client_golang//prometheus/promhttp",
     ],
 )
 
diff --git a/metropolis/node/core/metrics/exporters.go b/metropolis/node/core/metrics/exporters.go
index 2dd2cfc..2e8c1df 100644
--- a/metropolis/node/core/metrics/exporters.go
+++ b/metropolis/node/core/metrics/exporters.go
@@ -5,21 +5,37 @@
 	"io"
 	"net/http"
 
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
+
 	"source.monogon.dev/metropolis/node"
 	"source.monogon.dev/osbase/supervisor"
 )
 
-// An Exporter is a Prometheus binary running under the Metrics service which
-// collects some metrics and exposes them on a locally bound TCP port.
+// An Exporter is a source of Prometheus metrics. There are two possible kinds of
+// exporters:
+//
+// 1. A binary running under the Metrics service which collects some metrics and
+// exposes them on a locally bound TCP port (either started by the Exporter or
+// already running as part of Metropolis).
+//
+// 2. An in-memory Prometheus registry/gatherer for metrics generated by the
+// Metropolis core process.
 //
 // The Metrics Service will forward requests from /metrics/<name> to the
 // exporter.
 type Exporter struct {
 	// Name of the exporter, which becomes part of the metrics URL for this exporter.
 	Name string
-	// Port on which this exporter will be running.
+	// Gatherer, if provided, is a Prometheus registry (or other Gatherer) that will
+	// be queried for metrics for this exporter. Exactly one of Gatherer or Port must
+	// be set.
+	Gatherer prometheus.Gatherer
+	// Port on which an exporter is/will be running to which metrics requests will be
+	// proxied to. Exactly one of Gatherer or Port must be set.
 	Port node.Port
-	// Executable to run to start the exporter.
+	// Executable to run to start the exporter. If empty, no executable will be
+	// started.
 	Executable string
 	// Arguments to start the exporter. The exporter should listen at 127.0.0.1 and
 	// the port specified by Port, and serve its metrics on /metrics.
@@ -28,9 +44,17 @@
 	Path string
 }
 
+// CoreRegistry is the metrics registry that will be served at /core. All
+// prometheus metrics exported by the node core should register here.
+var CoreRegistry = prometheus.NewRegistry()
+
 // DefaultExporters are the exporters which we run by default in Metropolis.
 var DefaultExporters = []*Exporter{
 	{
+		Name:     "core",
+		Gatherer: CoreRegistry,
+	},
+	{
 		Name:       "node",
 		Port:       node.MetricsNodeListenerPort,
 		Executable: "/metrics/bin/node_exporter",
@@ -71,12 +95,7 @@
 	},
 }
 
-func (e *Exporter) ServeHTTP(w http.ResponseWriter, r *http.Request) {
-	if r.Method != http.MethodGet {
-		http.Error(w, fmt.Sprintf("method %q not allowed", r.Method), http.StatusMethodNotAllowed)
-		return
-	}
-
+func (e *Exporter) serveHTTPForward(w http.ResponseWriter, r *http.Request) {
 	ctx := r.Context()
 
 	// We are supplying the http.Server with a BaseContext that contains the
@@ -113,6 +132,27 @@
 	}
 }
 
+func (e *Exporter) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, fmt.Sprintf("method %q not allowed", r.Method), http.StatusMethodNotAllowed)
+		return
+	}
+
+	if e.Port != 0 {
+		e.serveHTTPForward(w, r)
+		return
+	}
+
+	if e.Gatherer != nil {
+		h := promhttp.HandlerFor(e.Gatherer, promhttp.HandlerOpts{})
+		h.ServeHTTP(w, r)
+		return
+	}
+
+	w.WriteHeader(500)
+	fmt.Fprintf(w, "invalid exporter configuration (no port, no gatherer)")
+}
+
 func copyHeader(dst, src http.Header) {
 	for k, vv := range src {
 		for _, v := range vv {