metropolis/core/metrics: expose containerd metrics endpoint

This adds containerd as another metrics endpoint. It is only available
on nodes with the KubernetesWorker role.

Change-Id: I5f6269165a81d9a4c4cff48d3ed6b6a55d7f4f46
Reviewed-on: https://review.monogon.dev/c/monogon/+/2861
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/metrics/exporters.go b/metropolis/node/core/metrics/exporters.go
index 8771539..c14abcc 100644
--- a/metropolis/node/core/metrics/exporters.go
+++ b/metropolis/node/core/metrics/exporters.go
@@ -24,6 +24,8 @@
 	// Arguments to start the exporter. The exporter should listen at 127.0.0.1 and
 	// the port specified by Port, and serve its metrics on /metrics.
 	Arguments []string
+	// Path to scrape metrics at. Defaults to /metrics.
+	Path string
 }
 
 // DefaultExporters are the exporters which we run by default in Metropolis.
@@ -62,6 +64,11 @@
 		Name: "kubernetes-apiserver",
 		Port: node.MetricsKubeAPIServerListenerPort,
 	},
+	{
+		Name: "containerd",
+		Port: node.MetricsContainerdListenerPort,
+		Path: "/v1/metrics",
+	},
 }
 
 func (e *Exporter) ServeHTTP(w http.ResponseWriter, r *http.Request) {
@@ -76,7 +83,12 @@
 	// context from our runnable which contains the logger.
 	logger := supervisor.Logger(ctx)
 
-	url := "http://127.0.0.1:" + e.Port.PortString() + "/metrics"
+	path := e.Path
+	if e.Path == "" {
+		path = "/metrics"
+	}
+
+	url := "http://127.0.0.1:" + e.Port.PortString() + path
 	outReq, err := http.NewRequestWithContext(ctx, "GET", url, nil)
 	if err != nil {
 		logger.Errorf("%s: forwarding to %q failed: %v", r.RemoteAddr, e.Name, err)
diff --git a/metropolis/node/kubernetes/containerd/config.toml b/metropolis/node/kubernetes/containerd/config.toml
index 4f6e31c..177e0d0 100644
--- a/metropolis/node/kubernetes/containerd/config.toml
+++ b/metropolis/node/kubernetes/containerd/config.toml
@@ -28,7 +28,7 @@
   level = ""
 
 [metrics]
-  address = ""
+  address = "127.0.0.1:7846"
   grpc_histogram = false
 
 [cgroup]
diff --git a/metropolis/node/ports.go b/metropolis/node/ports.go
index 793a68c..ded1815 100644
--- a/metropolis/node/ports.go
+++ b/metropolis/node/ports.go
@@ -63,6 +63,9 @@
 	// proxy for the api-server runs, bound to 127.0.0.1. The metrics
 	// service proxies traffic to it from the public MetricsPort.
 	MetricsKubeAPIServerListenerPort Port = 7845
+	// MetricsContainerdListenerPort is the TCP port on which the
+	// containerd metrics endpoint, bound to 127.0.0.1, is exposed.
+	MetricsContainerdListenerPort Port = 7846
 	// KubernetesAPIPort is the TCP port on which the Kubernetes API is
 	// exposed.
 	KubernetesAPIPort Port = 6443
diff --git a/metropolis/test/e2e/main_test.go b/metropolis/test/e2e/main_test.go
index 8617da4..19df697 100644
--- a/metropolis/test/e2e/main_test.go
+++ b/metropolis/test/e2e/main_test.go
@@ -436,6 +436,44 @@
 		}
 		return nil
 	})
+	util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
+		pool := x509.NewCertPool()
+		pool.AddCert(cluster.CACertificate)
+		cl := http.Client{
+			Transport: &http.Transport{
+				TLSClientConfig: &tls.Config{
+					Certificates: []tls.Certificate{cluster.Owner},
+					RootCAs:      pool,
+				},
+				DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
+					return cluster.DialNode(ctx, addr)
+				},
+			},
+		}
+		u := url.URL{
+			Scheme: "https",
+			Host:   net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
+			Path:   "/metrics/containerd",
+		}
+		res, err := cl.Get(u.String())
+		if err != nil {
+			return err
+		}
+		defer res.Body.Close()
+		if res.StatusCode != 200 {
+			return fmt.Errorf("status code %d", res.StatusCode)
+		}
+
+		body, err := io.ReadAll(res.Body)
+		if err != nil {
+			return err
+		}
+		needle := "containerd_build_info_total"
+		if !strings.Contains(string(body), needle) {
+			return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
+		}
+		return nil
+	})
 	if os.Getenv("HAVE_NESTED_KVM") != "" {
 		util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
 			runcRuntimeClass := "runc"