blob: 0d53e07bd3e051905d60ad06d5ec124f925fae6c [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package kubernetes
2
3import (
4 "context"
5 "crypto/tls"
6 "crypto/x509"
7 "errors"
8 "fmt"
9 "io"
10 "net"
11 "net/http"
12 _ "net/http/pprof"
13 "net/url"
14 "os"
15 "strings"
16 "testing"
17 "time"
18
19 "github.com/bazelbuild/rules_go/go/runfiles"
Serge Bazanski1e399142024-10-22 10:58:15 +000020 "google.golang.org/protobuf/types/known/fieldmaskpb"
Serge Bazanski99b02142024-04-17 16:33:28 +020021 corev1 "k8s.io/api/core/v1"
22 kerrors "k8s.io/apimachinery/pkg/api/errors"
23 "k8s.io/apimachinery/pkg/api/resource"
24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
26
Lorenz Brun732a8842024-08-26 23:25:37 +020027 common "source.monogon.dev/metropolis/node"
Serge Bazanski6d1ff362024-09-30 15:15:31 +000028 apb "source.monogon.dev/metropolis/proto/api"
Lorenz Brun732a8842024-08-26 23:25:37 +020029 cpb "source.monogon.dev/metropolis/proto/common"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020030 mlaunch "source.monogon.dev/metropolis/test/launch"
31 "source.monogon.dev/metropolis/test/localregistry"
Serge Bazanski99b02142024-04-17 16:33:28 +020032 "source.monogon.dev/metropolis/test/util"
Serge Bazanski99b02142024-04-17 16:33:28 +020033)
34
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000035var (
36 // These are filled by bazel at linking time with the canonical path of
37 // their corresponding file. Inside the init function we resolve it
38 // with the rules_go runfiles package to the real path.
39 xTestImagesManifestPath string
40)
41
42func init() {
43 var err error
44 for _, path := range []*string{
45 &xTestImagesManifestPath,
46 } {
47 *path, err = runfiles.Rlocation(*path)
48 if err != nil {
49 panic(err)
50 }
51 }
52}
53
Serge Bazanski99b02142024-04-17 16:33:28 +020054const (
55 // Timeout for the global test context.
56 //
57 // Bazel would eventually time out the test after 900s ("large") if, for
58 // some reason, the context cancellation fails to abort it.
59 globalTestTimeout = 600 * time.Second
60
61 // Timeouts for individual end-to-end tests of different sizes.
62 smallTestTimeout = 60 * time.Second
63 largeTestTimeout = 120 * time.Second
64)
65
Serge Bazanski6d1ff362024-09-30 15:15:31 +000066// TestE2EKubernetesLabels verifies that Kubernetes node labels are being updated
67// when the cluster state changes.
68func TestE2EKubernetesLabels(t *testing.T) {
69 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
70 defer cancel()
71
72 clusterOptions := mlaunch.ClusterOptions{
73 NumNodes: 2,
74 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +010075 ClusterDomain: "cluster.test",
Serge Bazanski6d1ff362024-09-30 15:15:31 +000076 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
77 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
Serge Bazanski78567602024-10-31 13:42:04 +000078 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
79 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanskie99638e2024-09-30 17:06:44 +000080 {Regexp: `^test\.monogon\.dev/`},
81 },
82 },
Serge Bazanski6d1ff362024-09-30 15:15:31 +000083 },
84 }
85 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
86 if err != nil {
87 t.Fatalf("LaunchCluster failed: %v", err)
88 }
89 defer func() {
90 err := cluster.Close()
91 if err != nil {
92 t.Fatalf("cluster Close failed: %v", err)
93 }
94 }()
95
96 con, err := cluster.CuratorClient()
97 if err != nil {
98 t.Fatalf("Could not get curator client: %v", err)
99 }
100 mgmt := apb.NewManagementClient(con)
101 clientSet, err := cluster.GetKubeClientSet()
102 if err != nil {
103 t.Fatal(err)
104 }
105
106 getLabelsForNode := func(nid string) common.Labels {
107 node, err := clientSet.CoreV1().Nodes().Get(ctx, nid, metav1.GetOptions{})
108 if kerrors.IsNotFound(err) {
109 return nil
110 }
111 if err != nil {
112 t.Fatalf("Could not get node %s: %v", nid, err)
113 return nil
114 }
115 return common.Labels(node.Labels).Filter(func(k, v string) bool {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000116 if strings.HasPrefix(k, "node-role.kubernetes.io/") {
117 return true
118 }
119 if strings.HasPrefix(k, "test.monogon.dev/") {
120 return true
121 }
122 return false
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000123 })
124 }
125
126 // Nodes should have no labels at first.
127 for _, nid := range cluster.NodeIDs {
128 if labels := getLabelsForNode(nid); !labels.Equals(nil) {
129 t.Errorf("Node %s should have no labels, has %s", nid, labels)
130 }
131 }
132 // Nominate both nodes to be Kubernetes workers.
133 for _, nid := range cluster.NodeIDs {
134 yes := true
135 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
136 Node: &apb.UpdateNodeRolesRequest_Id{
137 Id: nid,
138 },
139 KubernetesWorker: &yes,
140 })
141 if err != nil {
142 t.Fatalf("Could not make %s a KubernetesWorker: %v", nid, err)
143 }
144 }
145
Jan Schär36f03752024-11-19 17:41:05 +0100146 util.MustTestEventual(t, "Labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000147 // Nodes should have role labels now.
148 for _, nid := range cluster.NodeIDs {
149 want := common.Labels{
150 "node-role.kubernetes.io/KubernetesWorker": "",
151 }
152 if nid == cluster.NodeIDs[0] {
153 want["node-role.kubernetes.io/KubernetesController"] = ""
154 want["node-role.kubernetes.io/ConsensusMember"] = ""
155 }
156 if labels := getLabelsForNode(nid); !want.Equals(labels) {
157 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
158 }
159 }
160 return nil
161 })
162
163 // Remove KubernetesWorker from first node again. It will stay in k8s (arguably,
164 // this is a bug) but its role label should be removed.
165 no := false
166 _, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
167 Node: &apb.UpdateNodeRolesRequest_Id{
168 Id: cluster.NodeIDs[0],
169 },
170 KubernetesWorker: &no,
171 })
172 if err != nil {
173 t.Fatalf("Could not remove KubernetesWorker from %s: %v", cluster.NodeIDs[0], err)
174 }
175
Jan Schär36f03752024-11-19 17:41:05 +0100176 util.MustTestEventual(t, "Labels removed", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000177 for _, nid := range cluster.NodeIDs {
178 want := make(common.Labels)
179 if nid == cluster.NodeIDs[0] {
180 want["node-role.kubernetes.io/KubernetesController"] = ""
181 want["node-role.kubernetes.io/ConsensusMember"] = ""
182 } else {
183 want["node-role.kubernetes.io/KubernetesWorker"] = ""
184 }
185 if labels := getLabelsForNode(nid); !want.Equals(labels) {
186 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
187 }
188 }
189 return nil
190 })
Serge Bazanskie99638e2024-09-30 17:06:44 +0000191
192 // Add Metropolis node label, ensure it gets reflected on the Kubernetes node.
193 _, err = mgmt.UpdateNodeLabels(ctx, &apb.UpdateNodeLabelsRequest{
194 Node: &apb.UpdateNodeLabelsRequest_Id{
195 Id: cluster.NodeIDs[1],
196 },
197 Upsert: []*apb.UpdateNodeLabelsRequest_Pair{
198 {Key: "test.monogon.dev/foo", Value: "bar"},
199 },
200 })
201
Jan Schär36f03752024-11-19 17:41:05 +0100202 util.MustTestEventual(t, "Metropolis labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000203 if err != nil {
204 t.Fatalf("Could not add label to node: %v", err)
205 }
206 want := common.Labels{
207 "node-role.kubernetes.io/KubernetesWorker": "",
208 "test.monogon.dev/foo": "bar",
209 }
210 if labels := getLabelsForNode(cluster.NodeIDs[1]); !want.Equals(labels) {
Serge Bazanski1e399142024-10-22 10:58:15 +0000211 return fmt.Errorf("node %s should have labels %s, has %s", cluster.NodeIDs[1], want, labels)
Serge Bazanskie99638e2024-09-30 17:06:44 +0000212 }
213 return nil
214 })
Serge Bazanski1e399142024-10-22 10:58:15 +0000215
216 // Reconfigure node label rules.
217 _, err = mgmt.ConfigureCluster(ctx, &apb.ConfigureClusterRequest{
218 BaseConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000219 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
220 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanski1e399142024-10-22 10:58:15 +0000221 {Regexp: `^test\.monogon\.dev/`},
222 },
223 },
224 },
225 NewConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000226 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{},
Serge Bazanski1e399142024-10-22 10:58:15 +0000227 },
228 UpdateMask: &fieldmaskpb.FieldMask{
Serge Bazanski78567602024-10-31 13:42:04 +0000229 Paths: []string{"kubernetes.node_labels_to_synchronize"},
Serge Bazanski1e399142024-10-22 10:58:15 +0000230 },
231 })
232 if err != nil {
233 t.Fatalf("Could not update cluster configuration: %v", err)
234 }
235
236 ci, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
237 if err != nil {
238 t.Fatalf("Could not get cluster info")
239 }
240 // See if the config changed.
Serge Bazanski78567602024-10-31 13:42:04 +0000241 if rules := ci.ClusterConfiguration.Kubernetes.NodeLabelsToSynchronize; len(rules) != 0 {
Serge Bazanski1e399142024-10-22 10:58:15 +0000242 t.Fatalf("Wanted 0 label rules in config after reconfiguration, have %d: %v", len(rules), rules)
243 }
244 // TODO: ensure new rules get applied, but that will require watching the cluster
245 // config for changes in the labelmaker.
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000246}
247
Serge Bazanski99b02142024-04-17 16:33:28 +0200248// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
249//
250// The tests are performed against an in-memory cluster.
251func TestE2EKubernetes(t *testing.T) {
252 // Set a global timeout to make sure this terminates
253 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
254 defer cancel()
255
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000256 df, err := os.ReadFile(xTestImagesManifestPath)
Serge Bazanski99b02142024-04-17 16:33:28 +0200257 if err != nil {
258 t.Fatalf("Reading registry manifest failed: %v", err)
259 }
260 lr, err := localregistry.FromBazelManifest(df)
261 if err != nil {
262 t.Fatalf("Creating test image registry failed: %v", err)
263 }
264
265 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200266 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +0200267 NumNodes: 2,
268 LocalRegistry: lr,
Lorenz Brun732a8842024-08-26 23:25:37 +0200269 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +0100270 ClusterDomain: "cluster.test",
Lorenz Brun732a8842024-08-26 23:25:37 +0200271 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
272 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
273 },
Serge Bazanski99b02142024-04-17 16:33:28 +0200274 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200275 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +0200276 if err != nil {
277 t.Fatalf("LaunchCluster failed: %v", err)
278 }
279 defer func() {
280 err := cluster.Close()
281 if err != nil {
282 t.Fatalf("cluster Close failed: %v", err)
283 }
284 }()
285
286 clientSet, err := cluster.GetKubeClientSet()
287 if err != nil {
288 t.Fatal(err)
289 }
290 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
291 // Make everything but the first node into KubernetesWorkers.
292 for i := 1; i < clusterOptions.NumNodes; i++ {
293 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
294 if err != nil {
295 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
296 }
297 }
298 return nil
299 })
300 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
301 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
302 if err != nil {
303 return err
304 }
305 if len(nodes.Items) < 1 {
306 return errors.New("node not yet registered")
307 }
308 node := nodes.Items[0]
309 for _, cond := range node.Status.Conditions {
310 if cond.Type != corev1.NodeReady {
311 continue
312 }
313 if cond.Status != corev1.ConditionTrue {
314 return fmt.Errorf("node not ready: %v", cond.Message)
315 }
316 }
317 return nil
318 })
319 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
320 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
321 return err
322 })
323 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
324 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
325 if err != nil {
326 return err
327 }
328 if len(res.Items) == 0 {
329 return errors.New("pod didn't get created")
330 }
331 pod := res.Items[0]
332 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
333 return nil
334 }
335 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
336 if err != nil || len(events.Items) == 0 {
337 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
338 } else {
339 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
340 }
341 })
342 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
343 deployment := makeTestDeploymentSpec("test-deploy-2")
344 gvisorStr := "gvisor"
345 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
346 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
347 return err
348 })
349 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
350 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
351 if err != nil {
352 return err
353 }
354 if len(res.Items) == 0 {
355 return errors.New("pod didn't get created")
356 }
357 pod := res.Items[0]
358 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
359 return nil
360 }
361 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
362 if err != nil || len(events.Items) == 0 {
363 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
364 } else {
365 var errorMsg strings.Builder
366 for _, msg := range events.Items {
367 errorMsg.WriteString(" | ")
368 errorMsg.WriteString(msg.Message)
369 }
Tim Windelschmidt5f1a7de2024-09-19 02:00:14 +0200370 return fmt.Errorf("pod is not ready: %s", errorMsg.String())
Serge Bazanski99b02142024-04-17 16:33:28 +0200371 }
372 })
373 util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
374 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
375 return err
376 })
377 util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
378 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
379 if err != nil {
380 return err
381 }
382 if len(res.Items) == 0 {
383 return errors.New("pod didn't get created")
384 }
385 pod := res.Items[0]
386 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
387 return nil
388 }
389 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
390 if err != nil || len(events.Items) == 0 {
391 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
392 } else {
393 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
394 }
395 })
396 util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
397 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
398 return err
399 })
400 util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
401 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
402 if err != nil {
403 return err
404 }
405 if len(res.Items) == 0 {
406 return errors.New("pod didn't get created")
407 }
408 pod := res.Items[0]
409 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
410 return nil
411 }
412 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
413 if err != nil || len(events.Items) == 0 {
414 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
415 } else {
416 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
417 }
418 })
419 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
420 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
421 return err
422 })
423 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
424 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
425 if err != nil {
426 return err
427 }
428 if res.Status.Failed > 0 {
429 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
430 LabelSelector: "job-name=selftest",
431 })
432 if err != nil {
433 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
434 }
435 if len(pods.Items) < 1 {
436 return fmt.Errorf("job failed but pod does not exist")
437 }
438 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
439 if err != nil {
440 return fmt.Errorf("job failed but could not get logs: %w", err)
441 }
442 if len(lines) > 0 {
443 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
444 }
445 return util.Permanent(fmt.Errorf("job failed, empty log"))
446 }
447 if res.Status.Succeeded > 0 {
448 return nil
449 }
450 return fmt.Errorf("job still running")
451 })
452 util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
453 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
454 if err != nil && !kerrors.IsAlreadyExists(err) {
455 return err
456 }
457 _, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
458 if err != nil && !kerrors.IsAlreadyExists(err) {
459 return err
460 }
461 return nil
462 })
463 util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
464 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
465 if err != nil {
466 return err
467 }
468 // Use a new client for each attempt
469 hc := http.Client{
470 Timeout: 2 * time.Second,
471 Transport: &http.Transport{
472 Dial: cluster.SOCKSDialer.Dial,
473 },
474 }
475 for _, n := range nodes.Items {
476 var addr string
477 for _, a := range n.Status.Addresses {
478 if a.Type == corev1.NodeInternalIP {
479 addr = a.Address
480 }
481 }
482 u := url.URL{Scheme: "http", Host: addr, Path: "/"}
483 res, err := hc.Get(u.String())
484 if err != nil {
485 return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
486 }
487 if res.StatusCode != http.StatusOK {
488 return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
489 }
490 t.Logf("Got response from %q", n.Name)
491 }
492 return nil
493 })
494 util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
495 pool := x509.NewCertPool()
496 pool.AddCert(cluster.CACertificate)
497 cl := http.Client{
498 Transport: &http.Transport{
499 TLSClientConfig: &tls.Config{
500 Certificates: []tls.Certificate{cluster.Owner},
501 RootCAs: pool,
502 },
503 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
504 return cluster.DialNode(ctx, addr)
505 },
506 },
507 }
508 u := url.URL{
509 Scheme: "https",
510 Host: net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
511 Path: "/metrics/containerd",
512 }
513 res, err := cl.Get(u.String())
514 if err != nil {
515 return err
516 }
517 defer res.Body.Close()
518 if res.StatusCode != 200 {
519 return fmt.Errorf("status code %d", res.StatusCode)
520 }
521
522 body, err := io.ReadAll(res.Body)
523 if err != nil {
524 return err
525 }
526 needle := "containerd_build_info_total"
527 if !strings.Contains(string(body), needle) {
528 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
529 }
530 return nil
531 })
532 if os.Getenv("HAVE_NESTED_KVM") != "" {
533 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
534 runcRuntimeClass := "runc"
535 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
536 ObjectMeta: metav1.ObjectMeta{
537 Name: "vm-smoketest",
538 },
539 Spec: corev1.PodSpec{
540 Containers: []corev1.Container{{
541 Name: "vm-smoketest",
542 ImagePullPolicy: corev1.PullNever,
543 Image: "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
544 Resources: corev1.ResourceRequirements{
545 Limits: corev1.ResourceList{
546 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
547 },
548 },
549 }},
550 RuntimeClassName: &runcRuntimeClass,
551 RestartPolicy: corev1.RestartPolicyNever,
552 },
553 }, metav1.CreateOptions{})
554 return err
555 })
556 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
557 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
558 if err != nil {
559 return fmt.Errorf("failed to get pod: %w", err)
560 }
561 if pod.Status.Phase == corev1.PodSucceeded {
562 return nil
563 }
564 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
565 if err != nil || len(events.Items) == 0 {
566 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
567 } else {
568 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
569 }
570 })
571 }
572}