blob: 18239e0dc95d4068babef440a8c6677863f8af81 [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package kubernetes
2
3import (
4 "context"
5 "crypto/tls"
6 "crypto/x509"
7 "errors"
8 "fmt"
9 "io"
10 "net"
11 "net/http"
12 _ "net/http/pprof"
13 "net/url"
14 "os"
15 "strings"
16 "testing"
17 "time"
18
19 "github.com/bazelbuild/rules_go/go/runfiles"
Serge Bazanski1e399142024-10-22 10:58:15 +000020 "google.golang.org/protobuf/types/known/fieldmaskpb"
Serge Bazanski99b02142024-04-17 16:33:28 +020021 corev1 "k8s.io/api/core/v1"
22 kerrors "k8s.io/apimachinery/pkg/api/errors"
23 "k8s.io/apimachinery/pkg/api/resource"
24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
Lorenz Brun2ecccae2024-11-27 22:03:35 +010026 "k8s.io/utils/ptr"
Serge Bazanski99b02142024-04-17 16:33:28 +020027
Lorenz Brun732a8842024-08-26 23:25:37 +020028 common "source.monogon.dev/metropolis/node"
Serge Bazanski6d1ff362024-09-30 15:15:31 +000029 apb "source.monogon.dev/metropolis/proto/api"
Lorenz Brun732a8842024-08-26 23:25:37 +020030 cpb "source.monogon.dev/metropolis/proto/common"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020031 mlaunch "source.monogon.dev/metropolis/test/launch"
32 "source.monogon.dev/metropolis/test/localregistry"
Serge Bazanski99b02142024-04-17 16:33:28 +020033 "source.monogon.dev/metropolis/test/util"
Serge Bazanski99b02142024-04-17 16:33:28 +020034)
35
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000036var (
37 // These are filled by bazel at linking time with the canonical path of
38 // their corresponding file. Inside the init function we resolve it
39 // with the rules_go runfiles package to the real path.
40 xTestImagesManifestPath string
41)
42
43func init() {
44 var err error
45 for _, path := range []*string{
46 &xTestImagesManifestPath,
47 } {
48 *path, err = runfiles.Rlocation(*path)
49 if err != nil {
50 panic(err)
51 }
52 }
53}
54
Serge Bazanski99b02142024-04-17 16:33:28 +020055const (
56 // Timeout for the global test context.
57 //
58 // Bazel would eventually time out the test after 900s ("large") if, for
59 // some reason, the context cancellation fails to abort it.
60 globalTestTimeout = 600 * time.Second
61
62 // Timeouts for individual end-to-end tests of different sizes.
63 smallTestTimeout = 60 * time.Second
64 largeTestTimeout = 120 * time.Second
65)
66
Serge Bazanski6d1ff362024-09-30 15:15:31 +000067// TestE2EKubernetesLabels verifies that Kubernetes node labels are being updated
68// when the cluster state changes.
69func TestE2EKubernetesLabels(t *testing.T) {
70 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
71 defer cancel()
72
73 clusterOptions := mlaunch.ClusterOptions{
74 NumNodes: 2,
75 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +010076 ClusterDomain: "cluster.test",
Serge Bazanski6d1ff362024-09-30 15:15:31 +000077 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
78 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
Serge Bazanski78567602024-10-31 13:42:04 +000079 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
80 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanskie99638e2024-09-30 17:06:44 +000081 {Regexp: `^test\.monogon\.dev/`},
82 },
83 },
Serge Bazanski6d1ff362024-09-30 15:15:31 +000084 },
85 }
86 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
87 if err != nil {
88 t.Fatalf("LaunchCluster failed: %v", err)
89 }
90 defer func() {
91 err := cluster.Close()
92 if err != nil {
93 t.Fatalf("cluster Close failed: %v", err)
94 }
95 }()
96
97 con, err := cluster.CuratorClient()
98 if err != nil {
99 t.Fatalf("Could not get curator client: %v", err)
100 }
101 mgmt := apb.NewManagementClient(con)
102 clientSet, err := cluster.GetKubeClientSet()
103 if err != nil {
104 t.Fatal(err)
105 }
106
107 getLabelsForNode := func(nid string) common.Labels {
108 node, err := clientSet.CoreV1().Nodes().Get(ctx, nid, metav1.GetOptions{})
109 if kerrors.IsNotFound(err) {
110 return nil
111 }
112 if err != nil {
113 t.Fatalf("Could not get node %s: %v", nid, err)
114 return nil
115 }
116 return common.Labels(node.Labels).Filter(func(k, v string) bool {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000117 if strings.HasPrefix(k, "node-role.kubernetes.io/") {
118 return true
119 }
120 if strings.HasPrefix(k, "test.monogon.dev/") {
121 return true
122 }
123 return false
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000124 })
125 }
126
127 // Nodes should have no labels at first.
128 for _, nid := range cluster.NodeIDs {
129 if labels := getLabelsForNode(nid); !labels.Equals(nil) {
130 t.Errorf("Node %s should have no labels, has %s", nid, labels)
131 }
132 }
133 // Nominate both nodes to be Kubernetes workers.
134 for _, nid := range cluster.NodeIDs {
135 yes := true
136 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
137 Node: &apb.UpdateNodeRolesRequest_Id{
138 Id: nid,
139 },
140 KubernetesWorker: &yes,
141 })
142 if err != nil {
143 t.Fatalf("Could not make %s a KubernetesWorker: %v", nid, err)
144 }
145 }
146
Jan Schär36f03752024-11-19 17:41:05 +0100147 util.MustTestEventual(t, "Labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000148 // Nodes should have role labels now.
149 for _, nid := range cluster.NodeIDs {
150 want := common.Labels{
151 "node-role.kubernetes.io/KubernetesWorker": "",
152 }
153 if nid == cluster.NodeIDs[0] {
154 want["node-role.kubernetes.io/KubernetesController"] = ""
155 want["node-role.kubernetes.io/ConsensusMember"] = ""
156 }
157 if labels := getLabelsForNode(nid); !want.Equals(labels) {
158 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
159 }
160 }
161 return nil
162 })
163
164 // Remove KubernetesWorker from first node again. It will stay in k8s (arguably,
165 // this is a bug) but its role label should be removed.
166 no := false
167 _, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
168 Node: &apb.UpdateNodeRolesRequest_Id{
169 Id: cluster.NodeIDs[0],
170 },
171 KubernetesWorker: &no,
172 })
173 if err != nil {
174 t.Fatalf("Could not remove KubernetesWorker from %s: %v", cluster.NodeIDs[0], err)
175 }
176
Jan Schär36f03752024-11-19 17:41:05 +0100177 util.MustTestEventual(t, "Labels removed", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000178 for _, nid := range cluster.NodeIDs {
179 want := make(common.Labels)
180 if nid == cluster.NodeIDs[0] {
181 want["node-role.kubernetes.io/KubernetesController"] = ""
182 want["node-role.kubernetes.io/ConsensusMember"] = ""
183 } else {
184 want["node-role.kubernetes.io/KubernetesWorker"] = ""
185 }
186 if labels := getLabelsForNode(nid); !want.Equals(labels) {
187 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
188 }
189 }
190 return nil
191 })
Serge Bazanskie99638e2024-09-30 17:06:44 +0000192
193 // Add Metropolis node label, ensure it gets reflected on the Kubernetes node.
194 _, err = mgmt.UpdateNodeLabels(ctx, &apb.UpdateNodeLabelsRequest{
195 Node: &apb.UpdateNodeLabelsRequest_Id{
196 Id: cluster.NodeIDs[1],
197 },
198 Upsert: []*apb.UpdateNodeLabelsRequest_Pair{
199 {Key: "test.monogon.dev/foo", Value: "bar"},
200 },
201 })
202
Jan Schär36f03752024-11-19 17:41:05 +0100203 util.MustTestEventual(t, "Metropolis labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000204 if err != nil {
205 t.Fatalf("Could not add label to node: %v", err)
206 }
207 want := common.Labels{
208 "node-role.kubernetes.io/KubernetesWorker": "",
209 "test.monogon.dev/foo": "bar",
210 }
211 if labels := getLabelsForNode(cluster.NodeIDs[1]); !want.Equals(labels) {
Serge Bazanski1e399142024-10-22 10:58:15 +0000212 return fmt.Errorf("node %s should have labels %s, has %s", cluster.NodeIDs[1], want, labels)
Serge Bazanskie99638e2024-09-30 17:06:44 +0000213 }
214 return nil
215 })
Serge Bazanski1e399142024-10-22 10:58:15 +0000216
217 // Reconfigure node label rules.
218 _, err = mgmt.ConfigureCluster(ctx, &apb.ConfigureClusterRequest{
219 BaseConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000220 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
221 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanski1e399142024-10-22 10:58:15 +0000222 {Regexp: `^test\.monogon\.dev/`},
223 },
224 },
225 },
226 NewConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000227 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{},
Serge Bazanski1e399142024-10-22 10:58:15 +0000228 },
229 UpdateMask: &fieldmaskpb.FieldMask{
Serge Bazanski78567602024-10-31 13:42:04 +0000230 Paths: []string{"kubernetes.node_labels_to_synchronize"},
Serge Bazanski1e399142024-10-22 10:58:15 +0000231 },
232 })
233 if err != nil {
234 t.Fatalf("Could not update cluster configuration: %v", err)
235 }
236
237 ci, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
238 if err != nil {
239 t.Fatalf("Could not get cluster info")
240 }
241 // See if the config changed.
Serge Bazanski78567602024-10-31 13:42:04 +0000242 if rules := ci.ClusterConfiguration.Kubernetes.NodeLabelsToSynchronize; len(rules) != 0 {
Serge Bazanski1e399142024-10-22 10:58:15 +0000243 t.Fatalf("Wanted 0 label rules in config after reconfiguration, have %d: %v", len(rules), rules)
244 }
245 // TODO: ensure new rules get applied, but that will require watching the cluster
246 // config for changes in the labelmaker.
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000247}
248
Serge Bazanski99b02142024-04-17 16:33:28 +0200249// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
250//
251// The tests are performed against an in-memory cluster.
252func TestE2EKubernetes(t *testing.T) {
253 // Set a global timeout to make sure this terminates
254 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
255 defer cancel()
256
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000257 df, err := os.ReadFile(xTestImagesManifestPath)
Serge Bazanski99b02142024-04-17 16:33:28 +0200258 if err != nil {
259 t.Fatalf("Reading registry manifest failed: %v", err)
260 }
261 lr, err := localregistry.FromBazelManifest(df)
262 if err != nil {
263 t.Fatalf("Creating test image registry failed: %v", err)
264 }
265
266 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200267 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +0200268 NumNodes: 2,
269 LocalRegistry: lr,
Lorenz Brun732a8842024-08-26 23:25:37 +0200270 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +0100271 ClusterDomain: "cluster.test",
Lorenz Brun732a8842024-08-26 23:25:37 +0200272 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
273 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
274 },
Serge Bazanski99b02142024-04-17 16:33:28 +0200275 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200276 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +0200277 if err != nil {
278 t.Fatalf("LaunchCluster failed: %v", err)
279 }
280 defer func() {
281 err := cluster.Close()
282 if err != nil {
283 t.Fatalf("cluster Close failed: %v", err)
284 }
285 }()
286
287 clientSet, err := cluster.GetKubeClientSet()
288 if err != nil {
289 t.Fatal(err)
290 }
291 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
292 // Make everything but the first node into KubernetesWorkers.
293 for i := 1; i < clusterOptions.NumNodes; i++ {
294 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
295 if err != nil {
296 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
297 }
298 }
299 return nil
300 })
301 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
302 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
303 if err != nil {
304 return err
305 }
306 if len(nodes.Items) < 1 {
307 return errors.New("node not yet registered")
308 }
309 node := nodes.Items[0]
310 for _, cond := range node.Status.Conditions {
311 if cond.Type != corev1.NodeReady {
312 continue
313 }
314 if cond.Status != corev1.ConditionTrue {
315 return fmt.Errorf("node not ready: %v", cond.Message)
316 }
317 }
318 return nil
319 })
320 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
321 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
322 return err
323 })
324 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
325 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
326 if err != nil {
327 return err
328 }
329 if len(res.Items) == 0 {
330 return errors.New("pod didn't get created")
331 }
332 pod := res.Items[0]
333 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
334 return nil
335 }
336 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
337 if err != nil || len(events.Items) == 0 {
338 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
339 } else {
340 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
341 }
342 })
343 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
344 deployment := makeTestDeploymentSpec("test-deploy-2")
345 gvisorStr := "gvisor"
346 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
347 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
348 return err
349 })
350 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
351 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
352 if err != nil {
353 return err
354 }
355 if len(res.Items) == 0 {
356 return errors.New("pod didn't get created")
357 }
358 pod := res.Items[0]
359 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
360 return nil
361 }
362 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
363 if err != nil || len(events.Items) == 0 {
364 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
365 } else {
366 var errorMsg strings.Builder
367 for _, msg := range events.Items {
368 errorMsg.WriteString(" | ")
369 errorMsg.WriteString(msg.Message)
370 }
Tim Windelschmidt5f1a7de2024-09-19 02:00:14 +0200371 return fmt.Errorf("pod is not ready: %s", errorMsg.String())
Serge Bazanski99b02142024-04-17 16:33:28 +0200372 }
373 })
Jan Schär73beb692024-11-27 17:47:09 +0100374 for _, runtimeClass := range []string{"runc", "gvisor"} {
375 statefulSetName := fmt.Sprintf("test-statefulset-%s", runtimeClass)
376 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
377 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet(statefulSetName, runtimeClass), metav1.CreateOptions{})
Serge Bazanski99b02142024-04-17 16:33:28 +0200378 return err
Jan Schär73beb692024-11-27 17:47:09 +0100379 })
380 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests successful", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
381 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("name=%s", statefulSetName)})
382 if err != nil {
383 return err
Jan Schär652c2ad2024-11-19 17:40:50 +0100384 }
Jan Schär73beb692024-11-27 17:47:09 +0100385 if len(res.Items) == 0 {
386 return errors.New("pod didn't get created")
387 }
388 pod := res.Items[0]
389 lines, err := getPodLogLines(ctx, clientSet, pod.Name, 50)
390 if err != nil {
391 return fmt.Errorf("could not get logs: %w", err)
392 }
393 if len(lines) > 0 {
394 switch lines[len(lines)-1] {
395 case "[TESTS-PASSED]":
396 return nil
397 case "[TESTS-FAILED]":
398 return util.Permanent(fmt.Errorf("tests failed, log:\n %s", strings.Join(lines, "\n ")))
399 }
400 }
401 return fmt.Errorf("pod is not ready: %v, log:\n %s", pod.Status.Phase, strings.Join(lines, "\n "))
402 })
403 }
Lorenz Brun2ecccae2024-11-27 22:03:35 +0100404 util.TestEventual(t, "Deployment in user namespace", ctx, largeTestTimeout, func(ctx context.Context) error {
405 deployment := makeTestDeploymentSpec("test-userns-1")
406 deployment.Spec.Template.Spec.HostUsers = ptr.To(false)
407 deployment.Spec.Template.Spec.Containers[0].ReadinessProbe.HTTPGet.Path = "/ready_userns"
408 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
409 return err
410 })
411 util.TestEventual(t, "Deployment in user namespace is running", ctx, largeTestTimeout, func(ctx context.Context) error {
412 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-userns-1"})
413 if err != nil {
414 return err
415 }
416 if len(res.Items) == 0 {
417 return errors.New("pod didn't get created")
418 }
419 pod := res.Items[0]
420 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
421 return nil
422 }
423 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
424 if err != nil || len(events.Items) == 0 {
425 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
426 } else {
427 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
428 }
429 })
Serge Bazanski99b02142024-04-17 16:33:28 +0200430 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
431 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
432 return err
433 })
434 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
435 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
436 if err != nil {
437 return err
438 }
439 if res.Status.Failed > 0 {
440 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
441 LabelSelector: "job-name=selftest",
442 })
443 if err != nil {
444 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
445 }
446 if len(pods.Items) < 1 {
447 return fmt.Errorf("job failed but pod does not exist")
448 }
449 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
450 if err != nil {
451 return fmt.Errorf("job failed but could not get logs: %w", err)
452 }
453 if len(lines) > 0 {
454 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
455 }
456 return util.Permanent(fmt.Errorf("job failed, empty log"))
457 }
458 if res.Status.Succeeded > 0 {
459 return nil
460 }
461 return fmt.Errorf("job still running")
462 })
463 util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
464 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
465 if err != nil && !kerrors.IsAlreadyExists(err) {
466 return err
467 }
468 _, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
469 if err != nil && !kerrors.IsAlreadyExists(err) {
470 return err
471 }
472 return nil
473 })
474 util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
475 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
476 if err != nil {
477 return err
478 }
479 // Use a new client for each attempt
480 hc := http.Client{
481 Timeout: 2 * time.Second,
482 Transport: &http.Transport{
483 Dial: cluster.SOCKSDialer.Dial,
484 },
485 }
486 for _, n := range nodes.Items {
487 var addr string
488 for _, a := range n.Status.Addresses {
489 if a.Type == corev1.NodeInternalIP {
490 addr = a.Address
491 }
492 }
493 u := url.URL{Scheme: "http", Host: addr, Path: "/"}
494 res, err := hc.Get(u.String())
495 if err != nil {
496 return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
497 }
498 if res.StatusCode != http.StatusOK {
499 return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
500 }
501 t.Logf("Got response from %q", n.Name)
502 }
503 return nil
504 })
505 util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
506 pool := x509.NewCertPool()
507 pool.AddCert(cluster.CACertificate)
508 cl := http.Client{
509 Transport: &http.Transport{
510 TLSClientConfig: &tls.Config{
511 Certificates: []tls.Certificate{cluster.Owner},
512 RootCAs: pool,
513 },
514 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
515 return cluster.DialNode(ctx, addr)
516 },
517 },
518 }
519 u := url.URL{
520 Scheme: "https",
521 Host: net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
522 Path: "/metrics/containerd",
523 }
524 res, err := cl.Get(u.String())
525 if err != nil {
526 return err
527 }
528 defer res.Body.Close()
529 if res.StatusCode != 200 {
530 return fmt.Errorf("status code %d", res.StatusCode)
531 }
532
533 body, err := io.ReadAll(res.Body)
534 if err != nil {
535 return err
536 }
537 needle := "containerd_build_info_total"
538 if !strings.Contains(string(body), needle) {
539 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
540 }
541 return nil
542 })
543 if os.Getenv("HAVE_NESTED_KVM") != "" {
544 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
545 runcRuntimeClass := "runc"
546 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
547 ObjectMeta: metav1.ObjectMeta{
548 Name: "vm-smoketest",
549 },
550 Spec: corev1.PodSpec{
551 Containers: []corev1.Container{{
552 Name: "vm-smoketest",
553 ImagePullPolicy: corev1.PullNever,
554 Image: "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
555 Resources: corev1.ResourceRequirements{
556 Limits: corev1.ResourceList{
557 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
558 },
559 },
560 }},
561 RuntimeClassName: &runcRuntimeClass,
562 RestartPolicy: corev1.RestartPolicyNever,
563 },
564 }, metav1.CreateOptions{})
565 return err
566 })
567 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
568 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
569 if err != nil {
570 return fmt.Errorf("failed to get pod: %w", err)
571 }
572 if pod.Status.Phase == corev1.PodSucceeded {
573 return nil
574 }
575 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
576 if err != nil || len(events.Items) == 0 {
577 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
578 } else {
579 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
580 }
581 })
582 }
583}