blob: c7f5612a7cfd565dc84f4fb321835093f0aeadcf [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package kubernetes
2
3import (
4 "context"
5 "crypto/tls"
6 "crypto/x509"
7 "errors"
8 "fmt"
9 "io"
10 "net"
11 "net/http"
12 _ "net/http/pprof"
13 "net/url"
14 "os"
15 "strings"
16 "testing"
17 "time"
18
19 "github.com/bazelbuild/rules_go/go/runfiles"
Serge Bazanski1e399142024-10-22 10:58:15 +000020 "google.golang.org/protobuf/types/known/fieldmaskpb"
Serge Bazanski99b02142024-04-17 16:33:28 +020021 corev1 "k8s.io/api/core/v1"
22 kerrors "k8s.io/apimachinery/pkg/api/errors"
23 "k8s.io/apimachinery/pkg/api/resource"
24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
Lorenz Brun2ecccae2024-11-27 22:03:35 +010026 "k8s.io/utils/ptr"
Serge Bazanski99b02142024-04-17 16:33:28 +020027
Lorenz Brun732a8842024-08-26 23:25:37 +020028 common "source.monogon.dev/metropolis/node"
Serge Bazanski6d1ff362024-09-30 15:15:31 +000029 apb "source.monogon.dev/metropolis/proto/api"
Lorenz Brun732a8842024-08-26 23:25:37 +020030 cpb "source.monogon.dev/metropolis/proto/common"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020031 mlaunch "source.monogon.dev/metropolis/test/launch"
32 "source.monogon.dev/metropolis/test/localregistry"
Serge Bazanski99b02142024-04-17 16:33:28 +020033 "source.monogon.dev/metropolis/test/util"
Serge Bazanski99b02142024-04-17 16:33:28 +020034)
35
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000036var (
37 // These are filled by bazel at linking time with the canonical path of
38 // their corresponding file. Inside the init function we resolve it
39 // with the rules_go runfiles package to the real path.
40 xTestImagesManifestPath string
41)
42
43func init() {
44 var err error
45 for _, path := range []*string{
46 &xTestImagesManifestPath,
47 } {
48 *path, err = runfiles.Rlocation(*path)
49 if err != nil {
50 panic(err)
51 }
52 }
53}
54
Serge Bazanski99b02142024-04-17 16:33:28 +020055const (
56 // Timeout for the global test context.
57 //
58 // Bazel would eventually time out the test after 900s ("large") if, for
59 // some reason, the context cancellation fails to abort it.
60 globalTestTimeout = 600 * time.Second
61
62 // Timeouts for individual end-to-end tests of different sizes.
63 smallTestTimeout = 60 * time.Second
64 largeTestTimeout = 120 * time.Second
65)
66
Serge Bazanski6d1ff362024-09-30 15:15:31 +000067// TestE2EKubernetesLabels verifies that Kubernetes node labels are being updated
68// when the cluster state changes.
69func TestE2EKubernetesLabels(t *testing.T) {
70 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
71 defer cancel()
72
73 clusterOptions := mlaunch.ClusterOptions{
74 NumNodes: 2,
75 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +010076 ClusterDomain: "cluster.test",
Serge Bazanski6d1ff362024-09-30 15:15:31 +000077 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
78 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
Serge Bazanski78567602024-10-31 13:42:04 +000079 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
80 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanskie99638e2024-09-30 17:06:44 +000081 {Regexp: `^test\.monogon\.dev/`},
82 },
83 },
Serge Bazanski6d1ff362024-09-30 15:15:31 +000084 },
85 }
86 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
87 if err != nil {
88 t.Fatalf("LaunchCluster failed: %v", err)
89 }
90 defer func() {
91 err := cluster.Close()
92 if err != nil {
93 t.Fatalf("cluster Close failed: %v", err)
94 }
95 }()
96
97 con, err := cluster.CuratorClient()
98 if err != nil {
99 t.Fatalf("Could not get curator client: %v", err)
100 }
101 mgmt := apb.NewManagementClient(con)
102 clientSet, err := cluster.GetKubeClientSet()
103 if err != nil {
104 t.Fatal(err)
105 }
106
107 getLabelsForNode := func(nid string) common.Labels {
108 node, err := clientSet.CoreV1().Nodes().Get(ctx, nid, metav1.GetOptions{})
109 if kerrors.IsNotFound(err) {
110 return nil
111 }
112 if err != nil {
113 t.Fatalf("Could not get node %s: %v", nid, err)
114 return nil
115 }
116 return common.Labels(node.Labels).Filter(func(k, v string) bool {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000117 if strings.HasPrefix(k, "node-role.kubernetes.io/") {
118 return true
119 }
120 if strings.HasPrefix(k, "test.monogon.dev/") {
121 return true
122 }
123 return false
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000124 })
125 }
126
127 // Nodes should have no labels at first.
128 for _, nid := range cluster.NodeIDs {
129 if labels := getLabelsForNode(nid); !labels.Equals(nil) {
130 t.Errorf("Node %s should have no labels, has %s", nid, labels)
131 }
132 }
133 // Nominate both nodes to be Kubernetes workers.
134 for _, nid := range cluster.NodeIDs {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000135 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
136 Node: &apb.UpdateNodeRolesRequest_Id{
137 Id: nid,
138 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100139 KubernetesWorker: ptr.To(true),
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000140 })
141 if err != nil {
142 t.Fatalf("Could not make %s a KubernetesWorker: %v", nid, err)
143 }
144 }
145
Jan Schär36f03752024-11-19 17:41:05 +0100146 util.MustTestEventual(t, "Labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000147 // Nodes should have role labels now.
148 for _, nid := range cluster.NodeIDs {
149 want := common.Labels{
150 "node-role.kubernetes.io/KubernetesWorker": "",
151 }
152 if nid == cluster.NodeIDs[0] {
153 want["node-role.kubernetes.io/KubernetesController"] = ""
154 want["node-role.kubernetes.io/ConsensusMember"] = ""
155 }
156 if labels := getLabelsForNode(nid); !want.Equals(labels) {
157 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
158 }
159 }
160 return nil
161 })
162
163 // Remove KubernetesWorker from first node again. It will stay in k8s (arguably,
164 // this is a bug) but its role label should be removed.
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000165 _, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
166 Node: &apb.UpdateNodeRolesRequest_Id{
167 Id: cluster.NodeIDs[0],
168 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100169 KubernetesWorker: ptr.To(false),
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000170 })
171 if err != nil {
172 t.Fatalf("Could not remove KubernetesWorker from %s: %v", cluster.NodeIDs[0], err)
173 }
174
Jan Schär36f03752024-11-19 17:41:05 +0100175 util.MustTestEventual(t, "Labels removed", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000176 for _, nid := range cluster.NodeIDs {
177 want := make(common.Labels)
178 if nid == cluster.NodeIDs[0] {
179 want["node-role.kubernetes.io/KubernetesController"] = ""
180 want["node-role.kubernetes.io/ConsensusMember"] = ""
181 } else {
182 want["node-role.kubernetes.io/KubernetesWorker"] = ""
183 }
184 if labels := getLabelsForNode(nid); !want.Equals(labels) {
185 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
186 }
187 }
188 return nil
189 })
Serge Bazanskie99638e2024-09-30 17:06:44 +0000190
191 // Add Metropolis node label, ensure it gets reflected on the Kubernetes node.
192 _, err = mgmt.UpdateNodeLabels(ctx, &apb.UpdateNodeLabelsRequest{
193 Node: &apb.UpdateNodeLabelsRequest_Id{
194 Id: cluster.NodeIDs[1],
195 },
196 Upsert: []*apb.UpdateNodeLabelsRequest_Pair{
197 {Key: "test.monogon.dev/foo", Value: "bar"},
198 },
199 })
200
Jan Schär36f03752024-11-19 17:41:05 +0100201 util.MustTestEventual(t, "Metropolis labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000202 if err != nil {
203 t.Fatalf("Could not add label to node: %v", err)
204 }
205 want := common.Labels{
206 "node-role.kubernetes.io/KubernetesWorker": "",
207 "test.monogon.dev/foo": "bar",
208 }
209 if labels := getLabelsForNode(cluster.NodeIDs[1]); !want.Equals(labels) {
Serge Bazanski1e399142024-10-22 10:58:15 +0000210 return fmt.Errorf("node %s should have labels %s, has %s", cluster.NodeIDs[1], want, labels)
Serge Bazanskie99638e2024-09-30 17:06:44 +0000211 }
212 return nil
213 })
Serge Bazanski1e399142024-10-22 10:58:15 +0000214
215 // Reconfigure node label rules.
216 _, err = mgmt.ConfigureCluster(ctx, &apb.ConfigureClusterRequest{
217 BaseConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000218 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
219 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanski1e399142024-10-22 10:58:15 +0000220 {Regexp: `^test\.monogon\.dev/`},
221 },
222 },
223 },
224 NewConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000225 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{},
Serge Bazanski1e399142024-10-22 10:58:15 +0000226 },
227 UpdateMask: &fieldmaskpb.FieldMask{
Serge Bazanski78567602024-10-31 13:42:04 +0000228 Paths: []string{"kubernetes.node_labels_to_synchronize"},
Serge Bazanski1e399142024-10-22 10:58:15 +0000229 },
230 })
231 if err != nil {
232 t.Fatalf("Could not update cluster configuration: %v", err)
233 }
234
235 ci, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
236 if err != nil {
237 t.Fatalf("Could not get cluster info")
238 }
239 // See if the config changed.
Serge Bazanski78567602024-10-31 13:42:04 +0000240 if rules := ci.ClusterConfiguration.Kubernetes.NodeLabelsToSynchronize; len(rules) != 0 {
Serge Bazanski1e399142024-10-22 10:58:15 +0000241 t.Fatalf("Wanted 0 label rules in config after reconfiguration, have %d: %v", len(rules), rules)
242 }
243 // TODO: ensure new rules get applied, but that will require watching the cluster
244 // config for changes in the labelmaker.
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000245}
246
Serge Bazanski99b02142024-04-17 16:33:28 +0200247// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
248//
249// The tests are performed against an in-memory cluster.
250func TestE2EKubernetes(t *testing.T) {
251 // Set a global timeout to make sure this terminates
252 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
253 defer cancel()
254
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000255 df, err := os.ReadFile(xTestImagesManifestPath)
Serge Bazanski99b02142024-04-17 16:33:28 +0200256 if err != nil {
257 t.Fatalf("Reading registry manifest failed: %v", err)
258 }
259 lr, err := localregistry.FromBazelManifest(df)
260 if err != nil {
261 t.Fatalf("Creating test image registry failed: %v", err)
262 }
263
264 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200265 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +0200266 NumNodes: 2,
267 LocalRegistry: lr,
Lorenz Brun732a8842024-08-26 23:25:37 +0200268 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +0100269 ClusterDomain: "cluster.test",
Lorenz Brun732a8842024-08-26 23:25:37 +0200270 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
271 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
272 },
Serge Bazanski99b02142024-04-17 16:33:28 +0200273 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200274 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +0200275 if err != nil {
276 t.Fatalf("LaunchCluster failed: %v", err)
277 }
278 defer func() {
279 err := cluster.Close()
280 if err != nil {
281 t.Fatalf("cluster Close failed: %v", err)
282 }
283 }()
284
285 clientSet, err := cluster.GetKubeClientSet()
286 if err != nil {
287 t.Fatal(err)
288 }
289 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
290 // Make everything but the first node into KubernetesWorkers.
291 for i := 1; i < clusterOptions.NumNodes; i++ {
292 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
293 if err != nil {
294 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
295 }
296 }
297 return nil
298 })
299 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
300 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
301 if err != nil {
302 return err
303 }
304 if len(nodes.Items) < 1 {
305 return errors.New("node not yet registered")
306 }
307 node := nodes.Items[0]
308 for _, cond := range node.Status.Conditions {
309 if cond.Type != corev1.NodeReady {
310 continue
311 }
312 if cond.Status != corev1.ConditionTrue {
313 return fmt.Errorf("node not ready: %v", cond.Message)
314 }
315 }
316 return nil
317 })
318 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
319 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
320 return err
321 })
322 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
323 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
324 if err != nil {
325 return err
326 }
327 if len(res.Items) == 0 {
328 return errors.New("pod didn't get created")
329 }
330 pod := res.Items[0]
331 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
332 return nil
333 }
334 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
335 if err != nil || len(events.Items) == 0 {
336 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
337 } else {
338 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
339 }
340 })
341 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
342 deployment := makeTestDeploymentSpec("test-deploy-2")
343 gvisorStr := "gvisor"
344 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
345 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
346 return err
347 })
348 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
349 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
350 if err != nil {
351 return err
352 }
353 if len(res.Items) == 0 {
354 return errors.New("pod didn't get created")
355 }
356 pod := res.Items[0]
357 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
358 return nil
359 }
360 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
361 if err != nil || len(events.Items) == 0 {
362 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
363 } else {
364 var errorMsg strings.Builder
365 for _, msg := range events.Items {
366 errorMsg.WriteString(" | ")
367 errorMsg.WriteString(msg.Message)
368 }
Tim Windelschmidt5f1a7de2024-09-19 02:00:14 +0200369 return fmt.Errorf("pod is not ready: %s", errorMsg.String())
Serge Bazanski99b02142024-04-17 16:33:28 +0200370 }
371 })
Jan Schär73beb692024-11-27 17:47:09 +0100372 for _, runtimeClass := range []string{"runc", "gvisor"} {
373 statefulSetName := fmt.Sprintf("test-statefulset-%s", runtimeClass)
374 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
375 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet(statefulSetName, runtimeClass), metav1.CreateOptions{})
Serge Bazanski99b02142024-04-17 16:33:28 +0200376 return err
Jan Schär73beb692024-11-27 17:47:09 +0100377 })
378 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests successful", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
379 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("name=%s", statefulSetName)})
380 if err != nil {
381 return err
Jan Schär652c2ad2024-11-19 17:40:50 +0100382 }
Jan Schär73beb692024-11-27 17:47:09 +0100383 if len(res.Items) == 0 {
384 return errors.New("pod didn't get created")
385 }
386 pod := res.Items[0]
387 lines, err := getPodLogLines(ctx, clientSet, pod.Name, 50)
388 if err != nil {
389 return fmt.Errorf("could not get logs: %w", err)
390 }
391 if len(lines) > 0 {
392 switch lines[len(lines)-1] {
393 case "[TESTS-PASSED]":
394 return nil
395 case "[TESTS-FAILED]":
396 return util.Permanent(fmt.Errorf("tests failed, log:\n %s", strings.Join(lines, "\n ")))
397 }
398 }
399 return fmt.Errorf("pod is not ready: %v, log:\n %s", pod.Status.Phase, strings.Join(lines, "\n "))
400 })
401 }
Lorenz Brun2ecccae2024-11-27 22:03:35 +0100402 util.TestEventual(t, "Deployment in user namespace", ctx, largeTestTimeout, func(ctx context.Context) error {
403 deployment := makeTestDeploymentSpec("test-userns-1")
404 deployment.Spec.Template.Spec.HostUsers = ptr.To(false)
405 deployment.Spec.Template.Spec.Containers[0].ReadinessProbe.HTTPGet.Path = "/ready_userns"
406 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
407 return err
408 })
409 util.TestEventual(t, "Deployment in user namespace is running", ctx, largeTestTimeout, func(ctx context.Context) error {
410 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-userns-1"})
411 if err != nil {
412 return err
413 }
414 if len(res.Items) == 0 {
415 return errors.New("pod didn't get created")
416 }
417 pod := res.Items[0]
418 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
419 return nil
420 }
421 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
422 if err != nil || len(events.Items) == 0 {
423 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
424 } else {
425 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
426 }
427 })
Serge Bazanski99b02142024-04-17 16:33:28 +0200428 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
429 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
430 return err
431 })
432 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
433 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
434 if err != nil {
435 return err
436 }
437 if res.Status.Failed > 0 {
438 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
439 LabelSelector: "job-name=selftest",
440 })
441 if err != nil {
442 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
443 }
444 if len(pods.Items) < 1 {
445 return fmt.Errorf("job failed but pod does not exist")
446 }
447 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
448 if err != nil {
449 return fmt.Errorf("job failed but could not get logs: %w", err)
450 }
451 if len(lines) > 0 {
452 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
453 }
454 return util.Permanent(fmt.Errorf("job failed, empty log"))
455 }
456 if res.Status.Succeeded > 0 {
457 return nil
458 }
459 return fmt.Errorf("job still running")
460 })
461 util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
462 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
463 if err != nil && !kerrors.IsAlreadyExists(err) {
464 return err
465 }
466 _, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
467 if err != nil && !kerrors.IsAlreadyExists(err) {
468 return err
469 }
470 return nil
471 })
472 util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
473 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
474 if err != nil {
475 return err
476 }
477 // Use a new client for each attempt
478 hc := http.Client{
479 Timeout: 2 * time.Second,
480 Transport: &http.Transport{
481 Dial: cluster.SOCKSDialer.Dial,
482 },
483 }
484 for _, n := range nodes.Items {
485 var addr string
486 for _, a := range n.Status.Addresses {
487 if a.Type == corev1.NodeInternalIP {
488 addr = a.Address
489 }
490 }
491 u := url.URL{Scheme: "http", Host: addr, Path: "/"}
492 res, err := hc.Get(u.String())
493 if err != nil {
494 return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
495 }
496 if res.StatusCode != http.StatusOK {
497 return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
498 }
499 t.Logf("Got response from %q", n.Name)
500 }
501 return nil
502 })
503 util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
504 pool := x509.NewCertPool()
505 pool.AddCert(cluster.CACertificate)
506 cl := http.Client{
507 Transport: &http.Transport{
508 TLSClientConfig: &tls.Config{
509 Certificates: []tls.Certificate{cluster.Owner},
510 RootCAs: pool,
511 },
512 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
513 return cluster.DialNode(ctx, addr)
514 },
515 },
516 }
517 u := url.URL{
518 Scheme: "https",
519 Host: net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
520 Path: "/metrics/containerd",
521 }
522 res, err := cl.Get(u.String())
523 if err != nil {
524 return err
525 }
526 defer res.Body.Close()
527 if res.StatusCode != 200 {
528 return fmt.Errorf("status code %d", res.StatusCode)
529 }
530
531 body, err := io.ReadAll(res.Body)
532 if err != nil {
533 return err
534 }
535 needle := "containerd_build_info_total"
536 if !strings.Contains(string(body), needle) {
537 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
538 }
539 return nil
540 })
541 if os.Getenv("HAVE_NESTED_KVM") != "" {
542 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
543 runcRuntimeClass := "runc"
544 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
545 ObjectMeta: metav1.ObjectMeta{
546 Name: "vm-smoketest",
547 },
548 Spec: corev1.PodSpec{
549 Containers: []corev1.Container{{
550 Name: "vm-smoketest",
551 ImagePullPolicy: corev1.PullNever,
552 Image: "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
553 Resources: corev1.ResourceRequirements{
554 Limits: corev1.ResourceList{
555 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
556 },
557 },
558 }},
559 RuntimeClassName: &runcRuntimeClass,
560 RestartPolicy: corev1.RestartPolicyNever,
561 },
562 }, metav1.CreateOptions{})
563 return err
564 })
565 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
566 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
567 if err != nil {
568 return fmt.Errorf("failed to get pod: %w", err)
569 }
570 if pod.Status.Phase == corev1.PodSucceeded {
571 return nil
572 }
573 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
574 if err != nil || len(events.Items) == 0 {
575 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
576 } else {
577 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
578 }
579 })
580 }
581}