blob: 939c6630255794c8892b2d63f6d38adbd4c8cb7f [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski99b02142024-04-17 16:33:28 +02004package kubernetes
5
6import (
7 "context"
8 "crypto/tls"
9 "crypto/x509"
10 "errors"
11 "fmt"
12 "io"
13 "net"
14 "net/http"
15 _ "net/http/pprof"
16 "net/url"
17 "os"
18 "strings"
19 "testing"
20 "time"
21
22 "github.com/bazelbuild/rules_go/go/runfiles"
Serge Bazanski1e399142024-10-22 10:58:15 +000023 "google.golang.org/protobuf/types/known/fieldmaskpb"
Serge Bazanski99b02142024-04-17 16:33:28 +020024 corev1 "k8s.io/api/core/v1"
25 kerrors "k8s.io/apimachinery/pkg/api/errors"
Serge Bazanski99b02142024-04-17 16:33:28 +020026 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
Lorenz Brun2ecccae2024-11-27 22:03:35 +010028 "k8s.io/utils/ptr"
Serge Bazanski99b02142024-04-17 16:33:28 +020029
Lorenz Brun732a8842024-08-26 23:25:37 +020030 common "source.monogon.dev/metropolis/node"
Serge Bazanski6d1ff362024-09-30 15:15:31 +000031 apb "source.monogon.dev/metropolis/proto/api"
Lorenz Brun732a8842024-08-26 23:25:37 +020032 cpb "source.monogon.dev/metropolis/proto/common"
Lorenz Brunde57e6f2025-01-08 16:34:08 +000033 "source.monogon.dev/metropolis/test/e2e/connectivity"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020034 mlaunch "source.monogon.dev/metropolis/test/launch"
35 "source.monogon.dev/metropolis/test/localregistry"
Serge Bazanski99b02142024-04-17 16:33:28 +020036 "source.monogon.dev/metropolis/test/util"
Serge Bazanski99b02142024-04-17 16:33:28 +020037)
38
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000039var (
40 // These are filled by bazel at linking time with the canonical path of
41 // their corresponding file. Inside the init function we resolve it
42 // with the rules_go runfiles package to the real path.
43 xTestImagesManifestPath string
44)
45
46func init() {
47 var err error
48 for _, path := range []*string{
49 &xTestImagesManifestPath,
50 } {
51 *path, err = runfiles.Rlocation(*path)
52 if err != nil {
53 panic(err)
54 }
55 }
56}
57
Serge Bazanski99b02142024-04-17 16:33:28 +020058const (
59 // Timeout for the global test context.
60 //
61 // Bazel would eventually time out the test after 900s ("large") if, for
62 // some reason, the context cancellation fails to abort it.
63 globalTestTimeout = 600 * time.Second
64
65 // Timeouts for individual end-to-end tests of different sizes.
66 smallTestTimeout = 60 * time.Second
67 largeTestTimeout = 120 * time.Second
68)
69
Serge Bazanski6d1ff362024-09-30 15:15:31 +000070// TestE2EKubernetesLabels verifies that Kubernetes node labels are being updated
71// when the cluster state changes.
72func TestE2EKubernetesLabels(t *testing.T) {
73 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
74 defer cancel()
75
76 clusterOptions := mlaunch.ClusterOptions{
77 NumNodes: 2,
78 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +010079 ClusterDomain: "cluster.test",
Serge Bazanski6d1ff362024-09-30 15:15:31 +000080 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
81 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
Serge Bazanski78567602024-10-31 13:42:04 +000082 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
83 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanskie99638e2024-09-30 17:06:44 +000084 {Regexp: `^test\.monogon\.dev/`},
85 },
86 },
Serge Bazanski6d1ff362024-09-30 15:15:31 +000087 },
88 }
89 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
90 if err != nil {
91 t.Fatalf("LaunchCluster failed: %v", err)
92 }
93 defer func() {
94 err := cluster.Close()
95 if err != nil {
96 t.Fatalf("cluster Close failed: %v", err)
97 }
98 }()
99
100 con, err := cluster.CuratorClient()
101 if err != nil {
102 t.Fatalf("Could not get curator client: %v", err)
103 }
104 mgmt := apb.NewManagementClient(con)
Lorenz Brun8f1254d2025-01-28 14:10:05 +0100105 clientSet, _, err := cluster.GetKubeClientSet()
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000106 if err != nil {
107 t.Fatal(err)
108 }
109
110 getLabelsForNode := func(nid string) common.Labels {
111 node, err := clientSet.CoreV1().Nodes().Get(ctx, nid, metav1.GetOptions{})
112 if kerrors.IsNotFound(err) {
113 return nil
114 }
115 if err != nil {
116 t.Fatalf("Could not get node %s: %v", nid, err)
117 return nil
118 }
119 return common.Labels(node.Labels).Filter(func(k, v string) bool {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000120 if strings.HasPrefix(k, "node-role.kubernetes.io/") {
121 return true
122 }
123 if strings.HasPrefix(k, "test.monogon.dev/") {
124 return true
125 }
126 return false
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000127 })
128 }
129
130 // Nodes should have no labels at first.
131 for _, nid := range cluster.NodeIDs {
132 if labels := getLabelsForNode(nid); !labels.Equals(nil) {
133 t.Errorf("Node %s should have no labels, has %s", nid, labels)
134 }
135 }
136 // Nominate both nodes to be Kubernetes workers.
137 for _, nid := range cluster.NodeIDs {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000138 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
139 Node: &apb.UpdateNodeRolesRequest_Id{
140 Id: nid,
141 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100142 KubernetesWorker: ptr.To(true),
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000143 })
144 if err != nil {
145 t.Fatalf("Could not make %s a KubernetesWorker: %v", nid, err)
146 }
147 }
148
Jan Schär36f03752024-11-19 17:41:05 +0100149 util.MustTestEventual(t, "Labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000150 // Nodes should have role labels now.
151 for _, nid := range cluster.NodeIDs {
152 want := common.Labels{
153 "node-role.kubernetes.io/KubernetesWorker": "",
154 }
155 if nid == cluster.NodeIDs[0] {
156 want["node-role.kubernetes.io/KubernetesController"] = ""
157 want["node-role.kubernetes.io/ConsensusMember"] = ""
158 }
159 if labels := getLabelsForNode(nid); !want.Equals(labels) {
160 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
161 }
162 }
163 return nil
164 })
165
166 // Remove KubernetesWorker from first node again. It will stay in k8s (arguably,
167 // this is a bug) but its role label should be removed.
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000168 _, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
169 Node: &apb.UpdateNodeRolesRequest_Id{
170 Id: cluster.NodeIDs[0],
171 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100172 KubernetesWorker: ptr.To(false),
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000173 })
174 if err != nil {
175 t.Fatalf("Could not remove KubernetesWorker from %s: %v", cluster.NodeIDs[0], err)
176 }
177
Jan Schär36f03752024-11-19 17:41:05 +0100178 util.MustTestEventual(t, "Labels removed", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000179 for _, nid := range cluster.NodeIDs {
180 want := make(common.Labels)
181 if nid == cluster.NodeIDs[0] {
182 want["node-role.kubernetes.io/KubernetesController"] = ""
183 want["node-role.kubernetes.io/ConsensusMember"] = ""
184 } else {
185 want["node-role.kubernetes.io/KubernetesWorker"] = ""
186 }
187 if labels := getLabelsForNode(nid); !want.Equals(labels) {
188 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
189 }
190 }
191 return nil
192 })
Serge Bazanskie99638e2024-09-30 17:06:44 +0000193
194 // Add Metropolis node label, ensure it gets reflected on the Kubernetes node.
195 _, err = mgmt.UpdateNodeLabels(ctx, &apb.UpdateNodeLabelsRequest{
196 Node: &apb.UpdateNodeLabelsRequest_Id{
197 Id: cluster.NodeIDs[1],
198 },
199 Upsert: []*apb.UpdateNodeLabelsRequest_Pair{
200 {Key: "test.monogon.dev/foo", Value: "bar"},
201 },
202 })
203
Jan Schär36f03752024-11-19 17:41:05 +0100204 util.MustTestEventual(t, "Metropolis labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000205 if err != nil {
206 t.Fatalf("Could not add label to node: %v", err)
207 }
208 want := common.Labels{
209 "node-role.kubernetes.io/KubernetesWorker": "",
210 "test.monogon.dev/foo": "bar",
211 }
212 if labels := getLabelsForNode(cluster.NodeIDs[1]); !want.Equals(labels) {
Serge Bazanski1e399142024-10-22 10:58:15 +0000213 return fmt.Errorf("node %s should have labels %s, has %s", cluster.NodeIDs[1], want, labels)
Serge Bazanskie99638e2024-09-30 17:06:44 +0000214 }
215 return nil
216 })
Serge Bazanski1e399142024-10-22 10:58:15 +0000217
218 // Reconfigure node label rules.
219 _, err = mgmt.ConfigureCluster(ctx, &apb.ConfigureClusterRequest{
220 BaseConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000221 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
222 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanski1e399142024-10-22 10:58:15 +0000223 {Regexp: `^test\.monogon\.dev/`},
224 },
225 },
226 },
227 NewConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000228 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{},
Serge Bazanski1e399142024-10-22 10:58:15 +0000229 },
230 UpdateMask: &fieldmaskpb.FieldMask{
Serge Bazanski78567602024-10-31 13:42:04 +0000231 Paths: []string{"kubernetes.node_labels_to_synchronize"},
Serge Bazanski1e399142024-10-22 10:58:15 +0000232 },
233 })
234 if err != nil {
235 t.Fatalf("Could not update cluster configuration: %v", err)
236 }
237
238 ci, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
239 if err != nil {
240 t.Fatalf("Could not get cluster info")
241 }
242 // See if the config changed.
Serge Bazanski78567602024-10-31 13:42:04 +0000243 if rules := ci.ClusterConfiguration.Kubernetes.NodeLabelsToSynchronize; len(rules) != 0 {
Serge Bazanski1e399142024-10-22 10:58:15 +0000244 t.Fatalf("Wanted 0 label rules in config after reconfiguration, have %d: %v", len(rules), rules)
245 }
246 // TODO: ensure new rules get applied, but that will require watching the cluster
247 // config for changes in the labelmaker.
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000248}
249
Serge Bazanski99b02142024-04-17 16:33:28 +0200250// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
251//
252// The tests are performed against an in-memory cluster.
253func TestE2EKubernetes(t *testing.T) {
254 // Set a global timeout to make sure this terminates
255 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
256 defer cancel()
257
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000258 df, err := os.ReadFile(xTestImagesManifestPath)
Serge Bazanski99b02142024-04-17 16:33:28 +0200259 if err != nil {
260 t.Fatalf("Reading registry manifest failed: %v", err)
261 }
262 lr, err := localregistry.FromBazelManifest(df)
263 if err != nil {
264 t.Fatalf("Creating test image registry failed: %v", err)
265 }
266
267 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200268 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +0200269 NumNodes: 2,
270 LocalRegistry: lr,
Lorenz Brun732a8842024-08-26 23:25:37 +0200271 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +0100272 ClusterDomain: "cluster.test",
Lorenz Brun732a8842024-08-26 23:25:37 +0200273 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
274 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
275 },
Serge Bazanski99b02142024-04-17 16:33:28 +0200276 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200277 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +0200278 if err != nil {
279 t.Fatalf("LaunchCluster failed: %v", err)
280 }
281 defer func() {
282 err := cluster.Close()
283 if err != nil {
284 t.Fatalf("cluster Close failed: %v", err)
285 }
286 }()
287
Lorenz Brunde57e6f2025-01-08 16:34:08 +0000288 clientSet, restConfig, err := cluster.GetKubeClientSet()
Serge Bazanski99b02142024-04-17 16:33:28 +0200289 if err != nil {
290 t.Fatal(err)
291 }
292 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
293 // Make everything but the first node into KubernetesWorkers.
294 for i := 1; i < clusterOptions.NumNodes; i++ {
295 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
296 if err != nil {
297 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
298 }
299 }
300 return nil
301 })
302 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
303 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
304 if err != nil {
305 return err
306 }
307 if len(nodes.Items) < 1 {
308 return errors.New("node not yet registered")
309 }
310 node := nodes.Items[0]
311 for _, cond := range node.Status.Conditions {
312 if cond.Type != corev1.NodeReady {
313 continue
314 }
315 if cond.Status != corev1.ConditionTrue {
316 return fmt.Errorf("node not ready: %v", cond.Message)
317 }
318 }
319 return nil
320 })
321 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
322 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
323 return err
324 })
325 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
326 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
327 if err != nil {
328 return err
329 }
330 if len(res.Items) == 0 {
331 return errors.New("pod didn't get created")
332 }
333 pod := res.Items[0]
334 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
335 return nil
336 }
337 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
338 if err != nil || len(events.Items) == 0 {
339 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
340 } else {
341 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
342 }
343 })
344 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
345 deployment := makeTestDeploymentSpec("test-deploy-2")
346 gvisorStr := "gvisor"
347 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
348 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
349 return err
350 })
351 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
352 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
353 if err != nil {
354 return err
355 }
356 if len(res.Items) == 0 {
357 return errors.New("pod didn't get created")
358 }
359 pod := res.Items[0]
360 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
361 return nil
362 }
363 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
364 if err != nil || len(events.Items) == 0 {
365 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
366 } else {
367 var errorMsg strings.Builder
368 for _, msg := range events.Items {
369 errorMsg.WriteString(" | ")
370 errorMsg.WriteString(msg.Message)
371 }
Tim Windelschmidt5f1a7de2024-09-19 02:00:14 +0200372 return fmt.Errorf("pod is not ready: %s", errorMsg.String())
Serge Bazanski99b02142024-04-17 16:33:28 +0200373 }
374 })
Lorenz Brunde57e6f2025-01-08 16:34:08 +0000375 t.Run("Connectivity Smoke Tests", func(t *testing.T) {
376 ct := connectivity.SetupTest(t, &connectivity.TestSpec{
377 Name: "connectivity-smoke",
378 ClientSet: clientSet,
379 RESTConfig: restConfig,
380 NumPods: 2,
381 ExtraPodConfig: func(i int, pod *corev1.Pod) {
382 // Spread pods out over nodes to test inter-node network
383 pod.Labels = make(map[string]string)
384 pod.Labels["name"] = "connectivity-smoketest"
385 pod.Spec.TopologySpreadConstraints = []corev1.TopologySpreadConstraint{{
386 MaxSkew: 1,
387 TopologyKey: "kubernetes.io/hostname",
388 WhenUnsatisfiable: corev1.DoNotSchedule,
389 LabelSelector: metav1.SetAsLabelSelector(pod.Labels),
390 }}
391 },
392 })
393 ct.TestPodConnectivity(t, 0, 1, 1234, connectivity.ExpectedSuccess)
394 })
Jan Schär73beb692024-11-27 17:47:09 +0100395 for _, runtimeClass := range []string{"runc", "gvisor"} {
396 statefulSetName := fmt.Sprintf("test-statefulset-%s", runtimeClass)
397 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
398 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet(statefulSetName, runtimeClass), metav1.CreateOptions{})
Serge Bazanski99b02142024-04-17 16:33:28 +0200399 return err
Jan Schär73beb692024-11-27 17:47:09 +0100400 })
401 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests successful", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
402 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("name=%s", statefulSetName)})
403 if err != nil {
404 return err
Jan Schär652c2ad2024-11-19 17:40:50 +0100405 }
Jan Schär73beb692024-11-27 17:47:09 +0100406 if len(res.Items) == 0 {
407 return errors.New("pod didn't get created")
408 }
409 pod := res.Items[0]
410 lines, err := getPodLogLines(ctx, clientSet, pod.Name, 50)
411 if err != nil {
412 return fmt.Errorf("could not get logs: %w", err)
413 }
414 if len(lines) > 0 {
415 switch lines[len(lines)-1] {
416 case "[TESTS-PASSED]":
417 return nil
418 case "[TESTS-FAILED]":
419 return util.Permanent(fmt.Errorf("tests failed, log:\n %s", strings.Join(lines, "\n ")))
420 }
421 }
422 return fmt.Errorf("pod is not ready: %v, log:\n %s", pod.Status.Phase, strings.Join(lines, "\n "))
423 })
424 }
Lorenz Brun2ecccae2024-11-27 22:03:35 +0100425 util.TestEventual(t, "Deployment in user namespace", ctx, largeTestTimeout, func(ctx context.Context) error {
426 deployment := makeTestDeploymentSpec("test-userns-1")
427 deployment.Spec.Template.Spec.HostUsers = ptr.To(false)
428 deployment.Spec.Template.Spec.Containers[0].ReadinessProbe.HTTPGet.Path = "/ready_userns"
429 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
430 return err
431 })
432 util.TestEventual(t, "Deployment in user namespace is running", ctx, largeTestTimeout, func(ctx context.Context) error {
433 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-userns-1"})
434 if err != nil {
435 return err
436 }
437 if len(res.Items) == 0 {
438 return errors.New("pod didn't get created")
439 }
440 pod := res.Items[0]
441 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
442 return nil
443 }
444 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
445 if err != nil || len(events.Items) == 0 {
446 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
447 } else {
448 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
449 }
450 })
Serge Bazanski99b02142024-04-17 16:33:28 +0200451 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
452 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
453 return err
454 })
455 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
456 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
457 if err != nil {
458 return err
459 }
460 if res.Status.Failed > 0 {
461 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
462 LabelSelector: "job-name=selftest",
463 })
464 if err != nil {
465 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
466 }
467 if len(pods.Items) < 1 {
468 return fmt.Errorf("job failed but pod does not exist")
469 }
470 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
471 if err != nil {
472 return fmt.Errorf("job failed but could not get logs: %w", err)
473 }
474 if len(lines) > 0 {
475 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
476 }
477 return util.Permanent(fmt.Errorf("job failed, empty log"))
478 }
479 if res.Status.Succeeded > 0 {
480 return nil
481 }
482 return fmt.Errorf("job still running")
483 })
484 util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
485 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
486 if err != nil && !kerrors.IsAlreadyExists(err) {
487 return err
488 }
489 _, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
490 if err != nil && !kerrors.IsAlreadyExists(err) {
491 return err
492 }
493 return nil
494 })
495 util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
496 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
497 if err != nil {
498 return err
499 }
500 // Use a new client for each attempt
501 hc := http.Client{
502 Timeout: 2 * time.Second,
503 Transport: &http.Transport{
504 Dial: cluster.SOCKSDialer.Dial,
505 },
506 }
507 for _, n := range nodes.Items {
508 var addr string
509 for _, a := range n.Status.Addresses {
510 if a.Type == corev1.NodeInternalIP {
511 addr = a.Address
512 }
513 }
514 u := url.URL{Scheme: "http", Host: addr, Path: "/"}
515 res, err := hc.Get(u.String())
516 if err != nil {
517 return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
518 }
519 if res.StatusCode != http.StatusOK {
520 return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
521 }
522 t.Logf("Got response from %q", n.Name)
523 }
524 return nil
525 })
526 util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
527 pool := x509.NewCertPool()
528 pool.AddCert(cluster.CACertificate)
529 cl := http.Client{
530 Transport: &http.Transport{
531 TLSClientConfig: &tls.Config{
532 Certificates: []tls.Certificate{cluster.Owner},
533 RootCAs: pool,
534 },
535 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
536 return cluster.DialNode(ctx, addr)
537 },
538 },
539 }
540 u := url.URL{
541 Scheme: "https",
542 Host: net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
543 Path: "/metrics/containerd",
544 }
545 res, err := cl.Get(u.String())
546 if err != nil {
547 return err
548 }
549 defer res.Body.Close()
550 if res.StatusCode != 200 {
551 return fmt.Errorf("status code %d", res.StatusCode)
552 }
553
554 body, err := io.ReadAll(res.Body)
555 if err != nil {
556 return err
557 }
558 needle := "containerd_build_info_total"
559 if !strings.Contains(string(body), needle) {
560 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
561 }
562 return nil
563 })
Serge Bazanski99b02142024-04-17 16:33:28 +0200564}