blob: 64da63ee22064f6ed242a24713c4eab076dfcac1 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski99b02142024-04-17 16:33:28 +02004package kubernetes
5
6import (
7 "context"
8 "crypto/tls"
9 "crypto/x509"
10 "errors"
11 "fmt"
12 "io"
13 "net"
14 "net/http"
15 _ "net/http/pprof"
16 "net/url"
17 "os"
Jan Schärb00f7f92025-03-06 17:27:22 +010018 "slices"
Serge Bazanski99b02142024-04-17 16:33:28 +020019 "strings"
20 "testing"
21 "time"
22
23 "github.com/bazelbuild/rules_go/go/runfiles"
Serge Bazanski1e399142024-10-22 10:58:15 +000024 "google.golang.org/protobuf/types/known/fieldmaskpb"
Serge Bazanski99b02142024-04-17 16:33:28 +020025 corev1 "k8s.io/api/core/v1"
Lorenz Brun52700ae2025-01-28 15:07:08 +010026 nwkv1 "k8s.io/api/networking/v1"
Serge Bazanski99b02142024-04-17 16:33:28 +020027 kerrors "k8s.io/apimachinery/pkg/api/errors"
Serge Bazanski99b02142024-04-17 16:33:28 +020028 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Jan Schärb00f7f92025-03-06 17:27:22 +010029 "k8s.io/apimachinery/pkg/types"
Lorenz Brun52700ae2025-01-28 15:07:08 +010030 "k8s.io/apimachinery/pkg/util/intstr"
Serge Bazanski99b02142024-04-17 16:33:28 +020031 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
Lorenz Brun2ecccae2024-11-27 22:03:35 +010032 "k8s.io/utils/ptr"
Serge Bazanski99b02142024-04-17 16:33:28 +020033
Lorenz Brun732a8842024-08-26 23:25:37 +020034 common "source.monogon.dev/metropolis/node"
Jan Schär0f8ce4c2025-09-04 13:27:50 +020035 "source.monogon.dev/metropolis/node/allocs"
Serge Bazanski6d1ff362024-09-30 15:15:31 +000036 apb "source.monogon.dev/metropolis/proto/api"
Lorenz Brun732a8842024-08-26 23:25:37 +020037 cpb "source.monogon.dev/metropolis/proto/common"
Lorenz Brunde57e6f2025-01-08 16:34:08 +000038 "source.monogon.dev/metropolis/test/e2e/connectivity"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020039 mlaunch "source.monogon.dev/metropolis/test/launch"
40 "source.monogon.dev/metropolis/test/localregistry"
Serge Bazanski99b02142024-04-17 16:33:28 +020041 "source.monogon.dev/metropolis/test/util"
Serge Bazanski99b02142024-04-17 16:33:28 +020042)
43
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000044var (
45 // These are filled by bazel at linking time with the canonical path of
46 // their corresponding file. Inside the init function we resolve it
47 // with the rules_go runfiles package to the real path.
48 xTestImagesManifestPath string
49)
50
51func init() {
52 var err error
53 for _, path := range []*string{
54 &xTestImagesManifestPath,
55 } {
56 *path, err = runfiles.Rlocation(*path)
57 if err != nil {
58 panic(err)
59 }
60 }
61}
62
Serge Bazanski99b02142024-04-17 16:33:28 +020063const (
64 // Timeout for the global test context.
65 //
66 // Bazel would eventually time out the test after 900s ("large") if, for
67 // some reason, the context cancellation fails to abort it.
68 globalTestTimeout = 600 * time.Second
69
70 // Timeouts for individual end-to-end tests of different sizes.
71 smallTestTimeout = 60 * time.Second
72 largeTestTimeout = 120 * time.Second
73)
74
Serge Bazanski6d1ff362024-09-30 15:15:31 +000075// TestE2EKubernetesLabels verifies that Kubernetes node labels are being updated
76// when the cluster state changes.
77func TestE2EKubernetesLabels(t *testing.T) {
78 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
79 defer cancel()
80
81 clusterOptions := mlaunch.ClusterOptions{
82 NumNodes: 2,
83 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +010084 ClusterDomain: "cluster.test",
Serge Bazanski6d1ff362024-09-30 15:15:31 +000085 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
86 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
Serge Bazanski78567602024-10-31 13:42:04 +000087 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
88 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanskie99638e2024-09-30 17:06:44 +000089 {Regexp: `^test\.monogon\.dev/`},
90 },
91 },
Serge Bazanski6d1ff362024-09-30 15:15:31 +000092 },
93 }
94 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
95 if err != nil {
96 t.Fatalf("LaunchCluster failed: %v", err)
97 }
98 defer func() {
99 err := cluster.Close()
100 if err != nil {
101 t.Fatalf("cluster Close failed: %v", err)
102 }
103 }()
104
105 con, err := cluster.CuratorClient()
106 if err != nil {
107 t.Fatalf("Could not get curator client: %v", err)
108 }
109 mgmt := apb.NewManagementClient(con)
Lorenz Brun8f1254d2025-01-28 14:10:05 +0100110 clientSet, _, err := cluster.GetKubeClientSet()
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000111 if err != nil {
112 t.Fatal(err)
113 }
114
115 getLabelsForNode := func(nid string) common.Labels {
116 node, err := clientSet.CoreV1().Nodes().Get(ctx, nid, metav1.GetOptions{})
117 if kerrors.IsNotFound(err) {
118 return nil
119 }
120 if err != nil {
121 t.Fatalf("Could not get node %s: %v", nid, err)
122 return nil
123 }
124 return common.Labels(node.Labels).Filter(func(k, v string) bool {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000125 if strings.HasPrefix(k, "node-role.kubernetes.io/") {
126 return true
127 }
128 if strings.HasPrefix(k, "test.monogon.dev/") {
129 return true
130 }
131 return false
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000132 })
133 }
134
135 // Nodes should have no labels at first.
136 for _, nid := range cluster.NodeIDs {
137 if labels := getLabelsForNode(nid); !labels.Equals(nil) {
138 t.Errorf("Node %s should have no labels, has %s", nid, labels)
139 }
140 }
141 // Nominate both nodes to be Kubernetes workers.
142 for _, nid := range cluster.NodeIDs {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000143 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
144 Node: &apb.UpdateNodeRolesRequest_Id{
145 Id: nid,
146 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100147 KubernetesWorker: ptr.To(true),
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000148 })
149 if err != nil {
150 t.Fatalf("Could not make %s a KubernetesWorker: %v", nid, err)
151 }
152 }
153
Jan Schär36f03752024-11-19 17:41:05 +0100154 util.MustTestEventual(t, "Labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000155 // Nodes should have role labels now.
156 for _, nid := range cluster.NodeIDs {
157 want := common.Labels{
158 "node-role.kubernetes.io/KubernetesWorker": "",
159 }
160 if nid == cluster.NodeIDs[0] {
161 want["node-role.kubernetes.io/KubernetesController"] = ""
162 want["node-role.kubernetes.io/ConsensusMember"] = ""
163 }
164 if labels := getLabelsForNode(nid); !want.Equals(labels) {
165 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
166 }
167 }
168 return nil
169 })
170
171 // Remove KubernetesWorker from first node again. It will stay in k8s (arguably,
172 // this is a bug) but its role label should be removed.
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000173 _, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
174 Node: &apb.UpdateNodeRolesRequest_Id{
175 Id: cluster.NodeIDs[0],
176 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100177 KubernetesWorker: ptr.To(false),
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000178 })
179 if err != nil {
180 t.Fatalf("Could not remove KubernetesWorker from %s: %v", cluster.NodeIDs[0], err)
181 }
182
Jan Schär36f03752024-11-19 17:41:05 +0100183 util.MustTestEventual(t, "Labels removed", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000184 for _, nid := range cluster.NodeIDs {
185 want := make(common.Labels)
186 if nid == cluster.NodeIDs[0] {
187 want["node-role.kubernetes.io/KubernetesController"] = ""
188 want["node-role.kubernetes.io/ConsensusMember"] = ""
189 } else {
190 want["node-role.kubernetes.io/KubernetesWorker"] = ""
191 }
192 if labels := getLabelsForNode(nid); !want.Equals(labels) {
193 return fmt.Errorf("node %s should have labels %s, has %s", nid, want, labels)
194 }
195 }
196 return nil
197 })
Serge Bazanskie99638e2024-09-30 17:06:44 +0000198
199 // Add Metropolis node label, ensure it gets reflected on the Kubernetes node.
200 _, err = mgmt.UpdateNodeLabels(ctx, &apb.UpdateNodeLabelsRequest{
201 Node: &apb.UpdateNodeLabelsRequest_Id{
202 Id: cluster.NodeIDs[1],
203 },
204 Upsert: []*apb.UpdateNodeLabelsRequest_Pair{
205 {Key: "test.monogon.dev/foo", Value: "bar"},
206 },
207 })
208
Jan Schär36f03752024-11-19 17:41:05 +0100209 util.MustTestEventual(t, "Metropolis labels added", ctx, smallTestTimeout, func(ctx context.Context) error {
Serge Bazanskie99638e2024-09-30 17:06:44 +0000210 if err != nil {
211 t.Fatalf("Could not add label to node: %v", err)
212 }
213 want := common.Labels{
214 "node-role.kubernetes.io/KubernetesWorker": "",
215 "test.monogon.dev/foo": "bar",
216 }
217 if labels := getLabelsForNode(cluster.NodeIDs[1]); !want.Equals(labels) {
Serge Bazanski1e399142024-10-22 10:58:15 +0000218 return fmt.Errorf("node %s should have labels %s, has %s", cluster.NodeIDs[1], want, labels)
Serge Bazanskie99638e2024-09-30 17:06:44 +0000219 }
220 return nil
221 })
Serge Bazanski1e399142024-10-22 10:58:15 +0000222
223 // Reconfigure node label rules.
224 _, err = mgmt.ConfigureCluster(ctx, &apb.ConfigureClusterRequest{
225 BaseConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000226 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{
227 NodeLabelsToSynchronize: []*cpb.ClusterConfiguration_Kubernetes_NodeLabelsToSynchronize{
Serge Bazanski1e399142024-10-22 10:58:15 +0000228 {Regexp: `^test\.monogon\.dev/`},
229 },
230 },
231 },
232 NewConfig: &cpb.ClusterConfiguration{
Serge Bazanski78567602024-10-31 13:42:04 +0000233 Kubernetes: &cpb.ClusterConfiguration_Kubernetes{},
Serge Bazanski1e399142024-10-22 10:58:15 +0000234 },
235 UpdateMask: &fieldmaskpb.FieldMask{
Serge Bazanski78567602024-10-31 13:42:04 +0000236 Paths: []string{"kubernetes.node_labels_to_synchronize"},
Serge Bazanski1e399142024-10-22 10:58:15 +0000237 },
238 })
239 if err != nil {
240 t.Fatalf("Could not update cluster configuration: %v", err)
241 }
242
243 ci, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
244 if err != nil {
245 t.Fatalf("Could not get cluster info")
246 }
247 // See if the config changed.
Serge Bazanski78567602024-10-31 13:42:04 +0000248 if rules := ci.ClusterConfiguration.Kubernetes.NodeLabelsToSynchronize; len(rules) != 0 {
Serge Bazanski1e399142024-10-22 10:58:15 +0000249 t.Fatalf("Wanted 0 label rules in config after reconfiguration, have %d: %v", len(rules), rules)
250 }
251 // TODO: ensure new rules get applied, but that will require watching the cluster
252 // config for changes in the labelmaker.
Serge Bazanski6d1ff362024-09-30 15:15:31 +0000253}
254
Serge Bazanski99b02142024-04-17 16:33:28 +0200255// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
256//
257// The tests are performed against an in-memory cluster.
258func TestE2EKubernetes(t *testing.T) {
259 // Set a global timeout to make sure this terminates
260 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
261 defer cancel()
262
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000263 df, err := os.ReadFile(xTestImagesManifestPath)
Serge Bazanski99b02142024-04-17 16:33:28 +0200264 if err != nil {
265 t.Fatalf("Reading registry manifest failed: %v", err)
266 }
267 lr, err := localregistry.FromBazelManifest(df)
268 if err != nil {
269 t.Fatalf("Creating test image registry failed: %v", err)
270 }
271
272 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200273 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +0200274 NumNodes: 2,
275 LocalRegistry: lr,
Lorenz Brun732a8842024-08-26 23:25:37 +0200276 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +0100277 ClusterDomain: "cluster.test",
Lorenz Brun732a8842024-08-26 23:25:37 +0200278 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
279 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
280 },
Serge Bazanski99b02142024-04-17 16:33:28 +0200281 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +0200282 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +0200283 if err != nil {
284 t.Fatalf("LaunchCluster failed: %v", err)
285 }
286 defer func() {
287 err := cluster.Close()
288 if err != nil {
289 t.Fatalf("cluster Close failed: %v", err)
290 }
291 }()
292
Lorenz Brunde57e6f2025-01-08 16:34:08 +0000293 clientSet, restConfig, err := cluster.GetKubeClientSet()
Serge Bazanski99b02142024-04-17 16:33:28 +0200294 if err != nil {
295 t.Fatal(err)
296 }
297 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
298 // Make everything but the first node into KubernetesWorkers.
299 for i := 1; i < clusterOptions.NumNodes; i++ {
300 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
301 if err != nil {
302 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
303 }
304 }
305 return nil
306 })
307 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
308 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
309 if err != nil {
310 return err
311 }
312 if len(nodes.Items) < 1 {
313 return errors.New("node not yet registered")
314 }
315 node := nodes.Items[0]
316 for _, cond := range node.Status.Conditions {
317 if cond.Type != corev1.NodeReady {
318 continue
319 }
320 if cond.Status != corev1.ConditionTrue {
321 return fmt.Errorf("node not ready: %v", cond.Message)
322 }
323 }
324 return nil
325 })
326 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
327 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
328 return err
329 })
330 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
331 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
332 if err != nil {
333 return err
334 }
335 if len(res.Items) == 0 {
336 return errors.New("pod didn't get created")
337 }
338 pod := res.Items[0]
339 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
340 return nil
341 }
342 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
343 if err != nil || len(events.Items) == 0 {
344 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
345 } else {
346 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
347 }
348 })
349 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
350 deployment := makeTestDeploymentSpec("test-deploy-2")
351 gvisorStr := "gvisor"
352 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
353 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
354 return err
355 })
356 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
357 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
358 if err != nil {
359 return err
360 }
361 if len(res.Items) == 0 {
362 return errors.New("pod didn't get created")
363 }
364 pod := res.Items[0]
365 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
366 return nil
367 }
368 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
369 if err != nil || len(events.Items) == 0 {
370 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
371 } else {
372 var errorMsg strings.Builder
373 for _, msg := range events.Items {
374 errorMsg.WriteString(" | ")
375 errorMsg.WriteString(msg.Message)
376 }
Tim Windelschmidt5f1a7de2024-09-19 02:00:14 +0200377 return fmt.Errorf("pod is not ready: %s", errorMsg.String())
Serge Bazanski99b02142024-04-17 16:33:28 +0200378 }
379 })
Lorenz Brunde57e6f2025-01-08 16:34:08 +0000380 t.Run("Connectivity Smoke Tests", func(t *testing.T) {
381 ct := connectivity.SetupTest(t, &connectivity.TestSpec{
382 Name: "connectivity-smoke",
383 ClientSet: clientSet,
384 RESTConfig: restConfig,
385 NumPods: 2,
386 ExtraPodConfig: func(i int, pod *corev1.Pod) {
387 // Spread pods out over nodes to test inter-node network
388 pod.Labels = make(map[string]string)
389 pod.Labels["name"] = "connectivity-smoketest"
390 pod.Spec.TopologySpreadConstraints = []corev1.TopologySpreadConstraint{{
391 MaxSkew: 1,
392 TopologyKey: "kubernetes.io/hostname",
393 WhenUnsatisfiable: corev1.DoNotSchedule,
394 LabelSelector: metav1.SetAsLabelSelector(pod.Labels),
395 }}
396 },
397 })
398 ct.TestPodConnectivity(t, 0, 1, 1234, connectivity.ExpectedSuccess)
399 })
Lorenz Brun52700ae2025-01-28 15:07:08 +0100400 t.Run("Network Policy Smoke Test", func(t *testing.T) {
401 ct := connectivity.SetupTest(t, &connectivity.TestSpec{
402 Name: "npc-smoke",
403 ClientSet: clientSet,
404 RESTConfig: restConfig,
405 NumPods: 2,
406 ExtraPodConfig: func(i int, pod *corev1.Pod) {
407 // Spread pods out over nodes to test inter-node network
408 pod.Labels = make(map[string]string)
409 pod.Labels["name"] = "npc-smoke"
410 pod.Spec.TopologySpreadConstraints = []corev1.TopologySpreadConstraint{{
411 MaxSkew: 1,
412 TopologyKey: "kubernetes.io/hostname",
413 WhenUnsatisfiable: corev1.DoNotSchedule,
414 LabelSelector: metav1.SetAsLabelSelector(pod.Labels),
415 }}
416 },
417 })
418 // Test connectivity before applying network policy
419 ct.TestPodConnectivity(t, 0, 1, 1234, connectivity.ExpectedSuccess)
420 ct.TestPodConnectivity(t, 0, 1, 1235, connectivity.ExpectedSuccess)
421 nwp := &nwkv1.NetworkPolicy{
422 ObjectMeta: metav1.ObjectMeta{
423 Name: "npc-smoke",
424 },
425 Spec: nwkv1.NetworkPolicySpec{
426 PodSelector: metav1.LabelSelector{MatchLabels: map[string]string{"name": "npc-smoke"}},
427 Ingress: []nwkv1.NetworkPolicyIngressRule{{
428 Ports: []nwkv1.NetworkPolicyPort{{
429 Protocol: ptr.To(corev1.ProtocolTCP),
430 Port: &intstr.IntOrString{Type: intstr.Int, IntVal: 1234},
431 }},
432 From: []nwkv1.NetworkPolicyPeer{{
433 PodSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"name": "npc-smoke"}},
434 }},
435 }},
436 },
437 }
438 if _, err := clientSet.NetworkingV1().NetworkPolicies("default").Create(context.Background(), nwp, metav1.CreateOptions{}); err != nil {
439 t.Fatal(err)
440 }
441 // Check if policy is in effect
442 ct.TestPodConnectivityEventual(t, 0, 1, 1235, connectivity.ExpectedReject, 30*time.Second)
443 ct.TestPodConnectivity(t, 0, 1, 1234, connectivity.ExpectedSuccess)
444 })
Jan Schär73beb692024-11-27 17:47:09 +0100445 for _, runtimeClass := range []string{"runc", "gvisor"} {
446 statefulSetName := fmt.Sprintf("test-statefulset-%s", runtimeClass)
447 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
448 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet(statefulSetName, runtimeClass), metav1.CreateOptions{})
Serge Bazanski99b02142024-04-17 16:33:28 +0200449 return err
Jan Schär73beb692024-11-27 17:47:09 +0100450 })
Jan Schärb00f7f92025-03-06 17:27:22 +0100451 var podName string
Jan Schär73beb692024-11-27 17:47:09 +0100452 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests successful", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
453 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("name=%s", statefulSetName)})
454 if err != nil {
455 return err
Jan Schär652c2ad2024-11-19 17:40:50 +0100456 }
Jan Schär73beb692024-11-27 17:47:09 +0100457 if len(res.Items) == 0 {
458 return errors.New("pod didn't get created")
459 }
460 pod := res.Items[0]
Jan Schärb00f7f92025-03-06 17:27:22 +0100461 podName = pod.Name
462 lines, err := getPodLogLines(ctx, clientSet, podName, 50)
Jan Schär73beb692024-11-27 17:47:09 +0100463 if err != nil {
464 return fmt.Errorf("could not get logs: %w", err)
465 }
Jan Schärb00f7f92025-03-06 17:27:22 +0100466 if slices.Contains(lines, "[INIT-PASSED]") {
467 return nil
468 }
469 if slices.Contains(lines, "[INIT-FAILED]") {
470 return util.Permanent(fmt.Errorf("tests failed, log:\n %s", strings.Join(lines, "\n ")))
Jan Schär73beb692024-11-27 17:47:09 +0100471 }
472 return fmt.Errorf("pod is not ready: %v, log:\n %s", pod.Status.Phase, strings.Join(lines, "\n "))
473 })
Jan Schärb00f7f92025-03-06 17:27:22 +0100474 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s request resize", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
475 for _, templateName := range []string{"vol-default", "vol-readonly", "vol-block"} {
476 name := fmt.Sprintf("%s-%s-0", templateName, statefulSetName)
477 patch := `{"spec": {"resources": {"requests": {"storage": "4Mi"}}}}`
478 _, err := clientSet.CoreV1().PersistentVolumeClaims("default").Patch(ctx, name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{})
479 if err != nil {
480 return err
481 }
482 }
483 return nil
484 })
485 i := 0
486 util.TestEventual(t, fmt.Sprintf("StatefulSet with %s resize successful", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
487 // Make a change to the pod to make kubelet look at it and notice that it
488 // should call NodeExpandVolume. If we don't do this, it might take up to
489 // 1 minute for kubelet to notice, which slows down the test.
490 patch := fmt.Sprintf(`{"metadata": {"labels": {"trigger-kubelet-update": "%d"}}}`, i)
491 i += 1
492 _, err := clientSet.CoreV1().Pods("default").Patch(ctx, podName, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{})
493 if err != nil {
494 return err
495 }
496
497 lines, err := getPodLogLines(ctx, clientSet, podName, 50)
498 if err != nil {
499 return fmt.Errorf("could not get logs: %w", err)
500 }
501 if slices.Contains(lines, "[RESIZE-PASSED]") {
502 return nil
503 }
504 return fmt.Errorf("waiting for resize, log:\n %s", strings.Join(lines, "\n "))
505 })
Jan Schär73beb692024-11-27 17:47:09 +0100506 }
Lorenz Brun2ecccae2024-11-27 22:03:35 +0100507 util.TestEventual(t, "Deployment in user namespace", ctx, largeTestTimeout, func(ctx context.Context) error {
508 deployment := makeTestDeploymentSpec("test-userns-1")
509 deployment.Spec.Template.Spec.HostUsers = ptr.To(false)
510 deployment.Spec.Template.Spec.Containers[0].ReadinessProbe.HTTPGet.Path = "/ready_userns"
511 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
512 return err
513 })
514 util.TestEventual(t, "Deployment in user namespace is running", ctx, largeTestTimeout, func(ctx context.Context) error {
515 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-userns-1"})
516 if err != nil {
517 return err
518 }
519 if len(res.Items) == 0 {
520 return errors.New("pod didn't get created")
521 }
522 pod := res.Items[0]
523 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
524 return nil
525 }
526 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
527 if err != nil || len(events.Items) == 0 {
528 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
529 } else {
530 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
531 }
532 })
Serge Bazanski99b02142024-04-17 16:33:28 +0200533 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
534 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
535 return err
536 })
537 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
538 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
539 if err != nil {
540 return err
541 }
542 if res.Status.Failed > 0 {
543 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
544 LabelSelector: "job-name=selftest",
545 })
546 if err != nil {
547 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
548 }
549 if len(pods.Items) < 1 {
550 return fmt.Errorf("job failed but pod does not exist")
551 }
552 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
553 if err != nil {
554 return fmt.Errorf("job failed but could not get logs: %w", err)
555 }
556 if len(lines) > 0 {
557 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
558 }
559 return util.Permanent(fmt.Errorf("job failed, empty log"))
560 }
561 if res.Status.Succeeded > 0 {
562 return nil
563 }
564 return fmt.Errorf("job still running")
565 })
566 util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
567 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
568 if err != nil && !kerrors.IsAlreadyExists(err) {
569 return err
570 }
571 _, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
572 if err != nil && !kerrors.IsAlreadyExists(err) {
573 return err
574 }
575 return nil
576 })
577 util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
578 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
579 if err != nil {
580 return err
581 }
582 // Use a new client for each attempt
583 hc := http.Client{
584 Timeout: 2 * time.Second,
585 Transport: &http.Transport{
586 Dial: cluster.SOCKSDialer.Dial,
587 },
588 }
589 for _, n := range nodes.Items {
590 var addr string
591 for _, a := range n.Status.Addresses {
592 if a.Type == corev1.NodeInternalIP {
593 addr = a.Address
594 }
595 }
596 u := url.URL{Scheme: "http", Host: addr, Path: "/"}
597 res, err := hc.Get(u.String())
598 if err != nil {
599 return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
600 }
601 if res.StatusCode != http.StatusOK {
602 return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
603 }
604 t.Logf("Got response from %q", n.Name)
605 }
606 return nil
607 })
608 util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
609 pool := x509.NewCertPool()
610 pool.AddCert(cluster.CACertificate)
611 cl := http.Client{
612 Transport: &http.Transport{
613 TLSClientConfig: &tls.Config{
614 Certificates: []tls.Certificate{cluster.Owner},
615 RootCAs: pool,
616 },
617 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
618 return cluster.DialNode(ctx, addr)
619 },
620 },
621 }
622 u := url.URL{
623 Scheme: "https",
Jan Schär0f8ce4c2025-09-04 13:27:50 +0200624 Host: net.JoinHostPort(cluster.NodeIDs[1], allocs.PortMetrics.PortString()),
Serge Bazanski99b02142024-04-17 16:33:28 +0200625 Path: "/metrics/containerd",
626 }
627 res, err := cl.Get(u.String())
628 if err != nil {
629 return err
630 }
631 defer res.Body.Close()
632 if res.StatusCode != 200 {
633 return fmt.Errorf("status code %d", res.StatusCode)
634 }
635
636 body, err := io.ReadAll(res.Body)
637 if err != nil {
638 return err
639 }
640 needle := "containerd_build_info_total"
641 if !strings.Contains(string(body), needle) {
642 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
643 }
644 return nil
645 })
Serge Bazanski99b02142024-04-17 16:33:28 +0200646}