blob: e0c347f2674cf5f39bf57ecf5bf50bd230a0530a [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package e2e
18
19import (
20 "context"
Serge Bazanski54e212a2023-06-14 13:45:11 +020021 "crypto/tls"
22 "crypto/x509"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020023 "errors"
24 "fmt"
Serge Bazanski2cfafc92023-03-21 16:42:47 +010025 "io"
Leopold Schabele28e6d72020-06-03 11:39:25 +020026 "net"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020027 "net/http"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020028 _ "net/http/pprof"
Serge Bazanski54e212a2023-06-14 13:45:11 +020029 "net/url"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020030 "os"
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +020031 "strings"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020032 "testing"
33 "time"
34
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010035 "github.com/bazelbuild/rules_go/go/runfiles"
Serge Bazanskibe742842022-04-04 13:18:50 +020036 "google.golang.org/grpc"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020037 corev1 "k8s.io/api/core/v1"
Lorenz Brun276a7462023-07-12 21:28:54 +020038 kerrors "k8s.io/apimachinery/pkg/api/errors"
Lorenz Brun30167f52021-03-17 17:49:01 +010039 "k8s.io/apimachinery/pkg/api/resource"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020040 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
41 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
42
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010043 apb "source.monogon.dev/metropolis/proto/api"
44
Serge Bazanski31370b02021-01-07 16:31:14 +010045 common "source.monogon.dev/metropolis/node"
Serge Bazanskibe742842022-04-04 13:18:50 +020046 "source.monogon.dev/metropolis/node/core/rpc"
Lorenz Brun150f24a2023-07-13 20:11:06 +020047 "source.monogon.dev/metropolis/pkg/localregistry"
Serge Bazanski05f813b2023-03-16 17:58:39 +010048 "source.monogon.dev/metropolis/test/launch"
Serge Bazanski66e58952021-10-05 17:06:56 +020049 "source.monogon.dev/metropolis/test/launch/cluster"
Mateusz Zalegaddf19b42022-06-22 12:27:37 +020050 "source.monogon.dev/metropolis/test/util"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020051)
52
Leopold Schabeld603f842020-06-09 17:48:09 +020053const (
54 // Timeout for the global test context.
55 //
Serge Bazanski216fe7b2021-05-21 18:36:16 +020056 // Bazel would eventually time out the test after 900s ("large") if, for
57 // some reason, the context cancellation fails to abort it.
Leopold Schabeld603f842020-06-09 17:48:09 +020058 globalTestTimeout = 600 * time.Second
59
60 // Timeouts for individual end-to-end tests of different sizes.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020061 smallTestTimeout = 60 * time.Second
Leopold Schabeld603f842020-06-09 17:48:09 +020062 largeTestTimeout = 120 * time.Second
63)
64
Serge Bazanskia0bc6d32023-06-28 18:57:40 +020065// TestE2ECore exercisees the core functionality of Metropolis: maintaining a
66// control plane, changing node roles, ...
67//
68// The tests are performed against an in-memory cluster.
69func TestE2ECore(t *testing.T) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020070 // Set a global timeout to make sure this terminates
Leopold Schabeld603f842020-06-09 17:48:09 +020071 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020072 defer cancel()
Serge Bazanski66e58952021-10-05 17:06:56 +020073
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010074 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
75 if err != nil {
76 t.Fatalf("Resolving registry manifest failed: %v", err)
77 }
78 df, err := os.ReadFile(rPath)
79 if err != nil {
80 t.Fatalf("Reading registry manifest failed: %v", err)
81 }
82 lr, err := localregistry.FromBazelManifest(df)
Lorenz Brun150f24a2023-07-13 20:11:06 +020083 if err != nil {
84 t.Fatalf("Creating test image registry failed: %v", err)
85 }
Serge Bazanski66e58952021-10-05 17:06:56 +020086 // Launch cluster.
Serge Bazanskie78a0892021-10-07 17:03:49 +020087 clusterOptions := cluster.ClusterOptions{
Lorenz Brun150f24a2023-07-13 20:11:06 +020088 NumNodes: 2,
89 LocalRegistry: lr,
Serge Bazanskie78a0892021-10-07 17:03:49 +020090 }
91 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020092 if err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +020093 t.Fatalf("LaunchCluster failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020094 }
Serge Bazanski66e58952021-10-05 17:06:56 +020095 defer func() {
96 err := cluster.Close()
97 if err != nil {
98 t.Fatalf("cluster Close failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020099 }
100 }()
Serge Bazanski1f9a03b2021-08-17 13:40:53 +0200101
Serge Bazanski05f813b2023-03-16 17:58:39 +0100102 launch.Log("E2E: Cluster running, starting tests...")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200103
Serge Bazanskibe742842022-04-04 13:18:50 +0200104 // Dial first node's curator.
Serge Bazanski8535cb52023-03-29 14:15:08 +0200105 creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
Serge Bazanskibe742842022-04-04 13:18:50 +0200106 remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
107 cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
108 if err != nil {
109 t.Fatalf("failed to dial first node's curator: %v", err)
110 }
111 defer cl.Close()
112 mgmt := apb.NewManagementClient(cl)
113
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200114 util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
115 res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
116 if err != nil {
117 return fmt.Errorf("GetClusterInfo: %w", err)
118 }
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200119
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200120 // Ensure that the expected node count is present.
121 nodes := res.ClusterDirectory.Nodes
122 if want, got := clusterOptions.NumNodes, len(nodes); want != got {
123 return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
124 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100125
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200126 // Ensure the nodes have the expected addresses.
127 addresses := make(map[string]bool)
128 for _, n := range nodes {
129 if len(n.Addresses) != 1 {
Serge Bazanski538292d2024-04-17 14:50:02 +0200130 return fmt.Errorf("node %s has no addresss", n.Id)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200131 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200132 address := n.Addresses[0].Host
133 addresses[address] = true
134 }
Serge Bazanski2cfafc92023-03-21 16:42:47 +0100135
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200136 for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
137 if !addresses[address] {
138 return fmt.Errorf("address %q not found in directory", address)
Lorenz Brun30167f52021-03-17 17:49:01 +0100139 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200140 }
141 return nil
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200142 })
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200143 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
144 util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
145 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
146 if err := cluster.RebootNode(ctx, 1); err != nil {
147 return fmt.Errorf("while rebooting a node: %w", err)
148 }
149 return nil
150 })
151 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
152 util.TestEventual(t, "Prometheus node metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
153 pool := x509.NewCertPool()
154 pool.AddCert(cluster.CACertificate)
155 cl := http.Client{
156 Transport: &http.Transport{
157 TLSClientConfig: &tls.Config{
158 Certificates: []tls.Certificate{cluster.Owner},
159 RootCAs: pool,
160 },
161 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
162 return cluster.DialNode(ctx, addr)
163 },
164 },
165 }
166 u := url.URL{
167 Scheme: "https",
168 Host: net.JoinHostPort(cluster.NodeIDs[0], common.MetricsPort.PortString()),
169 Path: "/metrics/node",
170 }
171 res, err := cl.Get(u.String())
172 if err != nil {
173 return err
174 }
175 defer res.Body.Close()
176 if res.StatusCode != 200 {
177 return fmt.Errorf("status code %d", res.StatusCode)
178 }
179
180 body, err := io.ReadAll(res.Body)
181 if err != nil {
182 return err
183 }
184 needle := "node_uname_info"
185 if !strings.Contains(string(body), needle) {
186 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
187 }
188 return nil
189 })
190}
191
Serge Bazanski37cfcc12024-03-21 11:59:07 +0100192// TestE2ECoreHA exercises the basics of a high-availability control plane by
193// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
194// performing a rolling restart.
195func TestE2ECoreHA(t *testing.T) {
196 // Set a global timeout to make sure this terminates
197 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
198 defer cancel()
199
200 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
201 if err != nil {
202 t.Fatalf("Resolving registry manifest failed: %v", err)
203 }
204 df, err := os.ReadFile(rPath)
205 if err != nil {
206 t.Fatalf("Reading registry manifest failed: %v", err)
207 }
208 lr, err := localregistry.FromBazelManifest(df)
209 if err != nil {
210 t.Fatalf("Creating test image registry failed: %v", err)
211 }
212 // Launch cluster.
213 clusterOptions := cluster.ClusterOptions{
214 NumNodes: 3,
215 LocalRegistry: lr,
216 NodeLogsToFiles: true,
217 }
218 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
219 if err != nil {
220 t.Fatalf("LaunchCluster failed: %v", err)
221 }
222 defer func() {
223 err := cluster.Close()
224 if err != nil {
225 t.Fatalf("cluster Close failed: %v", err)
226 }
227 }()
228
229 launch.Log("E2E: Cluster running, starting tests...")
230
231 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
232 // Make everything but the first node into ConsensusMember.
233 for i := 1; i < clusterOptions.NumNodes; i++ {
234 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
235 if err != nil {
236 return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
237 }
238 }
239 return nil
240 })
241 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
242
243 // Perform rolling restart of all nodes. When a node rejoins it must be able to
244 // contact the cluster, so this also exercises that the cluster is serving even
245 // with the node having rebooted.
246 for i := 0; i < clusterOptions.NumNodes; i++ {
247 util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
248 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
249 if err := cluster.RebootNode(ctx, i); err != nil {
250 return fmt.Errorf("while rebooting a node: %w", err)
251 }
252 return nil
253 })
254 }
255}
256
257// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200258//
259// The tests are performed against an in-memory cluster.
260func TestE2EKubernetes(t *testing.T) {
261 // Set a global timeout to make sure this terminates
262 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
263 defer cancel()
264
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +0100265 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
266 if err != nil {
267 t.Fatalf("Resolving registry manifest failed: %v", err)
268 }
269 df, err := os.ReadFile(rPath)
270 if err != nil {
271 t.Fatalf("Reading registry manifest failed: %v", err)
272 }
273 lr, err := localregistry.FromBazelManifest(df)
Lorenz Brun150f24a2023-07-13 20:11:06 +0200274 if err != nil {
275 t.Fatalf("Creating test image registry failed: %v", err)
276 }
277
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200278 // Launch cluster.
279 clusterOptions := cluster.ClusterOptions{
Lorenz Brun150f24a2023-07-13 20:11:06 +0200280 NumNodes: 2,
281 LocalRegistry: lr,
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200282 }
283 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
284 if err != nil {
285 t.Fatalf("LaunchCluster failed: %v", err)
286 }
287 defer func() {
288 err := cluster.Close()
289 if err != nil {
290 t.Fatalf("cluster Close failed: %v", err)
291 }
292 }()
293
294 clientSet, err := cluster.GetKubeClientSet()
295 if err != nil {
296 t.Fatal(err)
297 }
298 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
299 // Make everything but the first node into KubernetesWorkers.
300 for i := 1; i < clusterOptions.NumNodes; i++ {
301 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
302 if err != nil {
303 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
304 }
305 }
306 return nil
307 })
308 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
309 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
310 if err != nil {
311 return err
312 }
313 if len(nodes.Items) < 1 {
314 return errors.New("node not yet registered")
315 }
316 node := nodes.Items[0]
317 for _, cond := range node.Status.Conditions {
318 if cond.Type != corev1.NodeReady {
319 continue
320 }
321 if cond.Status != corev1.ConditionTrue {
322 return fmt.Errorf("node not ready: %v", cond.Message)
323 }
324 }
325 return nil
326 })
327 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
328 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
329 return err
330 })
331 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
332 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
333 if err != nil {
334 return err
335 }
336 if len(res.Items) == 0 {
337 return errors.New("pod didn't get created")
338 }
339 pod := res.Items[0]
340 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
341 return nil
342 }
343 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
344 if err != nil || len(events.Items) == 0 {
345 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
346 } else {
347 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
348 }
349 })
350 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
351 deployment := makeTestDeploymentSpec("test-deploy-2")
352 gvisorStr := "gvisor"
353 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
354 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
355 return err
356 })
357 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
358 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
359 if err != nil {
360 return err
361 }
362 if len(res.Items) == 0 {
363 return errors.New("pod didn't get created")
364 }
365 pod := res.Items[0]
366 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
367 return nil
368 }
369 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
370 if err != nil || len(events.Items) == 0 {
371 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
372 } else {
373 var errorMsg strings.Builder
374 for _, msg := range events.Items {
375 errorMsg.WriteString(" | ")
376 errorMsg.WriteString(msg.Message)
377 }
378 return fmt.Errorf("pod is not ready: %v", errorMsg.String())
379 }
380 })
381 util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
382 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
383 return err
384 })
385 util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
386 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
387 if err != nil {
388 return err
389 }
390 if len(res.Items) == 0 {
391 return errors.New("pod didn't get created")
392 }
393 pod := res.Items[0]
394 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
395 return nil
396 }
397 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
398 if err != nil || len(events.Items) == 0 {
399 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
400 } else {
401 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
402 }
403 })
404 util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
405 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
406 return err
407 })
408 util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
409 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
410 if err != nil {
411 return err
412 }
413 if len(res.Items) == 0 {
414 return errors.New("pod didn't get created")
415 }
416 pod := res.Items[0]
417 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
418 return nil
419 }
420 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
421 if err != nil || len(events.Items) == 0 {
422 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
423 } else {
424 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
425 }
426 })
427 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
428 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
429 return err
430 })
431 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
432 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
433 if err != nil {
434 return err
435 }
436 if res.Status.Failed > 0 {
437 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
438 LabelSelector: "job-name=selftest",
439 })
440 if err != nil {
441 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
442 }
443 if len(pods.Items) < 1 {
444 return fmt.Errorf("job failed but pod does not exist")
445 }
446 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
447 if err != nil {
448 return fmt.Errorf("job failed but could not get logs: %w", err)
449 }
450 if len(lines) > 0 {
451 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
452 }
453 return util.Permanent(fmt.Errorf("job failed, empty log"))
454 }
455 if res.Status.Succeeded > 0 {
456 return nil
457 }
458 return fmt.Errorf("job still running")
459 })
Lorenz Brun276a7462023-07-12 21:28:54 +0200460 util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
461 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
462 if err != nil && !kerrors.IsAlreadyExists(err) {
463 return err
464 }
465 _, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
466 if err != nil && !kerrors.IsAlreadyExists(err) {
467 return err
468 }
469 return nil
470 })
471 util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
472 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
473 if err != nil {
474 return err
475 }
476 // Use a new client for each attempt
477 hc := http.Client{
478 Timeout: 2 * time.Second,
479 Transport: &http.Transport{
480 Dial: cluster.SOCKSDialer.Dial,
481 },
482 }
483 for _, n := range nodes.Items {
484 var addr string
485 for _, a := range n.Status.Addresses {
486 if a.Type == corev1.NodeInternalIP {
487 addr = a.Address
488 }
489 }
490 u := url.URL{Scheme: "http", Host: addr, Path: "/"}
491 res, err := hc.Get(u.String())
492 if err != nil {
493 return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
494 }
495 if res.StatusCode != http.StatusOK {
496 return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
497 }
498 t.Logf("Got response from %q", n.Name)
499 }
500 return nil
501 })
Tim Windelschmidt3bdb5fc2024-03-14 18:47:35 +0100502 util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
503 pool := x509.NewCertPool()
504 pool.AddCert(cluster.CACertificate)
505 cl := http.Client{
506 Transport: &http.Transport{
507 TLSClientConfig: &tls.Config{
508 Certificates: []tls.Certificate{cluster.Owner},
509 RootCAs: pool,
510 },
511 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
512 return cluster.DialNode(ctx, addr)
513 },
514 },
515 }
516 u := url.URL{
517 Scheme: "https",
518 Host: net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
519 Path: "/metrics/containerd",
520 }
521 res, err := cl.Get(u.String())
522 if err != nil {
523 return err
524 }
525 defer res.Body.Close()
526 if res.StatusCode != 200 {
527 return fmt.Errorf("status code %d", res.StatusCode)
528 }
529
530 body, err := io.ReadAll(res.Body)
531 if err != nil {
532 return err
533 }
534 needle := "containerd_build_info_total"
535 if !strings.Contains(string(body), needle) {
536 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
537 }
538 return nil
539 })
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200540 if os.Getenv("HAVE_NESTED_KVM") != "" {
541 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
542 runcRuntimeClass := "runc"
543 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
544 ObjectMeta: metav1.ObjectMeta{
545 Name: "vm-smoketest",
546 },
547 Spec: corev1.PodSpec{
548 Containers: []corev1.Container{{
549 Name: "vm-smoketest",
550 ImagePullPolicy: corev1.PullNever,
Lorenz Brun150f24a2023-07-13 20:11:06 +0200551 Image: "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200552 Resources: corev1.ResourceRequirements{
553 Limits: corev1.ResourceList{
554 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
555 },
556 },
557 }},
558 RuntimeClassName: &runcRuntimeClass,
559 RestartPolicy: corev1.RestartPolicyNever,
560 },
561 }, metav1.CreateOptions{})
562 return err
563 })
564 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
565 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
566 if err != nil {
567 return fmt.Errorf("failed to get pod: %w", err)
568 }
569 if pod.Status.Phase == corev1.PodSucceeded {
570 return nil
571 }
572 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
573 if err != nil || len(events.Items) == 0 {
574 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
575 } else {
576 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
577 }
578 })
579 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200580}