blob: 7004f7b94cb937825ce4924649cad4382ec6da92 [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package e2e
18
19import (
20 "context"
Serge Bazanski54e212a2023-06-14 13:45:11 +020021 "crypto/tls"
22 "crypto/x509"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020023 "errors"
24 "fmt"
Serge Bazanski2cfafc92023-03-21 16:42:47 +010025 "io"
Leopold Schabele28e6d72020-06-03 11:39:25 +020026 "net"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020027 "net/http"
28 _ "net/http"
29 _ "net/http/pprof"
Serge Bazanski54e212a2023-06-14 13:45:11 +020030 "net/url"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020031 "os"
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +020032 "strings"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020033 "testing"
34 "time"
35
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010036 "github.com/bazelbuild/rules_go/go/runfiles"
Serge Bazanskibe742842022-04-04 13:18:50 +020037 "google.golang.org/grpc"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020038 corev1 "k8s.io/api/core/v1"
Lorenz Brun276a7462023-07-12 21:28:54 +020039 kerrors "k8s.io/apimachinery/pkg/api/errors"
Lorenz Brun30167f52021-03-17 17:49:01 +010040 "k8s.io/apimachinery/pkg/api/resource"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020041 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
42 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
43
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010044 apb "source.monogon.dev/metropolis/proto/api"
45
Serge Bazanski31370b02021-01-07 16:31:14 +010046 common "source.monogon.dev/metropolis/node"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010047 "source.monogon.dev/metropolis/node/core/identity"
Serge Bazanskibe742842022-04-04 13:18:50 +020048 "source.monogon.dev/metropolis/node/core/rpc"
Lorenz Brun150f24a2023-07-13 20:11:06 +020049 "source.monogon.dev/metropolis/pkg/localregistry"
Serge Bazanski05f813b2023-03-16 17:58:39 +010050 "source.monogon.dev/metropolis/test/launch"
Serge Bazanski66e58952021-10-05 17:06:56 +020051 "source.monogon.dev/metropolis/test/launch/cluster"
Mateusz Zalegaddf19b42022-06-22 12:27:37 +020052 "source.monogon.dev/metropolis/test/util"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020053)
54
Leopold Schabeld603f842020-06-09 17:48:09 +020055const (
56 // Timeout for the global test context.
57 //
Serge Bazanski216fe7b2021-05-21 18:36:16 +020058 // Bazel would eventually time out the test after 900s ("large") if, for
59 // some reason, the context cancellation fails to abort it.
Leopold Schabeld603f842020-06-09 17:48:09 +020060 globalTestTimeout = 600 * time.Second
61
62 // Timeouts for individual end-to-end tests of different sizes.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020063 smallTestTimeout = 60 * time.Second
Leopold Schabeld603f842020-06-09 17:48:09 +020064 largeTestTimeout = 120 * time.Second
65)
66
Serge Bazanskia0bc6d32023-06-28 18:57:40 +020067// TestE2ECore exercisees the core functionality of Metropolis: maintaining a
68// control plane, changing node roles, ...
69//
70// The tests are performed against an in-memory cluster.
71func TestE2ECore(t *testing.T) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020072 // Set a global timeout to make sure this terminates
Leopold Schabeld603f842020-06-09 17:48:09 +020073 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020074 defer cancel()
Serge Bazanski66e58952021-10-05 17:06:56 +020075
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010076 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
77 if err != nil {
78 t.Fatalf("Resolving registry manifest failed: %v", err)
79 }
80 df, err := os.ReadFile(rPath)
81 if err != nil {
82 t.Fatalf("Reading registry manifest failed: %v", err)
83 }
84 lr, err := localregistry.FromBazelManifest(df)
Lorenz Brun150f24a2023-07-13 20:11:06 +020085 if err != nil {
86 t.Fatalf("Creating test image registry failed: %v", err)
87 }
Serge Bazanski66e58952021-10-05 17:06:56 +020088 // Launch cluster.
Serge Bazanskie78a0892021-10-07 17:03:49 +020089 clusterOptions := cluster.ClusterOptions{
Lorenz Brun150f24a2023-07-13 20:11:06 +020090 NumNodes: 2,
91 LocalRegistry: lr,
Serge Bazanskie78a0892021-10-07 17:03:49 +020092 }
93 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020094 if err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +020095 t.Fatalf("LaunchCluster failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020096 }
Serge Bazanski66e58952021-10-05 17:06:56 +020097 defer func() {
98 err := cluster.Close()
99 if err != nil {
100 t.Fatalf("cluster Close failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200101 }
102 }()
Serge Bazanski1f9a03b2021-08-17 13:40:53 +0200103
Serge Bazanski05f813b2023-03-16 17:58:39 +0100104 launch.Log("E2E: Cluster running, starting tests...")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200105
Serge Bazanskibe742842022-04-04 13:18:50 +0200106 // Dial first node's curator.
Serge Bazanski8535cb52023-03-29 14:15:08 +0200107 creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
Serge Bazanskibe742842022-04-04 13:18:50 +0200108 remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
109 cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
110 if err != nil {
111 t.Fatalf("failed to dial first node's curator: %v", err)
112 }
113 defer cl.Close()
114 mgmt := apb.NewManagementClient(cl)
115
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200116 util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
117 res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
118 if err != nil {
119 return fmt.Errorf("GetClusterInfo: %w", err)
120 }
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200121
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200122 // Ensure that the expected node count is present.
123 nodes := res.ClusterDirectory.Nodes
124 if want, got := clusterOptions.NumNodes, len(nodes); want != got {
125 return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
126 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100127
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200128 // Ensure the nodes have the expected addresses.
129 addresses := make(map[string]bool)
130 for _, n := range nodes {
131 if len(n.Addresses) != 1 {
132 return fmt.Errorf("node %s has no addresss", identity.NodeID(n.PublicKey))
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200133 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200134 address := n.Addresses[0].Host
135 addresses[address] = true
136 }
Serge Bazanski2cfafc92023-03-21 16:42:47 +0100137
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200138 for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
139 if !addresses[address] {
140 return fmt.Errorf("address %q not found in directory", address)
Lorenz Brun30167f52021-03-17 17:49:01 +0100141 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200142 }
143 return nil
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200144 })
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200145 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
146 util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
147 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
148 if err := cluster.RebootNode(ctx, 1); err != nil {
149 return fmt.Errorf("while rebooting a node: %w", err)
150 }
151 return nil
152 })
153 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
154 util.TestEventual(t, "Prometheus node metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
155 pool := x509.NewCertPool()
156 pool.AddCert(cluster.CACertificate)
157 cl := http.Client{
158 Transport: &http.Transport{
159 TLSClientConfig: &tls.Config{
160 Certificates: []tls.Certificate{cluster.Owner},
161 RootCAs: pool,
162 },
163 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
164 return cluster.DialNode(ctx, addr)
165 },
166 },
167 }
168 u := url.URL{
169 Scheme: "https",
170 Host: net.JoinHostPort(cluster.NodeIDs[0], common.MetricsPort.PortString()),
171 Path: "/metrics/node",
172 }
173 res, err := cl.Get(u.String())
174 if err != nil {
175 return err
176 }
177 defer res.Body.Close()
178 if res.StatusCode != 200 {
179 return fmt.Errorf("status code %d", res.StatusCode)
180 }
181
182 body, err := io.ReadAll(res.Body)
183 if err != nil {
184 return err
185 }
186 needle := "node_uname_info"
187 if !strings.Contains(string(body), needle) {
188 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
189 }
190 return nil
191 })
192}
193
Serge Bazanski37cfcc12024-03-21 11:59:07 +0100194// TestE2ECoreHA exercises the basics of a high-availability control plane by
195// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
196// performing a rolling restart.
197func TestE2ECoreHA(t *testing.T) {
198 // Set a global timeout to make sure this terminates
199 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
200 defer cancel()
201
202 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
203 if err != nil {
204 t.Fatalf("Resolving registry manifest failed: %v", err)
205 }
206 df, err := os.ReadFile(rPath)
207 if err != nil {
208 t.Fatalf("Reading registry manifest failed: %v", err)
209 }
210 lr, err := localregistry.FromBazelManifest(df)
211 if err != nil {
212 t.Fatalf("Creating test image registry failed: %v", err)
213 }
214 // Launch cluster.
215 clusterOptions := cluster.ClusterOptions{
216 NumNodes: 3,
217 LocalRegistry: lr,
218 NodeLogsToFiles: true,
219 }
220 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
221 if err != nil {
222 t.Fatalf("LaunchCluster failed: %v", err)
223 }
224 defer func() {
225 err := cluster.Close()
226 if err != nil {
227 t.Fatalf("cluster Close failed: %v", err)
228 }
229 }()
230
231 launch.Log("E2E: Cluster running, starting tests...")
232
233 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
234 // Make everything but the first node into ConsensusMember.
235 for i := 1; i < clusterOptions.NumNodes; i++ {
236 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
237 if err != nil {
238 return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
239 }
240 }
241 return nil
242 })
243 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
244
245 // Perform rolling restart of all nodes. When a node rejoins it must be able to
246 // contact the cluster, so this also exercises that the cluster is serving even
247 // with the node having rebooted.
248 for i := 0; i < clusterOptions.NumNodes; i++ {
249 util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
250 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
251 if err := cluster.RebootNode(ctx, i); err != nil {
252 return fmt.Errorf("while rebooting a node: %w", err)
253 }
254 return nil
255 })
256 }
257}
258
259// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200260//
261// The tests are performed against an in-memory cluster.
262func TestE2EKubernetes(t *testing.T) {
263 // Set a global timeout to make sure this terminates
264 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
265 defer cancel()
266
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +0100267 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
268 if err != nil {
269 t.Fatalf("Resolving registry manifest failed: %v", err)
270 }
271 df, err := os.ReadFile(rPath)
272 if err != nil {
273 t.Fatalf("Reading registry manifest failed: %v", err)
274 }
275 lr, err := localregistry.FromBazelManifest(df)
Lorenz Brun150f24a2023-07-13 20:11:06 +0200276 if err != nil {
277 t.Fatalf("Creating test image registry failed: %v", err)
278 }
279
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200280 // Launch cluster.
281 clusterOptions := cluster.ClusterOptions{
Lorenz Brun150f24a2023-07-13 20:11:06 +0200282 NumNodes: 2,
283 LocalRegistry: lr,
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200284 }
285 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
286 if err != nil {
287 t.Fatalf("LaunchCluster failed: %v", err)
288 }
289 defer func() {
290 err := cluster.Close()
291 if err != nil {
292 t.Fatalf("cluster Close failed: %v", err)
293 }
294 }()
295
296 clientSet, err := cluster.GetKubeClientSet()
297 if err != nil {
298 t.Fatal(err)
299 }
300 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
301 // Make everything but the first node into KubernetesWorkers.
302 for i := 1; i < clusterOptions.NumNodes; i++ {
303 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
304 if err != nil {
305 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
306 }
307 }
308 return nil
309 })
310 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
311 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
312 if err != nil {
313 return err
314 }
315 if len(nodes.Items) < 1 {
316 return errors.New("node not yet registered")
317 }
318 node := nodes.Items[0]
319 for _, cond := range node.Status.Conditions {
320 if cond.Type != corev1.NodeReady {
321 continue
322 }
323 if cond.Status != corev1.ConditionTrue {
324 return fmt.Errorf("node not ready: %v", cond.Message)
325 }
326 }
327 return nil
328 })
329 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
330 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
331 return err
332 })
333 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
334 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
335 if err != nil {
336 return err
337 }
338 if len(res.Items) == 0 {
339 return errors.New("pod didn't get created")
340 }
341 pod := res.Items[0]
342 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
343 return nil
344 }
345 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
346 if err != nil || len(events.Items) == 0 {
347 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
348 } else {
349 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
350 }
351 })
352 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
353 deployment := makeTestDeploymentSpec("test-deploy-2")
354 gvisorStr := "gvisor"
355 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
356 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
357 return err
358 })
359 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
360 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
361 if err != nil {
362 return err
363 }
364 if len(res.Items) == 0 {
365 return errors.New("pod didn't get created")
366 }
367 pod := res.Items[0]
368 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
369 return nil
370 }
371 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
372 if err != nil || len(events.Items) == 0 {
373 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
374 } else {
375 var errorMsg strings.Builder
376 for _, msg := range events.Items {
377 errorMsg.WriteString(" | ")
378 errorMsg.WriteString(msg.Message)
379 }
380 return fmt.Errorf("pod is not ready: %v", errorMsg.String())
381 }
382 })
383 util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
384 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
385 return err
386 })
387 util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
388 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
389 if err != nil {
390 return err
391 }
392 if len(res.Items) == 0 {
393 return errors.New("pod didn't get created")
394 }
395 pod := res.Items[0]
396 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
397 return nil
398 }
399 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
400 if err != nil || len(events.Items) == 0 {
401 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
402 } else {
403 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
404 }
405 })
406 util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
407 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
408 return err
409 })
410 util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
411 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
412 if err != nil {
413 return err
414 }
415 if len(res.Items) == 0 {
416 return errors.New("pod didn't get created")
417 }
418 pod := res.Items[0]
419 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
420 return nil
421 }
422 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
423 if err != nil || len(events.Items) == 0 {
424 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
425 } else {
426 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
427 }
428 })
429 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
430 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
431 return err
432 })
433 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
434 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
435 if err != nil {
436 return err
437 }
438 if res.Status.Failed > 0 {
439 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
440 LabelSelector: "job-name=selftest",
441 })
442 if err != nil {
443 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
444 }
445 if len(pods.Items) < 1 {
446 return fmt.Errorf("job failed but pod does not exist")
447 }
448 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
449 if err != nil {
450 return fmt.Errorf("job failed but could not get logs: %w", err)
451 }
452 if len(lines) > 0 {
453 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
454 }
455 return util.Permanent(fmt.Errorf("job failed, empty log"))
456 }
457 if res.Status.Succeeded > 0 {
458 return nil
459 }
460 return fmt.Errorf("job still running")
461 })
Lorenz Brun276a7462023-07-12 21:28:54 +0200462 util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
463 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
464 if err != nil && !kerrors.IsAlreadyExists(err) {
465 return err
466 }
467 _, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
468 if err != nil && !kerrors.IsAlreadyExists(err) {
469 return err
470 }
471 return nil
472 })
473 util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
474 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
475 if err != nil {
476 return err
477 }
478 // Use a new client for each attempt
479 hc := http.Client{
480 Timeout: 2 * time.Second,
481 Transport: &http.Transport{
482 Dial: cluster.SOCKSDialer.Dial,
483 },
484 }
485 for _, n := range nodes.Items {
486 var addr string
487 for _, a := range n.Status.Addresses {
488 if a.Type == corev1.NodeInternalIP {
489 addr = a.Address
490 }
491 }
492 u := url.URL{Scheme: "http", Host: addr, Path: "/"}
493 res, err := hc.Get(u.String())
494 if err != nil {
495 return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
496 }
497 if res.StatusCode != http.StatusOK {
498 return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
499 }
500 t.Logf("Got response from %q", n.Name)
501 }
502 return nil
503 })
Tim Windelschmidt3bdb5fc2024-03-14 18:47:35 +0100504 util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
505 pool := x509.NewCertPool()
506 pool.AddCert(cluster.CACertificate)
507 cl := http.Client{
508 Transport: &http.Transport{
509 TLSClientConfig: &tls.Config{
510 Certificates: []tls.Certificate{cluster.Owner},
511 RootCAs: pool,
512 },
513 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
514 return cluster.DialNode(ctx, addr)
515 },
516 },
517 }
518 u := url.URL{
519 Scheme: "https",
520 Host: net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
521 Path: "/metrics/containerd",
522 }
523 res, err := cl.Get(u.String())
524 if err != nil {
525 return err
526 }
527 defer res.Body.Close()
528 if res.StatusCode != 200 {
529 return fmt.Errorf("status code %d", res.StatusCode)
530 }
531
532 body, err := io.ReadAll(res.Body)
533 if err != nil {
534 return err
535 }
536 needle := "containerd_build_info_total"
537 if !strings.Contains(string(body), needle) {
538 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
539 }
540 return nil
541 })
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200542 if os.Getenv("HAVE_NESTED_KVM") != "" {
543 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
544 runcRuntimeClass := "runc"
545 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
546 ObjectMeta: metav1.ObjectMeta{
547 Name: "vm-smoketest",
548 },
549 Spec: corev1.PodSpec{
550 Containers: []corev1.Container{{
551 Name: "vm-smoketest",
552 ImagePullPolicy: corev1.PullNever,
Lorenz Brun150f24a2023-07-13 20:11:06 +0200553 Image: "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200554 Resources: corev1.ResourceRequirements{
555 Limits: corev1.ResourceList{
556 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
557 },
558 },
559 }},
560 RuntimeClassName: &runcRuntimeClass,
561 RestartPolicy: corev1.RestartPolicyNever,
562 },
563 }, metav1.CreateOptions{})
564 return err
565 })
566 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
567 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
568 if err != nil {
569 return fmt.Errorf("failed to get pod: %w", err)
570 }
571 if pod.Status.Phase == corev1.PodSucceeded {
572 return nil
573 }
574 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
575 if err != nil || len(events.Items) == 0 {
576 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
577 } else {
578 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
579 }
580 })
581 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200582}