blob: 306d29eed8ca5b47b89d467507273c67873fab97 [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package e2e
18
19import (
20 "context"
Serge Bazanski54e212a2023-06-14 13:45:11 +020021 "crypto/tls"
22 "crypto/x509"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020023 "errors"
24 "fmt"
Serge Bazanski2cfafc92023-03-21 16:42:47 +010025 "io"
Leopold Schabele28e6d72020-06-03 11:39:25 +020026 "net"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020027 "net/http"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020028 _ "net/http/pprof"
Serge Bazanski54e212a2023-06-14 13:45:11 +020029 "net/url"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020030 "os"
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +020031 "strings"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020032 "testing"
33 "time"
34
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010035 "github.com/bazelbuild/rules_go/go/runfiles"
Serge Bazanskibe742842022-04-04 13:18:50 +020036 "google.golang.org/grpc"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020037 corev1 "k8s.io/api/core/v1"
Lorenz Brun276a7462023-07-12 21:28:54 +020038 kerrors "k8s.io/apimachinery/pkg/api/errors"
Lorenz Brun30167f52021-03-17 17:49:01 +010039 "k8s.io/apimachinery/pkg/api/resource"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020040 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
41 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
42
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010043 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski7be54aa2024-04-09 12:07:10 +020044 cpb "source.monogon.dev/metropolis/proto/common"
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010045
Serge Bazanski31370b02021-01-07 16:31:14 +010046 common "source.monogon.dev/metropolis/node"
Serge Bazanskibe742842022-04-04 13:18:50 +020047 "source.monogon.dev/metropolis/node/core/rpc"
Lorenz Brun150f24a2023-07-13 20:11:06 +020048 "source.monogon.dev/metropolis/pkg/localregistry"
Serge Bazanski05f813b2023-03-16 17:58:39 +010049 "source.monogon.dev/metropolis/test/launch"
Serge Bazanski66e58952021-10-05 17:06:56 +020050 "source.monogon.dev/metropolis/test/launch/cluster"
Mateusz Zalegaddf19b42022-06-22 12:27:37 +020051 "source.monogon.dev/metropolis/test/util"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020052)
53
Leopold Schabeld603f842020-06-09 17:48:09 +020054const (
55 // Timeout for the global test context.
56 //
Serge Bazanski216fe7b2021-05-21 18:36:16 +020057 // Bazel would eventually time out the test after 900s ("large") if, for
58 // some reason, the context cancellation fails to abort it.
Leopold Schabeld603f842020-06-09 17:48:09 +020059 globalTestTimeout = 600 * time.Second
60
61 // Timeouts for individual end-to-end tests of different sizes.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020062 smallTestTimeout = 60 * time.Second
Leopold Schabeld603f842020-06-09 17:48:09 +020063 largeTestTimeout = 120 * time.Second
64)
65
Serge Bazanskia0bc6d32023-06-28 18:57:40 +020066// TestE2ECore exercisees the core functionality of Metropolis: maintaining a
67// control plane, changing node roles, ...
68//
69// The tests are performed against an in-memory cluster.
70func TestE2ECore(t *testing.T) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020071 // Set a global timeout to make sure this terminates
Leopold Schabeld603f842020-06-09 17:48:09 +020072 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020073 defer cancel()
Serge Bazanski66e58952021-10-05 17:06:56 +020074
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +010075 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
76 if err != nil {
77 t.Fatalf("Resolving registry manifest failed: %v", err)
78 }
79 df, err := os.ReadFile(rPath)
80 if err != nil {
81 t.Fatalf("Reading registry manifest failed: %v", err)
82 }
83 lr, err := localregistry.FromBazelManifest(df)
Lorenz Brun150f24a2023-07-13 20:11:06 +020084 if err != nil {
85 t.Fatalf("Creating test image registry failed: %v", err)
86 }
Serge Bazanski66e58952021-10-05 17:06:56 +020087 // Launch cluster.
Serge Bazanskie78a0892021-10-07 17:03:49 +020088 clusterOptions := cluster.ClusterOptions{
Lorenz Brun150f24a2023-07-13 20:11:06 +020089 NumNodes: 2,
90 LocalRegistry: lr,
Serge Bazanskie78a0892021-10-07 17:03:49 +020091 }
92 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020093 if err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +020094 t.Fatalf("LaunchCluster failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020095 }
Serge Bazanski66e58952021-10-05 17:06:56 +020096 defer func() {
97 err := cluster.Close()
98 if err != nil {
99 t.Fatalf("cluster Close failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200100 }
101 }()
Serge Bazanski1f9a03b2021-08-17 13:40:53 +0200102
Serge Bazanski05f813b2023-03-16 17:58:39 +0100103 launch.Log("E2E: Cluster running, starting tests...")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200104
Serge Bazanskibe742842022-04-04 13:18:50 +0200105 // Dial first node's curator.
Serge Bazanski8535cb52023-03-29 14:15:08 +0200106 creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
Serge Bazanskibe742842022-04-04 13:18:50 +0200107 remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
108 cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
109 if err != nil {
110 t.Fatalf("failed to dial first node's curator: %v", err)
111 }
112 defer cl.Close()
113 mgmt := apb.NewManagementClient(cl)
114
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200115 util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
116 res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
117 if err != nil {
118 return fmt.Errorf("GetClusterInfo: %w", err)
119 }
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200120
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200121 // Ensure that the expected node count is present.
122 nodes := res.ClusterDirectory.Nodes
123 if want, got := clusterOptions.NumNodes, len(nodes); want != got {
124 return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
125 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100126
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200127 // Ensure the nodes have the expected addresses.
128 addresses := make(map[string]bool)
129 for _, n := range nodes {
130 if len(n.Addresses) != 1 {
Serge Bazanski538292d2024-04-17 14:50:02 +0200131 return fmt.Errorf("node %s has no addresss", n.Id)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200132 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200133 address := n.Addresses[0].Host
134 addresses[address] = true
135 }
Serge Bazanski2cfafc92023-03-21 16:42:47 +0100136
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200137 for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
138 if !addresses[address] {
139 return fmt.Errorf("address %q not found in directory", address)
Lorenz Brun30167f52021-03-17 17:49:01 +0100140 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200141 }
142 return nil
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200143 })
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200144 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
145 util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
146 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
147 if err := cluster.RebootNode(ctx, 1); err != nil {
148 return fmt.Errorf("while rebooting a node: %w", err)
149 }
150 return nil
151 })
152 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
153 util.TestEventual(t, "Prometheus node metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
154 pool := x509.NewCertPool()
155 pool.AddCert(cluster.CACertificate)
156 cl := http.Client{
157 Transport: &http.Transport{
158 TLSClientConfig: &tls.Config{
159 Certificates: []tls.Certificate{cluster.Owner},
160 RootCAs: pool,
161 },
162 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
163 return cluster.DialNode(ctx, addr)
164 },
165 },
166 }
167 u := url.URL{
168 Scheme: "https",
169 Host: net.JoinHostPort(cluster.NodeIDs[0], common.MetricsPort.PortString()),
170 Path: "/metrics/node",
171 }
172 res, err := cl.Get(u.String())
173 if err != nil {
174 return err
175 }
176 defer res.Body.Close()
177 if res.StatusCode != 200 {
178 return fmt.Errorf("status code %d", res.StatusCode)
179 }
180
181 body, err := io.ReadAll(res.Body)
182 if err != nil {
183 return err
184 }
185 needle := "node_uname_info"
186 if !strings.Contains(string(body), needle) {
187 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
188 }
189 return nil
190 })
191}
192
Serge Bazanski37cfcc12024-03-21 11:59:07 +0100193// TestE2ECoreHA exercises the basics of a high-availability control plane by
194// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
195// performing a rolling restart.
196func TestE2ECoreHA(t *testing.T) {
197 // Set a global timeout to make sure this terminates
198 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
199 defer cancel()
200
201 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
202 if err != nil {
203 t.Fatalf("Resolving registry manifest failed: %v", err)
204 }
205 df, err := os.ReadFile(rPath)
206 if err != nil {
207 t.Fatalf("Reading registry manifest failed: %v", err)
208 }
209 lr, err := localregistry.FromBazelManifest(df)
210 if err != nil {
211 t.Fatalf("Creating test image registry failed: %v", err)
212 }
213 // Launch cluster.
214 clusterOptions := cluster.ClusterOptions{
215 NumNodes: 3,
216 LocalRegistry: lr,
217 NodeLogsToFiles: true,
218 }
219 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
220 if err != nil {
221 t.Fatalf("LaunchCluster failed: %v", err)
222 }
223 defer func() {
224 err := cluster.Close()
225 if err != nil {
226 t.Fatalf("cluster Close failed: %v", err)
227 }
228 }()
229
230 launch.Log("E2E: Cluster running, starting tests...")
231
232 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
233 // Make everything but the first node into ConsensusMember.
234 for i := 1; i < clusterOptions.NumNodes; i++ {
235 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
236 if err != nil {
237 return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
238 }
239 }
240 return nil
241 })
242 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
243
244 // Perform rolling restart of all nodes. When a node rejoins it must be able to
245 // contact the cluster, so this also exercises that the cluster is serving even
246 // with the node having rebooted.
247 for i := 0; i < clusterOptions.NumNodes; i++ {
248 util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
249 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
250 if err := cluster.RebootNode(ctx, i); err != nil {
251 return fmt.Errorf("while rebooting a node: %w", err)
252 }
253 return nil
254 })
255 }
256}
257
Serge Bazanski7be54aa2024-04-09 12:07:10 +0200258// TestE2EColdStartHA exercises an HA cluster being fully shut down then
259// restarted again.
260//
261// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
262// so we test a non-TPM/Insecure cluster.
263func TestE2EColdStartHA(t *testing.T) {
264 // Set a global timeout to make sure this terminates
265 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
266 defer cancel()
267
268 // Launch cluster.
269 clusterOptions := cluster.ClusterOptions{
270 NumNodes: 3,
271 NodeLogsToFiles: true,
272 InitialClusterConfiguration: &cpb.ClusterConfiguration{
273 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
274 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
275 },
276 }
277 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
278 if err != nil {
279 t.Fatalf("LaunchCluster failed: %v", err)
280 }
281 defer func() {
282 err := cluster.Close()
283 if err != nil {
284 t.Fatalf("cluster Close failed: %v", err)
285 }
286 }()
287
288 launch.Log("E2E: Cluster running, starting tests...")
289
290 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
291 // Make everything but the first node into ConsensusMember.
292 for i := 1; i < clusterOptions.NumNodes; i++ {
293 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
294 if err != nil {
295 return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
296 }
297 }
298 return nil
299 })
300 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
301
302 // Shut every node down.
303 for i := 0; i < clusterOptions.NumNodes; i++ {
304 if err := cluster.ShutdownNode(i); err != nil {
305 t.Fatalf("Could not shutdown node %d", i)
306 }
307 }
308 // Start every node back up.
309 for i := 0; i < clusterOptions.NumNodes; i++ {
310 if err := cluster.StartNode(i); err != nil {
311 t.Fatalf("Could not shutdown node %d", i)
312 }
313 }
314 // Check if the cluster comes back up.
315 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
316}
317
Serge Bazanski37cfcc12024-03-21 11:59:07 +0100318// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200319//
320// The tests are performed against an in-memory cluster.
321func TestE2EKubernetes(t *testing.T) {
322 // Set a global timeout to make sure this terminates
323 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
324 defer cancel()
325
Tim Windelschmidt2a1d1b22024-02-06 07:07:42 +0100326 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
327 if err != nil {
328 t.Fatalf("Resolving registry manifest failed: %v", err)
329 }
330 df, err := os.ReadFile(rPath)
331 if err != nil {
332 t.Fatalf("Reading registry manifest failed: %v", err)
333 }
334 lr, err := localregistry.FromBazelManifest(df)
Lorenz Brun150f24a2023-07-13 20:11:06 +0200335 if err != nil {
336 t.Fatalf("Creating test image registry failed: %v", err)
337 }
338
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200339 // Launch cluster.
340 clusterOptions := cluster.ClusterOptions{
Lorenz Brun150f24a2023-07-13 20:11:06 +0200341 NumNodes: 2,
342 LocalRegistry: lr,
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200343 }
344 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
345 if err != nil {
346 t.Fatalf("LaunchCluster failed: %v", err)
347 }
348 defer func() {
349 err := cluster.Close()
350 if err != nil {
351 t.Fatalf("cluster Close failed: %v", err)
352 }
353 }()
354
355 clientSet, err := cluster.GetKubeClientSet()
356 if err != nil {
357 t.Fatal(err)
358 }
359 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
360 // Make everything but the first node into KubernetesWorkers.
361 for i := 1; i < clusterOptions.NumNodes; i++ {
362 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
363 if err != nil {
364 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
365 }
366 }
367 return nil
368 })
369 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
370 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
371 if err != nil {
372 return err
373 }
374 if len(nodes.Items) < 1 {
375 return errors.New("node not yet registered")
376 }
377 node := nodes.Items[0]
378 for _, cond := range node.Status.Conditions {
379 if cond.Type != corev1.NodeReady {
380 continue
381 }
382 if cond.Status != corev1.ConditionTrue {
383 return fmt.Errorf("node not ready: %v", cond.Message)
384 }
385 }
386 return nil
387 })
388 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
389 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
390 return err
391 })
392 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
393 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
394 if err != nil {
395 return err
396 }
397 if len(res.Items) == 0 {
398 return errors.New("pod didn't get created")
399 }
400 pod := res.Items[0]
401 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
402 return nil
403 }
404 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
405 if err != nil || len(events.Items) == 0 {
406 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
407 } else {
408 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
409 }
410 })
411 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
412 deployment := makeTestDeploymentSpec("test-deploy-2")
413 gvisorStr := "gvisor"
414 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
415 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
416 return err
417 })
418 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
419 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
420 if err != nil {
421 return err
422 }
423 if len(res.Items) == 0 {
424 return errors.New("pod didn't get created")
425 }
426 pod := res.Items[0]
427 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
428 return nil
429 }
430 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
431 if err != nil || len(events.Items) == 0 {
432 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
433 } else {
434 var errorMsg strings.Builder
435 for _, msg := range events.Items {
436 errorMsg.WriteString(" | ")
437 errorMsg.WriteString(msg.Message)
438 }
439 return fmt.Errorf("pod is not ready: %v", errorMsg.String())
440 }
441 })
442 util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
443 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
444 return err
445 })
446 util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
447 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
448 if err != nil {
449 return err
450 }
451 if len(res.Items) == 0 {
452 return errors.New("pod didn't get created")
453 }
454 pod := res.Items[0]
455 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
456 return nil
457 }
458 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
459 if err != nil || len(events.Items) == 0 {
460 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
461 } else {
462 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
463 }
464 })
465 util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
466 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
467 return err
468 })
469 util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
470 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
471 if err != nil {
472 return err
473 }
474 if len(res.Items) == 0 {
475 return errors.New("pod didn't get created")
476 }
477 pod := res.Items[0]
478 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
479 return nil
480 }
481 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
482 if err != nil || len(events.Items) == 0 {
483 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
484 } else {
485 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
486 }
487 })
488 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
489 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
490 return err
491 })
492 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
493 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
494 if err != nil {
495 return err
496 }
497 if res.Status.Failed > 0 {
498 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
499 LabelSelector: "job-name=selftest",
500 })
501 if err != nil {
502 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
503 }
504 if len(pods.Items) < 1 {
505 return fmt.Errorf("job failed but pod does not exist")
506 }
507 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
508 if err != nil {
509 return fmt.Errorf("job failed but could not get logs: %w", err)
510 }
511 if len(lines) > 0 {
512 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
513 }
514 return util.Permanent(fmt.Errorf("job failed, empty log"))
515 }
516 if res.Status.Succeeded > 0 {
517 return nil
518 }
519 return fmt.Errorf("job still running")
520 })
Lorenz Brun276a7462023-07-12 21:28:54 +0200521 util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
522 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
523 if err != nil && !kerrors.IsAlreadyExists(err) {
524 return err
525 }
526 _, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
527 if err != nil && !kerrors.IsAlreadyExists(err) {
528 return err
529 }
530 return nil
531 })
532 util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
533 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
534 if err != nil {
535 return err
536 }
537 // Use a new client for each attempt
538 hc := http.Client{
539 Timeout: 2 * time.Second,
540 Transport: &http.Transport{
541 Dial: cluster.SOCKSDialer.Dial,
542 },
543 }
544 for _, n := range nodes.Items {
545 var addr string
546 for _, a := range n.Status.Addresses {
547 if a.Type == corev1.NodeInternalIP {
548 addr = a.Address
549 }
550 }
551 u := url.URL{Scheme: "http", Host: addr, Path: "/"}
552 res, err := hc.Get(u.String())
553 if err != nil {
554 return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
555 }
556 if res.StatusCode != http.StatusOK {
557 return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
558 }
559 t.Logf("Got response from %q", n.Name)
560 }
561 return nil
562 })
Tim Windelschmidt3bdb5fc2024-03-14 18:47:35 +0100563 util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
564 pool := x509.NewCertPool()
565 pool.AddCert(cluster.CACertificate)
566 cl := http.Client{
567 Transport: &http.Transport{
568 TLSClientConfig: &tls.Config{
569 Certificates: []tls.Certificate{cluster.Owner},
570 RootCAs: pool,
571 },
572 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
573 return cluster.DialNode(ctx, addr)
574 },
575 },
576 }
577 u := url.URL{
578 Scheme: "https",
579 Host: net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
580 Path: "/metrics/containerd",
581 }
582 res, err := cl.Get(u.String())
583 if err != nil {
584 return err
585 }
586 defer res.Body.Close()
587 if res.StatusCode != 200 {
588 return fmt.Errorf("status code %d", res.StatusCode)
589 }
590
591 body, err := io.ReadAll(res.Body)
592 if err != nil {
593 return err
594 }
595 needle := "containerd_build_info_total"
596 if !strings.Contains(string(body), needle) {
597 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
598 }
599 return nil
600 })
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200601 if os.Getenv("HAVE_NESTED_KVM") != "" {
602 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
603 runcRuntimeClass := "runc"
604 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
605 ObjectMeta: metav1.ObjectMeta{
606 Name: "vm-smoketest",
607 },
608 Spec: corev1.PodSpec{
609 Containers: []corev1.Container{{
610 Name: "vm-smoketest",
611 ImagePullPolicy: corev1.PullNever,
Lorenz Brun150f24a2023-07-13 20:11:06 +0200612 Image: "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200613 Resources: corev1.ResourceRequirements{
614 Limits: corev1.ResourceList{
615 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
616 },
617 },
618 }},
619 RuntimeClassName: &runcRuntimeClass,
620 RestartPolicy: corev1.RestartPolicyNever,
621 },
622 }, metav1.CreateOptions{})
623 return err
624 })
625 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
626 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
627 if err != nil {
628 return fmt.Errorf("failed to get pod: %w", err)
629 }
630 if pod.Status.Phase == corev1.PodSucceeded {
631 return nil
632 }
633 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
634 if err != nil || len(events.Items) == 0 {
635 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
636 } else {
637 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
638 }
639 })
640 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200641}