blob: 812ae9dc0ff85c55e420bf3662153aef0419d803 [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package e2e
18
19import (
20 "context"
Serge Bazanski54e212a2023-06-14 13:45:11 +020021 "crypto/tls"
22 "crypto/x509"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020023 "errors"
24 "fmt"
Serge Bazanski2cfafc92023-03-21 16:42:47 +010025 "io"
Leopold Schabele28e6d72020-06-03 11:39:25 +020026 "net"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020027 "net/http"
28 _ "net/http"
29 _ "net/http/pprof"
Serge Bazanski54e212a2023-06-14 13:45:11 +020030 "net/url"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020031 "os"
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +020032 "strings"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020033 "testing"
34 "time"
35
Serge Bazanskibe742842022-04-04 13:18:50 +020036 "google.golang.org/grpc"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020037 corev1 "k8s.io/api/core/v1"
Lorenz Brun30167f52021-03-17 17:49:01 +010038 "k8s.io/apimachinery/pkg/api/resource"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020039 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
40 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
41
Lorenz Brun150f24a2023-07-13 20:11:06 +020042 "source.monogon.dev/metropolis/cli/pkg/datafile"
Serge Bazanski31370b02021-01-07 16:31:14 +010043 common "source.monogon.dev/metropolis/node"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010044 "source.monogon.dev/metropolis/node/core/identity"
Serge Bazanskibe742842022-04-04 13:18:50 +020045 "source.monogon.dev/metropolis/node/core/rpc"
Lorenz Brun150f24a2023-07-13 20:11:06 +020046 "source.monogon.dev/metropolis/pkg/localregistry"
Serge Bazanski31370b02021-01-07 16:31:14 +010047 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski05f813b2023-03-16 17:58:39 +010048 "source.monogon.dev/metropolis/test/launch"
Serge Bazanski66e58952021-10-05 17:06:56 +020049 "source.monogon.dev/metropolis/test/launch/cluster"
Mateusz Zalegaddf19b42022-06-22 12:27:37 +020050 "source.monogon.dev/metropolis/test/util"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020051)
52
Leopold Schabeld603f842020-06-09 17:48:09 +020053const (
54 // Timeout for the global test context.
55 //
Serge Bazanski216fe7b2021-05-21 18:36:16 +020056 // Bazel would eventually time out the test after 900s ("large") if, for
57 // some reason, the context cancellation fails to abort it.
Leopold Schabeld603f842020-06-09 17:48:09 +020058 globalTestTimeout = 600 * time.Second
59
60 // Timeouts for individual end-to-end tests of different sizes.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020061 smallTestTimeout = 60 * time.Second
Leopold Schabeld603f842020-06-09 17:48:09 +020062 largeTestTimeout = 120 * time.Second
63)
64
Serge Bazanskia0bc6d32023-06-28 18:57:40 +020065// TestE2ECore exercisees the core functionality of Metropolis: maintaining a
66// control plane, changing node roles, ...
67//
68// The tests are performed against an in-memory cluster.
69func TestE2ECore(t *testing.T) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020070 // Set a global timeout to make sure this terminates
Leopold Schabeld603f842020-06-09 17:48:09 +020071 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020072 defer cancel()
Serge Bazanski66e58952021-10-05 17:06:56 +020073
Lorenz Brun150f24a2023-07-13 20:11:06 +020074 lr, err := localregistry.FromBazelManifest(datafile.MustGet("metropolis/test/e2e/testimages_manifest.prototxt"))
75 if err != nil {
76 t.Fatalf("Creating test image registry failed: %v", err)
77 }
78
Serge Bazanski66e58952021-10-05 17:06:56 +020079 // Launch cluster.
Serge Bazanskie78a0892021-10-07 17:03:49 +020080 clusterOptions := cluster.ClusterOptions{
Lorenz Brun150f24a2023-07-13 20:11:06 +020081 NumNodes: 2,
82 LocalRegistry: lr,
Serge Bazanskie78a0892021-10-07 17:03:49 +020083 }
84 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020085 if err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +020086 t.Fatalf("LaunchCluster failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020087 }
Serge Bazanski66e58952021-10-05 17:06:56 +020088 defer func() {
89 err := cluster.Close()
90 if err != nil {
91 t.Fatalf("cluster Close failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020092 }
93 }()
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020094
Serge Bazanski05f813b2023-03-16 17:58:39 +010095 launch.Log("E2E: Cluster running, starting tests...")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020096
Serge Bazanskibe742842022-04-04 13:18:50 +020097 // Dial first node's curator.
Serge Bazanski8535cb52023-03-29 14:15:08 +020098 creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
Serge Bazanskibe742842022-04-04 13:18:50 +020099 remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
100 cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
101 if err != nil {
102 t.Fatalf("failed to dial first node's curator: %v", err)
103 }
104 defer cl.Close()
105 mgmt := apb.NewManagementClient(cl)
106
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200107 util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
108 res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
109 if err != nil {
110 return fmt.Errorf("GetClusterInfo: %w", err)
111 }
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200112
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200113 // Ensure that the expected node count is present.
114 nodes := res.ClusterDirectory.Nodes
115 if want, got := clusterOptions.NumNodes, len(nodes); want != got {
116 return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
117 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100118
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200119 // Ensure the nodes have the expected addresses.
120 addresses := make(map[string]bool)
121 for _, n := range nodes {
122 if len(n.Addresses) != 1 {
123 return fmt.Errorf("node %s has no addresss", identity.NodeID(n.PublicKey))
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200124 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200125 address := n.Addresses[0].Host
126 addresses[address] = true
127 }
Serge Bazanski2cfafc92023-03-21 16:42:47 +0100128
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200129 for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
130 if !addresses[address] {
131 return fmt.Errorf("address %q not found in directory", address)
Lorenz Brun30167f52021-03-17 17:49:01 +0100132 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200133 }
134 return nil
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200135 })
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200136 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
137 util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
138 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
139 if err := cluster.RebootNode(ctx, 1); err != nil {
140 return fmt.Errorf("while rebooting a node: %w", err)
141 }
142 return nil
143 })
144 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
145 util.TestEventual(t, "Prometheus node metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
146 pool := x509.NewCertPool()
147 pool.AddCert(cluster.CACertificate)
148 cl := http.Client{
149 Transport: &http.Transport{
150 TLSClientConfig: &tls.Config{
151 Certificates: []tls.Certificate{cluster.Owner},
152 RootCAs: pool,
153 },
154 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
155 return cluster.DialNode(ctx, addr)
156 },
157 },
158 }
159 u := url.URL{
160 Scheme: "https",
161 Host: net.JoinHostPort(cluster.NodeIDs[0], common.MetricsPort.PortString()),
162 Path: "/metrics/node",
163 }
164 res, err := cl.Get(u.String())
165 if err != nil {
166 return err
167 }
168 defer res.Body.Close()
169 if res.StatusCode != 200 {
170 return fmt.Errorf("status code %d", res.StatusCode)
171 }
172
173 body, err := io.ReadAll(res.Body)
174 if err != nil {
175 return err
176 }
177 needle := "node_uname_info"
178 if !strings.Contains(string(body), needle) {
179 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
180 }
181 return nil
182 })
183}
184
185// TestE2ECore exercisees the Kubernetes functionality of Metropolis.
186//
187// The tests are performed against an in-memory cluster.
188func TestE2EKubernetes(t *testing.T) {
189 // Set a global timeout to make sure this terminates
190 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
191 defer cancel()
192
Lorenz Brun150f24a2023-07-13 20:11:06 +0200193 lr, err := localregistry.FromBazelManifest(datafile.MustGet("metropolis/test/e2e/testimages_manifest.prototxt"))
194 if err != nil {
195 t.Fatalf("Creating test image registry failed: %v", err)
196 }
197
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200198 // Launch cluster.
199 clusterOptions := cluster.ClusterOptions{
Lorenz Brun150f24a2023-07-13 20:11:06 +0200200 NumNodes: 2,
201 LocalRegistry: lr,
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200202 }
203 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
204 if err != nil {
205 t.Fatalf("LaunchCluster failed: %v", err)
206 }
207 defer func() {
208 err := cluster.Close()
209 if err != nil {
210 t.Fatalf("cluster Close failed: %v", err)
211 }
212 }()
213
214 clientSet, err := cluster.GetKubeClientSet()
215 if err != nil {
216 t.Fatal(err)
217 }
218 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
219 // Make everything but the first node into KubernetesWorkers.
220 for i := 1; i < clusterOptions.NumNodes; i++ {
221 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
222 if err != nil {
223 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
224 }
225 }
226 return nil
227 })
228 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
229 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
230 if err != nil {
231 return err
232 }
233 if len(nodes.Items) < 1 {
234 return errors.New("node not yet registered")
235 }
236 node := nodes.Items[0]
237 for _, cond := range node.Status.Conditions {
238 if cond.Type != corev1.NodeReady {
239 continue
240 }
241 if cond.Status != corev1.ConditionTrue {
242 return fmt.Errorf("node not ready: %v", cond.Message)
243 }
244 }
245 return nil
246 })
247 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
248 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
249 return err
250 })
251 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
252 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
253 if err != nil {
254 return err
255 }
256 if len(res.Items) == 0 {
257 return errors.New("pod didn't get created")
258 }
259 pod := res.Items[0]
260 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
261 return nil
262 }
263 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
264 if err != nil || len(events.Items) == 0 {
265 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
266 } else {
267 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
268 }
269 })
270 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
271 deployment := makeTestDeploymentSpec("test-deploy-2")
272 gvisorStr := "gvisor"
273 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
274 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
275 return err
276 })
277 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
278 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
279 if err != nil {
280 return err
281 }
282 if len(res.Items) == 0 {
283 return errors.New("pod didn't get created")
284 }
285 pod := res.Items[0]
286 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
287 return nil
288 }
289 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
290 if err != nil || len(events.Items) == 0 {
291 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
292 } else {
293 var errorMsg strings.Builder
294 for _, msg := range events.Items {
295 errorMsg.WriteString(" | ")
296 errorMsg.WriteString(msg.Message)
297 }
298 return fmt.Errorf("pod is not ready: %v", errorMsg.String())
299 }
300 })
301 util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
302 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
303 return err
304 })
305 util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
306 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
307 if err != nil {
308 return err
309 }
310 if len(res.Items) == 0 {
311 return errors.New("pod didn't get created")
312 }
313 pod := res.Items[0]
314 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
315 return nil
316 }
317 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
318 if err != nil || len(events.Items) == 0 {
319 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
320 } else {
321 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
322 }
323 })
324 util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
325 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
326 return err
327 })
328 util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
329 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
330 if err != nil {
331 return err
332 }
333 if len(res.Items) == 0 {
334 return errors.New("pod didn't get created")
335 }
336 pod := res.Items[0]
337 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
338 return nil
339 }
340 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
341 if err != nil || len(events.Items) == 0 {
342 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
343 } else {
344 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
345 }
346 })
347 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
348 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
349 return err
350 })
351 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
352 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
353 if err != nil {
354 return err
355 }
356 if res.Status.Failed > 0 {
357 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
358 LabelSelector: "job-name=selftest",
359 })
360 if err != nil {
361 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
362 }
363 if len(pods.Items) < 1 {
364 return fmt.Errorf("job failed but pod does not exist")
365 }
366 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
367 if err != nil {
368 return fmt.Errorf("job failed but could not get logs: %w", err)
369 }
370 if len(lines) > 0 {
371 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
372 }
373 return util.Permanent(fmt.Errorf("job failed, empty log"))
374 }
375 if res.Status.Succeeded > 0 {
376 return nil
377 }
378 return fmt.Errorf("job still running")
379 })
380 if os.Getenv("HAVE_NESTED_KVM") != "" {
381 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
382 runcRuntimeClass := "runc"
383 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
384 ObjectMeta: metav1.ObjectMeta{
385 Name: "vm-smoketest",
386 },
387 Spec: corev1.PodSpec{
388 Containers: []corev1.Container{{
389 Name: "vm-smoketest",
390 ImagePullPolicy: corev1.PullNever,
Lorenz Brun150f24a2023-07-13 20:11:06 +0200391 Image: "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200392 Resources: corev1.ResourceRequirements{
393 Limits: corev1.ResourceList{
394 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
395 },
396 },
397 }},
398 RuntimeClassName: &runcRuntimeClass,
399 RestartPolicy: corev1.RestartPolicyNever,
400 },
401 }, metav1.CreateOptions{})
402 return err
403 })
404 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
405 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
406 if err != nil {
407 return fmt.Errorf("failed to get pod: %w", err)
408 }
409 if pod.Status.Phase == corev1.PodSucceeded {
410 return nil
411 }
412 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
413 if err != nil || len(events.Items) == 0 {
414 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
415 } else {
416 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
417 }
418 })
419 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200420}