blob: a2c9eaf1eb40849a638154c9aa42a8866bb950bc [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package e2e
18
19import (
20 "context"
Serge Bazanski54e212a2023-06-14 13:45:11 +020021 "crypto/tls"
22 "crypto/x509"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020023 "errors"
24 "fmt"
Serge Bazanski2cfafc92023-03-21 16:42:47 +010025 "io"
Leopold Schabele28e6d72020-06-03 11:39:25 +020026 "net"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020027 "net/http"
28 _ "net/http"
29 _ "net/http/pprof"
Serge Bazanski54e212a2023-06-14 13:45:11 +020030 "net/url"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020031 "os"
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +020032 "strings"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020033 "testing"
34 "time"
35
Serge Bazanskibe742842022-04-04 13:18:50 +020036 "google.golang.org/grpc"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020037 corev1 "k8s.io/api/core/v1"
Lorenz Brun30167f52021-03-17 17:49:01 +010038 "k8s.io/apimachinery/pkg/api/resource"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020039 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
40 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
41
Serge Bazanski31370b02021-01-07 16:31:14 +010042 common "source.monogon.dev/metropolis/node"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010043 "source.monogon.dev/metropolis/node/core/identity"
Serge Bazanskibe742842022-04-04 13:18:50 +020044 "source.monogon.dev/metropolis/node/core/rpc"
Serge Bazanski31370b02021-01-07 16:31:14 +010045 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski05f813b2023-03-16 17:58:39 +010046 "source.monogon.dev/metropolis/test/launch"
Serge Bazanski66e58952021-10-05 17:06:56 +020047 "source.monogon.dev/metropolis/test/launch/cluster"
Mateusz Zalegaddf19b42022-06-22 12:27:37 +020048 "source.monogon.dev/metropolis/test/util"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020049)
50
Leopold Schabeld603f842020-06-09 17:48:09 +020051const (
52 // Timeout for the global test context.
53 //
Serge Bazanski216fe7b2021-05-21 18:36:16 +020054 // Bazel would eventually time out the test after 900s ("large") if, for
55 // some reason, the context cancellation fails to abort it.
Leopold Schabeld603f842020-06-09 17:48:09 +020056 globalTestTimeout = 600 * time.Second
57
58 // Timeouts for individual end-to-end tests of different sizes.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020059 smallTestTimeout = 60 * time.Second
Leopold Schabeld603f842020-06-09 17:48:09 +020060 largeTestTimeout = 120 * time.Second
61)
62
Serge Bazanskia0bc6d32023-06-28 18:57:40 +020063// TestE2ECore exercisees the core functionality of Metropolis: maintaining a
64// control plane, changing node roles, ...
65//
66// The tests are performed against an in-memory cluster.
67func TestE2ECore(t *testing.T) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020068 // Set a global timeout to make sure this terminates
Leopold Schabeld603f842020-06-09 17:48:09 +020069 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020070 defer cancel()
Serge Bazanski66e58952021-10-05 17:06:56 +020071
72 // Launch cluster.
Serge Bazanskie78a0892021-10-07 17:03:49 +020073 clusterOptions := cluster.ClusterOptions{
74 NumNodes: 2,
75 }
76 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020077 if err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +020078 t.Fatalf("LaunchCluster failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020079 }
Serge Bazanski66e58952021-10-05 17:06:56 +020080 defer func() {
81 err := cluster.Close()
82 if err != nil {
83 t.Fatalf("cluster Close failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020084 }
85 }()
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020086
Serge Bazanski05f813b2023-03-16 17:58:39 +010087 launch.Log("E2E: Cluster running, starting tests...")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020088
Serge Bazanskibe742842022-04-04 13:18:50 +020089 // Dial first node's curator.
Serge Bazanski8535cb52023-03-29 14:15:08 +020090 creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
Serge Bazanskibe742842022-04-04 13:18:50 +020091 remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
92 cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
93 if err != nil {
94 t.Fatalf("failed to dial first node's curator: %v", err)
95 }
96 defer cl.Close()
97 mgmt := apb.NewManagementClient(cl)
98
Serge Bazanskia0bc6d32023-06-28 18:57:40 +020099 util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
100 res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
101 if err != nil {
102 return fmt.Errorf("GetClusterInfo: %w", err)
103 }
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200104
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200105 // Ensure that the expected node count is present.
106 nodes := res.ClusterDirectory.Nodes
107 if want, got := clusterOptions.NumNodes, len(nodes); want != got {
108 return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
109 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100110
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200111 // Ensure the nodes have the expected addresses.
112 addresses := make(map[string]bool)
113 for _, n := range nodes {
114 if len(n.Addresses) != 1 {
115 return fmt.Errorf("node %s has no addresss", identity.NodeID(n.PublicKey))
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200116 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200117 address := n.Addresses[0].Host
118 addresses[address] = true
119 }
Serge Bazanski2cfafc92023-03-21 16:42:47 +0100120
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200121 for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
122 if !addresses[address] {
123 return fmt.Errorf("address %q not found in directory", address)
Lorenz Brun30167f52021-03-17 17:49:01 +0100124 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200125 }
126 return nil
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200127 })
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200128 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
129 util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
130 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
131 if err := cluster.RebootNode(ctx, 1); err != nil {
132 return fmt.Errorf("while rebooting a node: %w", err)
133 }
134 return nil
135 })
136 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
137 util.TestEventual(t, "Prometheus node metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
138 pool := x509.NewCertPool()
139 pool.AddCert(cluster.CACertificate)
140 cl := http.Client{
141 Transport: &http.Transport{
142 TLSClientConfig: &tls.Config{
143 Certificates: []tls.Certificate{cluster.Owner},
144 RootCAs: pool,
145 },
146 DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
147 return cluster.DialNode(ctx, addr)
148 },
149 },
150 }
151 u := url.URL{
152 Scheme: "https",
153 Host: net.JoinHostPort(cluster.NodeIDs[0], common.MetricsPort.PortString()),
154 Path: "/metrics/node",
155 }
156 res, err := cl.Get(u.String())
157 if err != nil {
158 return err
159 }
160 defer res.Body.Close()
161 if res.StatusCode != 200 {
162 return fmt.Errorf("status code %d", res.StatusCode)
163 }
164
165 body, err := io.ReadAll(res.Body)
166 if err != nil {
167 return err
168 }
169 needle := "node_uname_info"
170 if !strings.Contains(string(body), needle) {
171 return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
172 }
173 return nil
174 })
175}
176
177// TestE2ECore exercisees the Kubernetes functionality of Metropolis.
178//
179// The tests are performed against an in-memory cluster.
180func TestE2EKubernetes(t *testing.T) {
181 // Set a global timeout to make sure this terminates
182 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
183 defer cancel()
184
185 // Launch cluster.
186 clusterOptions := cluster.ClusterOptions{
187 NumNodes: 2,
188 }
189 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
190 if err != nil {
191 t.Fatalf("LaunchCluster failed: %v", err)
192 }
193 defer func() {
194 err := cluster.Close()
195 if err != nil {
196 t.Fatalf("cluster Close failed: %v", err)
197 }
198 }()
199
200 clientSet, err := cluster.GetKubeClientSet()
201 if err != nil {
202 t.Fatal(err)
203 }
204 util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
205 // Make everything but the first node into KubernetesWorkers.
206 for i := 1; i < clusterOptions.NumNodes; i++ {
207 err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
208 if err != nil {
209 return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
210 }
211 }
212 return nil
213 })
214 util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
215 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
216 if err != nil {
217 return err
218 }
219 if len(nodes.Items) < 1 {
220 return errors.New("node not yet registered")
221 }
222 node := nodes.Items[0]
223 for _, cond := range node.Status.Conditions {
224 if cond.Type != corev1.NodeReady {
225 continue
226 }
227 if cond.Status != corev1.ConditionTrue {
228 return fmt.Errorf("node not ready: %v", cond.Message)
229 }
230 }
231 return nil
232 })
233 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
234 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
235 return err
236 })
237 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
238 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
239 if err != nil {
240 return err
241 }
242 if len(res.Items) == 0 {
243 return errors.New("pod didn't get created")
244 }
245 pod := res.Items[0]
246 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
247 return nil
248 }
249 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
250 if err != nil || len(events.Items) == 0 {
251 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
252 } else {
253 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
254 }
255 })
256 util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
257 deployment := makeTestDeploymentSpec("test-deploy-2")
258 gvisorStr := "gvisor"
259 deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
260 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
261 return err
262 })
263 util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
264 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
265 if err != nil {
266 return err
267 }
268 if len(res.Items) == 0 {
269 return errors.New("pod didn't get created")
270 }
271 pod := res.Items[0]
272 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
273 return nil
274 }
275 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
276 if err != nil || len(events.Items) == 0 {
277 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
278 } else {
279 var errorMsg strings.Builder
280 for _, msg := range events.Items {
281 errorMsg.WriteString(" | ")
282 errorMsg.WriteString(msg.Message)
283 }
284 return fmt.Errorf("pod is not ready: %v", errorMsg.String())
285 }
286 })
287 util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
288 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
289 return err
290 })
291 util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
292 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
293 if err != nil {
294 return err
295 }
296 if len(res.Items) == 0 {
297 return errors.New("pod didn't get created")
298 }
299 pod := res.Items[0]
300 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
301 return nil
302 }
303 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
304 if err != nil || len(events.Items) == 0 {
305 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
306 } else {
307 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
308 }
309 })
310 util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
311 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
312 return err
313 })
314 util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
315 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
316 if err != nil {
317 return err
318 }
319 if len(res.Items) == 0 {
320 return errors.New("pod didn't get created")
321 }
322 pod := res.Items[0]
323 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
324 return nil
325 }
326 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
327 if err != nil || len(events.Items) == 0 {
328 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
329 } else {
330 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
331 }
332 })
333 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
334 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
335 return err
336 })
337 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
338 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
339 if err != nil {
340 return err
341 }
342 if res.Status.Failed > 0 {
343 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
344 LabelSelector: "job-name=selftest",
345 })
346 if err != nil {
347 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
348 }
349 if len(pods.Items) < 1 {
350 return fmt.Errorf("job failed but pod does not exist")
351 }
352 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
353 if err != nil {
354 return fmt.Errorf("job failed but could not get logs: %w", err)
355 }
356 if len(lines) > 0 {
357 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
358 }
359 return util.Permanent(fmt.Errorf("job failed, empty log"))
360 }
361 if res.Status.Succeeded > 0 {
362 return nil
363 }
364 return fmt.Errorf("job still running")
365 })
366 if os.Getenv("HAVE_NESTED_KVM") != "" {
367 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
368 runcRuntimeClass := "runc"
369 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
370 ObjectMeta: metav1.ObjectMeta{
371 Name: "vm-smoketest",
372 },
373 Spec: corev1.PodSpec{
374 Containers: []corev1.Container{{
375 Name: "vm-smoketest",
376 ImagePullPolicy: corev1.PullNever,
377 Image: "bazel/metropolis/vm/smoketest:smoketest_container",
378 Resources: corev1.ResourceRequirements{
379 Limits: corev1.ResourceList{
380 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
381 },
382 },
383 }},
384 RuntimeClassName: &runcRuntimeClass,
385 RestartPolicy: corev1.RestartPolicyNever,
386 },
387 }, metav1.CreateOptions{})
388 return err
389 })
390 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
391 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
392 if err != nil {
393 return fmt.Errorf("failed to get pod: %w", err)
394 }
395 if pod.Status.Phase == corev1.PodSucceeded {
396 return nil
397 }
398 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
399 if err != nil || len(events.Items) == 0 {
400 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
401 } else {
402 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
403 }
404 })
405 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200406}