blob: b57f3d3c02e31119a9610a5a013872b008163ed2 [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package e2e
18
19import (
20 "context"
21 "errors"
22 "fmt"
Mateusz Zalega32b19292022-05-17 13:26:55 +020023 "io"
Leopold Schabele28e6d72020-06-03 11:39:25 +020024 "net"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020025 "net/http"
26 _ "net/http"
27 _ "net/http/pprof"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020028 "os"
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +020029 "strings"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020030 "testing"
31 "time"
32
Serge Bazanskibe742842022-04-04 13:18:50 +020033 "google.golang.org/grpc"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020034 corev1 "k8s.io/api/core/v1"
Lorenz Brun30167f52021-03-17 17:49:01 +010035 "k8s.io/apimachinery/pkg/api/resource"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020036 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
37 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
38
Serge Bazanski31370b02021-01-07 16:31:14 +010039 common "source.monogon.dev/metropolis/node"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010040 "source.monogon.dev/metropolis/node/core/identity"
Serge Bazanskibe742842022-04-04 13:18:50 +020041 "source.monogon.dev/metropolis/node/core/rpc"
Serge Bazanski31370b02021-01-07 16:31:14 +010042 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski05f813b2023-03-16 17:58:39 +010043 "source.monogon.dev/metropolis/test/launch"
Serge Bazanski66e58952021-10-05 17:06:56 +020044 "source.monogon.dev/metropolis/test/launch/cluster"
Mateusz Zalegaddf19b42022-06-22 12:27:37 +020045 "source.monogon.dev/metropolis/test/util"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020046)
47
Leopold Schabeld603f842020-06-09 17:48:09 +020048const (
49 // Timeout for the global test context.
50 //
Serge Bazanski216fe7b2021-05-21 18:36:16 +020051 // Bazel would eventually time out the test after 900s ("large") if, for
52 // some reason, the context cancellation fails to abort it.
Leopold Schabeld603f842020-06-09 17:48:09 +020053 globalTestTimeout = 600 * time.Second
54
55 // Timeouts for individual end-to-end tests of different sizes.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020056 smallTestTimeout = 60 * time.Second
Leopold Schabeld603f842020-06-09 17:48:09 +020057 largeTestTimeout = 120 * time.Second
58)
59
Serge Bazanski216fe7b2021-05-21 18:36:16 +020060// TestE2E is the main E2E test entrypoint for single-node freshly-bootstrapped
61// E2E tests. It starts a full Metropolis node in bootstrap mode and then runs
62// tests against it. The actual tests it performs are located in the RunGroup
63// subtest.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020064func TestE2E(t *testing.T) {
Leopold Schabele28e6d72020-06-03 11:39:25 +020065 // Run pprof server for debugging
Serge Bazanski66e58952021-10-05 17:06:56 +020066 addr, err := net.ResolveTCPAddr("tcp", "localhost:0")
67 if err != nil {
68 panic(err)
69 }
70
71 pprofListen, err := net.ListenTCP("tcp", addr)
72 if err != nil {
Serge Bazanski05f813b2023-03-16 17:58:39 +010073 launch.Fatal("Failed to listen on pprof port: %s", pprofListen.Addr())
Serge Bazanski66e58952021-10-05 17:06:56 +020074 }
75
Serge Bazanski05f813b2023-03-16 17:58:39 +010076 launch.Log("E2E: pprof server listening on %s", pprofListen.Addr())
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020077 go func() {
Serge Bazanski05f813b2023-03-16 17:58:39 +010078 launch.Log("E2E: pprof server returned an error: %v", http.Serve(pprofListen, nil))
Serge Bazanski66e58952021-10-05 17:06:56 +020079 pprofListen.Close()
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020080 }()
Leopold Schabele28e6d72020-06-03 11:39:25 +020081
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020082 // Set a global timeout to make sure this terminates
Leopold Schabeld603f842020-06-09 17:48:09 +020083 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020084 defer cancel()
Serge Bazanski66e58952021-10-05 17:06:56 +020085
86 // Launch cluster.
Serge Bazanskie78a0892021-10-07 17:03:49 +020087 clusterOptions := cluster.ClusterOptions{
88 NumNodes: 2,
89 }
90 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020091 if err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +020092 t.Fatalf("LaunchCluster failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020093 }
Serge Bazanski66e58952021-10-05 17:06:56 +020094 defer func() {
95 err := cluster.Close()
96 if err != nil {
97 t.Fatalf("cluster Close failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020098 }
99 }()
Serge Bazanski1f9a03b2021-08-17 13:40:53 +0200100
Serge Bazanski05f813b2023-03-16 17:58:39 +0100101 launch.Log("E2E: Cluster running, starting tests...")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200102
Serge Bazanskibe742842022-04-04 13:18:50 +0200103 // Dial first node's curator.
Serge Bazanski8535cb52023-03-29 14:15:08 +0200104 creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
Serge Bazanskibe742842022-04-04 13:18:50 +0200105 remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
106 cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
107 if err != nil {
108 t.Fatalf("failed to dial first node's curator: %v", err)
109 }
110 defer cl.Close()
111 mgmt := apb.NewManagementClient(cl)
112
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200113 // This exists to keep the parent around while all the children race.
114 // It currently tests both a set of OS-level conditions and Kubernetes
115 // Deployments and StatefulSets
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200116 t.Run("RunGroup", func(t *testing.T) {
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100117 t.Run("Cluster", func(t *testing.T) {
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200118 util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
Serge Bazanskibe742842022-04-04 13:18:50 +0200119 res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200120 if err != nil {
121 return fmt.Errorf("GetClusterInfo: %w", err)
122 }
123
Serge Bazanskie78a0892021-10-07 17:03:49 +0200124 // Ensure that the expected node count is present.
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200125 nodes := res.ClusterDirectory.Nodes
Serge Bazanskie78a0892021-10-07 17:03:49 +0200126 if want, got := clusterOptions.NumNodes, len(nodes); want != got {
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200127 return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
128 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100129
130 // Ensure the nodes have the expected addresses.
131 addresses := make(map[string]bool)
132 for _, n := range nodes {
133 if len(n.Addresses) != 1 {
134 return fmt.Errorf("node %s has no addresss", identity.NodeID(n.PublicKey))
135 }
136 address := n.Addresses[0].Host
137 addresses[address] = true
138 }
139
140 for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
141 if !addresses[address] {
142 return fmt.Errorf("address %q not found in directory", address)
143 }
144 }
Serge Bazanski1f9a03b2021-08-17 13:40:53 +0200145 return nil
146 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200147 util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200148 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
149 if err := cluster.RebootNode(ctx, 1); err != nil {
150 return fmt.Errorf("while rebooting a node: %w", err)
151 }
152 return nil
153 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200154 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, func(ctx context.Context) error {
Mateusz Zalega32b19292022-05-17 13:26:55 +0200155 // Ensure all cluster nodes are capable of sending heartbeat updates.
156 // This test assumes the expected count of nodes is already present in
157 // the cluster.
158 for {
159 srvN, err := mgmt.GetNodes(ctx, &apb.GetNodesRequest{})
160 if err != nil {
161 return fmt.Errorf("GetNodes: %w", err)
162 }
163
164 // Count the unhealthy nodes.
165 var unhealthy int
166 for {
167 node, err := srvN.Recv()
168 if err == io.EOF {
169 break
170 }
171 if err != nil {
172 return fmt.Errorf("GetNodes.Recv: %w", err)
173 }
174
175 if node.Health != apb.Node_HEALTHY {
176 unhealthy++
177 }
178 }
179
180 // If all nodes tested in this iteration are healthy, the test has
181 // been passed.
182 if unhealthy == 0 {
183 break
184 }
185 }
186 return nil
187 })
Serge Bazanski1f9a03b2021-08-17 13:40:53 +0200188 })
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100189 t.Run("Kubernetes", func(t *testing.T) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200190 t.Parallel()
Serge Bazanskibe742842022-04-04 13:18:50 +0200191 // TODO(q3k): use SOCKS proxy.
192 clientSet, err := GetKubeClientSet(cluster, cluster.Ports[uint16(common.KubernetesAPIWrappedPort)])
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200193 if err != nil {
194 t.Fatal(err)
195 }
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200196 util.TestEventual(t, "Nodes are registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200197 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
198 if err != nil {
199 return err
200 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100201 if len(nodes.Items) < 2 {
202 return errors.New("nodes not yet registered")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200203 }
204 node := nodes.Items[0]
205 for _, cond := range node.Status.Conditions {
206 if cond.Type != corev1.NodeReady {
207 continue
208 }
209 if cond.Status != corev1.ConditionTrue {
210 return fmt.Errorf("node not ready: %v", cond.Message)
211 }
212 }
213 return nil
214 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200215 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200216 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
217 return err
218 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200219 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200220 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
221 if err != nil {
222 return err
223 }
224 if len(res.Items) == 0 {
225 return errors.New("pod didn't get created")
226 }
227 pod := res.Items[0]
228 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
229 return nil
230 }
231 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
232 if err != nil || len(events.Items) == 0 {
233 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
234 } else {
235 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
236 }
237 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200238 util.TestEventual(t, "Simple deployment with runc", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +0200239 deployment := makeTestDeploymentSpec("test-deploy-2")
240 var runcStr = "runc"
241 deployment.Spec.Template.Spec.RuntimeClassName = &runcStr
242 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
243 return err
244 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200245 util.TestEventual(t, "Simple deployment is running on runc", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +0200246 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
247 if err != nil {
248 return err
249 }
250 if len(res.Items) == 0 {
251 return errors.New("pod didn't get created")
252 }
253 pod := res.Items[0]
254 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
255 return nil
256 }
257 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
258 if err != nil || len(events.Items) == 0 {
259 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
260 } else {
261 var errorMsg strings.Builder
262 for _, msg := range events.Items {
263 errorMsg.WriteString(" | ")
264 errorMsg.WriteString(msg.Message)
265 }
266 return fmt.Errorf("pod is not ready: %v", errorMsg.String())
267 }
268 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200269 util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun37050122021-03-30 14:00:27 +0200270 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200271 return err
272 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200273 util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200274 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
275 if err != nil {
276 return err
277 }
278 if len(res.Items) == 0 {
279 return errors.New("pod didn't get created")
280 }
281 pod := res.Items[0]
282 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
283 return nil
284 }
285 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
286 if err != nil || len(events.Items) == 0 {
287 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
288 } else {
289 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
290 }
291 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200292 util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun37050122021-03-30 14:00:27 +0200293 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
294 return err
295 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200296 util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun37050122021-03-30 14:00:27 +0200297 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
298 if err != nil {
299 return err
300 }
301 if len(res.Items) == 0 {
302 return errors.New("pod didn't get created")
303 }
304 pod := res.Items[0]
305 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
306 return nil
307 }
308 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
309 if err != nil || len(events.Items) == 0 {
310 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
311 } else {
312 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
313 }
314 })
Lorenz Brun30167f52021-03-17 17:49:01 +0100315 if os.Getenv("HAVE_NESTED_KVM") != "" {
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200316 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
Lorenz Brun30167f52021-03-17 17:49:01 +0100317 runcRuntimeClass := "runc"
318 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
319 ObjectMeta: metav1.ObjectMeta{
320 Name: "vm-smoketest",
321 },
322 Spec: corev1.PodSpec{
323 Containers: []corev1.Container{{
324 Name: "vm-smoketest",
325 ImagePullPolicy: corev1.PullNever,
326 Image: "bazel/metropolis/vm/smoketest:smoketest_container",
327 Resources: corev1.ResourceRequirements{
328 Limits: corev1.ResourceList{
329 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
330 },
331 },
332 }},
333 RuntimeClassName: &runcRuntimeClass,
334 RestartPolicy: corev1.RestartPolicyNever,
335 },
336 }, metav1.CreateOptions{})
337 return err
338 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200339 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
Lorenz Brun30167f52021-03-17 17:49:01 +0100340 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
341 if err != nil {
342 return fmt.Errorf("failed to get pod: %w", err)
343 }
344 if pod.Status.Phase == corev1.PodSucceeded {
345 return nil
346 }
347 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
348 if err != nil || len(events.Items) == 0 {
349 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
350 } else {
351 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
352 }
353 })
354 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200355 })
356 })
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200357}