blob: 4e2ceb7de838b75537708b33a706ee45f4a58bc1 [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package e2e
18
19import (
20 "context"
21 "errors"
22 "fmt"
Leopold Schabele28e6d72020-06-03 11:39:25 +020023 "net"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020024 "net/http"
25 _ "net/http"
26 _ "net/http/pprof"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020027 "os"
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +020028 "strings"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020029 "testing"
30 "time"
31
Serge Bazanskibe742842022-04-04 13:18:50 +020032 "google.golang.org/grpc"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020033 corev1 "k8s.io/api/core/v1"
Lorenz Brun30167f52021-03-17 17:49:01 +010034 "k8s.io/apimachinery/pkg/api/resource"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020035 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
36 podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
37
Serge Bazanski31370b02021-01-07 16:31:14 +010038 common "source.monogon.dev/metropolis/node"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010039 "source.monogon.dev/metropolis/node/core/identity"
Serge Bazanskibe742842022-04-04 13:18:50 +020040 "source.monogon.dev/metropolis/node/core/rpc"
Serge Bazanski31370b02021-01-07 16:31:14 +010041 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski05f813b2023-03-16 17:58:39 +010042 "source.monogon.dev/metropolis/test/launch"
Serge Bazanski66e58952021-10-05 17:06:56 +020043 "source.monogon.dev/metropolis/test/launch/cluster"
Mateusz Zalegaddf19b42022-06-22 12:27:37 +020044 "source.monogon.dev/metropolis/test/util"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020045)
46
Leopold Schabeld603f842020-06-09 17:48:09 +020047const (
48 // Timeout for the global test context.
49 //
Serge Bazanski216fe7b2021-05-21 18:36:16 +020050 // Bazel would eventually time out the test after 900s ("large") if, for
51 // some reason, the context cancellation fails to abort it.
Leopold Schabeld603f842020-06-09 17:48:09 +020052 globalTestTimeout = 600 * time.Second
53
54 // Timeouts for individual end-to-end tests of different sizes.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020055 smallTestTimeout = 60 * time.Second
Leopold Schabeld603f842020-06-09 17:48:09 +020056 largeTestTimeout = 120 * time.Second
57)
58
Serge Bazanski216fe7b2021-05-21 18:36:16 +020059// TestE2E is the main E2E test entrypoint for single-node freshly-bootstrapped
60// E2E tests. It starts a full Metropolis node in bootstrap mode and then runs
61// tests against it. The actual tests it performs are located in the RunGroup
62// subtest.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020063func TestE2E(t *testing.T) {
Leopold Schabele28e6d72020-06-03 11:39:25 +020064 // Run pprof server for debugging
Serge Bazanski66e58952021-10-05 17:06:56 +020065 addr, err := net.ResolveTCPAddr("tcp", "localhost:0")
66 if err != nil {
67 panic(err)
68 }
69
70 pprofListen, err := net.ListenTCP("tcp", addr)
71 if err != nil {
Serge Bazanski05f813b2023-03-16 17:58:39 +010072 launch.Fatal("Failed to listen on pprof port: %s", pprofListen.Addr())
Serge Bazanski66e58952021-10-05 17:06:56 +020073 }
74
Serge Bazanski05f813b2023-03-16 17:58:39 +010075 launch.Log("E2E: pprof server listening on %s", pprofListen.Addr())
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020076 go func() {
Serge Bazanski05f813b2023-03-16 17:58:39 +010077 launch.Log("E2E: pprof server returned an error: %v", http.Serve(pprofListen, nil))
Serge Bazanski66e58952021-10-05 17:06:56 +020078 pprofListen.Close()
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020079 }()
Leopold Schabele28e6d72020-06-03 11:39:25 +020080
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020081 // Set a global timeout to make sure this terminates
Leopold Schabeld603f842020-06-09 17:48:09 +020082 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020083 defer cancel()
Serge Bazanski66e58952021-10-05 17:06:56 +020084
85 // Launch cluster.
Serge Bazanskie78a0892021-10-07 17:03:49 +020086 clusterOptions := cluster.ClusterOptions{
87 NumNodes: 2,
88 }
89 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020090 if err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +020091 t.Fatalf("LaunchCluster failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020092 }
Serge Bazanski66e58952021-10-05 17:06:56 +020093 defer func() {
94 err := cluster.Close()
95 if err != nil {
96 t.Fatalf("cluster Close failed: %v", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020097 }
98 }()
Serge Bazanski1f9a03b2021-08-17 13:40:53 +020099
Serge Bazanski05f813b2023-03-16 17:58:39 +0100100 launch.Log("E2E: Cluster running, starting tests...")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200101
Serge Bazanskibe742842022-04-04 13:18:50 +0200102 // Dial first node's curator.
Serge Bazanski8535cb52023-03-29 14:15:08 +0200103 creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
Serge Bazanskibe742842022-04-04 13:18:50 +0200104 remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
105 cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
106 if err != nil {
107 t.Fatalf("failed to dial first node's curator: %v", err)
108 }
109 defer cl.Close()
110 mgmt := apb.NewManagementClient(cl)
111
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200112 // This exists to keep the parent around while all the children race.
113 // It currently tests both a set of OS-level conditions and Kubernetes
114 // Deployments and StatefulSets
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200115 t.Run("RunGroup", func(t *testing.T) {
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100116 t.Run("Cluster", func(t *testing.T) {
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200117 util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
Serge Bazanskibe742842022-04-04 13:18:50 +0200118 res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200119 if err != nil {
120 return fmt.Errorf("GetClusterInfo: %w", err)
121 }
122
Serge Bazanskie78a0892021-10-07 17:03:49 +0200123 // Ensure that the expected node count is present.
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200124 nodes := res.ClusterDirectory.Nodes
Serge Bazanskie78a0892021-10-07 17:03:49 +0200125 if want, got := clusterOptions.NumNodes, len(nodes); want != got {
Serge Bazanskibf68fa92021-10-05 17:53:58 +0200126 return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
127 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100128
129 // Ensure the nodes have the expected addresses.
130 addresses := make(map[string]bool)
131 for _, n := range nodes {
132 if len(n.Addresses) != 1 {
133 return fmt.Errorf("node %s has no addresss", identity.NodeID(n.PublicKey))
134 }
135 address := n.Addresses[0].Host
136 addresses[address] = true
137 }
138
139 for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
140 if !addresses[address] {
141 return fmt.Errorf("address %q not found in directory", address)
142 }
143 }
Serge Bazanski1f9a03b2021-08-17 13:40:53 +0200144 return nil
145 })
Serge Bazanski630fb5c2023-04-06 10:50:24 +0200146 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200147 util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200148 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
149 if err := cluster.RebootNode(ctx, 1); err != nil {
150 return fmt.Errorf("while rebooting a node: %w", err)
151 }
152 return nil
153 })
Serge Bazanski630fb5c2023-04-06 10:50:24 +0200154 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
Serge Bazanski1f9a03b2021-08-17 13:40:53 +0200155 })
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100156 t.Run("Kubernetes", func(t *testing.T) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200157 t.Parallel()
Serge Bazanskibe742842022-04-04 13:18:50 +0200158 // TODO(q3k): use SOCKS proxy.
159 clientSet, err := GetKubeClientSet(cluster, cluster.Ports[uint16(common.KubernetesAPIWrappedPort)])
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200160 if err != nil {
161 t.Fatal(err)
162 }
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200163 util.TestEventual(t, "Nodes are registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200164 nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
165 if err != nil {
166 return err
167 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100168 if len(nodes.Items) < 2 {
169 return errors.New("nodes not yet registered")
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200170 }
171 node := nodes.Items[0]
172 for _, cond := range node.Status.Conditions {
173 if cond.Type != corev1.NodeReady {
174 continue
175 }
176 if cond.Status != corev1.ConditionTrue {
177 return fmt.Errorf("node not ready: %v", cond.Message)
178 }
179 }
180 return nil
181 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200182 util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200183 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
184 return err
185 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200186 util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200187 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
188 if err != nil {
189 return err
190 }
191 if len(res.Items) == 0 {
192 return errors.New("pod didn't get created")
193 }
194 pod := res.Items[0]
195 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
196 return nil
197 }
198 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
199 if err != nil || len(events.Items) == 0 {
200 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
201 } else {
202 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
203 }
204 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200205 util.TestEventual(t, "Simple deployment with runc", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +0200206 deployment := makeTestDeploymentSpec("test-deploy-2")
207 var runcStr = "runc"
208 deployment.Spec.Template.Spec.RuntimeClassName = &runcStr
209 _, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
210 return err
211 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200212 util.TestEventual(t, "Simple deployment is running on runc", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun5e4fc2d2020-09-22 18:35:15 +0200213 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
214 if err != nil {
215 return err
216 }
217 if len(res.Items) == 0 {
218 return errors.New("pod didn't get created")
219 }
220 pod := res.Items[0]
221 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
222 return nil
223 }
224 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
225 if err != nil || len(events.Items) == 0 {
226 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
227 } else {
228 var errorMsg strings.Builder
229 for _, msg := range events.Items {
230 errorMsg.WriteString(" | ")
231 errorMsg.WriteString(msg.Message)
232 }
233 return fmt.Errorf("pod is not ready: %v", errorMsg.String())
234 }
235 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200236 util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun37050122021-03-30 14:00:27 +0200237 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200238 return err
239 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200240 util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200241 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
242 if err != nil {
243 return err
244 }
245 if len(res.Items) == 0 {
246 return errors.New("pod didn't get created")
247 }
248 pod := res.Items[0]
249 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
250 return nil
251 }
252 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
253 if err != nil || len(events.Items) == 0 {
254 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
255 } else {
256 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
257 }
258 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200259 util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun37050122021-03-30 14:00:27 +0200260 _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
261 return err
262 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200263 util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
Lorenz Brun37050122021-03-30 14:00:27 +0200264 res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
265 if err != nil {
266 return err
267 }
268 if len(res.Items) == 0 {
269 return errors.New("pod didn't get created")
270 }
271 pod := res.Items[0]
272 if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
273 return nil
274 }
275 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
276 if err != nil || len(events.Items) == 0 {
277 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
278 } else {
279 return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
280 }
281 })
Serge Bazanski9104e382023-04-04 20:08:21 +0200282 util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
283 _, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
284 return err
285 })
286 util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
287 res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
288 if err != nil {
289 return err
290 }
291 if res.Status.Failed > 0 {
292 pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
293 LabelSelector: "job-name=selftest",
294 })
295 if err != nil {
296 return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
297 }
298 if len(pods.Items) < 1 {
299 return fmt.Errorf("job failed but pod does not exist")
300 }
301 lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
302 if err != nil {
303 return fmt.Errorf("job failed but could not get logs: %w", err)
304 }
305 if len(lines) > 0 {
306 return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
307 }
308 return util.Permanent(fmt.Errorf("job failed, empty log"))
309 }
310 if res.Status.Succeeded > 0 {
311 return nil
312 }
313 return fmt.Errorf("job still running")
314 })
Lorenz Brun30167f52021-03-17 17:49:01 +0100315 if os.Getenv("HAVE_NESTED_KVM") != "" {
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200316 util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
Lorenz Brun30167f52021-03-17 17:49:01 +0100317 runcRuntimeClass := "runc"
318 _, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
319 ObjectMeta: metav1.ObjectMeta{
320 Name: "vm-smoketest",
321 },
322 Spec: corev1.PodSpec{
323 Containers: []corev1.Container{{
324 Name: "vm-smoketest",
325 ImagePullPolicy: corev1.PullNever,
326 Image: "bazel/metropolis/vm/smoketest:smoketest_container",
327 Resources: corev1.ResourceRequirements{
328 Limits: corev1.ResourceList{
329 "devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
330 },
331 },
332 }},
333 RuntimeClassName: &runcRuntimeClass,
334 RestartPolicy: corev1.RestartPolicyNever,
335 },
336 }, metav1.CreateOptions{})
337 return err
338 })
Mateusz Zalegaddf19b42022-06-22 12:27:37 +0200339 util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
Lorenz Brun30167f52021-03-17 17:49:01 +0100340 pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
341 if err != nil {
342 return fmt.Errorf("failed to get pod: %w", err)
343 }
344 if pod.Status.Phase == corev1.PodSucceeded {
345 return nil
346 }
347 events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
348 if err != nil || len(events.Items) == 0 {
349 return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
350 } else {
351 return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
352 }
353 })
354 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200355 })
356 })
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200357}