blob: 0684a0b72cf83dcccb94e0b98df60e87e59dbfd2 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski99b02142024-04-17 16:33:28 +02004package ha_cold
5
6import (
7 "context"
8 "fmt"
9 "testing"
10 "time"
11
Jan Schärd1a8b642024-12-03 17:40:41 +010012 "k8s.io/utils/ptr"
13
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020014 mlaunch "source.monogon.dev/metropolis/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020015 "source.monogon.dev/metropolis/test/util"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020016 "source.monogon.dev/osbase/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020017
Jan Schärd5538b52024-09-25 13:16:49 +020018 ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
19 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski99b02142024-04-17 16:33:28 +020020 cpb "source.monogon.dev/metropolis/proto/common"
21)
22
23const (
24 // Timeout for the global test context.
25 //
26 // Bazel would eventually time out the test after 900s ("large") if, for
27 // some reason, the context cancellation fails to abort it.
28 globalTestTimeout = 600 * time.Second
29
30 // Timeouts for individual end-to-end tests of different sizes.
31 smallTestTimeout = 60 * time.Second
32 largeTestTimeout = 120 * time.Second
33)
34
35// TestE2EColdStartHA exercises an HA cluster being fully shut down then
36// restarted again.
37//
38// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
39// so we test a non-TPM/Insecure cluster.
40func TestE2EColdStartHA(t *testing.T) {
41 // Set a global timeout to make sure this terminates
42 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
43 defer cancel()
44
45 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020046 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +020047 NumNodes: 3,
48 NodeLogsToFiles: true,
49 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +010050 ClusterDomain: "cluster.test",
Serge Bazanski99b02142024-04-17 16:33:28 +020051 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
52 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
53 },
54 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020055 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +020056 if err != nil {
57 t.Fatalf("LaunchCluster failed: %v", err)
58 }
59 defer func() {
60 err := cluster.Close()
61 if err != nil {
62 t.Fatalf("cluster Close failed: %v", err)
63 }
64 }()
65
66 launch.Log("E2E: Cluster running, starting tests...")
67
68 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
69 // Make everything but the first node into ConsensusMember.
70 for i := 1; i < clusterOptions.NumNodes; i++ {
71 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
72 if err != nil {
73 return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
74 }
75 }
76 return nil
77 })
78 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
79
80 // Shut every node down.
81 for i := 0; i < clusterOptions.NumNodes; i++ {
82 if err := cluster.ShutdownNode(i); err != nil {
83 t.Fatalf("Could not shutdown node %d", i)
84 }
85 }
86 // Start every node back up.
87 for i := 0; i < clusterOptions.NumNodes; i++ {
88 if err := cluster.StartNode(i); err != nil {
89 t.Fatalf("Could not shutdown node %d", i)
90 }
91 }
92 // Check if the cluster comes back up.
93 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
Jan Schärd5538b52024-09-25 13:16:49 +020094
95 // Test node role removal.
96 curC, err := cluster.CuratorClient()
97 if err != nil {
98 t.Fatalf("Could not get CuratorClient: %v", err)
99 }
100 mgmt := apb.NewManagementClient(curC)
101 cur := ipb.NewCuratorClient(curC)
102
103 util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
Jan Schärd5538b52024-09-25 13:16:49 +0200104 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
105 Node: &apb.UpdateNodeRolesRequest_Id{
106 Id: cluster.NodeIDs[0],
107 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100108 KubernetesController: ptr.To(false),
Jan Schärd5538b52024-09-25 13:16:49 +0200109 })
110 return err
111 })
112 util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
Jan Schärd5538b52024-09-25 13:16:49 +0200113 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
114 Node: &apb.UpdateNodeRolesRequest_Id{
115 Id: cluster.NodeIDs[0],
116 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100117 ConsensusMember: ptr.To(false),
Jan Schärd5538b52024-09-25 13:16:49 +0200118 })
119 return err
120 })
121
122 // Test that removing the ConsensusMember role from a node removed the
123 // corresponding etcd member from the cluster.
124 var st *ipb.GetConsensusStatusResponse
125 util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
126 st, err = cur.GetConsensusStatus(ctx, &ipb.GetConsensusStatusRequest{})
127 return err
128 })
129
130 for _, member := range st.EtcdMember {
131 if member.Id == cluster.NodeIDs[0] {
132 t.Errorf("member still present in etcd")
133 }
134 }
135
136 // Test that that the cluster still works after deleting the first node and
137 // restarting the remaining nodes.
138 util.MustTestEventual(t, "Delete first node", ctx, 10*time.Second, func(ctx context.Context) error {
139 _, err := mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
140 Node: &apb.DeleteNodeRequest_Id{
141 Id: cluster.NodeIDs[0],
142 },
143 SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
144 })
145 return err
146 })
147
148 // Shut every remaining node down.
149 for i := 1; i < clusterOptions.NumNodes; i++ {
150 if err := cluster.ShutdownNode(i); err != nil {
151 t.Fatalf("Could not shutdown node %d", i)
152 }
153 }
154 // Start every remaining node back up.
155 for i := 1; i < clusterOptions.NumNodes; i++ {
156 if err := cluster.StartNode(i); err != nil {
157 t.Fatalf("Could not shutdown node %d", i)
158 }
159 }
160 // Check if the cluster comes back up.
161 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
Serge Bazanski99b02142024-04-17 16:33:28 +0200162}