blob: ae6a04327ccb85e9f8ed3c95a0d5e589ada7db4a [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package ha_cold
2
3import (
4 "context"
5 "fmt"
6 "testing"
7 "time"
8
Jan Schärd1a8b642024-12-03 17:40:41 +01009 "k8s.io/utils/ptr"
10
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020011 mlaunch "source.monogon.dev/metropolis/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020012 "source.monogon.dev/metropolis/test/util"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020013 "source.monogon.dev/osbase/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020014
Jan Schärd5538b52024-09-25 13:16:49 +020015 ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
16 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski99b02142024-04-17 16:33:28 +020017 cpb "source.monogon.dev/metropolis/proto/common"
18)
19
20const (
21 // Timeout for the global test context.
22 //
23 // Bazel would eventually time out the test after 900s ("large") if, for
24 // some reason, the context cancellation fails to abort it.
25 globalTestTimeout = 600 * time.Second
26
27 // Timeouts for individual end-to-end tests of different sizes.
28 smallTestTimeout = 60 * time.Second
29 largeTestTimeout = 120 * time.Second
30)
31
32// TestE2EColdStartHA exercises an HA cluster being fully shut down then
33// restarted again.
34//
35// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
36// so we test a non-TPM/Insecure cluster.
37func TestE2EColdStartHA(t *testing.T) {
38 // Set a global timeout to make sure this terminates
39 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
40 defer cancel()
41
42 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020043 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +020044 NumNodes: 3,
45 NodeLogsToFiles: true,
46 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +010047 ClusterDomain: "cluster.test",
Serge Bazanski99b02142024-04-17 16:33:28 +020048 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
49 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
50 },
51 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020052 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +020053 if err != nil {
54 t.Fatalf("LaunchCluster failed: %v", err)
55 }
56 defer func() {
57 err := cluster.Close()
58 if err != nil {
59 t.Fatalf("cluster Close failed: %v", err)
60 }
61 }()
62
63 launch.Log("E2E: Cluster running, starting tests...")
64
65 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
66 // Make everything but the first node into ConsensusMember.
67 for i := 1; i < clusterOptions.NumNodes; i++ {
68 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
69 if err != nil {
70 return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
71 }
72 }
73 return nil
74 })
75 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
76
77 // Shut every node down.
78 for i := 0; i < clusterOptions.NumNodes; i++ {
79 if err := cluster.ShutdownNode(i); err != nil {
80 t.Fatalf("Could not shutdown node %d", i)
81 }
82 }
83 // Start every node back up.
84 for i := 0; i < clusterOptions.NumNodes; i++ {
85 if err := cluster.StartNode(i); err != nil {
86 t.Fatalf("Could not shutdown node %d", i)
87 }
88 }
89 // Check if the cluster comes back up.
90 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
Jan Schärd5538b52024-09-25 13:16:49 +020091
92 // Test node role removal.
93 curC, err := cluster.CuratorClient()
94 if err != nil {
95 t.Fatalf("Could not get CuratorClient: %v", err)
96 }
97 mgmt := apb.NewManagementClient(curC)
98 cur := ipb.NewCuratorClient(curC)
99
100 util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
Jan Schärd5538b52024-09-25 13:16:49 +0200101 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
102 Node: &apb.UpdateNodeRolesRequest_Id{
103 Id: cluster.NodeIDs[0],
104 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100105 KubernetesController: ptr.To(false),
Jan Schärd5538b52024-09-25 13:16:49 +0200106 })
107 return err
108 })
109 util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
Jan Schärd5538b52024-09-25 13:16:49 +0200110 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
111 Node: &apb.UpdateNodeRolesRequest_Id{
112 Id: cluster.NodeIDs[0],
113 },
Jan Schärd1a8b642024-12-03 17:40:41 +0100114 ConsensusMember: ptr.To(false),
Jan Schärd5538b52024-09-25 13:16:49 +0200115 })
116 return err
117 })
118
119 // Test that removing the ConsensusMember role from a node removed the
120 // corresponding etcd member from the cluster.
121 var st *ipb.GetConsensusStatusResponse
122 util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
123 st, err = cur.GetConsensusStatus(ctx, &ipb.GetConsensusStatusRequest{})
124 return err
125 })
126
127 for _, member := range st.EtcdMember {
128 if member.Id == cluster.NodeIDs[0] {
129 t.Errorf("member still present in etcd")
130 }
131 }
132
133 // Test that that the cluster still works after deleting the first node and
134 // restarting the remaining nodes.
135 util.MustTestEventual(t, "Delete first node", ctx, 10*time.Second, func(ctx context.Context) error {
136 _, err := mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
137 Node: &apb.DeleteNodeRequest_Id{
138 Id: cluster.NodeIDs[0],
139 },
140 SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
141 })
142 return err
143 })
144
145 // Shut every remaining node down.
146 for i := 1; i < clusterOptions.NumNodes; i++ {
147 if err := cluster.ShutdownNode(i); err != nil {
148 t.Fatalf("Could not shutdown node %d", i)
149 }
150 }
151 // Start every remaining node back up.
152 for i := 1; i < clusterOptions.NumNodes; i++ {
153 if err := cluster.StartNode(i); err != nil {
154 t.Fatalf("Could not shutdown node %d", i)
155 }
156 }
157 // Check if the cluster comes back up.
158 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
Serge Bazanski99b02142024-04-17 16:33:28 +0200159}