blob: 6a5c6e8e7f9bb3f08c49de252a50c9e7d1e14d2d [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package ha_cold
2
3import (
4 "context"
5 "fmt"
6 "testing"
7 "time"
8
Tim Windelschmidt9f21f532024-05-07 15:14:20 +02009 mlaunch "source.monogon.dev/metropolis/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020010 "source.monogon.dev/metropolis/test/util"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020011 "source.monogon.dev/osbase/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020012
Jan Schärd5538b52024-09-25 13:16:49 +020013 ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
14 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski99b02142024-04-17 16:33:28 +020015 cpb "source.monogon.dev/metropolis/proto/common"
16)
17
18const (
19 // Timeout for the global test context.
20 //
21 // Bazel would eventually time out the test after 900s ("large") if, for
22 // some reason, the context cancellation fails to abort it.
23 globalTestTimeout = 600 * time.Second
24
25 // Timeouts for individual end-to-end tests of different sizes.
26 smallTestTimeout = 60 * time.Second
27 largeTestTimeout = 120 * time.Second
28)
29
30// TestE2EColdStartHA exercises an HA cluster being fully shut down then
31// restarted again.
32//
33// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
34// so we test a non-TPM/Insecure cluster.
35func TestE2EColdStartHA(t *testing.T) {
36 // Set a global timeout to make sure this terminates
37 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
38 defer cancel()
39
40 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020041 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +020042 NumNodes: 3,
43 NodeLogsToFiles: true,
44 InitialClusterConfiguration: &cpb.ClusterConfiguration{
45 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
46 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
47 },
48 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020049 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +020050 if err != nil {
51 t.Fatalf("LaunchCluster failed: %v", err)
52 }
53 defer func() {
54 err := cluster.Close()
55 if err != nil {
56 t.Fatalf("cluster Close failed: %v", err)
57 }
58 }()
59
60 launch.Log("E2E: Cluster running, starting tests...")
61
62 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
63 // Make everything but the first node into ConsensusMember.
64 for i := 1; i < clusterOptions.NumNodes; i++ {
65 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
66 if err != nil {
67 return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
68 }
69 }
70 return nil
71 })
72 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
73
74 // Shut every node down.
75 for i := 0; i < clusterOptions.NumNodes; i++ {
76 if err := cluster.ShutdownNode(i); err != nil {
77 t.Fatalf("Could not shutdown node %d", i)
78 }
79 }
80 // Start every node back up.
81 for i := 0; i < clusterOptions.NumNodes; i++ {
82 if err := cluster.StartNode(i); err != nil {
83 t.Fatalf("Could not shutdown node %d", i)
84 }
85 }
86 // Check if the cluster comes back up.
87 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
Jan Schärd5538b52024-09-25 13:16:49 +020088
89 // Test node role removal.
90 curC, err := cluster.CuratorClient()
91 if err != nil {
92 t.Fatalf("Could not get CuratorClient: %v", err)
93 }
94 mgmt := apb.NewManagementClient(curC)
95 cur := ipb.NewCuratorClient(curC)
96
97 util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
98 fa := false
99 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
100 Node: &apb.UpdateNodeRolesRequest_Id{
101 Id: cluster.NodeIDs[0],
102 },
103 KubernetesController: &fa,
104 })
105 return err
106 })
107 util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
108 fa := false
109 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
110 Node: &apb.UpdateNodeRolesRequest_Id{
111 Id: cluster.NodeIDs[0],
112 },
113 ConsensusMember: &fa,
114 })
115 return err
116 })
117
118 // Test that removing the ConsensusMember role from a node removed the
119 // corresponding etcd member from the cluster.
120 var st *ipb.GetConsensusStatusResponse
121 util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
122 st, err = cur.GetConsensusStatus(ctx, &ipb.GetConsensusStatusRequest{})
123 return err
124 })
125
126 for _, member := range st.EtcdMember {
127 if member.Id == cluster.NodeIDs[0] {
128 t.Errorf("member still present in etcd")
129 }
130 }
131
132 // Test that that the cluster still works after deleting the first node and
133 // restarting the remaining nodes.
134 util.MustTestEventual(t, "Delete first node", ctx, 10*time.Second, func(ctx context.Context) error {
135 _, err := mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
136 Node: &apb.DeleteNodeRequest_Id{
137 Id: cluster.NodeIDs[0],
138 },
139 SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
140 })
141 return err
142 })
143
144 // Shut every remaining node down.
145 for i := 1; i < clusterOptions.NumNodes; i++ {
146 if err := cluster.ShutdownNode(i); err != nil {
147 t.Fatalf("Could not shutdown node %d", i)
148 }
149 }
150 // Start every remaining node back up.
151 for i := 1; i < clusterOptions.NumNodes; i++ {
152 if err := cluster.StartNode(i); err != nil {
153 t.Fatalf("Could not shutdown node %d", i)
154 }
155 }
156 // Check if the cluster comes back up.
157 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
Serge Bazanski99b02142024-04-17 16:33:28 +0200158}