blob: 43ff689943475e5ef096bedbb5fec0fe927d73f4 [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package ha_cold
2
3import (
4 "context"
5 "fmt"
6 "testing"
7 "time"
8
Tim Windelschmidt9f21f532024-05-07 15:14:20 +02009 mlaunch "source.monogon.dev/metropolis/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020010 "source.monogon.dev/metropolis/test/util"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020011 "source.monogon.dev/osbase/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020012
Jan Schärd5538b52024-09-25 13:16:49 +020013 ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
14 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski99b02142024-04-17 16:33:28 +020015 cpb "source.monogon.dev/metropolis/proto/common"
16)
17
18const (
19 // Timeout for the global test context.
20 //
21 // Bazel would eventually time out the test after 900s ("large") if, for
22 // some reason, the context cancellation fails to abort it.
23 globalTestTimeout = 600 * time.Second
24
25 // Timeouts for individual end-to-end tests of different sizes.
26 smallTestTimeout = 60 * time.Second
27 largeTestTimeout = 120 * time.Second
28)
29
30// TestE2EColdStartHA exercises an HA cluster being fully shut down then
31// restarted again.
32//
33// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
34// so we test a non-TPM/Insecure cluster.
35func TestE2EColdStartHA(t *testing.T) {
36 // Set a global timeout to make sure this terminates
37 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
38 defer cancel()
39
40 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020041 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +020042 NumNodes: 3,
43 NodeLogsToFiles: true,
44 InitialClusterConfiguration: &cpb.ClusterConfiguration{
Jan Schär39f4f5c2024-10-29 09:41:50 +010045 ClusterDomain: "cluster.test",
Serge Bazanski99b02142024-04-17 16:33:28 +020046 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
47 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
48 },
49 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020050 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +020051 if err != nil {
52 t.Fatalf("LaunchCluster failed: %v", err)
53 }
54 defer func() {
55 err := cluster.Close()
56 if err != nil {
57 t.Fatalf("cluster Close failed: %v", err)
58 }
59 }()
60
61 launch.Log("E2E: Cluster running, starting tests...")
62
63 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
64 // Make everything but the first node into ConsensusMember.
65 for i := 1; i < clusterOptions.NumNodes; i++ {
66 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
67 if err != nil {
68 return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
69 }
70 }
71 return nil
72 })
73 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
74
75 // Shut every node down.
76 for i := 0; i < clusterOptions.NumNodes; i++ {
77 if err := cluster.ShutdownNode(i); err != nil {
78 t.Fatalf("Could not shutdown node %d", i)
79 }
80 }
81 // Start every node back up.
82 for i := 0; i < clusterOptions.NumNodes; i++ {
83 if err := cluster.StartNode(i); err != nil {
84 t.Fatalf("Could not shutdown node %d", i)
85 }
86 }
87 // Check if the cluster comes back up.
88 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
Jan Schärd5538b52024-09-25 13:16:49 +020089
90 // Test node role removal.
91 curC, err := cluster.CuratorClient()
92 if err != nil {
93 t.Fatalf("Could not get CuratorClient: %v", err)
94 }
95 mgmt := apb.NewManagementClient(curC)
96 cur := ipb.NewCuratorClient(curC)
97
98 util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
99 fa := false
100 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
101 Node: &apb.UpdateNodeRolesRequest_Id{
102 Id: cluster.NodeIDs[0],
103 },
104 KubernetesController: &fa,
105 })
106 return err
107 })
108 util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
109 fa := false
110 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
111 Node: &apb.UpdateNodeRolesRequest_Id{
112 Id: cluster.NodeIDs[0],
113 },
114 ConsensusMember: &fa,
115 })
116 return err
117 })
118
119 // Test that removing the ConsensusMember role from a node removed the
120 // corresponding etcd member from the cluster.
121 var st *ipb.GetConsensusStatusResponse
122 util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
123 st, err = cur.GetConsensusStatus(ctx, &ipb.GetConsensusStatusRequest{})
124 return err
125 })
126
127 for _, member := range st.EtcdMember {
128 if member.Id == cluster.NodeIDs[0] {
129 t.Errorf("member still present in etcd")
130 }
131 }
132
133 // Test that that the cluster still works after deleting the first node and
134 // restarting the remaining nodes.
135 util.MustTestEventual(t, "Delete first node", ctx, 10*time.Second, func(ctx context.Context) error {
136 _, err := mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
137 Node: &apb.DeleteNodeRequest_Id{
138 Id: cluster.NodeIDs[0],
139 },
140 SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
141 })
142 return err
143 })
144
145 // Shut every remaining node down.
146 for i := 1; i < clusterOptions.NumNodes; i++ {
147 if err := cluster.ShutdownNode(i); err != nil {
148 t.Fatalf("Could not shutdown node %d", i)
149 }
150 }
151 // Start every remaining node back up.
152 for i := 1; i < clusterOptions.NumNodes; i++ {
153 if err := cluster.StartNode(i); err != nil {
154 t.Fatalf("Could not shutdown node %d", i)
155 }
156 }
157 // Check if the cluster comes back up.
158 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
Serge Bazanski99b02142024-04-17 16:33:28 +0200159}