blob: 7dcd228dda478debb21a866bb97a7b18f110137d [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package ha
2
3import (
4 "context"
5 "fmt"
6 "os"
7 "testing"
8 "time"
9
10 "github.com/bazelbuild/rules_go/go/runfiles"
11
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020012 mlaunch "source.monogon.dev/metropolis/test/launch"
13 "source.monogon.dev/metropolis/test/localregistry"
Serge Bazanski99b02142024-04-17 16:33:28 +020014 "source.monogon.dev/metropolis/test/util"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020015 "source.monogon.dev/osbase/test/launch"
Jan Schärad8982f2024-09-17 13:56:34 +020016
17 cpb "source.monogon.dev/metropolis/node/core/curator/proto/api"
18 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski99b02142024-04-17 16:33:28 +020019)
20
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000021var (
22 // These are filled by bazel at linking time with the canonical path of
23 // their corresponding file. Inside the init function we resolve it
24 // with the rules_go runfiles package to the real path.
25 xTestImagesManifestPath string
26)
27
28func init() {
29 var err error
30 for _, path := range []*string{
31 &xTestImagesManifestPath,
32 } {
33 *path, err = runfiles.Rlocation(*path)
34 if err != nil {
35 panic(err)
36 }
37 }
38}
39
Serge Bazanski99b02142024-04-17 16:33:28 +020040const (
41 // Timeout for the global test context.
42 //
43 // Bazel would eventually time out the test after 900s ("large") if, for
44 // some reason, the context cancellation fails to abort it.
45 globalTestTimeout = 600 * time.Second
46
47 // Timeouts for individual end-to-end tests of different sizes.
48 smallTestTimeout = 60 * time.Second
49 largeTestTimeout = 120 * time.Second
50)
51
52// TestE2ECoreHA exercises the basics of a high-availability control plane by
53// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
54// performing a rolling restart.
55func TestE2ECoreHA(t *testing.T) {
56 // Set a global timeout to make sure this terminates
57 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
58 defer cancel()
59
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000060 df, err := os.ReadFile(xTestImagesManifestPath)
Serge Bazanski99b02142024-04-17 16:33:28 +020061 if err != nil {
62 t.Fatalf("Reading registry manifest failed: %v", err)
63 }
64 lr, err := localregistry.FromBazelManifest(df)
65 if err != nil {
66 t.Fatalf("Creating test image registry failed: %v", err)
67 }
68 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020069 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +020070 NumNodes: 3,
71 LocalRegistry: lr,
72 NodeLogsToFiles: true,
73 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020074 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +020075 if err != nil {
76 t.Fatalf("LaunchCluster failed: %v", err)
77 }
78 defer func() {
79 err := cluster.Close()
80 if err != nil {
81 t.Fatalf("cluster Close failed: %v", err)
82 }
83 }()
84
85 launch.Log("E2E: Cluster running, starting tests...")
86
87 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
88 // Make everything but the first node into ConsensusMember.
89 for i := 1; i < clusterOptions.NumNodes; i++ {
90 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
91 if err != nil {
92 return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
93 }
94 }
95 return nil
96 })
97 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
98
99 // Perform rolling restart of all nodes. When a node rejoins it must be able to
100 // contact the cluster, so this also exercises that the cluster is serving even
101 // with the node having rebooted.
102 for i := 0; i < clusterOptions.NumNodes; i++ {
103 util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
104 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
105 if err := cluster.RebootNode(ctx, i); err != nil {
106 return fmt.Errorf("while rebooting a node: %w", err)
107 }
108 return nil
109 })
110 }
Jan Schärad8982f2024-09-17 13:56:34 +0200111
112 // Test node role removal.
113 curC, err := cluster.CuratorClient()
114 if err != nil {
115 t.Fatalf("Could not get CuratorClient: %v", err)
116 }
117 mgmt := apb.NewManagementClient(curC)
118 cur := cpb.NewCuratorClient(curC)
119
120 util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
121 fa := false
122 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
123 Node: &apb.UpdateNodeRolesRequest_Id{
124 Id: cluster.NodeIDs[0],
125 },
126 KubernetesController: &fa,
127 })
128 return err
129 })
130 util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
131 fa := false
132 _, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
133 Node: &apb.UpdateNodeRolesRequest_Id{
134 Id: cluster.NodeIDs[0],
135 },
136 ConsensusMember: &fa,
137 })
138 return err
139 })
140
141 // Test that removing the ConsensusMember role from a node removed the
142 // corresponding etcd member from the cluster.
143 var st *cpb.GetConsensusStatusResponse
144 util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
145 st, err = cur.GetConsensusStatus(ctx, &cpb.GetConsensusStatusRequest{})
146 return err
147 })
148
149 for _, member := range st.EtcdMember {
150 if member.Id == cluster.NodeIDs[0] {
151 t.Errorf("member still present in etcd")
152 }
153 }
Serge Bazanski99b02142024-04-17 16:33:28 +0200154}