blob: 05175c15148784166beae6ba57ff80c5eb34b7ba [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski99b02142024-04-17 16:33:28 +02004package ha
5
6import (
7 "context"
8 "fmt"
9 "os"
10 "testing"
11 "time"
12
13 "github.com/bazelbuild/rules_go/go/runfiles"
14
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020015 mlaunch "source.monogon.dev/metropolis/test/launch"
16 "source.monogon.dev/metropolis/test/localregistry"
Serge Bazanski99b02142024-04-17 16:33:28 +020017 "source.monogon.dev/metropolis/test/util"
18)
19
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000020var (
21 // These are filled by bazel at linking time with the canonical path of
22 // their corresponding file. Inside the init function we resolve it
23 // with the rules_go runfiles package to the real path.
24 xTestImagesManifestPath string
25)
26
27func init() {
28 var err error
29 for _, path := range []*string{
30 &xTestImagesManifestPath,
31 } {
32 *path, err = runfiles.Rlocation(*path)
33 if err != nil {
34 panic(err)
35 }
36 }
37}
38
Serge Bazanski99b02142024-04-17 16:33:28 +020039const (
40 // Timeout for the global test context.
41 //
42 // Bazel would eventually time out the test after 900s ("large") if, for
43 // some reason, the context cancellation fails to abort it.
44 globalTestTimeout = 600 * time.Second
45
46 // Timeouts for individual end-to-end tests of different sizes.
47 smallTestTimeout = 60 * time.Second
48 largeTestTimeout = 120 * time.Second
49)
50
51// TestE2ECoreHA exercises the basics of a high-availability control plane by
52// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
53// performing a rolling restart.
54func TestE2ECoreHA(t *testing.T) {
55 // Set a global timeout to make sure this terminates
56 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
57 defer cancel()
58
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000059 df, err := os.ReadFile(xTestImagesManifestPath)
Serge Bazanski99b02142024-04-17 16:33:28 +020060 if err != nil {
61 t.Fatalf("Reading registry manifest failed: %v", err)
62 }
63 lr, err := localregistry.FromBazelManifest(df)
64 if err != nil {
65 t.Fatalf("Creating test image registry failed: %v", err)
66 }
67 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020068 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +020069 NumNodes: 3,
70 LocalRegistry: lr,
71 NodeLogsToFiles: true,
Jan Schär3cc51632025-06-23 14:37:50 +000072 Node: mlaunch.NodeOptions{
73 // ESP, 2 system partitions, and data partition.
74 DiskBytes: (128 + 2*1024 + 512) * 1024 * 1024,
75 },
Serge Bazanski99b02142024-04-17 16:33:28 +020076 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020077 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +020078 if err != nil {
79 t.Fatalf("LaunchCluster failed: %v", err)
80 }
81 defer func() {
82 err := cluster.Close()
83 if err != nil {
84 t.Fatalf("cluster Close failed: %v", err)
85 }
86 }()
87
Serge Bazanski99b02142024-04-17 16:33:28 +020088 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
89 // Make everything but the first node into ConsensusMember.
90 for i := 1; i < clusterOptions.NumNodes; i++ {
91 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
92 if err != nil {
93 return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
94 }
95 }
96 return nil
97 })
98 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
99
100 // Perform rolling restart of all nodes. When a node rejoins it must be able to
101 // contact the cluster, so this also exercises that the cluster is serving even
102 // with the node having rebooted.
103 for i := 0; i < clusterOptions.NumNodes; i++ {
104 util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
105 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
106 if err := cluster.RebootNode(ctx, i); err != nil {
107 return fmt.Errorf("while rebooting a node: %w", err)
108 }
109 return nil
110 })
111 }
Serge Bazanski99b02142024-04-17 16:33:28 +0200112}