blob: 63a2acd6bfd96f3ff374146eba3a9f626615083a [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package ha
2
3import (
4 "context"
5 "fmt"
6 "os"
7 "testing"
8 "time"
9
10 "github.com/bazelbuild/rules_go/go/runfiles"
11
12 "source.monogon.dev/metropolis/pkg/localregistry"
13 "source.monogon.dev/metropolis/test/launch"
14 "source.monogon.dev/metropolis/test/launch/cluster"
15 "source.monogon.dev/metropolis/test/util"
16)
17
18const (
19 // Timeout for the global test context.
20 //
21 // Bazel would eventually time out the test after 900s ("large") if, for
22 // some reason, the context cancellation fails to abort it.
23 globalTestTimeout = 600 * time.Second
24
25 // Timeouts for individual end-to-end tests of different sizes.
26 smallTestTimeout = 60 * time.Second
27 largeTestTimeout = 120 * time.Second
28)
29
30// TestE2ECoreHA exercises the basics of a high-availability control plane by
31// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
32// performing a rolling restart.
33func TestE2ECoreHA(t *testing.T) {
34 // Set a global timeout to make sure this terminates
35 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
36 defer cancel()
37
38 rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
39 if err != nil {
40 t.Fatalf("Resolving registry manifest failed: %v", err)
41 }
42 df, err := os.ReadFile(rPath)
43 if err != nil {
44 t.Fatalf("Reading registry manifest failed: %v", err)
45 }
46 lr, err := localregistry.FromBazelManifest(df)
47 if err != nil {
48 t.Fatalf("Creating test image registry failed: %v", err)
49 }
50 // Launch cluster.
51 clusterOptions := cluster.ClusterOptions{
52 NumNodes: 3,
53 LocalRegistry: lr,
54 NodeLogsToFiles: true,
55 }
56 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
57 if err != nil {
58 t.Fatalf("LaunchCluster failed: %v", err)
59 }
60 defer func() {
61 err := cluster.Close()
62 if err != nil {
63 t.Fatalf("cluster Close failed: %v", err)
64 }
65 }()
66
67 launch.Log("E2E: Cluster running, starting tests...")
68
69 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
70 // Make everything but the first node into ConsensusMember.
71 for i := 1; i < clusterOptions.NumNodes; i++ {
72 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
73 if err != nil {
74 return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
75 }
76 }
77 return nil
78 })
79 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
80
81 // Perform rolling restart of all nodes. When a node rejoins it must be able to
82 // contact the cluster, so this also exercises that the cluster is serving even
83 // with the node having rebooted.
84 for i := 0; i < clusterOptions.NumNodes; i++ {
85 util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
86 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
87 if err := cluster.RebootNode(ctx, i); err != nil {
88 return fmt.Errorf("while rebooting a node: %w", err)
89 }
90 return nil
91 })
92 }
93}