blob: b56e40c150d32de2f8731051f8306cf24078b744 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski99b02142024-04-17 16:33:28 +02004package ha
5
6import (
7 "context"
8 "fmt"
9 "os"
10 "testing"
11 "time"
12
13 "github.com/bazelbuild/rules_go/go/runfiles"
14
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020015 mlaunch "source.monogon.dev/metropolis/test/launch"
16 "source.monogon.dev/metropolis/test/localregistry"
Serge Bazanski99b02142024-04-17 16:33:28 +020017 "source.monogon.dev/metropolis/test/util"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020018 "source.monogon.dev/osbase/test/launch"
Serge Bazanski99b02142024-04-17 16:33:28 +020019)
20
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000021var (
22 // These are filled by bazel at linking time with the canonical path of
23 // their corresponding file. Inside the init function we resolve it
24 // with the rules_go runfiles package to the real path.
25 xTestImagesManifestPath string
26)
27
28func init() {
29 var err error
30 for _, path := range []*string{
31 &xTestImagesManifestPath,
32 } {
33 *path, err = runfiles.Rlocation(*path)
34 if err != nil {
35 panic(err)
36 }
37 }
38}
39
Serge Bazanski99b02142024-04-17 16:33:28 +020040const (
41 // Timeout for the global test context.
42 //
43 // Bazel would eventually time out the test after 900s ("large") if, for
44 // some reason, the context cancellation fails to abort it.
45 globalTestTimeout = 600 * time.Second
46
47 // Timeouts for individual end-to-end tests of different sizes.
48 smallTestTimeout = 60 * time.Second
49 largeTestTimeout = 120 * time.Second
50)
51
52// TestE2ECoreHA exercises the basics of a high-availability control plane by
53// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
54// performing a rolling restart.
55func TestE2ECoreHA(t *testing.T) {
56 // Set a global timeout to make sure this terminates
57 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
58 defer cancel()
59
Tim Windelschmidt82e6af72024-07-23 00:05:42 +000060 df, err := os.ReadFile(xTestImagesManifestPath)
Serge Bazanski99b02142024-04-17 16:33:28 +020061 if err != nil {
62 t.Fatalf("Reading registry manifest failed: %v", err)
63 }
64 lr, err := localregistry.FromBazelManifest(df)
65 if err != nil {
66 t.Fatalf("Creating test image registry failed: %v", err)
67 }
68 // Launch cluster.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020069 clusterOptions := mlaunch.ClusterOptions{
Serge Bazanski99b02142024-04-17 16:33:28 +020070 NumNodes: 3,
71 LocalRegistry: lr,
72 NodeLogsToFiles: true,
73 }
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020074 cluster, err := mlaunch.LaunchCluster(ctx, clusterOptions)
Serge Bazanski99b02142024-04-17 16:33:28 +020075 if err != nil {
76 t.Fatalf("LaunchCluster failed: %v", err)
77 }
78 defer func() {
79 err := cluster.Close()
80 if err != nil {
81 t.Fatalf("cluster Close failed: %v", err)
82 }
83 }()
84
85 launch.Log("E2E: Cluster running, starting tests...")
86
87 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
88 // Make everything but the first node into ConsensusMember.
89 for i := 1; i < clusterOptions.NumNodes; i++ {
90 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
91 if err != nil {
92 return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
93 }
94 }
95 return nil
96 })
97 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
98
99 // Perform rolling restart of all nodes. When a node rejoins it must be able to
100 // contact the cluster, so this also exercises that the cluster is serving even
101 // with the node having rebooted.
102 for i := 0; i < clusterOptions.NumNodes; i++ {
103 util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
104 // Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
105 if err := cluster.RebootNode(ctx, i); err != nil {
106 return fmt.Errorf("while rebooting a node: %w", err)
107 }
108 return nil
109 })
110 }
Serge Bazanski99b02142024-04-17 16:33:28 +0200111}