blob: 6670b8f45592e15331a9d5f161c707889dfa2c97 [file] [log] [blame]
Serge Bazanski99b02142024-04-17 16:33:28 +02001package ha_cold
2
3import (
4 "context"
5 "fmt"
6 "testing"
7 "time"
8
9 "source.monogon.dev/metropolis/test/launch"
10 "source.monogon.dev/metropolis/test/launch/cluster"
11 "source.monogon.dev/metropolis/test/util"
12
13 cpb "source.monogon.dev/metropolis/proto/common"
14)
15
16const (
17 // Timeout for the global test context.
18 //
19 // Bazel would eventually time out the test after 900s ("large") if, for
20 // some reason, the context cancellation fails to abort it.
21 globalTestTimeout = 600 * time.Second
22
23 // Timeouts for individual end-to-end tests of different sizes.
24 smallTestTimeout = 60 * time.Second
25 largeTestTimeout = 120 * time.Second
26)
27
28// TestE2EColdStartHA exercises an HA cluster being fully shut down then
29// restarted again.
30//
31// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
32// so we test a non-TPM/Insecure cluster.
33func TestE2EColdStartHA(t *testing.T) {
34 // Set a global timeout to make sure this terminates
35 ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
36 defer cancel()
37
38 // Launch cluster.
39 clusterOptions := cluster.ClusterOptions{
40 NumNodes: 3,
41 NodeLogsToFiles: true,
42 InitialClusterConfiguration: &cpb.ClusterConfiguration{
43 TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
44 StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
45 },
46 }
47 cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
48 if err != nil {
49 t.Fatalf("LaunchCluster failed: %v", err)
50 }
51 defer func() {
52 err := cluster.Close()
53 if err != nil {
54 t.Fatalf("cluster Close failed: %v", err)
55 }
56 }()
57
58 launch.Log("E2E: Cluster running, starting tests...")
59
60 util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
61 // Make everything but the first node into ConsensusMember.
62 for i := 1; i < clusterOptions.NumNodes; i++ {
63 err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
64 if err != nil {
65 return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
66 }
67 }
68 return nil
69 })
70 util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
71
72 // Shut every node down.
73 for i := 0; i < clusterOptions.NumNodes; i++ {
74 if err := cluster.ShutdownNode(i); err != nil {
75 t.Fatalf("Could not shutdown node %d", i)
76 }
77 }
78 // Start every node back up.
79 for i := 0; i < clusterOptions.NumNodes; i++ {
80 if err := cluster.StartNode(i); err != nil {
81 t.Fatalf("Could not shutdown node %d", i)
82 }
83 }
84 // Check if the cluster comes back up.
85 util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
86}