| Serge Bazanski | 99b0214 | 2024-04-17 16:33:28 +0200 | [diff] [blame] | 1 | package ha_cold |
| 2 | |
| 3 | import ( |
| 4 | "context" |
| 5 | "fmt" |
| 6 | "testing" |
| 7 | "time" |
| 8 | |
| 9 | "source.monogon.dev/metropolis/test/launch" |
| 10 | "source.monogon.dev/metropolis/test/launch/cluster" |
| 11 | "source.monogon.dev/metropolis/test/util" |
| 12 | |
| 13 | cpb "source.monogon.dev/metropolis/proto/common" |
| 14 | ) |
| 15 | |
| 16 | const ( |
| 17 | // Timeout for the global test context. |
| 18 | // |
| 19 | // Bazel would eventually time out the test after 900s ("large") if, for |
| 20 | // some reason, the context cancellation fails to abort it. |
| 21 | globalTestTimeout = 600 * time.Second |
| 22 | |
| 23 | // Timeouts for individual end-to-end tests of different sizes. |
| 24 | smallTestTimeout = 60 * time.Second |
| 25 | largeTestTimeout = 120 * time.Second |
| 26 | ) |
| 27 | |
| 28 | // TestE2EColdStartHA exercises an HA cluster being fully shut down then |
| 29 | // restarted again. |
| 30 | // |
| 31 | // Metropolis currently doesn't support cold startups from TPM/Secure clusters, |
| 32 | // so we test a non-TPM/Insecure cluster. |
| 33 | func TestE2EColdStartHA(t *testing.T) { |
| 34 | // Set a global timeout to make sure this terminates |
| 35 | ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout) |
| 36 | defer cancel() |
| 37 | |
| 38 | // Launch cluster. |
| 39 | clusterOptions := cluster.ClusterOptions{ |
| 40 | NumNodes: 3, |
| 41 | NodeLogsToFiles: true, |
| 42 | InitialClusterConfiguration: &cpb.ClusterConfiguration{ |
| 43 | TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED, |
| 44 | StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE, |
| 45 | }, |
| 46 | } |
| 47 | cluster, err := cluster.LaunchCluster(ctx, clusterOptions) |
| 48 | if err != nil { |
| 49 | t.Fatalf("LaunchCluster failed: %v", err) |
| 50 | } |
| 51 | defer func() { |
| 52 | err := cluster.Close() |
| 53 | if err != nil { |
| 54 | t.Fatalf("cluster Close failed: %v", err) |
| 55 | } |
| 56 | }() |
| 57 | |
| 58 | launch.Log("E2E: Cluster running, starting tests...") |
| 59 | |
| 60 | util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error { |
| 61 | // Make everything but the first node into ConsensusMember. |
| 62 | for i := 1; i < clusterOptions.NumNodes; i++ { |
| 63 | err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i]) |
| 64 | if err != nil { |
| 65 | return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)) |
| 66 | } |
| 67 | } |
| 68 | return nil |
| 69 | }) |
| 70 | util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy) |
| 71 | |
| 72 | // Shut every node down. |
| 73 | for i := 0; i < clusterOptions.NumNodes; i++ { |
| 74 | if err := cluster.ShutdownNode(i); err != nil { |
| 75 | t.Fatalf("Could not shutdown node %d", i) |
| 76 | } |
| 77 | } |
| 78 | // Start every node back up. |
| 79 | for i := 0; i < clusterOptions.NumNodes; i++ { |
| 80 | if err := cluster.StartNode(i); err != nil { |
| 81 | t.Fatalf("Could not shutdown node %d", i) |
| 82 | } |
| 83 | } |
| 84 | // Check if the cluster comes back up. |
| 85 | util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy) |
| 86 | } |