m/test/e2e: add TestE2EColdStartHA

This exercises full cluster shutdown and restart.

Change-Id: I546a46c7c8d34da23466b8b959076135c503b077
Reviewed-on: https://review.monogon.dev/c/monogon/+/2943
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/metropolis/test/e2e/BUILD.bazel b/metropolis/test/e2e/BUILD.bazel
index 812bc93..ca65e19 100644
--- a/metropolis/test/e2e/BUILD.bazel
+++ b/metropolis/test/e2e/BUILD.bazel
@@ -43,6 +43,7 @@
         "//metropolis/node/core/rpc",
         "//metropolis/pkg/localregistry",
         "//metropolis/proto/api",
+        "//metropolis/proto/common",
         "//metropolis/test/launch",
         "//metropolis/test/launch/cluster",
         "//metropolis/test/util",
diff --git a/metropolis/test/e2e/main_test.go b/metropolis/test/e2e/main_test.go
index e0c347f..306d29e 100644
--- a/metropolis/test/e2e/main_test.go
+++ b/metropolis/test/e2e/main_test.go
@@ -41,6 +41,7 @@
 	podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
 
 	apb "source.monogon.dev/metropolis/proto/api"
+	cpb "source.monogon.dev/metropolis/proto/common"
 
 	common "source.monogon.dev/metropolis/node"
 	"source.monogon.dev/metropolis/node/core/rpc"
@@ -254,6 +255,66 @@
 	}
 }
 
+// TestE2EColdStartHA exercises an HA cluster being fully shut down then
+// restarted again.
+//
+// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
+// so we test a non-TPM/Insecure cluster.
+func TestE2EColdStartHA(t *testing.T) {
+	// Set a global timeout to make sure this terminates
+	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
+	defer cancel()
+
+	// Launch cluster.
+	clusterOptions := cluster.ClusterOptions{
+		NumNodes:        3,
+		NodeLogsToFiles: true,
+		InitialClusterConfiguration: &cpb.ClusterConfiguration{
+			TpmMode:               cpb.ClusterConfiguration_TPM_MODE_DISABLED,
+			StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
+		},
+	}
+	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
+	if err != nil {
+		t.Fatalf("LaunchCluster failed: %v", err)
+	}
+	defer func() {
+		err := cluster.Close()
+		if err != nil {
+			t.Fatalf("cluster Close failed: %v", err)
+		}
+	}()
+
+	launch.Log("E2E: Cluster running, starting tests...")
+
+	util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
+		// Make everything but the first node into ConsensusMember.
+		for i := 1; i < clusterOptions.NumNodes; i++ {
+			err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
+			if err != nil {
+				return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
+			}
+		}
+		return nil
+	})
+	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
+
+	// Shut every node down.
+	for i := 0; i < clusterOptions.NumNodes; i++ {
+		if err := cluster.ShutdownNode(i); err != nil {
+			t.Fatalf("Could not shutdown node %d", i)
+		}
+	}
+	// Start every node back up.
+	for i := 0; i < clusterOptions.NumNodes; i++ {
+		if err := cluster.StartNode(i); err != nil {
+			t.Fatalf("Could not shutdown node %d", i)
+		}
+	}
+	// Check if the cluster comes back up.
+	util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
+}
+
 // TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
 //
 // The tests are performed against an in-memory cluster.