m/test/e2e: split out tests into subpackages
The end-to-end tests have grown large enough that they merit their own
test targets. To make this more Go-idiomatic, we split away the tests
not just int separate Bazel targets, but also Go packages.
We also add per-test resource requests for Bazel, including a new
resource kind (iops). This makes the tests more deterministic and allows
use to eg. use --runs_per_test=10 to deflake test logic without hitting
resource contention issues.
//metropolis/test/e2e/suites/core:core_test PASSED in 35.1s
Stats over 10 runs: max = 35.1s, min = 26.6s, avg = 31.9s, dev = 2.6s
//metropolis/test/e2e/suites/ha:ha_test PASSED in 114.6s
Stats over 10 runs: max = 114.6s, min = 90.1s, avg = 100.9s, dev = 7.6s
//metropolis/test/e2e/suites/ha_cold:ha_cold_test PASSED in 67.8s
Stats over 10 runs: max = 67.8s, min = 55.5s, avg = 62.0s, dev = 4.1s
//metropolis/test/e2e/suites/kubernetes:kubernetes_test PASSED in 80.9s
Stats over 10 runs: max = 80.9s, min = 58.8s, avg = 68.6s, dev = 6.0s
Change-Id: I8f31e09f599fd90c9941e2b69f36789817fa90ce
Reviewed-on: https://review.monogon.dev/c/monogon/+/3086
Reviewed-by: Jan Schär <jan@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/test/e2e/suites/ha_cold/run_test.go b/metropolis/test/e2e/suites/ha_cold/run_test.go
new file mode 100644
index 0000000..6670b8f
--- /dev/null
+++ b/metropolis/test/e2e/suites/ha_cold/run_test.go
@@ -0,0 +1,86 @@
+package ha_cold
+
+import (
+ "context"
+ "fmt"
+ "testing"
+ "time"
+
+ "source.monogon.dev/metropolis/test/launch"
+ "source.monogon.dev/metropolis/test/launch/cluster"
+ "source.monogon.dev/metropolis/test/util"
+
+ cpb "source.monogon.dev/metropolis/proto/common"
+)
+
+const (
+ // Timeout for the global test context.
+ //
+ // Bazel would eventually time out the test after 900s ("large") if, for
+ // some reason, the context cancellation fails to abort it.
+ globalTestTimeout = 600 * time.Second
+
+ // Timeouts for individual end-to-end tests of different sizes.
+ smallTestTimeout = 60 * time.Second
+ largeTestTimeout = 120 * time.Second
+)
+
+// TestE2EColdStartHA exercises an HA cluster being fully shut down then
+// restarted again.
+//
+// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
+// so we test a non-TPM/Insecure cluster.
+func TestE2EColdStartHA(t *testing.T) {
+ // Set a global timeout to make sure this terminates
+ ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
+ defer cancel()
+
+ // Launch cluster.
+ clusterOptions := cluster.ClusterOptions{
+ NumNodes: 3,
+ NodeLogsToFiles: true,
+ InitialClusterConfiguration: &cpb.ClusterConfiguration{
+ TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
+ StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
+ },
+ }
+ cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
+ if err != nil {
+ t.Fatalf("LaunchCluster failed: %v", err)
+ }
+ defer func() {
+ err := cluster.Close()
+ if err != nil {
+ t.Fatalf("cluster Close failed: %v", err)
+ }
+ }()
+
+ launch.Log("E2E: Cluster running, starting tests...")
+
+ util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
+ // Make everything but the first node into ConsensusMember.
+ for i := 1; i < clusterOptions.NumNodes; i++ {
+ err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
+ if err != nil {
+ return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
+ }
+ }
+ return nil
+ })
+ util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
+
+ // Shut every node down.
+ for i := 0; i < clusterOptions.NumNodes; i++ {
+ if err := cluster.ShutdownNode(i); err != nil {
+ t.Fatalf("Could not shutdown node %d", i)
+ }
+ }
+ // Start every node back up.
+ for i := 0; i < clusterOptions.NumNodes; i++ {
+ if err := cluster.StartNode(i); err != nil {
+ t.Fatalf("Could not shutdown node %d", i)
+ }
+ }
+ // Check if the cluster comes back up.
+ util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
+}