m/test/e2e: split out tests into subpackages

The end-to-end tests have grown large enough that they merit their own
test targets. To make this more Go-idiomatic, we split away the tests
not just int separate Bazel targets, but also Go packages.

We also add per-test resource requests for Bazel, including a new
resource kind (iops). This makes the tests more deterministic and allows
use to eg. use --runs_per_test=10 to deflake test logic without hitting
resource contention issues.

//metropolis/test/e2e/suites/core:core_test                              PASSED in 35.1s
  Stats over 10 runs: max = 35.1s, min = 26.6s, avg = 31.9s, dev = 2.6s
//metropolis/test/e2e/suites/ha:ha_test                                  PASSED in 114.6s
  Stats over 10 runs: max = 114.6s, min = 90.1s, avg = 100.9s, dev = 7.6s
//metropolis/test/e2e/suites/ha_cold:ha_cold_test                        PASSED in 67.8s
  Stats over 10 runs: max = 67.8s, min = 55.5s, avg = 62.0s, dev = 4.1s
//metropolis/test/e2e/suites/kubernetes:kubernetes_test                  PASSED in 80.9s
  Stats over 10 runs: max = 80.9s, min = 58.8s, avg = 68.6s, dev = 6.0s

Change-Id: I8f31e09f599fd90c9941e2b69f36789817fa90ce
Reviewed-on: https://review.monogon.dev/c/monogon/+/3086
Reviewed-by: Jan Schär <jan@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/test/e2e/suites/ha/run_test.go b/metropolis/test/e2e/suites/ha/run_test.go
new file mode 100644
index 0000000..63a2acd
--- /dev/null
+++ b/metropolis/test/e2e/suites/ha/run_test.go
@@ -0,0 +1,93 @@
+package ha
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/bazelbuild/rules_go/go/runfiles"
+
+	"source.monogon.dev/metropolis/pkg/localregistry"
+	"source.monogon.dev/metropolis/test/launch"
+	"source.monogon.dev/metropolis/test/launch/cluster"
+	"source.monogon.dev/metropolis/test/util"
+)
+
+const (
+	// Timeout for the global test context.
+	//
+	// Bazel would eventually time out the test after 900s ("large") if, for
+	// some reason, the context cancellation fails to abort it.
+	globalTestTimeout = 600 * time.Second
+
+	// Timeouts for individual end-to-end tests of different sizes.
+	smallTestTimeout = 60 * time.Second
+	largeTestTimeout = 120 * time.Second
+)
+
+// TestE2ECoreHA exercises the basics of a high-availability control plane by
+// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
+// performing a rolling restart.
+func TestE2ECoreHA(t *testing.T) {
+	// Set a global timeout to make sure this terminates
+	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
+	defer cancel()
+
+	rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
+	if err != nil {
+		t.Fatalf("Resolving registry manifest failed: %v", err)
+	}
+	df, err := os.ReadFile(rPath)
+	if err != nil {
+		t.Fatalf("Reading registry manifest failed: %v", err)
+	}
+	lr, err := localregistry.FromBazelManifest(df)
+	if err != nil {
+		t.Fatalf("Creating test image registry failed: %v", err)
+	}
+	// Launch cluster.
+	clusterOptions := cluster.ClusterOptions{
+		NumNodes:        3,
+		LocalRegistry:   lr,
+		NodeLogsToFiles: true,
+	}
+	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
+	if err != nil {
+		t.Fatalf("LaunchCluster failed: %v", err)
+	}
+	defer func() {
+		err := cluster.Close()
+		if err != nil {
+			t.Fatalf("cluster Close failed: %v", err)
+		}
+	}()
+
+	launch.Log("E2E: Cluster running, starting tests...")
+
+	util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
+		// Make everything but the first node into ConsensusMember.
+		for i := 1; i < clusterOptions.NumNodes; i++ {
+			err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
+			if err != nil {
+				return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
+			}
+		}
+		return nil
+	})
+	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
+
+	// Perform rolling restart of all nodes. When a node rejoins it must be able to
+	// contact the cluster, so this also exercises that the cluster is serving even
+	// with the node having rebooted.
+	for i := 0; i < clusterOptions.NumNodes; i++ {
+		util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
+			// Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
+			if err := cluster.RebootNode(ctx, i); err != nil {
+				return fmt.Errorf("while rebooting a node: %w", err)
+			}
+			return nil
+		})
+	}
+}