m/test/e2e: split out tests into subpackages

The end-to-end tests have grown large enough that they merit their own
test targets. To make this more Go-idiomatic, we split away the tests
not just int separate Bazel targets, but also Go packages.

We also add per-test resource requests for Bazel, including a new
resource kind (iops). This makes the tests more deterministic and allows
use to eg. use --runs_per_test=10 to deflake test logic without hitting
resource contention issues.

//metropolis/test/e2e/suites/core:core_test                              PASSED in 35.1s
  Stats over 10 runs: max = 35.1s, min = 26.6s, avg = 31.9s, dev = 2.6s
//metropolis/test/e2e/suites/ha:ha_test                                  PASSED in 114.6s
  Stats over 10 runs: max = 114.6s, min = 90.1s, avg = 100.9s, dev = 7.6s
//metropolis/test/e2e/suites/ha_cold:ha_cold_test                        PASSED in 67.8s
  Stats over 10 runs: max = 67.8s, min = 55.5s, avg = 62.0s, dev = 4.1s
//metropolis/test/e2e/suites/kubernetes:kubernetes_test                  PASSED in 80.9s
  Stats over 10 runs: max = 80.9s, min = 58.8s, avg = 68.6s, dev = 6.0s

Change-Id: I8f31e09f599fd90c9941e2b69f36789817fa90ce
Reviewed-on: https://review.monogon.dev/c/monogon/+/3086
Reviewed-by: Jan Schär <jan@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/test/e2e/BUILD.bazel b/metropolis/test/e2e/BUILD.bazel
index ca65e19..cb8645b 100644
--- a/metropolis/test/e2e/BUILD.bazel
+++ b/metropolis/test/e2e/BUILD.bazel
@@ -1,22 +1,5 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//metropolis/pkg/localregistry:def.bzl", "localregistry_manifest")
 
-go_library(
-    name = "e2e",
-    srcs = ["kubernetes_helpers.go"],
-    importpath = "source.monogon.dev/metropolis/test/e2e",
-    visibility = ["//metropolis/test:__subpackages__"],
-    deps = [
-        "@io_k8s_api//apps/v1:apps",
-        "@io_k8s_api//batch/v1:batch",
-        "@io_k8s_api//core/v1:core",
-        "@io_k8s_apimachinery//pkg/api/resource",
-        "@io_k8s_apimachinery//pkg/apis/meta/v1:meta",
-        "@io_k8s_apimachinery//pkg/util/intstr",
-        "@io_k8s_client_go//kubernetes",
-    ],
-)
-
 localregistry_manifest(
     name = "testimages_manifest",
     images = [
@@ -24,35 +7,7 @@
         "//metropolis/test/e2e/httpserver:httpserver_image",
         "//metropolis/vm/smoketest:smoketest_image",
     ],
-)
-
-go_test(
-    name = "e2e_test",
-    size = "large",
-    srcs = ["main_test.go"],
-    data = [
-        ":testimages_manifest",
-        "//metropolis/node:image",
-        "//metropolis/node:swtpm_data",
-        "//third_party/edk2:firmware",
-    ],
-    embed = [":e2e"],
-    rundir = ".",
-    deps = [
-        "//metropolis/node",
-        "//metropolis/node/core/rpc",
-        "//metropolis/pkg/localregistry",
-        "//metropolis/proto/api",
-        "//metropolis/proto/common",
-        "//metropolis/test/launch",
-        "//metropolis/test/launch/cluster",
-        "//metropolis/test/util",
-        "@io_bazel_rules_go//go/runfiles:go_default_library",
-        "@io_k8s_api//core/v1:core",
-        "@io_k8s_apimachinery//pkg/api/errors",
-        "@io_k8s_apimachinery//pkg/api/resource",
-        "@io_k8s_apimachinery//pkg/apis/meta/v1:meta",
-        "@io_k8s_kubernetes//pkg/api/v1/pod",
-        "@org_golang_google_grpc//:go_default_library",
+    visibility = [
+        "//metropolis/test/e2e/suites:__subpackages__",
     ],
 )
diff --git a/metropolis/test/e2e/main_test.go b/metropolis/test/e2e/main_test.go
deleted file mode 100644
index 306d29e..0000000
--- a/metropolis/test/e2e/main_test.go
+++ /dev/null
@@ -1,641 +0,0 @@
-// Copyright 2020 The Monogon Project Authors.
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package e2e
-
-import (
-	"context"
-	"crypto/tls"
-	"crypto/x509"
-	"errors"
-	"fmt"
-	"io"
-	"net"
-	"net/http"
-	_ "net/http/pprof"
-	"net/url"
-	"os"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/bazelbuild/rules_go/go/runfiles"
-	"google.golang.org/grpc"
-	corev1 "k8s.io/api/core/v1"
-	kerrors "k8s.io/apimachinery/pkg/api/errors"
-	"k8s.io/apimachinery/pkg/api/resource"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
-
-	apb "source.monogon.dev/metropolis/proto/api"
-	cpb "source.monogon.dev/metropolis/proto/common"
-
-	common "source.monogon.dev/metropolis/node"
-	"source.monogon.dev/metropolis/node/core/rpc"
-	"source.monogon.dev/metropolis/pkg/localregistry"
-	"source.monogon.dev/metropolis/test/launch"
-	"source.monogon.dev/metropolis/test/launch/cluster"
-	"source.monogon.dev/metropolis/test/util"
-)
-
-const (
-	// Timeout for the global test context.
-	//
-	// Bazel would eventually time out the test after 900s ("large") if, for
-	// some reason, the context cancellation fails to abort it.
-	globalTestTimeout = 600 * time.Second
-
-	// Timeouts for individual end-to-end tests of different sizes.
-	smallTestTimeout = 60 * time.Second
-	largeTestTimeout = 120 * time.Second
-)
-
-// TestE2ECore exercisees the core functionality of Metropolis: maintaining a
-// control plane, changing node roles, ...
-//
-// The tests are performed against an in-memory cluster.
-func TestE2ECore(t *testing.T) {
-	// Set a global timeout to make sure this terminates
-	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
-	defer cancel()
-
-	rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
-	if err != nil {
-		t.Fatalf("Resolving registry manifest failed: %v", err)
-	}
-	df, err := os.ReadFile(rPath)
-	if err != nil {
-		t.Fatalf("Reading registry manifest failed: %v", err)
-	}
-	lr, err := localregistry.FromBazelManifest(df)
-	if err != nil {
-		t.Fatalf("Creating test image registry failed: %v", err)
-	}
-	// Launch cluster.
-	clusterOptions := cluster.ClusterOptions{
-		NumNodes:      2,
-		LocalRegistry: lr,
-	}
-	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
-	if err != nil {
-		t.Fatalf("LaunchCluster failed: %v", err)
-	}
-	defer func() {
-		err := cluster.Close()
-		if err != nil {
-			t.Fatalf("cluster Close failed: %v", err)
-		}
-	}()
-
-	launch.Log("E2E: Cluster running, starting tests...")
-
-	// Dial first node's curator.
-	creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
-	remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
-	cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
-	if err != nil {
-		t.Fatalf("failed to dial first node's curator: %v", err)
-	}
-	defer cl.Close()
-	mgmt := apb.NewManagementClient(cl)
-
-	util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
-		res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
-		if err != nil {
-			return fmt.Errorf("GetClusterInfo: %w", err)
-		}
-
-		// Ensure that the expected node count is present.
-		nodes := res.ClusterDirectory.Nodes
-		if want, got := clusterOptions.NumNodes, len(nodes); want != got {
-			return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
-		}
-
-		// Ensure the nodes have the expected addresses.
-		addresses := make(map[string]bool)
-		for _, n := range nodes {
-			if len(n.Addresses) != 1 {
-				return fmt.Errorf("node %s has no addresss", n.Id)
-			}
-			address := n.Addresses[0].Host
-			addresses[address] = true
-		}
-
-		for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
-			if !addresses[address] {
-				return fmt.Errorf("address %q not found in directory", address)
-			}
-		}
-		return nil
-	})
-	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
-	util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
-		// Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
-		if err := cluster.RebootNode(ctx, 1); err != nil {
-			return fmt.Errorf("while rebooting a node: %w", err)
-		}
-		return nil
-	})
-	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
-	util.TestEventual(t, "Prometheus node metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
-		pool := x509.NewCertPool()
-		pool.AddCert(cluster.CACertificate)
-		cl := http.Client{
-			Transport: &http.Transport{
-				TLSClientConfig: &tls.Config{
-					Certificates: []tls.Certificate{cluster.Owner},
-					RootCAs:      pool,
-				},
-				DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
-					return cluster.DialNode(ctx, addr)
-				},
-			},
-		}
-		u := url.URL{
-			Scheme: "https",
-			Host:   net.JoinHostPort(cluster.NodeIDs[0], common.MetricsPort.PortString()),
-			Path:   "/metrics/node",
-		}
-		res, err := cl.Get(u.String())
-		if err != nil {
-			return err
-		}
-		defer res.Body.Close()
-		if res.StatusCode != 200 {
-			return fmt.Errorf("status code %d", res.StatusCode)
-		}
-
-		body, err := io.ReadAll(res.Body)
-		if err != nil {
-			return err
-		}
-		needle := "node_uname_info"
-		if !strings.Contains(string(body), needle) {
-			return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
-		}
-		return nil
-	})
-}
-
-// TestE2ECoreHA exercises the basics of a high-availability control plane by
-// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
-// performing a rolling restart.
-func TestE2ECoreHA(t *testing.T) {
-	// Set a global timeout to make sure this terminates
-	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
-	defer cancel()
-
-	rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
-	if err != nil {
-		t.Fatalf("Resolving registry manifest failed: %v", err)
-	}
-	df, err := os.ReadFile(rPath)
-	if err != nil {
-		t.Fatalf("Reading registry manifest failed: %v", err)
-	}
-	lr, err := localregistry.FromBazelManifest(df)
-	if err != nil {
-		t.Fatalf("Creating test image registry failed: %v", err)
-	}
-	// Launch cluster.
-	clusterOptions := cluster.ClusterOptions{
-		NumNodes:        3,
-		LocalRegistry:   lr,
-		NodeLogsToFiles: true,
-	}
-	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
-	if err != nil {
-		t.Fatalf("LaunchCluster failed: %v", err)
-	}
-	defer func() {
-		err := cluster.Close()
-		if err != nil {
-			t.Fatalf("cluster Close failed: %v", err)
-		}
-	}()
-
-	launch.Log("E2E: Cluster running, starting tests...")
-
-	util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
-		// Make everything but the first node into ConsensusMember.
-		for i := 1; i < clusterOptions.NumNodes; i++ {
-			err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
-			if err != nil {
-				return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
-			}
-		}
-		return nil
-	})
-	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
-
-	// Perform rolling restart of all nodes. When a node rejoins it must be able to
-	// contact the cluster, so this also exercises that the cluster is serving even
-	// with the node having rebooted.
-	for i := 0; i < clusterOptions.NumNodes; i++ {
-		util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
-			// Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
-			if err := cluster.RebootNode(ctx, i); err != nil {
-				return fmt.Errorf("while rebooting a node: %w", err)
-			}
-			return nil
-		})
-	}
-}
-
-// TestE2EColdStartHA exercises an HA cluster being fully shut down then
-// restarted again.
-//
-// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
-// so we test a non-TPM/Insecure cluster.
-func TestE2EColdStartHA(t *testing.T) {
-	// Set a global timeout to make sure this terminates
-	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
-	defer cancel()
-
-	// Launch cluster.
-	clusterOptions := cluster.ClusterOptions{
-		NumNodes:        3,
-		NodeLogsToFiles: true,
-		InitialClusterConfiguration: &cpb.ClusterConfiguration{
-			TpmMode:               cpb.ClusterConfiguration_TPM_MODE_DISABLED,
-			StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
-		},
-	}
-	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
-	if err != nil {
-		t.Fatalf("LaunchCluster failed: %v", err)
-	}
-	defer func() {
-		err := cluster.Close()
-		if err != nil {
-			t.Fatalf("cluster Close failed: %v", err)
-		}
-	}()
-
-	launch.Log("E2E: Cluster running, starting tests...")
-
-	util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
-		// Make everything but the first node into ConsensusMember.
-		for i := 1; i < clusterOptions.NumNodes; i++ {
-			err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
-			if err != nil {
-				return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
-			}
-		}
-		return nil
-	})
-	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
-
-	// Shut every node down.
-	for i := 0; i < clusterOptions.NumNodes; i++ {
-		if err := cluster.ShutdownNode(i); err != nil {
-			t.Fatalf("Could not shutdown node %d", i)
-		}
-	}
-	// Start every node back up.
-	for i := 0; i < clusterOptions.NumNodes; i++ {
-		if err := cluster.StartNode(i); err != nil {
-			t.Fatalf("Could not shutdown node %d", i)
-		}
-	}
-	// Check if the cluster comes back up.
-	util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
-}
-
-// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
-//
-// The tests are performed against an in-memory cluster.
-func TestE2EKubernetes(t *testing.T) {
-	// Set a global timeout to make sure this terminates
-	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
-	defer cancel()
-
-	rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
-	if err != nil {
-		t.Fatalf("Resolving registry manifest failed: %v", err)
-	}
-	df, err := os.ReadFile(rPath)
-	if err != nil {
-		t.Fatalf("Reading registry manifest failed: %v", err)
-	}
-	lr, err := localregistry.FromBazelManifest(df)
-	if err != nil {
-		t.Fatalf("Creating test image registry failed: %v", err)
-	}
-
-	// Launch cluster.
-	clusterOptions := cluster.ClusterOptions{
-		NumNodes:      2,
-		LocalRegistry: lr,
-	}
-	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
-	if err != nil {
-		t.Fatalf("LaunchCluster failed: %v", err)
-	}
-	defer func() {
-		err := cluster.Close()
-		if err != nil {
-			t.Fatalf("cluster Close failed: %v", err)
-		}
-	}()
-
-	clientSet, err := cluster.GetKubeClientSet()
-	if err != nil {
-		t.Fatal(err)
-	}
-	util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
-		// Make everything but the first node into KubernetesWorkers.
-		for i := 1; i < clusterOptions.NumNodes; i++ {
-			err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
-			if err != nil {
-				return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
-			}
-		}
-		return nil
-	})
-	util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
-		nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
-		if err != nil {
-			return err
-		}
-		if len(nodes.Items) < 1 {
-			return errors.New("node not yet registered")
-		}
-		node := nodes.Items[0]
-		for _, cond := range node.Status.Conditions {
-			if cond.Type != corev1.NodeReady {
-				continue
-			}
-			if cond.Status != corev1.ConditionTrue {
-				return fmt.Errorf("node not ready: %v", cond.Message)
-			}
-		}
-		return nil
-	})
-	util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
-		_, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
-		return err
-	})
-	util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
-		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
-		if err != nil {
-			return err
-		}
-		if len(res.Items) == 0 {
-			return errors.New("pod didn't get created")
-		}
-		pod := res.Items[0]
-		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
-			return nil
-		}
-		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
-		if err != nil || len(events.Items) == 0 {
-			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
-		} else {
-			return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
-		}
-	})
-	util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
-		deployment := makeTestDeploymentSpec("test-deploy-2")
-		gvisorStr := "gvisor"
-		deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
-		_, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
-		return err
-	})
-	util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
-		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
-		if err != nil {
-			return err
-		}
-		if len(res.Items) == 0 {
-			return errors.New("pod didn't get created")
-		}
-		pod := res.Items[0]
-		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
-			return nil
-		}
-		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
-		if err != nil || len(events.Items) == 0 {
-			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
-		} else {
-			var errorMsg strings.Builder
-			for _, msg := range events.Items {
-				errorMsg.WriteString(" | ")
-				errorMsg.WriteString(msg.Message)
-			}
-			return fmt.Errorf("pod is not ready: %v", errorMsg.String())
-		}
-	})
-	util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
-		_, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
-		return err
-	})
-	util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
-		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
-		if err != nil {
-			return err
-		}
-		if len(res.Items) == 0 {
-			return errors.New("pod didn't get created")
-		}
-		pod := res.Items[0]
-		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
-			return nil
-		}
-		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
-		if err != nil || len(events.Items) == 0 {
-			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
-		} else {
-			return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
-		}
-	})
-	util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
-		_, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
-		return err
-	})
-	util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
-		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
-		if err != nil {
-			return err
-		}
-		if len(res.Items) == 0 {
-			return errors.New("pod didn't get created")
-		}
-		pod := res.Items[0]
-		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
-			return nil
-		}
-		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
-		if err != nil || len(events.Items) == 0 {
-			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
-		} else {
-			return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
-		}
-	})
-	util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
-		_, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
-		return err
-	})
-	util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
-		res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
-		if err != nil {
-			return err
-		}
-		if res.Status.Failed > 0 {
-			pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
-				LabelSelector: "job-name=selftest",
-			})
-			if err != nil {
-				return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
-			}
-			if len(pods.Items) < 1 {
-				return fmt.Errorf("job failed but pod does not exist")
-			}
-			lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
-			if err != nil {
-				return fmt.Errorf("job failed but could not get logs: %w", err)
-			}
-			if len(lines) > 0 {
-				return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
-			}
-			return util.Permanent(fmt.Errorf("job failed, empty log"))
-		}
-		if res.Status.Succeeded > 0 {
-			return nil
-		}
-		return fmt.Errorf("job still running")
-	})
-	util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
-		_, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
-		if err != nil && !kerrors.IsAlreadyExists(err) {
-			return err
-		}
-		_, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
-		if err != nil && !kerrors.IsAlreadyExists(err) {
-			return err
-		}
-		return nil
-	})
-	util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
-		nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
-		if err != nil {
-			return err
-		}
-		// Use a new client for each attempt
-		hc := http.Client{
-			Timeout: 2 * time.Second,
-			Transport: &http.Transport{
-				Dial: cluster.SOCKSDialer.Dial,
-			},
-		}
-		for _, n := range nodes.Items {
-			var addr string
-			for _, a := range n.Status.Addresses {
-				if a.Type == corev1.NodeInternalIP {
-					addr = a.Address
-				}
-			}
-			u := url.URL{Scheme: "http", Host: addr, Path: "/"}
-			res, err := hc.Get(u.String())
-			if err != nil {
-				return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
-			}
-			if res.StatusCode != http.StatusOK {
-				return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
-			}
-			t.Logf("Got response from %q", n.Name)
-		}
-		return nil
-	})
-	util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
-		pool := x509.NewCertPool()
-		pool.AddCert(cluster.CACertificate)
-		cl := http.Client{
-			Transport: &http.Transport{
-				TLSClientConfig: &tls.Config{
-					Certificates: []tls.Certificate{cluster.Owner},
-					RootCAs:      pool,
-				},
-				DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
-					return cluster.DialNode(ctx, addr)
-				},
-			},
-		}
-		u := url.URL{
-			Scheme: "https",
-			Host:   net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
-			Path:   "/metrics/containerd",
-		}
-		res, err := cl.Get(u.String())
-		if err != nil {
-			return err
-		}
-		defer res.Body.Close()
-		if res.StatusCode != 200 {
-			return fmt.Errorf("status code %d", res.StatusCode)
-		}
-
-		body, err := io.ReadAll(res.Body)
-		if err != nil {
-			return err
-		}
-		needle := "containerd_build_info_total"
-		if !strings.Contains(string(body), needle) {
-			return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
-		}
-		return nil
-	})
-	if os.Getenv("HAVE_NESTED_KVM") != "" {
-		util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
-			runcRuntimeClass := "runc"
-			_, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
-				ObjectMeta: metav1.ObjectMeta{
-					Name: "vm-smoketest",
-				},
-				Spec: corev1.PodSpec{
-					Containers: []corev1.Container{{
-						Name:            "vm-smoketest",
-						ImagePullPolicy: corev1.PullNever,
-						Image:           "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
-						Resources: corev1.ResourceRequirements{
-							Limits: corev1.ResourceList{
-								"devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
-							},
-						},
-					}},
-					RuntimeClassName: &runcRuntimeClass,
-					RestartPolicy:    corev1.RestartPolicyNever,
-				},
-			}, metav1.CreateOptions{})
-			return err
-		})
-		util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
-			pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
-			if err != nil {
-				return fmt.Errorf("failed to get pod: %w", err)
-			}
-			if pod.Status.Phase == corev1.PodSucceeded {
-				return nil
-			}
-			events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
-			if err != nil || len(events.Items) == 0 {
-				return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
-			} else {
-				return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
-			}
-		})
-	}
-}
diff --git a/metropolis/test/e2e/suites/core/BUILD.bazel b/metropolis/test/e2e/suites/core/BUILD.bazel
new file mode 100644
index 0000000..6cf7765
--- /dev/null
+++ b/metropolis/test/e2e/suites/core/BUILD.bazel
@@ -0,0 +1,30 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
+go_test(
+    name = "core_test",
+    srcs = ["run_test.go"],
+    data = [
+        "//metropolis/node:image",
+        "//metropolis/node:swtpm_data",
+        "//metropolis/test/e2e:testimages_manifest",
+        "//third_party/edk2:firmware",
+    ],
+    tags = [
+        "resources:iops:5000",
+        "resources:cpu:3",
+        # 2x2048 for nodes plus some extra.
+        "resources:ram:4500",
+    ],
+    deps = [
+        "//metropolis/node",
+        "//metropolis/node/core/rpc",
+        "//metropolis/pkg/localregistry",
+        "//metropolis/proto/api",
+        "//metropolis/proto/common",
+        "//metropolis/test/launch",
+        "//metropolis/test/launch/cluster",
+        "//metropolis/test/util",
+        "@io_bazel_rules_go//go/runfiles:go_default_library",
+        "@org_golang_google_grpc//:go_default_library",
+    ],
+)
diff --git a/metropolis/test/e2e/suites/core/run_test.go b/metropolis/test/e2e/suites/core/run_test.go
new file mode 100644
index 0000000..8bbbf52
--- /dev/null
+++ b/metropolis/test/e2e/suites/core/run_test.go
@@ -0,0 +1,172 @@
+package test_core
+
+import (
+	"context"
+	"crypto/tls"
+	"crypto/x509"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"net/url"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/bazelbuild/rules_go/go/runfiles"
+	"google.golang.org/grpc"
+
+	common "source.monogon.dev/metropolis/node"
+	"source.monogon.dev/metropolis/node/core/rpc"
+	"source.monogon.dev/metropolis/pkg/localregistry"
+	"source.monogon.dev/metropolis/test/launch"
+	"source.monogon.dev/metropolis/test/launch/cluster"
+	"source.monogon.dev/metropolis/test/util"
+
+	apb "source.monogon.dev/metropolis/proto/api"
+	cpb "source.monogon.dev/metropolis/proto/common"
+)
+
+const (
+	// Timeout for the global test context.
+	//
+	// Bazel would eventually time out the test after 900s ("large") if, for
+	// some reason, the context cancellation fails to abort it.
+	globalTestTimeout = 600 * time.Second
+
+	// Timeouts for individual end-to-end tests of different sizes.
+	smallTestTimeout = 60 * time.Second
+	largeTestTimeout = 120 * time.Second
+)
+
+// TestE2ECore exercisees the core functionality of Metropolis: maintaining a
+// control plane, changing node roles, ...
+//
+// The tests are performed against an in-memory cluster.
+func TestE2ECore(t *testing.T) {
+	// Set a global timeout to make sure this terminates
+	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
+	defer cancel()
+
+	rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
+	if err != nil {
+		t.Fatalf("Resolving registry manifest failed: %v", err)
+	}
+	df, err := os.ReadFile(rPath)
+	if err != nil {
+		t.Fatalf("Reading registry manifest failed: %v", err)
+	}
+	lr, err := localregistry.FromBazelManifest(df)
+	if err != nil {
+		t.Fatalf("Creating test image registry failed: %v", err)
+	}
+	// Launch cluster.
+	clusterOptions := cluster.ClusterOptions{
+		NumNodes:      2,
+		LocalRegistry: lr,
+		InitialClusterConfiguration: &cpb.ClusterConfiguration{
+			TpmMode:               cpb.ClusterConfiguration_TPM_MODE_DISABLED,
+			StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
+		},
+	}
+	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
+	if err != nil {
+		t.Fatalf("LaunchCluster failed: %v", err)
+	}
+	defer func() {
+		err := cluster.Close()
+		if err != nil {
+			t.Fatalf("cluster Close failed: %v", err)
+		}
+	}()
+
+	launch.Log("E2E: Cluster running, starting tests...")
+
+	// Dial first node's curator.
+	creds := rpc.NewAuthenticatedCredentials(cluster.Owner, rpc.WantInsecure())
+	remote := net.JoinHostPort(cluster.NodeIDs[0], common.CuratorServicePort.PortString())
+	cl, err := grpc.Dial(remote, grpc.WithContextDialer(cluster.DialNode), grpc.WithTransportCredentials(creds))
+	if err != nil {
+		t.Fatalf("failed to dial first node's curator: %v", err)
+	}
+	defer cl.Close()
+	mgmt := apb.NewManagementClient(cl)
+
+	util.TestEventual(t, "Retrieving cluster directory sucessful", ctx, 60*time.Second, func(ctx context.Context) error {
+		res, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
+		if err != nil {
+			return fmt.Errorf("GetClusterInfo: %w", err)
+		}
+
+		// Ensure that the expected node count is present.
+		nodes := res.ClusterDirectory.Nodes
+		if want, got := clusterOptions.NumNodes, len(nodes); want != got {
+			return fmt.Errorf("wanted %d nodes in cluster directory, got %d", want, got)
+		}
+
+		// Ensure the nodes have the expected addresses.
+		addresses := make(map[string]bool)
+		for _, n := range nodes {
+			if len(n.Addresses) != 1 {
+				return fmt.Errorf("node %s has no addresss", n.Id)
+			}
+			address := n.Addresses[0].Host
+			addresses[address] = true
+		}
+
+		for _, address := range []string{"10.1.0.2", "10.1.0.3"} {
+			if !addresses[address] {
+				return fmt.Errorf("address %q not found in directory", address)
+			}
+		}
+		return nil
+	})
+	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
+	util.TestEventual(t, "Node rejoin successful", ctx, 60*time.Second, func(ctx context.Context) error {
+		// Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
+		if err := cluster.RebootNode(ctx, 1); err != nil {
+			return fmt.Errorf("while rebooting a node: %w", err)
+		}
+		return nil
+	})
+	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
+	util.TestEventual(t, "Prometheus node metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
+		pool := x509.NewCertPool()
+		pool.AddCert(cluster.CACertificate)
+		cl := http.Client{
+			Transport: &http.Transport{
+				TLSClientConfig: &tls.Config{
+					Certificates: []tls.Certificate{cluster.Owner},
+					RootCAs:      pool,
+				},
+				DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
+					return cluster.DialNode(ctx, addr)
+				},
+			},
+		}
+		u := url.URL{
+			Scheme: "https",
+			Host:   net.JoinHostPort(cluster.NodeIDs[0], common.MetricsPort.PortString()),
+			Path:   "/metrics/node",
+		}
+		res, err := cl.Get(u.String())
+		if err != nil {
+			return err
+		}
+		defer res.Body.Close()
+		if res.StatusCode != 200 {
+			return fmt.Errorf("status code %d", res.StatusCode)
+		}
+
+		body, err := io.ReadAll(res.Body)
+		if err != nil {
+			return err
+		}
+		needle := "node_uname_info"
+		if !strings.Contains(string(body), needle) {
+			return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
+		}
+		return nil
+	})
+}
diff --git a/metropolis/test/e2e/suites/ha/BUILD.bazel b/metropolis/test/e2e/suites/ha/BUILD.bazel
new file mode 100644
index 0000000..f9aea3f
--- /dev/null
+++ b/metropolis/test/e2e/suites/ha/BUILD.bazel
@@ -0,0 +1,25 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
+go_test(
+    name = "ha_test",
+    srcs = ["run_test.go"],
+    data = [
+        "//metropolis/node:image",
+        "//metropolis/node:swtpm_data",
+        "//metropolis/test/e2e:testimages_manifest",
+        "//third_party/edk2:firmware",
+    ],
+    tags = [
+        "resources:iops:5000",
+        "resources:cpu:3",
+        # 3x2048 for nodes plus some extra.
+        "resources:ram:7000",
+    ],
+    deps = [
+        "//metropolis/pkg/localregistry",
+        "//metropolis/test/launch",
+        "//metropolis/test/launch/cluster",
+        "//metropolis/test/util",
+        "@io_bazel_rules_go//go/runfiles:go_default_library",
+    ],
+)
diff --git a/metropolis/test/e2e/suites/ha/run_test.go b/metropolis/test/e2e/suites/ha/run_test.go
new file mode 100644
index 0000000..63a2acd
--- /dev/null
+++ b/metropolis/test/e2e/suites/ha/run_test.go
@@ -0,0 +1,93 @@
+package ha
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/bazelbuild/rules_go/go/runfiles"
+
+	"source.monogon.dev/metropolis/pkg/localregistry"
+	"source.monogon.dev/metropolis/test/launch"
+	"source.monogon.dev/metropolis/test/launch/cluster"
+	"source.monogon.dev/metropolis/test/util"
+)
+
+const (
+	// Timeout for the global test context.
+	//
+	// Bazel would eventually time out the test after 900s ("large") if, for
+	// some reason, the context cancellation fails to abort it.
+	globalTestTimeout = 600 * time.Second
+
+	// Timeouts for individual end-to-end tests of different sizes.
+	smallTestTimeout = 60 * time.Second
+	largeTestTimeout = 120 * time.Second
+)
+
+// TestE2ECoreHA exercises the basics of a high-availability control plane by
+// starting up a 3-node cluster, turning all nodes into ConsensusMembers, then
+// performing a rolling restart.
+func TestE2ECoreHA(t *testing.T) {
+	// Set a global timeout to make sure this terminates
+	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
+	defer cancel()
+
+	rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
+	if err != nil {
+		t.Fatalf("Resolving registry manifest failed: %v", err)
+	}
+	df, err := os.ReadFile(rPath)
+	if err != nil {
+		t.Fatalf("Reading registry manifest failed: %v", err)
+	}
+	lr, err := localregistry.FromBazelManifest(df)
+	if err != nil {
+		t.Fatalf("Creating test image registry failed: %v", err)
+	}
+	// Launch cluster.
+	clusterOptions := cluster.ClusterOptions{
+		NumNodes:        3,
+		LocalRegistry:   lr,
+		NodeLogsToFiles: true,
+	}
+	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
+	if err != nil {
+		t.Fatalf("LaunchCluster failed: %v", err)
+	}
+	defer func() {
+		err := cluster.Close()
+		if err != nil {
+			t.Fatalf("cluster Close failed: %v", err)
+		}
+	}()
+
+	launch.Log("E2E: Cluster running, starting tests...")
+
+	util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
+		// Make everything but the first node into ConsensusMember.
+		for i := 1; i < clusterOptions.NumNodes; i++ {
+			err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
+			if err != nil {
+				return fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err)
+			}
+		}
+		return nil
+	})
+	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
+
+	// Perform rolling restart of all nodes. When a node rejoins it must be able to
+	// contact the cluster, so this also exercises that the cluster is serving even
+	// with the node having rebooted.
+	for i := 0; i < clusterOptions.NumNodes; i++ {
+		util.MustTestEventual(t, fmt.Sprintf("Node %d rejoin successful", i), ctx, 60*time.Second, func(ctx context.Context) error {
+			// Ensure nodes rejoin the cluster after a reboot by reboting the 1st node.
+			if err := cluster.RebootNode(ctx, i); err != nil {
+				return fmt.Errorf("while rebooting a node: %w", err)
+			}
+			return nil
+		})
+	}
+}
diff --git a/metropolis/test/e2e/suites/ha_cold/BUILD.bazel b/metropolis/test/e2e/suites/ha_cold/BUILD.bazel
new file mode 100644
index 0000000..dc738a1
--- /dev/null
+++ b/metropolis/test/e2e/suites/ha_cold/BUILD.bazel
@@ -0,0 +1,24 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
+go_test(
+    name = "ha_cold_test",
+    srcs = ["run_test.go"],
+    data = [
+        "//metropolis/node:image",
+        "//metropolis/node:swtpm_data",
+        "//metropolis/test/e2e:testimages_manifest",
+        "//third_party/edk2:firmware",
+    ],
+    tags = [
+        "resources:iops:5000",
+        "resources:cpu:3",
+        # 3x2048 for nodes plus some extra.
+        "resources:ram:7000",
+    ],
+    deps = [
+        "//metropolis/proto/common",
+        "//metropolis/test/launch",
+        "//metropolis/test/launch/cluster",
+        "//metropolis/test/util",
+    ],
+)
diff --git a/metropolis/test/e2e/suites/ha_cold/run_test.go b/metropolis/test/e2e/suites/ha_cold/run_test.go
new file mode 100644
index 0000000..6670b8f
--- /dev/null
+++ b/metropolis/test/e2e/suites/ha_cold/run_test.go
@@ -0,0 +1,86 @@
+package ha_cold
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+
+	"source.monogon.dev/metropolis/test/launch"
+	"source.monogon.dev/metropolis/test/launch/cluster"
+	"source.monogon.dev/metropolis/test/util"
+
+	cpb "source.monogon.dev/metropolis/proto/common"
+)
+
+const (
+	// Timeout for the global test context.
+	//
+	// Bazel would eventually time out the test after 900s ("large") if, for
+	// some reason, the context cancellation fails to abort it.
+	globalTestTimeout = 600 * time.Second
+
+	// Timeouts for individual end-to-end tests of different sizes.
+	smallTestTimeout = 60 * time.Second
+	largeTestTimeout = 120 * time.Second
+)
+
+// TestE2EColdStartHA exercises an HA cluster being fully shut down then
+// restarted again.
+//
+// Metropolis currently doesn't support cold startups from TPM/Secure clusters,
+// so we test a non-TPM/Insecure cluster.
+func TestE2EColdStartHA(t *testing.T) {
+	// Set a global timeout to make sure this terminates
+	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
+	defer cancel()
+
+	// Launch cluster.
+	clusterOptions := cluster.ClusterOptions{
+		NumNodes:        3,
+		NodeLogsToFiles: true,
+		InitialClusterConfiguration: &cpb.ClusterConfiguration{
+			TpmMode:               cpb.ClusterConfiguration_TPM_MODE_DISABLED,
+			StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
+		},
+	}
+	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
+	if err != nil {
+		t.Fatalf("LaunchCluster failed: %v", err)
+	}
+	defer func() {
+		err := cluster.Close()
+		if err != nil {
+			t.Fatalf("cluster Close failed: %v", err)
+		}
+	}()
+
+	launch.Log("E2E: Cluster running, starting tests...")
+
+	util.MustTestEventual(t, "Add ConsensusMember roles", ctx, smallTestTimeout, func(ctx context.Context) error {
+		// Make everything but the first node into ConsensusMember.
+		for i := 1; i < clusterOptions.NumNodes; i++ {
+			err := cluster.MakeConsensusMember(ctx, cluster.NodeIDs[i])
+			if err != nil {
+				return util.Permanent(fmt.Errorf("MakeConsensusMember(%d/%s): %w", i, cluster.NodeIDs[i], err))
+			}
+		}
+		return nil
+	})
+	util.TestEventual(t, "Heartbeat test successful", ctx, 20*time.Second, cluster.AllNodesHealthy)
+
+	// Shut every node down.
+	for i := 0; i < clusterOptions.NumNodes; i++ {
+		if err := cluster.ShutdownNode(i); err != nil {
+			t.Fatalf("Could not shutdown node %d", i)
+		}
+	}
+	// Start every node back up.
+	for i := 0; i < clusterOptions.NumNodes; i++ {
+		if err := cluster.StartNode(i); err != nil {
+			t.Fatalf("Could not shutdown node %d", i)
+		}
+	}
+	// Check if the cluster comes back up.
+	util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
+}
diff --git a/metropolis/test/e2e/suites/kubernetes/BUILD.bazel b/metropolis/test/e2e/suites/kubernetes/BUILD.bazel
new file mode 100644
index 0000000..bb5d009
--- /dev/null
+++ b/metropolis/test/e2e/suites/kubernetes/BUILD.bazel
@@ -0,0 +1,47 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "kubernetes",
+    srcs = ["kubernetes_helpers.go"],
+    importpath = "source.monogon.dev/metropolis/test/e2e/suites/kubernetes",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@io_k8s_api//apps/v1:apps",
+        "@io_k8s_api//batch/v1:batch",
+        "@io_k8s_api//core/v1:core",
+        "@io_k8s_apimachinery//pkg/api/resource",
+        "@io_k8s_apimachinery//pkg/apis/meta/v1:meta",
+        "@io_k8s_apimachinery//pkg/util/intstr",
+        "@io_k8s_client_go//kubernetes",
+    ],
+)
+
+go_test(
+    name = "kubernetes_test",
+    srcs = ["run_test.go"],
+    data = [
+        "//metropolis/node:image",
+        "//metropolis/node:swtpm_data",
+        "//metropolis/test/e2e:testimages_manifest",
+        "//third_party/edk2:firmware",
+    ],
+    embed = [":kubernetes"],
+    tags = [
+        "resources:iops:5000",
+        "resources:cpu:3",
+        # 2x2048 for nodes plus some extra.
+        "resources:ram:4500",
+    ],
+    deps = [
+        "//metropolis/node",
+        "//metropolis/pkg/localregistry",
+        "//metropolis/test/launch/cluster",
+        "//metropolis/test/util",
+        "@io_bazel_rules_go//go/runfiles:go_default_library",
+        "@io_k8s_api//core/v1:core",
+        "@io_k8s_apimachinery//pkg/api/errors",
+        "@io_k8s_apimachinery//pkg/api/resource",
+        "@io_k8s_apimachinery//pkg/apis/meta/v1:meta",
+        "@io_k8s_kubernetes//pkg/api/v1/pod",
+    ],
+)
diff --git a/metropolis/test/e2e/kubernetes_helpers.go b/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
similarity index 99%
rename from metropolis/test/e2e/kubernetes_helpers.go
rename to metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
index 4c0ec28..60d611b 100644
--- a/metropolis/test/e2e/kubernetes_helpers.go
+++ b/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package e2e
+package kubernetes
 
 import (
 	"bytes"
diff --git a/metropolis/test/e2e/suites/kubernetes/run_test.go b/metropolis/test/e2e/suites/kubernetes/run_test.go
new file mode 100644
index 0000000..ec38aa3
--- /dev/null
+++ b/metropolis/test/e2e/suites/kubernetes/run_test.go
@@ -0,0 +1,368 @@
+package kubernetes
+
+import (
+	"context"
+	"crypto/tls"
+	"crypto/x509"
+	"errors"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	_ "net/http/pprof"
+	"net/url"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/bazelbuild/rules_go/go/runfiles"
+	corev1 "k8s.io/api/core/v1"
+	kerrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	podv1 "k8s.io/kubernetes/pkg/api/v1/pod"
+
+	"source.monogon.dev/metropolis/pkg/localregistry"
+	"source.monogon.dev/metropolis/test/launch/cluster"
+	"source.monogon.dev/metropolis/test/util"
+
+	common "source.monogon.dev/metropolis/node"
+)
+
+const (
+	// Timeout for the global test context.
+	//
+	// Bazel would eventually time out the test after 900s ("large") if, for
+	// some reason, the context cancellation fails to abort it.
+	globalTestTimeout = 600 * time.Second
+
+	// Timeouts for individual end-to-end tests of different sizes.
+	smallTestTimeout = 60 * time.Second
+	largeTestTimeout = 120 * time.Second
+)
+
+// TestE2EKubernetes exercises the Kubernetes functionality of Metropolis.
+//
+// The tests are performed against an in-memory cluster.
+func TestE2EKubernetes(t *testing.T) {
+	// Set a global timeout to make sure this terminates
+	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
+	defer cancel()
+
+	rPath, err := runfiles.Rlocation("_main/metropolis/test/e2e/testimages_manifest.prototxt")
+	if err != nil {
+		t.Fatalf("Resolving registry manifest failed: %v", err)
+	}
+	df, err := os.ReadFile(rPath)
+	if err != nil {
+		t.Fatalf("Reading registry manifest failed: %v", err)
+	}
+	lr, err := localregistry.FromBazelManifest(df)
+	if err != nil {
+		t.Fatalf("Creating test image registry failed: %v", err)
+	}
+
+	// Launch cluster.
+	clusterOptions := cluster.ClusterOptions{
+		NumNodes:      2,
+		LocalRegistry: lr,
+	}
+	cluster, err := cluster.LaunchCluster(ctx, clusterOptions)
+	if err != nil {
+		t.Fatalf("LaunchCluster failed: %v", err)
+	}
+	defer func() {
+		err := cluster.Close()
+		if err != nil {
+			t.Fatalf("cluster Close failed: %v", err)
+		}
+	}()
+
+	clientSet, err := cluster.GetKubeClientSet()
+	if err != nil {
+		t.Fatal(err)
+	}
+	util.TestEventual(t, "Add KubernetesWorker roles", ctx, smallTestTimeout, func(ctx context.Context) error {
+		// Make everything but the first node into KubernetesWorkers.
+		for i := 1; i < clusterOptions.NumNodes; i++ {
+			err := cluster.MakeKubernetesWorker(ctx, cluster.NodeIDs[i])
+			if err != nil {
+				return util.Permanent(fmt.Errorf("MakeKubernetesWorker: %w", err))
+			}
+		}
+		return nil
+	})
+	util.TestEventual(t, "Node is registered and ready", ctx, largeTestTimeout, func(ctx context.Context) error {
+		nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+		if err != nil {
+			return err
+		}
+		if len(nodes.Items) < 1 {
+			return errors.New("node not yet registered")
+		}
+		node := nodes.Items[0]
+		for _, cond := range node.Status.Conditions {
+			if cond.Type != corev1.NodeReady {
+				continue
+			}
+			if cond.Status != corev1.ConditionTrue {
+				return fmt.Errorf("node not ready: %v", cond.Message)
+			}
+		}
+		return nil
+	})
+	util.TestEventual(t, "Simple deployment", ctx, largeTestTimeout, func(ctx context.Context) error {
+		_, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeTestDeploymentSpec("test-deploy-1"), metav1.CreateOptions{})
+		return err
+	})
+	util.TestEventual(t, "Simple deployment is running", ctx, largeTestTimeout, func(ctx context.Context) error {
+		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-1"})
+		if err != nil {
+			return err
+		}
+		if len(res.Items) == 0 {
+			return errors.New("pod didn't get created")
+		}
+		pod := res.Items[0]
+		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
+			return nil
+		}
+		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
+		if err != nil || len(events.Items) == 0 {
+			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
+		} else {
+			return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
+		}
+	})
+	util.TestEventual(t, "Simple deployment with gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
+		deployment := makeTestDeploymentSpec("test-deploy-2")
+		gvisorStr := "gvisor"
+		deployment.Spec.Template.Spec.RuntimeClassName = &gvisorStr
+		_, err := clientSet.AppsV1().Deployments("default").Create(ctx, deployment, metav1.CreateOptions{})
+		return err
+	})
+	util.TestEventual(t, "Simple deployment is running on gvisor", ctx, largeTestTimeout, func(ctx context.Context) error {
+		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-deploy-2"})
+		if err != nil {
+			return err
+		}
+		if len(res.Items) == 0 {
+			return errors.New("pod didn't get created")
+		}
+		pod := res.Items[0]
+		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
+			return nil
+		}
+		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
+		if err != nil || len(events.Items) == 0 {
+			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
+		} else {
+			var errorMsg strings.Builder
+			for _, msg := range events.Items {
+				errorMsg.WriteString(" | ")
+				errorMsg.WriteString(msg.Message)
+			}
+			return fmt.Errorf("pod is not ready: %v", errorMsg.String())
+		}
+	})
+	util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
+		_, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
+		return err
+	})
+	util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
+		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
+		if err != nil {
+			return err
+		}
+		if len(res.Items) == 0 {
+			return errors.New("pod didn't get created")
+		}
+		pod := res.Items[0]
+		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
+			return nil
+		}
+		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
+		if err != nil || len(events.Items) == 0 {
+			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
+		} else {
+			return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
+		}
+	})
+	util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
+		_, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
+		return err
+	})
+	util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
+		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
+		if err != nil {
+			return err
+		}
+		if len(res.Items) == 0 {
+			return errors.New("pod didn't get created")
+		}
+		pod := res.Items[0]
+		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
+			return nil
+		}
+		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
+		if err != nil || len(events.Items) == 0 {
+			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
+		} else {
+			return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
+		}
+	})
+	util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
+		_, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
+		return err
+	})
+	util.TestEventual(t, "In-cluster self-test job passed", ctx, smallTestTimeout, func(ctx context.Context) error {
+		res, err := clientSet.BatchV1().Jobs("default").Get(ctx, "selftest", metav1.GetOptions{})
+		if err != nil {
+			return err
+		}
+		if res.Status.Failed > 0 {
+			pods, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{
+				LabelSelector: "job-name=selftest",
+			})
+			if err != nil {
+				return util.Permanent(fmt.Errorf("job failed but failed to find pod: %w", err))
+			}
+			if len(pods.Items) < 1 {
+				return fmt.Errorf("job failed but pod does not exist")
+			}
+			lines, err := getPodLogLines(ctx, clientSet, pods.Items[0].Name, 1)
+			if err != nil {
+				return fmt.Errorf("job failed but could not get logs: %w", err)
+			}
+			if len(lines) > 0 {
+				return util.Permanent(fmt.Errorf("job failed, last log line: %s", lines[0]))
+			}
+			return util.Permanent(fmt.Errorf("job failed, empty log"))
+		}
+		if res.Status.Succeeded > 0 {
+			return nil
+		}
+		return fmt.Errorf("job still running")
+	})
+	util.TestEventual(t, "Start NodePort test setup", ctx, smallTestTimeout, func(ctx context.Context) error {
+		_, err := clientSet.AppsV1().Deployments("default").Create(ctx, makeHTTPServerDeploymentSpec("nodeport-server"), metav1.CreateOptions{})
+		if err != nil && !kerrors.IsAlreadyExists(err) {
+			return err
+		}
+		_, err = clientSet.CoreV1().Services("default").Create(ctx, makeHTTPServerNodePortService("nodeport-server"), metav1.CreateOptions{})
+		if err != nil && !kerrors.IsAlreadyExists(err) {
+			return err
+		}
+		return nil
+	})
+	util.TestEventual(t, "NodePort accessible from all nodes", ctx, smallTestTimeout, func(ctx context.Context) error {
+		nodes, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+		if err != nil {
+			return err
+		}
+		// Use a new client for each attempt
+		hc := http.Client{
+			Timeout: 2 * time.Second,
+			Transport: &http.Transport{
+				Dial: cluster.SOCKSDialer.Dial,
+			},
+		}
+		for _, n := range nodes.Items {
+			var addr string
+			for _, a := range n.Status.Addresses {
+				if a.Type == corev1.NodeInternalIP {
+					addr = a.Address
+				}
+			}
+			u := url.URL{Scheme: "http", Host: addr, Path: "/"}
+			res, err := hc.Get(u.String())
+			if err != nil {
+				return fmt.Errorf("failed getting from node %q: %w", n.Name, err)
+			}
+			if res.StatusCode != http.StatusOK {
+				return fmt.Errorf("getting from node %q: HTTP %d", n.Name, res.StatusCode)
+			}
+			t.Logf("Got response from %q", n.Name)
+		}
+		return nil
+	})
+	util.TestEventual(t, "containerd metrics retrieved", ctx, smallTestTimeout, func(ctx context.Context) error {
+		pool := x509.NewCertPool()
+		pool.AddCert(cluster.CACertificate)
+		cl := http.Client{
+			Transport: &http.Transport{
+				TLSClientConfig: &tls.Config{
+					Certificates: []tls.Certificate{cluster.Owner},
+					RootCAs:      pool,
+				},
+				DialContext: func(ctx context.Context, _, addr string) (net.Conn, error) {
+					return cluster.DialNode(ctx, addr)
+				},
+			},
+		}
+		u := url.URL{
+			Scheme: "https",
+			Host:   net.JoinHostPort(cluster.NodeIDs[1], common.MetricsPort.PortString()),
+			Path:   "/metrics/containerd",
+		}
+		res, err := cl.Get(u.String())
+		if err != nil {
+			return err
+		}
+		defer res.Body.Close()
+		if res.StatusCode != 200 {
+			return fmt.Errorf("status code %d", res.StatusCode)
+		}
+
+		body, err := io.ReadAll(res.Body)
+		if err != nil {
+			return err
+		}
+		needle := "containerd_build_info_total"
+		if !strings.Contains(string(body), needle) {
+			return util.Permanent(fmt.Errorf("could not find %q in returned response", needle))
+		}
+		return nil
+	})
+	if os.Getenv("HAVE_NESTED_KVM") != "" {
+		util.TestEventual(t, "Pod for KVM/QEMU smoke test", ctx, smallTestTimeout, func(ctx context.Context) error {
+			runcRuntimeClass := "runc"
+			_, err := clientSet.CoreV1().Pods("default").Create(ctx, &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "vm-smoketest",
+				},
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{{
+						Name:            "vm-smoketest",
+						ImagePullPolicy: corev1.PullNever,
+						Image:           "test.monogon.internal/metropolis/vm/smoketest:smoketest_container",
+						Resources: corev1.ResourceRequirements{
+							Limits: corev1.ResourceList{
+								"devices.monogon.dev/kvm": *resource.NewQuantity(1, ""),
+							},
+						},
+					}},
+					RuntimeClassName: &runcRuntimeClass,
+					RestartPolicy:    corev1.RestartPolicyNever,
+				},
+			}, metav1.CreateOptions{})
+			return err
+		})
+		util.TestEventual(t, "KVM/QEMU smoke test completion", ctx, smallTestTimeout, func(ctx context.Context) error {
+			pod, err := clientSet.CoreV1().Pods("default").Get(ctx, "vm-smoketest", metav1.GetOptions{})
+			if err != nil {
+				return fmt.Errorf("failed to get pod: %w", err)
+			}
+			if pod.Status.Phase == corev1.PodSucceeded {
+				return nil
+			}
+			events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
+			if err != nil || len(events.Items) == 0 {
+				return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
+			} else {
+				return fmt.Errorf("pod is not ready: %v", events.Items[len(events.Items)-1].Message)
+			}
+		})
+	}
+}