Add Kubernetes CTS

This adds patches and build specifications for the Kubernetes Conformance Test Suite. This involves
gating various cloud-specific tests behind the providerless flag (otherwise we'd gain a ton of additional dependencies)
and an additional 60MiB in test binary size.
Since the CTS for weird reasons requires kubectl to be available in the path we first build a kubectl go_image and then
stack the CTS on top of it. The output bundle is then preseeded for use.

Test Plan: `bazel run //core/tests/e2e/k8s_cts`

Bug: T836

X-Origin-Diff: phab/D615
GitOrigin-RevId: 7d2cd780a3ffb63b217591c5854b4aec4031d83d
diff --git a/core/BUILD b/core/BUILD
index 33ec07d..5d628f1 100644
--- a/core/BUILD
+++ b/core/BUILD
@@ -43,6 +43,7 @@
 
         # Containerd preseed bundles
         "//core/tests/e2e/preseedtest:preseedtest.tar": "/containerd/preseed/k8s.io/preseedtest.tar",
+        "//core/tests/e2e/k8s_cts:k8s_cts_image.tar": "/containerd/preseed/k8s.io/k8s_cts.tar",
 
         # CNI Plugins
         "@com_github_containernetworking_plugins//plugins/main/loopback": "/containerd/bin/cni/loopback",
diff --git a/core/cmd/launch/main.go b/core/cmd/launch/main.go
index 9bb4732..aba3dda 100644
--- a/core/cmd/launch/main.go
+++ b/core/cmd/launch/main.go
@@ -34,7 +34,7 @@
 		<-sigs
 		cancel()
 	}()
-	if err := launch.Launch(ctx, launch.Options{Ports: launch.IdentityPortMap(), SerialPort: os.Stdout}); err != nil {
+	if err := launch.Launch(ctx, launch.Options{Ports: launch.IdentityPortMap(launch.NodePorts), SerialPort: os.Stdout}); err != nil {
 		if err == ctx.Err() {
 			return
 		}
diff --git a/core/internal/launch/BUILD.bazel b/core/internal/launch/BUILD.bazel
index b57c016..1979ec6 100644
--- a/core/internal/launch/BUILD.bazel
+++ b/core/internal/launch/BUILD.bazel
@@ -10,6 +10,7 @@
         "//core/proto/api:go_default_library",
         "//golibs/common:go_default_library",
         "@com_github_golang_protobuf//proto:go_default_library",
+        "@com_github_grpc_ecosystem_go_grpc_middleware//retry:go_default_library",
         "@org_golang_google_grpc//:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/core/internal/launch/launch.go b/core/internal/launch/launch.go
index f20d721..f456852 100644
--- a/core/internal/launch/launch.go
+++ b/core/internal/launch/launch.go
@@ -32,6 +32,9 @@
 	"strconv"
 	"strings"
 	"syscall"
+	"time"
+
+	grpcretry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
 
 	"github.com/golang/protobuf/proto"
 	"golang.org/x/sys/unix"
@@ -133,26 +136,27 @@
 	EnrolmentConfig *apb.EnrolmentConfig
 }
 
-var requiredPorts = []uint16{common.ConsensusPort, common.NodeServicePort, common.MasterServicePort,
+// NodePorts is the list of ports a fully operational Smalltown node listens on
+var NodePorts = []uint16{common.ConsensusPort, common.NodeServicePort, common.MasterServicePort,
 	common.ExternalServicePort, common.DebugServicePort, common.KubernetesAPIPort, common.DebuggerPort}
 
-// IdentityPortMap returns a port map where each VM port is mapped onto itself on the host. This is mainly useful
+// IdentityPortMap returns a port map where each given port is mapped onto itself on the host. This is mainly useful
 // for development against Smalltown. The dbg command requires this mapping.
-func IdentityPortMap() PortMap {
+func IdentityPortMap(ports []uint16) PortMap {
 	portMap := make(PortMap)
-	for _, port := range requiredPorts {
+	for _, port := range ports {
 		portMap[port] = port
 	}
 	return portMap
 }
 
-// ConflictFreePortMap returns a port map where each VM port is mapped onto a random free port on the host. This is
+// ConflictFreePortMap returns a port map where each given port is mapped onto a random free port on the host. This is
 // intended for automated testing where multiple instances of Smalltown might be running. Please call this function for
 // each Launch command separately and as close to it as possible since it cannot guarantee that the ports will remain
 // free.
-func ConflictFreePortMap() (PortMap, error) {
+func ConflictFreePortMap(ports []uint16) (PortMap, error) {
 	portMap := make(PortMap)
-	for _, port := range requiredPorts {
+	for _, port := range ports {
 		mappedPort, listenCloser, err := freeport.AllocateTCPPort()
 		if err != nil {
 			return portMap, fmt.Errorf("failed to get free host port: %w", err)
@@ -460,3 +464,95 @@
 func (e *QEMUError) Error() string {
 	return fmt.Sprintf("%v: %v", e.String(), string(e.Stderr))
 }
+
+// NanoswitchPorts contains all ports forwarded by Nanoswitch to the first VM
+var NanoswitchPorts = []uint16{
+	common.ExternalServicePort,
+	common.DebugServicePort,
+	common.KubernetesAPIPort,
+}
+
+// ClusterOptions contains all options for launching a Smalltown cluster
+type ClusterOptions struct {
+	// The number of nodes this cluster should be started with initially
+	NumNodes int
+}
+
+// LaunchCluster launches a cluster of Smalltown VMs together with a Nanoswitch instance to network them all together.
+func LaunchCluster(ctx context.Context, opts ClusterOptions) (apb.NodeDebugServiceClient, PortMap, error) {
+	var switchPorts []*os.File
+	var vmPorts []*os.File
+	for i := 0; i < opts.NumNodes; i++ {
+		switchPort, vmPort, err := NewSocketPair()
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to get socketpair: %w", err)
+		}
+		switchPorts = append(switchPorts, switchPort)
+		vmPorts = append(vmPorts, vmPort)
+	}
+
+	if opts.NumNodes == 0 {
+		return nil, nil, errors.New("refusing to start cluster with zero nodes")
+	}
+
+	if opts.NumNodes > 2 {
+		return nil, nil, errors.New("launching more than 2 nodes is unsupported pending replacement of golden tickets")
+	}
+
+	go func() {
+		if err := Launch(ctx, Options{ConnectToSocket: vmPorts[0]}); err != nil {
+			// Launch() only terminates when QEMU has terminated. At that point our function probably doesn't run anymore
+			// so we have no way of communicating the error back up, so let's just log it. Also a failure in launching
+			// VMs should be very visible by the unavailability of the clients we return.
+			log.Printf("Failed to launch vm0: %v", err)
+		}
+	}()
+
+	portMap, err := ConflictFreePortMap(NanoswitchPorts)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to allocate ephemeral ports: %w", err)
+	}
+
+	go func() {
+		if err := RunMicroVM(ctx, &MicroVMOptions{
+			KernelPath:             "core/tools/ktest/linux-testing.elf",
+			InitramfsPath:          "core/cmd/nanoswitch/initramfs.lz4",
+			ExtraNetworkInterfaces: switchPorts,
+			PortMap:                portMap,
+		}); err != nil {
+			log.Printf("Failed to launch nanoswitch: %v", err)
+		}
+	}()
+	copts := []grpcretry.CallOption{
+		grpcretry.WithBackoff(grpcretry.BackoffExponential(100 * time.Millisecond)),
+	}
+	conn, err := portMap.DialGRPC(common.DebugServicePort, grpc.WithInsecure(),
+		grpc.WithUnaryInterceptor(grpcretry.UnaryClientInterceptor(copts...)))
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to dial debug service: %w", err)
+	}
+	defer conn.Close()
+	debug := apb.NewNodeDebugServiceClient(conn)
+
+	if opts.NumNodes == 2 {
+		res, err := debug.GetGoldenTicket(ctx, &apb.GetGoldenTicketRequest{
+			// HACK: this is assigned by DHCP, and we assume that everything goes well.
+			ExternalIp: "10.1.0.3",
+		}, grpcretry.WithMax(10))
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to get golden ticket: %w", err)
+		}
+
+		ec := &apb.EnrolmentConfig{
+			GoldenTicket: res.Ticket,
+		}
+
+		go func() {
+			if err := Launch(ctx, Options{ConnectToSocket: vmPorts[1], EnrolmentConfig: ec}); err != nil {
+				log.Printf("Failed to launch vm1: %v", err)
+			}
+		}()
+	}
+
+	return debug, portMap, nil
+}
diff --git a/core/tests/e2e/BUILD.bazel b/core/tests/e2e/BUILD.bazel
index 974bcdd..8e74be4 100644
--- a/core/tests/e2e/BUILD.bazel
+++ b/core/tests/e2e/BUILD.bazel
@@ -7,7 +7,7 @@
         "utils.go",
     ],
     importpath = "git.monogon.dev/source/nexantic.git/core/tests/e2e",
-    visibility = ["//visibility:private"],
+    visibility = ["//core/tests:__subpackages__"],
     deps = [
         "//core/proto/api:go_default_library",
         "@io_k8s_api//apps/v1:go_default_library",
diff --git a/core/tests/e2e/k8s_cts/BUILD.bazel b/core/tests/e2e/k8s_cts/BUILD.bazel
new file mode 100644
index 0000000..648e1c5
--- /dev/null
+++ b/core/tests/e2e/k8s_cts/BUILD.bazel
@@ -0,0 +1,55 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
+load("@io_bazel_rules_docker//go:image.bzl", "go_image")
+load("@io_bazel_rules_docker//container:container.bzl", "container_image")
+
+go_image(
+    name = "kubectl",
+    binary = "@io_k8s_kubernetes//cmd/kubectl",
+    pure = "on",
+)
+
+container_image(
+    name = "kubectl_in_path",
+    base = ":kubectl",
+    env = {
+        # Don't include FHS paths since they aren't available anyways
+        "PATH": "/app/cmd/kubectl",
+    },
+)
+
+go_image(
+    name = "k8s_cts_image",
+    base = ":kubectl_in_path",
+    binary = "@io_k8s_kubernetes//test/e2e:_go_default_test-pure",
+    pure = "on",
+    visibility = ["//visibility:public"],
+)
+
+go_library(
+    name = "go_default_library",
+    srcs = ["main.go"],
+    importpath = "git.monogon.dev/source/nexantic.git/core/tests/e2e/k8s_cts",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//core/internal/common:go_default_library",
+        "//core/internal/launch:go_default_library",
+        "//core/tests/e2e:go_default_library",
+        "@io_k8s_api//core/v1:go_default_library",
+        "@io_k8s_api//rbac/v1:go_default_library",
+        "@io_k8s_apimachinery//pkg/apis/meta/v1:go_default_library",
+    ],
+)
+
+go_binary(
+    name = "k8s_cts",
+    data = [
+        "//core:image",
+        "//core:swtpm_data",
+        "//core/cmd/nanoswitch:initramfs",
+        "//core/tools/ktest:linux-testing",
+        "//third_party/edk2:firmware",
+        "@com_github_bonzini_qboot//:qboot-bin",
+    ],
+    embed = [":go_default_library"],
+    visibility = ["//visibility:public"],
+)
diff --git a/core/tests/e2e/k8s_cts/main.go b/core/tests/e2e/k8s_cts/main.go
new file mode 100644
index 0000000..412ae7c
--- /dev/null
+++ b/core/tests/e2e/k8s_cts/main.go
@@ -0,0 +1,176 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This package launches a Smalltown Cluster with two nodes and spawns in the CTS container. Then it streams its output
+// to the console. When the CTS has finished it exits with the appropriate error code.
+package main
+
+import (
+	"context"
+	"io"
+	"log"
+	"os"
+	"os/signal"
+	"strings"
+	"syscall"
+	"time"
+
+	"git.monogon.dev/source/nexantic.git/core/internal/common"
+	"git.monogon.dev/source/nexantic.git/core/tests/e2e"
+
+	corev1 "k8s.io/api/core/v1"
+	rbacv1 "k8s.io/api/rbac/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	"git.monogon.dev/source/nexantic.git/core/internal/launch"
+)
+
+// makeCTSPodSpec generates a spec for a standalone pod running the Kubernetes CTS. It also sets the test configuration
+// for the Kubernetes E2E test suite to only run CTS tests and excludes known-broken ones.
+func makeCTSPodSpec(name string, saName string) *corev1.Pod {
+	skipRegexes := []string{
+		// hostNetworking cannot be supported since we run different network stacks for the host and containers
+		"should function for node-pod communication",
+		// gVisor misreports statfs() syscalls: https://github.com/google/gvisor/issues/3339
+		`should support \((non-)?root,`,
+		"volume on tmpfs should have the correct mode",
+		"volume on default medium should have the correct mode",
+		// gVisor doesn't support the full Linux privilege machinery including SUID and NewPrivs
+		// https://github.com/google/gvisor/issues/189#issuecomment-481064000
+		"should run the container as unprivileged when false",
+	}
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: name,
+			Labels: map[string]string{
+				"name": name,
+			},
+		},
+		Spec: corev1.PodSpec{
+			Containers: []corev1.Container{
+				{
+					Name:  "cts",
+					Image: "bazel/core/tests/e2e/k8s_cts:k8s_cts_image",
+					Args: []string{
+						"-cluster-ip-range=10.0.0.0/17",
+						"-dump-systemd-journal=false",
+						"-ginkgo.focus=\\[Conformance\\]",
+						"-ginkgo.skip=" + strings.Join(skipRegexes, "|"),
+						"-test.parallel=8",
+					},
+					ImagePullPolicy: corev1.PullNever,
+				},
+			},
+			Tolerations: []corev1.Toleration{{ // Tolerate all taints, otherwise the CTS likes to self-evict
+				Operator: "Exists",
+			}},
+			PriorityClassName:  "system-cluster-critical", // Don't evict the CTS pod
+			RestartPolicy:      corev1.RestartPolicyNever,
+			ServiceAccountName: saName,
+		},
+	}
+}
+
+func main() {
+	sigs := make(chan os.Signal, 1)
+	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() {
+		<-sigs
+		cancel()
+	}()
+
+	debugClient, portMap, err := launch.LaunchCluster(ctx, launch.ClusterOptions{NumNodes: 2})
+	if err != nil {
+		log.Fatalf("Failed to launch cluster: %v", err)
+	}
+	log.Println("Cluster initialized")
+
+	clientSet, err := e2e.GetKubeClientSet(ctx, debugClient, portMap[common.KubernetesAPIPort])
+	if err != nil {
+		log.Fatalf("Failed to get clientSet: %v", err)
+	}
+	log.Println("Credentials available")
+
+	saName := "cts"
+	ctsSA := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: saName}}
+	for {
+		if _, err := clientSet.CoreV1().ServiceAccounts("default").Create(ctx, ctsSA, metav1.CreateOptions{}); err != nil {
+			log.Printf("Failed to create ServiceAccount: %v", err)
+			time.Sleep(1 * time.Second)
+			continue
+		}
+		break
+	}
+	ctsRoleBinding := &rbacv1.ClusterRoleBinding{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: saName,
+		},
+		Subjects: []rbacv1.Subject{
+			{
+				Namespace: "default",
+				Name:      saName,
+				Kind:      rbacv1.ServiceAccountKind,
+			},
+		},
+		RoleRef: rbacv1.RoleRef{
+			Kind: "ClusterRole",
+			Name: "cluster-admin",
+		},
+	}
+	podName := "cts"
+	if _, err := clientSet.RbacV1().ClusterRoleBindings().Create(ctx, ctsRoleBinding, metav1.CreateOptions{}); err != nil {
+		log.Fatalf("Failed to create ClusterRoleBinding: %v", err)
+	}
+	for {
+		if _, err := clientSet.CoreV1().Pods("default").Create(ctx, makeCTSPodSpec(podName, saName), metav1.CreateOptions{}); err != nil {
+			log.Printf("Failed to create Pod: %v", err)
+			time.Sleep(1 * time.Second)
+			continue
+		}
+		break
+	}
+	var logs io.ReadCloser
+	go func() {
+		// This loops the whole .Stream()/io.Copy process because the API sometimes returns streams that immediately return EOF
+		for {
+			logs, err = clientSet.CoreV1().Pods("default").GetLogs(podName, &corev1.PodLogOptions{Follow: true}).Stream(ctx)
+			if err == nil {
+				if _, err := io.Copy(os.Stdout, logs); err != nil {
+					log.Printf("Log pump error: %v", err)
+				}
+				logs.Close()
+			} else {
+				log.Printf("Pod logs not ready yet: %v", err)
+			}
+			time.Sleep(1 * time.Second)
+		}
+	}()
+	for {
+		time.Sleep(1 * time.Second)
+		pod, err := clientSet.CoreV1().Pods("default").Get(ctx, podName, metav1.GetOptions{})
+		if err != nil {
+			log.Printf("Failed to get CTS pod: %v", err)
+			continue
+		}
+		if pod.Status.Phase == corev1.PodSucceeded {
+			return
+		}
+		if pod.Status.Phase == corev1.PodFailed {
+			log.Fatalf("CTS failed")
+		}
+	}
+}
diff --git a/core/tests/e2e/kubernetes_helpers.go b/core/tests/e2e/kubernetes_helpers.go
index d0337e6..4f9ba81 100644
--- a/core/tests/e2e/kubernetes_helpers.go
+++ b/core/tests/e2e/kubernetes_helpers.go
@@ -33,9 +33,9 @@
 	apb "git.monogon.dev/source/nexantic.git/core/proto/api"
 )
 
-// getKubeClientSet gets a Kubeconfig from the debug API and creates a K8s ClientSet using it. The identity used has
+// GetKubeClientSet gets a Kubeconfig from the debug API and creates a K8s ClientSet using it. The identity used has
 // the system:masters group and thus has RBAC access to everything.
-func getKubeClientSet(ctx context.Context, client apb.NodeDebugServiceClient, port uint16) (kubernetes.Interface, error) {
+func GetKubeClientSet(ctx context.Context, client apb.NodeDebugServiceClient, port uint16) (kubernetes.Interface, error) {
 	var lastErr = errors.New("context canceled before any operation completed")
 	for {
 		reqT, cancel := context.WithTimeout(ctx, 5*time.Second)
diff --git a/core/tests/e2e/main_test.go b/core/tests/e2e/main_test.go
index 99cfdff..c50263c 100644
--- a/core/tests/e2e/main_test.go
+++ b/core/tests/e2e/main_test.go
@@ -73,7 +73,7 @@
 
 	// Set a global timeout to make sure this terminates
 	ctx, cancel := context.WithTimeout(context.Background(), globalTestTimeout)
-	portMap, err := launch.ConflictFreePortMap()
+	portMap, err := launch.ConflictFreePortMap(launch.NodePorts)
 	if err != nil {
 		t.Fatalf("Failed to acquire ports for e2e test: %v", err)
 	}
@@ -99,7 +99,7 @@
 			t.Parallel()
 			selfCtx, cancel := context.WithTimeout(ctx, largeTestTimeout)
 			defer cancel()
-			clientSet, err := getKubeClientSet(selfCtx, debugClient, portMap[common.KubernetesAPIPort])
+			clientSet, err := GetKubeClientSet(selfCtx, debugClient, portMap[common.KubernetesAPIPort])
 			if err != nil {
 				t.Fatal(err)
 			}