Add KVM device plugin

This adds a KVM device plugin for Kubernetes. This plugin allows for unprivileged access and granular
control of KVM access.

Test Plan: Tested in subsequent revision

X-Origin-Diff: phab/D739
GitOrigin-RevId: 5cd738a47d24e7bfdc29bbd1a31537209e1ebf46
diff --git a/metropolis/node/core/localstorage/storage.go b/metropolis/node/core/localstorage/storage.go
index a1f567b..d728661 100644
--- a/metropolis/node/core/localstorage/storage.go
+++ b/metropolis/node/core/localstorage/storage.go
@@ -133,11 +133,13 @@
 	Plugins struct {
 		declarative.Directory
 		VFS declarative.File `file:"dev.monogon.metropolis.vfs.sock"`
+		KVM declarative.File `file:"devices.monogon.dev_kvm.sock"`
 	} `dir:"plugins"`
 
 	PluginsRegistry struct {
 		declarative.Directory
 		VFSReg declarative.File `file:"dev.monogon.metropolis.vfs-reg.sock"`
+		KVMReg declarative.File `file:"devices.monogon.dev_kvm-reg.sock"`
 	} `dir:"plugins_registry"`
 }
 
diff --git a/metropolis/node/kubernetes/BUILD.bazel b/metropolis/node/kubernetes/BUILD.bazel
index ada30c9..1ab4c52 100644
--- a/metropolis/node/kubernetes/BUILD.bazel
+++ b/metropolis/node/kubernetes/BUILD.bazel
@@ -20,6 +20,7 @@
         "//metropolis/node/kubernetes/clusternet:go_default_library",
         "//metropolis/node/kubernetes/nfproxy:go_default_library",
         "//metropolis/node/kubernetes/pki:go_default_library",
+        "//metropolis/node/kubernetes/plugins/kvmdevice:go_default_library",
         "//metropolis/node/kubernetes/reconciler:go_default_library",
         "//metropolis/pkg/fileargs:go_default_library",
         "//metropolis/pkg/fsquota:go_default_library",
diff --git a/metropolis/node/kubernetes/csi.go b/metropolis/node/kubernetes/csi.go
index 03091c1..3f88d6f 100644
--- a/metropolis/node/kubernetes/csi.go
+++ b/metropolis/node/kubernetes/csi.go
@@ -33,8 +33,8 @@
 	pluginregistration "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
 
 	"source.monogon.dev/metropolis/node/core/localstorage"
-	"source.monogon.dev/metropolis/pkg/logtree"
 	"source.monogon.dev/metropolis/pkg/fsquota"
+	"source.monogon.dev/metropolis/pkg/logtree"
 	"source.monogon.dev/metropolis/pkg/supervisor"
 )
 
@@ -231,7 +231,7 @@
 // Registration endpoints
 func (s *csiPluginServer) GetInfo(ctx context.Context, req *pluginregistration.InfoRequest) (*pluginregistration.PluginInfo, error) {
 	return &pluginregistration.PluginInfo{
-		Type:              "CSIPlugin",
+		Type:              pluginregistration.CSIPlugin,
 		Name:              "dev.monogon.metropolis.vfs",
 		Endpoint:          s.KubeletDirectory.Plugins.VFS.FullPath(),
 		SupportedVersions: []string{"1.2"}, // Keep in sync with container-storage-interface/spec package version
diff --git a/metropolis/node/kubernetes/plugins/kvmdevice/BUILD.bazel b/metropolis/node/kubernetes/plugins/kvmdevice/BUILD.bazel
new file mode 100644
index 0000000..d666f41
--- /dev/null
+++ b/metropolis/node/kubernetes/plugins/kvmdevice/BUILD.bazel
@@ -0,0 +1,17 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["kvmdevice.go"],
+    importpath = "source.monogon.dev/metropolis/node/kubernetes/plugins/kvmdevice",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//metropolis/node/core/localstorage:go_default_library",
+        "//metropolis/pkg/logtree:go_default_library",
+        "//metropolis/pkg/supervisor:go_default_library",
+        "@io_k8s_kubelet//pkg/apis/deviceplugin/v1beta1:go_default_library",
+        "@io_k8s_kubelet//pkg/apis/pluginregistration/v1:go_default_library",
+        "@org_golang_google_grpc//:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/metropolis/node/kubernetes/plugins/kvmdevice/kvmdevice.go b/metropolis/node/kubernetes/plugins/kvmdevice/kvmdevice.go
new file mode 100644
index 0000000..e887d1d
--- /dev/null
+++ b/metropolis/node/kubernetes/plugins/kvmdevice/kvmdevice.go
@@ -0,0 +1,184 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kvmdevice implements a Kubernetes device plugin for the virtual KVM device. Using the device plugin API
+// allows us to take advantage of the scheduler to locate pods on machines eligible for KVM and also allows granular
+// access control to KVM using quotas instead of needing privileged access.
+// Since KVM devices are virtual, this plugin emulates a huge number of them so that we never run out.
+package kvmdevice
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io/ioutil"
+	"net"
+	"os"
+	"strconv"
+	"strings"
+
+	"golang.org/x/sys/unix"
+	"google.golang.org/grpc"
+	deviceplugin "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
+	"k8s.io/kubelet/pkg/apis/pluginregistration/v1"
+
+	"source.monogon.dev/metropolis/node/core/localstorage"
+	"source.monogon.dev/metropolis/pkg/logtree"
+	"source.monogon.dev/metropolis/pkg/supervisor"
+)
+
+type Plugin struct {
+	*deviceplugin.UnimplementedDevicePluginServer
+	KubeletDirectory *localstorage.DataKubernetesKubeletDirectory
+
+	logger logtree.LeveledLogger
+}
+
+func (k *Plugin) GetInfo(context.Context, *pluginregistration.InfoRequest) (*pluginregistration.PluginInfo, error) {
+	return &pluginregistration.PluginInfo{
+		Type:              pluginregistration.DevicePlugin,
+		Name:              "devices.monogon.dev/kvm",
+		Endpoint:          k.KubeletDirectory.Plugins.KVM.FullPath(),
+		SupportedVersions: []string{"v1beta1"},
+	}, nil
+}
+
+func (k *Plugin) NotifyRegistrationStatus(ctx context.Context, req *pluginregistration.RegistrationStatus) (*pluginregistration.RegistrationStatusResponse, error) {
+	if !req.PluginRegistered {
+		k.logger.Errorf("KVM plugin failed to register: %v", req.Error)
+	}
+	return &pluginregistration.RegistrationStatusResponse{}, nil
+}
+
+func (k *Plugin) GetDevicePluginOptions(context.Context, *deviceplugin.Empty) (*deviceplugin.DevicePluginOptions, error) {
+	return &deviceplugin.DevicePluginOptions{
+		GetPreferredAllocationAvailable: false,
+		PreStartRequired:                false,
+	}, nil
+}
+
+func (k *Plugin) ListAndWatch(req *deviceplugin.Empty, s deviceplugin.DevicePlugin_ListAndWatchServer) error {
+	var devs []*deviceplugin.Device
+
+	// TODO(T963): Get this value from Kubelet configuration (or something higher-level?)
+	for i := 0; i < 256; i++ {
+		devs = append(devs, &deviceplugin.Device{
+			ID:     fmt.Sprintf("kvm%v", i),
+			Health: deviceplugin.Healthy,
+		})
+	}
+
+	s.Send(&deviceplugin.ListAndWatchResponse{Devices: devs})
+
+	<-s.Context().Done()
+	return nil
+}
+
+func (k *Plugin) Allocate(ctx context.Context, req *deviceplugin.AllocateRequest) (*deviceplugin.AllocateResponse, error) {
+	var response deviceplugin.AllocateResponse
+
+	for _, req := range req.ContainerRequests {
+		var devices []*deviceplugin.DeviceSpec
+		for range req.DevicesIDs {
+			dev := new(deviceplugin.DeviceSpec)
+			dev.HostPath = "/dev/kvm"
+			dev.ContainerPath = "/dev/kvm"
+			dev.Permissions = "rw"
+			devices = append(devices, dev)
+		}
+		response.ContainerResponses = append(response.ContainerResponses, &deviceplugin.ContainerAllocateResponse{
+			Devices: devices})
+	}
+
+	return &response, nil
+}
+
+// deviceNumberFromString gets a Linux device number from a string containing two decimal numbers representing the major
+// and minor device numbers separated by a colon. Whitespace is ignored.
+func deviceNumberFromString(s string) (uint64, error) {
+	kvmDevParts := strings.Split(s, ":")
+	if len(kvmDevParts) != 2 {
+		return 0, fmt.Errorf("device file spec contains an invalid number of colons: `%v`", s)
+	}
+	major, err := strconv.ParseUint(strings.TrimSpace(kvmDevParts[0]), 10, 32)
+	if err != nil {
+		return 0, fmt.Errorf("failed to convert major number to an integer: %w", err)
+	}
+	minor, err := strconv.ParseUint(strings.TrimSpace(kvmDevParts[1]), 10, 32)
+	if err != nil {
+		return 0, fmt.Errorf("failed to convert minor number to an integer: %w", err)
+	}
+
+	return unix.Mkdev(uint32(major), uint32(minor)), nil
+}
+
+func (k *Plugin) Run(ctx context.Context) error {
+	k.logger = supervisor.Logger(ctx)
+
+	l1tfStatus, err := ioutil.ReadFile("/sys/devices/system/cpu/vulnerabilities/l1tf")
+	if err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("failed to query for CPU vulnerabilities: %v", err)
+	}
+
+	if bytes.Contains(l1tfStatus, []byte("vulnerable")) {
+		k.logger.Warning("CPU is vulnerable to L1TF, not exposing KVM.")
+		supervisor.Signal(ctx, supervisor.SignalHealthy)
+		supervisor.Signal(ctx, supervisor.SignalDone)
+		return nil
+	}
+
+	kvmDevRaw, err := ioutil.ReadFile("/sys/devices/virtual/misc/kvm/dev")
+	if err != nil {
+		k.logger.Warning("KVM is not available. Check firmware settings and CPU.")
+		supervisor.Signal(ctx, supervisor.SignalHealthy)
+		supervisor.Signal(ctx, supervisor.SignalDone)
+		return nil
+	}
+
+	kvmDevNode, err := deviceNumberFromString(string(kvmDevRaw))
+
+	err = unix.Mknod("/dev/kvm", 0660, int(kvmDevNode))
+	if err != nil && err != unix.EEXIST {
+		return fmt.Errorf("failed to create KVM device node: %v", err)
+	}
+
+	pluginListener, err := net.ListenUnix("unix", &net.UnixAddr{Name: k.KubeletDirectory.Plugins.KVM.FullPath(), Net: "unix"})
+	if err != nil {
+		return fmt.Errorf("failed to listen on device plugin socket: %w", err)
+	}
+	pluginListener.SetUnlinkOnClose(true)
+
+	pluginServer := grpc.NewServer()
+	deviceplugin.RegisterDevicePluginServer(pluginServer, k)
+	if err := supervisor.Run(ctx, "kvm-device", supervisor.GRPCServer(pluginServer, pluginListener, false)); err != nil {
+		return err
+	}
+
+	registrationListener, err := net.ListenUnix("unix", &net.UnixAddr{Name: k.KubeletDirectory.PluginsRegistry.KVMReg.FullPath(), Net: "unix"})
+	if err != nil {
+		return fmt.Errorf("failed to listen on registration socket: %w", err)
+	}
+	registrationListener.SetUnlinkOnClose(true)
+
+	registrationServer := grpc.NewServer()
+	pluginregistration.RegisterRegistrationServer(registrationServer, k)
+	if err := supervisor.Run(ctx, "registration", supervisor.GRPCServer(registrationServer, registrationListener, true)); err != nil {
+		return err
+	}
+	supervisor.Signal(ctx, supervisor.SignalHealthy)
+	supervisor.Signal(ctx, supervisor.SignalDone)
+	return nil
+}
diff --git a/metropolis/node/kubernetes/service.go b/metropolis/node/kubernetes/service.go
index 8d0c795..bd0d211 100644
--- a/metropolis/node/kubernetes/service.go
+++ b/metropolis/node/kubernetes/service.go
@@ -34,6 +34,7 @@
 	"source.monogon.dev/metropolis/node/kubernetes/clusternet"
 	"source.monogon.dev/metropolis/node/kubernetes/nfproxy"
 	"source.monogon.dev/metropolis/node/kubernetes/pki"
+	"source.monogon.dev/metropolis/node/kubernetes/plugins/kvmdevice"
 	"source.monogon.dev/metropolis/node/kubernetes/reconciler"
 	"source.monogon.dev/metropolis/pkg/supervisor"
 	apb "source.monogon.dev/metropolis/proto/api"
@@ -136,6 +137,10 @@
 		ClientSet:   clientSet,
 	}
 
+	kvmDevicePlugin := kvmdevice.Plugin{
+		KubeletDirectory: &s.c.Root.Data.Kubernetes.Kubelet,
+	}
+
 	for _, sub := range []struct {
 		name     string
 		runnable supervisor.Runnable
@@ -149,6 +154,7 @@
 		{"csi-provisioner", csiProvisioner.Run},
 		{"clusternet", clusternet.Run},
 		{"nfproxy", nfproxy.Run},
+		{"kvmdeviceplugin", kvmDevicePlugin.Run},
 	} {
 		err := supervisor.Run(ctx, sub.name, sub.runnable)
 		if err != nil {
diff --git a/third_party/linux/linux-metropolis.config b/third_party/linux/linux-metropolis.config
index a6faa09..f03e66b 100644
--- a/third_party/linux/linux-metropolis.config
+++ b/third_party/linux/linux-metropolis.config
@@ -86,11 +86,12 @@
 #
 # Timers subsystem
 #
+CONFIG_TICK_ONESHOT=y
 CONFIG_HZ_PERIODIC=y
 # CONFIG_NO_HZ_IDLE is not set
 # CONFIG_NO_HZ_FULL is not set
 # CONFIG_NO_HZ is not set
-# CONFIG_HIGH_RES_TIMERS is not set
+CONFIG_HIGH_RES_TIMERS=y
 # end of Timers subsystem
 
 CONFIG_PREEMPT_NONE=y
@@ -105,7 +106,7 @@
 # CONFIG_IRQ_TIME_ACCOUNTING is not set
 # CONFIG_BSD_PROCESS_ACCT is not set
 CONFIG_TASKSTATS=y
-# CONFIG_TASK_DELAY_ACCT is not set
+CONFIG_TASK_DELAY_ACCT=y
 # CONFIG_TASK_XACCT is not set
 CONFIG_PSI=y
 # CONFIG_PSI_DEFAULT_DISABLED is not set
@@ -425,6 +426,7 @@
 # CONFIG_HZ_300 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=250
+CONFIG_SCHED_HRTICK=y
 # CONFIG_KEXEC is not set
 # CONFIG_KEXEC_FILE is not set
 # CONFIG_CRASH_DUMP is not set
@@ -632,7 +634,24 @@
 # end of Firmware Drivers
 
 CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
+CONFIG_HAVE_KVM_IRQFD=y
+CONFIG_HAVE_KVM_IRQ_ROUTING=y
+CONFIG_HAVE_KVM_EVENTFD=y
+CONFIG_KVM_MMIO=y
+CONFIG_KVM_ASYNC_PF=y
+CONFIG_HAVE_KVM_MSI=y
+CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y
+CONFIG_KVM_VFIO=y
+CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y
+CONFIG_HAVE_KVM_IRQ_BYPASS=y
+CONFIG_HAVE_KVM_NO_POLL=y
+CONFIG_KVM_XFER_TO_GUEST_WORK=y
 CONFIG_VIRTUALIZATION=y
+CONFIG_KVM=y
+CONFIG_KVM_INTEL=y
+CONFIG_KVM_AMD=y
+CONFIG_KVM_AMD_SEV=y
 CONFIG_AS_AVX512=y
 CONFIG_AS_SHA1_NI=y
 CONFIG_AS_SHA256_NI=y
@@ -650,6 +669,7 @@
 # CONFIG_STATIC_CALL_SELFTEST is not set
 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
 CONFIG_ARCH_USE_BUILTIN_BSWAP=y
+CONFIG_USER_RETURN_NOTIFIER=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
@@ -791,6 +811,7 @@
 # CONFIG_IOSCHED_BFQ is not set
 # end of IO Schedulers
 
+CONFIG_PREEMPT_NOTIFIERS=y
 CONFIG_PADATA=y
 CONFIG_ASN1=y
 CONFIG_INLINE_SPIN_UNLOCK_IRQ=y
@@ -2634,6 +2655,7 @@
 # CONFIG_UIO_MF624 is not set
 # CONFIG_UIO_HV_GENERIC is not set
 # CONFIG_VFIO is not set
+CONFIG_IRQ_BYPASS_MANAGER=y
 # CONFIG_VIRT_DRIVERS is not set
 CONFIG_VIRTIO=y
 CONFIG_VIRTIO_MENU=y
@@ -3391,7 +3413,9 @@
 # CONFIG_CRYPTO_DEV_PADLOCK is not set
 # CONFIG_CRYPTO_DEV_ATMEL_ECC is not set
 # CONFIG_CRYPTO_DEV_ATMEL_SHA204A is not set
-# CONFIG_CRYPTO_DEV_CCP is not set
+CONFIG_CRYPTO_DEV_CCP=y
+CONFIG_CRYPTO_DEV_CCP_DD=y
+CONFIG_CRYPTO_DEV_SP_PSP=y
 # CONFIG_CRYPTO_DEV_QAT_DH895xCC is not set
 # CONFIG_CRYPTO_DEV_QAT_C3XXX is not set
 # CONFIG_CRYPTO_DEV_QAT_C62X is not set
@@ -3633,6 +3657,7 @@
 # Scheduler Debugging
 #
 CONFIG_SCHED_DEBUG=y
+CONFIG_SCHED_INFO=y
 # CONFIG_SCHEDSTATS is not set
 # end of Scheduler Debugging