Add KVM device plugin

This adds a KVM device plugin for Kubernetes. This plugin allows for unprivileged access and granular
control of KVM access.

Test Plan: Tested in subsequent revision

X-Origin-Diff: phab/D739
GitOrigin-RevId: 5cd738a47d24e7bfdc29bbd1a31537209e1ebf46
diff --git a/metropolis/node/kubernetes/plugins/kvmdevice/kvmdevice.go b/metropolis/node/kubernetes/plugins/kvmdevice/kvmdevice.go
new file mode 100644
index 0000000..e887d1d
--- /dev/null
+++ b/metropolis/node/kubernetes/plugins/kvmdevice/kvmdevice.go
@@ -0,0 +1,184 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kvmdevice implements a Kubernetes device plugin for the virtual KVM device. Using the device plugin API
+// allows us to take advantage of the scheduler to locate pods on machines eligible for KVM and also allows granular
+// access control to KVM using quotas instead of needing privileged access.
+// Since KVM devices are virtual, this plugin emulates a huge number of them so that we never run out.
+package kvmdevice
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io/ioutil"
+	"net"
+	"os"
+	"strconv"
+	"strings"
+
+	"golang.org/x/sys/unix"
+	"google.golang.org/grpc"
+	deviceplugin "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
+	"k8s.io/kubelet/pkg/apis/pluginregistration/v1"
+
+	"source.monogon.dev/metropolis/node/core/localstorage"
+	"source.monogon.dev/metropolis/pkg/logtree"
+	"source.monogon.dev/metropolis/pkg/supervisor"
+)
+
+type Plugin struct {
+	*deviceplugin.UnimplementedDevicePluginServer
+	KubeletDirectory *localstorage.DataKubernetesKubeletDirectory
+
+	logger logtree.LeveledLogger
+}
+
+func (k *Plugin) GetInfo(context.Context, *pluginregistration.InfoRequest) (*pluginregistration.PluginInfo, error) {
+	return &pluginregistration.PluginInfo{
+		Type:              pluginregistration.DevicePlugin,
+		Name:              "devices.monogon.dev/kvm",
+		Endpoint:          k.KubeletDirectory.Plugins.KVM.FullPath(),
+		SupportedVersions: []string{"v1beta1"},
+	}, nil
+}
+
+func (k *Plugin) NotifyRegistrationStatus(ctx context.Context, req *pluginregistration.RegistrationStatus) (*pluginregistration.RegistrationStatusResponse, error) {
+	if !req.PluginRegistered {
+		k.logger.Errorf("KVM plugin failed to register: %v", req.Error)
+	}
+	return &pluginregistration.RegistrationStatusResponse{}, nil
+}
+
+func (k *Plugin) GetDevicePluginOptions(context.Context, *deviceplugin.Empty) (*deviceplugin.DevicePluginOptions, error) {
+	return &deviceplugin.DevicePluginOptions{
+		GetPreferredAllocationAvailable: false,
+		PreStartRequired:                false,
+	}, nil
+}
+
+func (k *Plugin) ListAndWatch(req *deviceplugin.Empty, s deviceplugin.DevicePlugin_ListAndWatchServer) error {
+	var devs []*deviceplugin.Device
+
+	// TODO(T963): Get this value from Kubelet configuration (or something higher-level?)
+	for i := 0; i < 256; i++ {
+		devs = append(devs, &deviceplugin.Device{
+			ID:     fmt.Sprintf("kvm%v", i),
+			Health: deviceplugin.Healthy,
+		})
+	}
+
+	s.Send(&deviceplugin.ListAndWatchResponse{Devices: devs})
+
+	<-s.Context().Done()
+	return nil
+}
+
+func (k *Plugin) Allocate(ctx context.Context, req *deviceplugin.AllocateRequest) (*deviceplugin.AllocateResponse, error) {
+	var response deviceplugin.AllocateResponse
+
+	for _, req := range req.ContainerRequests {
+		var devices []*deviceplugin.DeviceSpec
+		for range req.DevicesIDs {
+			dev := new(deviceplugin.DeviceSpec)
+			dev.HostPath = "/dev/kvm"
+			dev.ContainerPath = "/dev/kvm"
+			dev.Permissions = "rw"
+			devices = append(devices, dev)
+		}
+		response.ContainerResponses = append(response.ContainerResponses, &deviceplugin.ContainerAllocateResponse{
+			Devices: devices})
+	}
+
+	return &response, nil
+}
+
+// deviceNumberFromString gets a Linux device number from a string containing two decimal numbers representing the major
+// and minor device numbers separated by a colon. Whitespace is ignored.
+func deviceNumberFromString(s string) (uint64, error) {
+	kvmDevParts := strings.Split(s, ":")
+	if len(kvmDevParts) != 2 {
+		return 0, fmt.Errorf("device file spec contains an invalid number of colons: `%v`", s)
+	}
+	major, err := strconv.ParseUint(strings.TrimSpace(kvmDevParts[0]), 10, 32)
+	if err != nil {
+		return 0, fmt.Errorf("failed to convert major number to an integer: %w", err)
+	}
+	minor, err := strconv.ParseUint(strings.TrimSpace(kvmDevParts[1]), 10, 32)
+	if err != nil {
+		return 0, fmt.Errorf("failed to convert minor number to an integer: %w", err)
+	}
+
+	return unix.Mkdev(uint32(major), uint32(minor)), nil
+}
+
+func (k *Plugin) Run(ctx context.Context) error {
+	k.logger = supervisor.Logger(ctx)
+
+	l1tfStatus, err := ioutil.ReadFile("/sys/devices/system/cpu/vulnerabilities/l1tf")
+	if err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("failed to query for CPU vulnerabilities: %v", err)
+	}
+
+	if bytes.Contains(l1tfStatus, []byte("vulnerable")) {
+		k.logger.Warning("CPU is vulnerable to L1TF, not exposing KVM.")
+		supervisor.Signal(ctx, supervisor.SignalHealthy)
+		supervisor.Signal(ctx, supervisor.SignalDone)
+		return nil
+	}
+
+	kvmDevRaw, err := ioutil.ReadFile("/sys/devices/virtual/misc/kvm/dev")
+	if err != nil {
+		k.logger.Warning("KVM is not available. Check firmware settings and CPU.")
+		supervisor.Signal(ctx, supervisor.SignalHealthy)
+		supervisor.Signal(ctx, supervisor.SignalDone)
+		return nil
+	}
+
+	kvmDevNode, err := deviceNumberFromString(string(kvmDevRaw))
+
+	err = unix.Mknod("/dev/kvm", 0660, int(kvmDevNode))
+	if err != nil && err != unix.EEXIST {
+		return fmt.Errorf("failed to create KVM device node: %v", err)
+	}
+
+	pluginListener, err := net.ListenUnix("unix", &net.UnixAddr{Name: k.KubeletDirectory.Plugins.KVM.FullPath(), Net: "unix"})
+	if err != nil {
+		return fmt.Errorf("failed to listen on device plugin socket: %w", err)
+	}
+	pluginListener.SetUnlinkOnClose(true)
+
+	pluginServer := grpc.NewServer()
+	deviceplugin.RegisterDevicePluginServer(pluginServer, k)
+	if err := supervisor.Run(ctx, "kvm-device", supervisor.GRPCServer(pluginServer, pluginListener, false)); err != nil {
+		return err
+	}
+
+	registrationListener, err := net.ListenUnix("unix", &net.UnixAddr{Name: k.KubeletDirectory.PluginsRegistry.KVMReg.FullPath(), Net: "unix"})
+	if err != nil {
+		return fmt.Errorf("failed to listen on registration socket: %w", err)
+	}
+	registrationListener.SetUnlinkOnClose(true)
+
+	registrationServer := grpc.NewServer()
+	pluginregistration.RegisterRegistrationServer(registrationServer, k)
+	if err := supervisor.Run(ctx, "registration", supervisor.GRPCServer(registrationServer, registrationListener, true)); err != nil {
+		return err
+	}
+	supervisor.Signal(ctx, supervisor.SignalHealthy)
+	supervisor.Signal(ctx, supervisor.SignalDone)
+	return nil
+}