c/takeover: init

Add takeover, a tool which is used to take over machines running any
pretty much any Linux distribution as long as it has kexec support and
one can run the takeover binary on it.

After takeover it launches the cloud agent which can then perform
further work on the machine.

Change-Id: If26015f626f439c44be473221c98b5e9a8fa9adc
Reviewed-on: https://review.monogon.dev/c/monogon/+/1143
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Leopold Schabel <leo@monogon.tech>
diff --git a/cloud/takeover/BUILD.bazel b/cloud/takeover/BUILD.bazel
new file mode 100644
index 0000000..1eb34e2
--- /dev/null
+++ b/cloud/takeover/BUILD.bazel
@@ -0,0 +1,38 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
+load("//metropolis/node/build/mkucode:def.bzl", "cpio_ucode")
+
+go_library(
+    name = "takeover_lib",
+    srcs = ["takeover.go"],
+    embedsrcs = [
+        "//third_party/linux",  #keep
+        ":ucode",  #keep
+        "//cloud/agent:initramfs",  #keep
+    ],
+    importpath = "source.monogon.dev/cloud/takeover",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//cloud/agent/api",
+        "//metropolis/pkg/bootparam",
+        "//metropolis/pkg/kexec",
+        "//net/dump",
+        "@com_github_cavaliergopher_cpio//:cpio",
+        "@com_github_pierrec_lz4_v4//:lz4",
+        "@org_golang_google_protobuf//proto",
+        "@org_golang_x_sys//unix",
+    ],
+)
+
+go_binary(
+    name = "takeover",
+    embed = [":takeover_lib"],
+    visibility = ["//visibility:public"],
+)
+
+cpio_ucode(
+    name = "ucode",
+    ucode = {
+        "@linux-firmware//:amd_ucode": "AuthenticAMD",
+        "@intel_ucode//:fam6h": "GenuineIntel",
+    },
+)
diff --git a/cloud/takeover/takeover.go b/cloud/takeover/takeover.go
new file mode 100644
index 0000000..e340c56
--- /dev/null
+++ b/cloud/takeover/takeover.go
@@ -0,0 +1,226 @@
+// takeover is a self-contained executable which when executed loads the BMaaS
+// agent via kexec. It is intended to be called over SSH, given a binary
+// TakeoverInit message over standard input and (if all preparation work
+// completed successfully) will respond with a TakeoverResponse on standard
+// output. At that point the new kernel and agent initramfs are fully staged
+// by the current kernel.
+// The second stage which is also part of this binary, selected by an
+// environment variable, is then executed in detached mode and the main
+// takeover binary called over SSH terminates.
+// The second stage waits for 5 seconds for the main binary to exit, the SSH
+// session to be torn down and various other things before issuing the final
+// non-returning syscall which jumps into the new kernel.
+
+package main
+
+import (
+	"bytes"
+	"crypto/ed25519"
+	"crypto/rand"
+	_ "embed"
+	"errors"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"os/exec"
+	"time"
+
+	"github.com/cavaliergopher/cpio"
+	"github.com/pierrec/lz4/v4"
+	"golang.org/x/sys/unix"
+	"google.golang.org/protobuf/proto"
+
+	"source.monogon.dev/cloud/agent/api"
+	"source.monogon.dev/metropolis/pkg/bootparam"
+	"source.monogon.dev/metropolis/pkg/kexec"
+	netdump "source.monogon.dev/net/dump"
+)
+
+//go:embed third_party/linux/bzImage
+var kernel []byte
+
+//go:embed ucode.cpio
+var ucode []byte
+
+//go:embed cloud/agent/initramfs.cpio.lz4
+var initramfs []byte
+
+// newMemfile creates a new file which is not located on a specific filesystem,
+// but is instead backed by anonymous memory.
+func newMemfile(name string, flags int) (*os.File, error) {
+	fd, err := unix.MemfdCreate(name, flags)
+	if err != nil {
+		return nil, fmt.Errorf("memfd_create failed: %w", err)
+	}
+	return os.NewFile(uintptr(fd), name), nil
+}
+
+func setupTakeover() (*api.TakeoverSuccess, error) {
+	// Read init specification from stdin.
+	initRaw, err := io.ReadAll(os.Stdin)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read TakeoverInit message from stdin: %w", err)
+	}
+	var takeoverInit api.TakeoverInit
+	if err := proto.Unmarshal(initRaw, &takeoverInit); err != nil {
+		return nil, fmt.Errorf("failed to parse TakeoverInit messag from stdin: %w", err)
+	}
+
+	// Sanity check for empty TakeoverInit messages
+	if takeoverInit.BmaasEndpoint == "" {
+		return nil, errors.New("BMaaS endpoint is empty, check that a proper TakeoverInit message has been provided")
+	}
+
+	// Load data from embedded files into memfiles as the kexec load syscall
+	// requires file descriptors.
+	kernelFile, err := newMemfile("kernel", 0)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create kernel memfile: %w", err)
+	}
+	initramfsFile, err := newMemfile("initramfs", 0)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create initramfs memfile: %w", err)
+	}
+	if _, err := kernelFile.ReadFrom(bytes.NewReader(kernel)); err != nil {
+		return nil, fmt.Errorf("failed to read kernel into memory-backed file: %w", err)
+	}
+	if _, err := initramfsFile.ReadFrom(bytes.NewReader(ucode)); err != nil {
+		return nil, fmt.Errorf("failed to read ucode into memory-backed file: %w", err)
+	}
+	if _, err := initramfsFile.ReadFrom(bytes.NewReader(initramfs)); err != nil {
+		return nil, fmt.Errorf("failed to read initramfs into memory-backed file: %w", err)
+	}
+
+	// Dump the current network configuration
+	netconf, warnings, err := netdump.Dump()
+	if err != nil {
+		return nil, fmt.Errorf("failed to dump network configuration: %w", err)
+	}
+
+	// Generate agent private key
+	pubKey, privKey, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		return nil, fmt.Errorf("unable to generate Ed25519 key: %w", err)
+	}
+
+	agentInit := api.AgentInit{
+		TakeoverInit:  &takeoverInit,
+		PrivateKey:    privKey,
+		NetworkConfig: netconf,
+	}
+	agentInitRaw, err := proto.Marshal(&agentInit)
+	if err != nil {
+		return nil, fmt.Errorf("unable to marshal AgentInit message: %v", err)
+	}
+
+	// Append AgentInit spec to initramfs
+	compressedOut := lz4.NewWriter(initramfsFile)
+	compressedOut.Apply(lz4.LegacyOption(true))
+	cpioW := cpio.NewWriter(compressedOut)
+	cpioW.WriteHeader(&cpio.Header{
+		Name: "/init.pb",
+		Size: int64(len(agentInitRaw)),
+		Mode: cpio.TypeReg | 0o644,
+	})
+	cpioW.Write(agentInitRaw)
+	cpioW.Close()
+	compressedOut.Close()
+
+	agentParams := bootparam.Params{
+		bootparam.Param{Param: "quiet"},
+		bootparam.Param{Param: "init", Value: "/init"},
+		// Always add "default" console on x86
+		bootparam.Param{Param: "console", Value: "ttyS0,115200"},
+	}
+
+	cmdline, err := os.ReadFile("/proc/cmdline")
+	if err != nil {
+		warnings = append(warnings, fmt.Errorf("unable to read current kernel command line: %w", err))
+	} else {
+		params, _, err := bootparam.Unmarshal(string(cmdline))
+		// If the existing command line is well-formed, add all existing console
+		// parameters to the console for the agent
+		if err == nil {
+			for _, p := range params {
+				if p.Param == "console" {
+					agentParams = append(agentParams, p)
+				}
+			}
+		}
+	}
+	agentCmdline, err := bootparam.Marshal(agentParams, "")
+	// Stage agent payload into kernel memory
+	if err := kexec.FileLoad(kernelFile, initramfsFile, agentCmdline); err != nil {
+		return nil, fmt.Errorf("failed to load kexec payload: %w", err)
+	}
+	var warningsStrs []string
+	for _, w := range warnings {
+		warningsStrs = append(warningsStrs, w.Error())
+	}
+	return &api.TakeoverSuccess{
+		InitMessage: &takeoverInit,
+		Key:         pubKey,
+		Warning:     warningsStrs,
+	}, nil
+}
+
+// Environment variable which tells the takeover binary to run the second stage
+const detachedLaunchEnv = "TAKEOVER_DETACHED_LAUNCH"
+
+func main() {
+	// Check if the second stage should be executed
+	if os.Getenv(detachedLaunchEnv) == "1" {
+		// Wait 5 seconds for data to be sent, connections to be closed and
+		// syncs to be executed
+		time.Sleep(5 * time.Second)
+		// Perform kexec, this will not return unless it fails
+		err := unix.Reboot(unix.LINUX_REBOOT_CMD_KEXEC)
+		var msg string = "takeover: reboot succeeded, but we're still runing??"
+		if err != nil {
+			msg = err.Error()
+		}
+		// We have no standard output/error anymore, if this fails it's
+		// just borked. Attempt to dump the error into kmesg for manual
+		// debugging.
+		kmsg, err := os.OpenFile("/dev/kmsg", os.O_WRONLY, 0)
+		if err != nil {
+			os.Exit(2)
+		}
+		kmsg.WriteString(msg)
+		kmsg.Close()
+		os.Exit(1)
+	}
+
+	var takeoverResp api.TakeoverResponse
+	res, err := setupTakeover()
+	if err != nil {
+		takeoverResp.Result = &api.TakeoverResponse_Error{Error: &api.TakeoverError{
+			Message: err.Error(),
+		}}
+	} else {
+		takeoverResp.Result = &api.TakeoverResponse_Success{Success: res}
+	}
+	// Respond to stdout
+	takeoverRespRaw, err := proto.Marshal(&takeoverResp)
+	if err != nil {
+		log.Fatalf("failed to marshal response: %v", err)
+	}
+	if _, err := os.Stdout.Write(takeoverRespRaw); err != nil {
+		log.Fatalf("failed to write response to stdout: %v", err)
+	}
+	// Close stdout, we're done responding
+	os.Stdout.Close()
+
+	// Start second stage which waits for 5 seconds while performing
+	// final cleanup.
+	detachedCmd := exec.Command("/proc/self/exe")
+	detachedCmd.Env = []string{detachedLaunchEnv + "=1"}
+	if err := detachedCmd.Start(); err != nil {
+		log.Fatalf("failed to launch final stage: %v", err)
+	}
+	// Release the second stage so that the first stage can cleanly terminate.
+	if err := detachedCmd.Process.Release(); err != nil {
+		log.Fatalf("error releasing final stage process: %v", err)
+	}
+}