m/n/c/update: implement kexec-based activation

As we've had some issues with EFI-based slot activation and enterprise
server firmware is extremely slow, this implements kexec-based
activation. This just kexecs into the freshly-installed slot instead of
rebooting. It still updates the BootOrder on successful boot to allow
cold-boots if the server crashes or loses power, but no longer uses the
NextBoot mechanism to boot into the new slot once (this is taken care of
by kexec).

Change-Id: I6092c47d988634ba39fb6bdd7fd7ccd41ceb02ef
Reviewed-on: https://review.monogon.dev/c/monogon/+/2021
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/mgmt/mgmt.go b/metropolis/node/core/mgmt/mgmt.go
index a9f5973..0c7cb54 100644
--- a/metropolis/node/core/mgmt/mgmt.go
+++ b/metropolis/node/core/mgmt/mgmt.go
@@ -6,6 +6,7 @@
 	"context"
 	"fmt"
 	"net"
+	"sync"
 
 	"google.golang.org/grpc"
 
@@ -27,6 +28,8 @@
 	LogTree *logtree.LogTree
 	// Update service handle for performing updates via the API.
 	UpdateService *update.Service
+	// Serialized UpdateNode RPCs
+	updateMutex sync.Mutex
 
 	// Automatically populated on Run.
 	LogService
diff --git a/metropolis/node/core/mgmt/update.go b/metropolis/node/core/mgmt/update.go
index 28a2a0a..ce8b26b 100644
--- a/metropolis/node/core/mgmt/update.go
+++ b/metropolis/node/core/mgmt/update.go
@@ -12,16 +12,30 @@
 )
 
 func (s *Service) UpdateNode(ctx context.Context, req *apb.UpdateNodeRequest) (*apb.UpdateNodeResponse, error) {
-	if err := s.UpdateService.InstallBundle(ctx, req.BundleUrl); err != nil {
+	ok := s.updateMutex.TryLock()
+	if ok {
+		defer s.updateMutex.Unlock()
+	} else {
+		return nil, status.Error(codes.Aborted, "another UpdateNode RPC is in progress on this node")
+	}
+	if req.ActivationMode == apb.ActivationMode_ACTIVATION_INVALID {
+		return nil, status.Errorf(codes.InvalidArgument, "activation_mode needs to be explicitly specified")
+	}
+	if err := s.UpdateService.InstallBundle(ctx, req.BundleUrl, req.ActivationMode == apb.ActivationMode_ACTIVATION_KEXEC); err != nil {
 		return nil, status.Errorf(codes.Unavailable, "error installing update: %v", err)
 	}
-	if !req.NoReboot {
+	if req.ActivationMode != apb.ActivationMode_ACTIVATION_NONE {
 		// TODO(#253): Tell Supervisor to shut down gracefully and reboot
 		go func() {
 			time.Sleep(10 * time.Second)
 			unix.Sync()
-			unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
+			if req.ActivationMode == apb.ActivationMode_ACTIVATION_KEXEC {
+				unix.Reboot(unix.LINUX_REBOOT_CMD_KEXEC)
+			} else {
+				unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
+			}
 		}()
 	}
+
 	return &apb.UpdateNodeResponse{}, nil
 }
diff --git a/metropolis/node/core/update/BUILD.bazel b/metropolis/node/core/update/BUILD.bazel
index 143231b..e3cdcd1 100644
--- a/metropolis/node/core/update/BUILD.bazel
+++ b/metropolis/node/core/update/BUILD.bazel
@@ -10,9 +10,11 @@
         "//metropolis/pkg/blockdev",
         "//metropolis/pkg/efivarfs",
         "//metropolis/pkg/gpt",
+        "//metropolis/pkg/kexec",
         "//metropolis/pkg/logtree",
         "@com_github_cenkalti_backoff_v4//:backoff",
         "@org_golang_google_grpc//codes",
         "@org_golang_google_grpc//status",
+        "@org_golang_x_sys//unix",
     ],
 )
diff --git a/metropolis/node/core/update/e2e/e2e_test.go b/metropolis/node/core/update/e2e/e2e_test.go
index 7524d2a..491259a 100644
--- a/metropolis/node/core/update/e2e/e2e_test.go
+++ b/metropolis/node/core/update/e2e/e2e_test.go
@@ -26,29 +26,20 @@
 
 var variantRegexp = regexp.MustCompile(`TESTOS_VARIANT=([A-Z])`)
 
-func runAndCheckVariant(t *testing.T, expectedVariant string, qemuArgs []string) {
-	t.Helper()
-	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
-	defer cancel()
-	qemuCmdLaunch := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
-	stdoutPipe, err := qemuCmdLaunch.StdoutPipe()
+func stdoutHandler(t *testing.T, cmd *exec.Cmd, cancel context.CancelFunc, testosStarted chan string) {
+	stdoutPipe, err := cmd.StdoutPipe()
 	if err != nil {
 		t.Fatal(err)
 	}
-	stderrPipe, err := qemuCmdLaunch.StderrPipe()
-	if err != nil {
-		t.Fatal(err)
-	}
-	testosStarted := make(chan string, 1)
+	s := bufio.NewScanner(stdoutPipe)
 	go func() {
-		s := bufio.NewScanner(stdoutPipe)
 		for s.Scan() {
 			if strings.HasPrefix(s.Text(), "[") {
 				continue
 			}
 			errIdx := strings.Index(s.Text(), "Error installing new bundle")
 			if errIdx != -1 {
-				t.Error(s.Text()[errIdx:])
+				cancel()
 			}
 			t.Log("vm: " + s.Text())
 			if m := variantRegexp.FindStringSubmatch(s.Text()); len(m) == 2 {
@@ -59,8 +50,15 @@
 			}
 		}
 	}()
+}
+
+func stderrHandler(t *testing.T, cmd *exec.Cmd) {
+	stderrPipe, err := cmd.StderrPipe()
+	if err != nil {
+		t.Fatal(err)
+	}
+	s := bufio.NewScanner(stderrPipe)
 	go func() {
-		s := bufio.NewScanner(stderrPipe)
 		for s.Scan() {
 			if strings.HasPrefix(s.Text(), "[") {
 				continue
@@ -68,6 +66,16 @@
 			t.Log("qemu: " + s.Text())
 		}
 	}()
+}
+
+func runAndCheckVariant(t *testing.T, expectedVariant string, qemuArgs []string) {
+	t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	defer cancel()
+	qemuCmdLaunch := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
+	testosStarted := make(chan string, 1)
+	stdoutHandler(t, qemuCmdLaunch, cancel, testosStarted)
+	stderrHandler(t, qemuCmdLaunch)
 	if err := qemuCmdLaunch.Start(); err != nil {
 		t.Fatal(err)
 	}
@@ -85,7 +93,7 @@
 		case <-procExit:
 			return
 		case <-ctx.Done():
-			t.Log("Canceled VM")
+			t.Error("Timed out waiting for VM to exit")
 			cancel()
 			<-procExit
 			return
@@ -98,31 +106,60 @@
 	}
 }
 
-func TestABUpdateSequence(t *testing.T) {
+type bundleServing struct {
+	t              *testing.T
+	bundlePaths    map[string]string
+	bundleFilePath string
+	// Protects bundleFilePath above
+	m sync.Mutex
+}
+
+func (b *bundleServing) setNextBundle(variant string) {
+	b.m.Lock()
+	defer b.m.Unlock()
+	p, ok := b.bundlePaths[variant]
+	if !ok {
+		b.t.Fatalf("no bundle for variant %s available", variant)
+		return
+	}
+	b.bundleFilePath = p
+}
+
+// setup sets up an an HTTP server for serving bundles which can be controlled
+// through the returned bundleServing struct as well as the initial boot disk
+// and EFI variable storage. It also returns the required QEMU arguments to
+// boot the initial TestOS.
+func setup(t *testing.T) (*bundleServing, []string) {
+	t.Helper()
 	blobAddr := net.TCPAddr{
 		IP:   net.IPv4(10, 42, 0, 5),
 		Port: 80,
 	}
 
-	var nextBundlePathToInstall string
-	var nbpMutex sync.Mutex
+	b := bundleServing{
+		t:           t,
+		bundlePaths: make(map[string]string),
+	}
 
 	m := http.NewServeMux()
 	bundleYPath, err := datafile.ResolveRunfile("metropolis/node/core/update/e2e/testos/testos_bundle_y.zip")
 	if err != nil {
 		t.Fatal(err)
 	}
+	b.bundlePaths["Y"] = bundleYPath
 	bundleZPath, err := datafile.ResolveRunfile("metropolis/node/core/update/e2e/testos/testos_bundle_z.zip")
 	if err != nil {
 		t.Fatal(err)
 	}
+	b.bundlePaths["Z"] = bundleZPath
 	m.HandleFunc("/bundle.bin", func(w http.ResponseWriter, req *http.Request) {
-		nbpMutex.Lock()
-		bundleFilePath := nextBundlePathToInstall
-		nbpMutex.Unlock()
+		b.m.Lock()
+		bundleFilePath := b.bundleFilePath
+		b.m.Unlock()
 		if bundleFilePath == "" {
 			w.WriteHeader(http.StatusBadRequest)
 			w.Write([]byte("No next bundle set in the test harness"))
+			return
 		}
 		http.ServeFile(w, req, bundleFilePath)
 	})
@@ -130,6 +167,7 @@
 	if err != nil {
 		t.Fatal(err)
 	}
+	t.Cleanup(func() { blobLis.Close() })
 	blobListenAddr := blobLis.Addr().(*net.TCPAddr)
 	go http.Serve(blobLis, m)
 
@@ -139,7 +177,7 @@
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer os.Remove(rootDevPath)
+	t.Cleanup(func() { os.Remove(rootDevPath) })
 	defer rootDisk.Close()
 
 	ovmfVarsPath, err := datafile.ResolveRunfile("external/edk2/OVMF_VARS.fd")
@@ -181,18 +219,20 @@
 	}); err != nil {
 		t.Fatalf("unable to generate starting point image: %v", err)
 	}
-	rootDisk.Close()
 
 	blobGuestFwd := fmt.Sprintf("guestfwd=tcp:%s-tcp:127.0.0.1:%d", blobAddr.String(), blobListenAddr.Port)
 
-	ovmfVars, err := os.CreateTemp("", "agent-ovmf-vars")
+	ovmfVars, err := os.CreateTemp("", "ab-ovmf-vars")
 	if err != nil {
 		t.Fatal(err)
 	}
+	defer ovmfVars.Close()
+	t.Cleanup(func() { os.Remove(ovmfVars.Name()) })
 	ovmfVarsTmpl, err := os.Open(ovmfVarsPath)
 	if err != nil {
 		t.Fatal(err)
 	}
+	defer ovmfVarsTmpl.Close()
 	if _, err := io.Copy(ovmfVars, ovmfVarsTmpl); err != nil {
 		t.Fatal(err)
 	}
@@ -207,25 +247,75 @@
 		"-device", "virtio-net-pci,netdev=net0,mac=22:d5:8e:76:1d:07",
 		"-device", "virtio-rng-pci",
 		"-serial", "stdio",
-		"-trace", "pflash*",
 		"-no-reboot",
 	}
-	// Install Bundle Y next
-	nbpMutex.Lock()
-	nextBundlePathToInstall = bundleYPath
-	nbpMutex.Unlock()
+	return &b, qemuArgs
+}
+
+func TestABUpdateSequenceReboot(t *testing.T) {
+	bsrv, qemuArgs := setup(t)
 
 	t.Log("Launching X image to install Y")
+	bsrv.setNextBundle("Y")
 	runAndCheckVariant(t, "X", qemuArgs)
 
-	// Install Bundle Z next
-	nbpMutex.Lock()
-	nextBundlePathToInstall = bundleZPath
-	nbpMutex.Unlock()
-
 	t.Log("Launching Y on slot B to install Z on slot A")
+	bsrv.setNextBundle("Z")
 	runAndCheckVariant(t, "Y", qemuArgs)
 
 	t.Log("Launching Z on slot A")
 	runAndCheckVariant(t, "Z", qemuArgs)
 }
+
+func TestABUpdateSequenceKexec(t *testing.T) {
+	bsrv, qemuArgs := setup(t)
+	qemuArgs = append(qemuArgs, "-fw_cfg", "name=use_kexec,string=1")
+
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+	qemuCmdLaunch := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
+	testosStarted := make(chan string, 1)
+	stdoutHandler(t, qemuCmdLaunch, cancel, testosStarted)
+	stderrHandler(t, qemuCmdLaunch)
+	if err := qemuCmdLaunch.Start(); err != nil {
+		t.Fatal(err)
+	}
+	procExit := make(chan error)
+	go func() {
+		procExit <- qemuCmdLaunch.Wait()
+		close(procExit)
+	}()
+	var expectedVariant = "X"
+	for {
+		select {
+		case variant := <-testosStarted:
+			if variant != expectedVariant {
+				t.Fatalf("expected variant %s to launch, got %s", expectedVariant, variant)
+			}
+			switch expectedVariant {
+			case "X":
+				expectedVariant = "Y"
+			case "Y":
+				expectedVariant = "Z"
+			case "Z":
+				// We're done, wait for everything to wind down and return
+				select {
+				case <-procExit:
+					return
+				case <-ctx.Done():
+					t.Error("Timed out waiting for VM to exit")
+					cancel()
+					<-procExit
+					return
+				}
+			}
+			bsrv.setNextBundle(expectedVariant)
+			t.Logf("Got %s, installing %s", variant, expectedVariant)
+		case err := <-procExit:
+			t.Fatalf("QEMU exited unexpectedly: %v", err)
+			return
+		case <-ctx.Done():
+			t.Fatalf("Waiting for TestOS variant %s launch timed out", expectedVariant)
+		}
+	}
+}
diff --git a/metropolis/node/core/update/e2e/testos/main.go b/metropolis/node/core/update/e2e/testos/main.go
index cae004c..cba1ade 100644
--- a/metropolis/node/core/update/e2e/testos/main.go
+++ b/metropolis/node/core/update/e2e/testos/main.go
@@ -123,8 +123,11 @@
 	if err := updateSvc.MarkBootSuccessful(); err != nil {
 		supervisor.Logger(ctx).Errorf("error marking boot successful: %w", err)
 	}
+	_, err = os.Stat("/sys/firmware/qemu_fw_cfg/by_name/use_kexec/raw")
+	useKexec := err == nil
+	supervisor.Logger(ctx).Infof("Kexec: %v", useKexec)
 	if Variant != "Z" {
-		if err := updateSvc.InstallBundle(ctx, "http://10.42.0.5:80/bundle.bin"); err != nil {
+		if err := updateSvc.InstallBundle(ctx, "http://10.42.0.5:80/bundle.bin", useKexec); err != nil {
 			supervisor.Logger(ctx).Errorf("Error installing new bundle: %v", err)
 		}
 	}
@@ -132,6 +135,10 @@
 	supervisor.Logger(ctx).Info("Installed bundle successfully, powering off")
 	unix.Sync()
 	time.Sleep(1 * time.Second)
-	unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF)
+	if useKexec && Variant != "Z" {
+		unix.Reboot(unix.LINUX_REBOOT_CMD_KEXEC)
+	} else {
+		unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF)
+	}
 	return nil
 }
diff --git a/metropolis/node/core/update/e2e/testos/testos.bzl b/metropolis/node/core/update/e2e/testos/testos.bzl
index a123ea6..29e218f 100644
--- a/metropolis/node/core/update/e2e/testos/testos.bzl
+++ b/metropolis/node/core/update/e2e/testos/testos.bzl
@@ -27,7 +27,7 @@
 
     efi_unified_kernel_image(
         name = "kernel_efi_" + variant,
-        cmdline = "console=ttyS0 init=/init",
+        cmdline = "console=ttyS0 quiet rootfstype=erofs init=/init loadpin.exclude=kexec-image,kexec-initramfs",
         kernel = "//third_party/linux",
         verity = ":verity_rootfs_" + variant,
         visibility = ["//metropolis/node/core/update/e2e:__pkg__"],
diff --git a/metropolis/node/core/update/update.go b/metropolis/node/core/update/update.go
index 107f9cc..84a5db9 100644
--- a/metropolis/node/core/update/update.go
+++ b/metropolis/node/core/update/update.go
@@ -4,6 +4,7 @@
 	"archive/zip"
 	"bytes"
 	"context"
+	"debug/pe"
 	"errors"
 	"fmt"
 	"io"
@@ -12,8 +13,10 @@
 	"path/filepath"
 	"regexp"
 	"strconv"
+	"strings"
 
 	"github.com/cenkalti/backoff/v4"
+	"golang.org/x/sys/unix"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
 
@@ -21,6 +24,7 @@
 	"source.monogon.dev/metropolis/pkg/blockdev"
 	"source.monogon.dev/metropolis/pkg/efivarfs"
 	"source.monogon.dev/metropolis/pkg/gpt"
+	"source.monogon.dev/metropolis/pkg/kexec"
 	"source.monogon.dev/metropolis/pkg/logtree"
 )
 
@@ -177,6 +181,7 @@
 			efivarfs.FilePath(slot.EFIBootPath()),
 		},
 	}
+	s.Logger.Infof("Recreated boot entry %s", newEntry.Description)
 	newIdx, err := efivarfs.AddBootEntry(newEntry)
 	if err == nil {
 		existing[newIdx] = newEntry
@@ -274,7 +279,7 @@
 // InstallBundle installs the bundle at the given HTTP(S) URL into the currently
 // inactive slot and sets that slot to boot next. If it doesn't return an error,
 // a reboot boots into the new slot.
-func (s *Service) InstallBundle(ctx context.Context, bundleURL string) error {
+func (s *Service) InstallBundle(ctx context.Context, bundleURL string, withKexec bool) error {
 	if s.ESPPath == "" {
 		return errors.New("no ESP information provided to update service, cannot continue")
 	}
@@ -353,8 +358,14 @@
 		return fmt.Errorf("failed setting boot entry %d active: %w", targetSlotBootEntryIdx, err)
 	}
 
-	if err := efivarfs.SetBootNext(uint16(targetSlotBootEntryIdx)); err != nil {
-		return fmt.Errorf("failed to set BootNext variable: %w", err)
+	if withKexec {
+		if err := s.stageKexec(bootFile, targetSlot); err != nil {
+			return fmt.Errorf("while kexec staging: %w", err)
+		}
+	} else {
+		if err := efivarfs.SetBootNext(uint16(targetSlotBootEntryIdx)); err != nil {
+			return fmt.Errorf("failed to set BootNext variable: %w", err)
+		}
 	}
 
 	return nil
@@ -388,3 +399,62 @@
 	}
 	return nil
 }
+
+// newMemfile creates a new file which is not located on a specific filesystem,
+// but is instead backed by anonymous memory.
+func newMemfile(name string, flags int) (*os.File, error) {
+	fd, err := unix.MemfdCreate(name, flags)
+	if err != nil {
+		return nil, fmt.Errorf("memfd_create: %w", err)
+	}
+	return os.NewFile(uintptr(fd), name), nil
+}
+
+// stageKexec stages the kernel, command line and initramfs if available for
+// a future kexec. It extracts the relevant data from the EFI boot executable.
+func (s *Service) stageKexec(bootFile io.ReaderAt, targetSlot Slot) error {
+	bootPE, err := pe.NewFile(bootFile)
+	if err != nil {
+		return fmt.Errorf("unable to open bootFile as PE: %w", err)
+	}
+	var cmdlineRaw []byte
+	cmdlineSection := bootPE.Section(".cmdline")
+	if cmdlineSection == nil {
+		return fmt.Errorf("no .cmdline section in boot PE")
+	}
+	cmdlineRaw, err = cmdlineSection.Data()
+	if err != nil {
+		return fmt.Errorf("while reading .cmdline PE section: %w", err)
+	}
+	cmdline := string(bytes.TrimRight(cmdlineRaw, "\x00"))
+	cmdline = strings.ReplaceAll(cmdline, "METROPOLIS-SYSTEM-X", fmt.Sprintf("METROPOLIS-SYSTEM-%s", targetSlot))
+	kernelFile, err := newMemfile("kernel", 0)
+	if err != nil {
+		return fmt.Errorf("failed to create kernel memfile: %w", err)
+	}
+	defer kernelFile.Close()
+	kernelSection := bootPE.Section(".linux")
+	if kernelSection == nil {
+		return fmt.Errorf("no .linux section in boot PE")
+	}
+	if _, err := io.Copy(kernelFile, kernelSection.Open()); err != nil {
+		return fmt.Errorf("while copying .linux PE section: %w", err)
+	}
+
+	initramfsSection := bootPE.Section(".initrd")
+	var initramfsFile *os.File
+	if initramfsSection != nil && initramfsSection.Size > 0 {
+		initramfsFile, err = newMemfile("initramfs", 0)
+		if err != nil {
+			return fmt.Errorf("failed to create initramfs memfile: %w", err)
+		}
+		defer initramfsFile.Close()
+		if _, err := io.Copy(initramfsFile, initramfsSection.Open()); err != nil {
+			return fmt.Errorf("while copying .initrd PE section: %w", err)
+		}
+	}
+	if err := kexec.FileLoad(kernelFile, initramfsFile, cmdline); err != nil {
+		return fmt.Errorf("while staging new kexec kernel: %w", err)
+	}
+	return nil
+}