m/n/c/update: implement kexec-based activation
As we've had some issues with EFI-based slot activation and enterprise
server firmware is extremely slow, this implements kexec-based
activation. This just kexecs into the freshly-installed slot instead of
rebooting. It still updates the BootOrder on successful boot to allow
cold-boots if the server crashes or loses power, but no longer uses the
NextBoot mechanism to boot into the new slot once (this is taken care of
by kexec).
Change-Id: I6092c47d988634ba39fb6bdd7fd7ccd41ceb02ef
Reviewed-on: https://review.monogon.dev/c/monogon/+/2021
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/mgmt/mgmt.go b/metropolis/node/core/mgmt/mgmt.go
index a9f5973..0c7cb54 100644
--- a/metropolis/node/core/mgmt/mgmt.go
+++ b/metropolis/node/core/mgmt/mgmt.go
@@ -6,6 +6,7 @@
"context"
"fmt"
"net"
+ "sync"
"google.golang.org/grpc"
@@ -27,6 +28,8 @@
LogTree *logtree.LogTree
// Update service handle for performing updates via the API.
UpdateService *update.Service
+ // Serialized UpdateNode RPCs
+ updateMutex sync.Mutex
// Automatically populated on Run.
LogService
diff --git a/metropolis/node/core/mgmt/update.go b/metropolis/node/core/mgmt/update.go
index 28a2a0a..ce8b26b 100644
--- a/metropolis/node/core/mgmt/update.go
+++ b/metropolis/node/core/mgmt/update.go
@@ -12,16 +12,30 @@
)
func (s *Service) UpdateNode(ctx context.Context, req *apb.UpdateNodeRequest) (*apb.UpdateNodeResponse, error) {
- if err := s.UpdateService.InstallBundle(ctx, req.BundleUrl); err != nil {
+ ok := s.updateMutex.TryLock()
+ if ok {
+ defer s.updateMutex.Unlock()
+ } else {
+ return nil, status.Error(codes.Aborted, "another UpdateNode RPC is in progress on this node")
+ }
+ if req.ActivationMode == apb.ActivationMode_ACTIVATION_INVALID {
+ return nil, status.Errorf(codes.InvalidArgument, "activation_mode needs to be explicitly specified")
+ }
+ if err := s.UpdateService.InstallBundle(ctx, req.BundleUrl, req.ActivationMode == apb.ActivationMode_ACTIVATION_KEXEC); err != nil {
return nil, status.Errorf(codes.Unavailable, "error installing update: %v", err)
}
- if !req.NoReboot {
+ if req.ActivationMode != apb.ActivationMode_ACTIVATION_NONE {
// TODO(#253): Tell Supervisor to shut down gracefully and reboot
go func() {
time.Sleep(10 * time.Second)
unix.Sync()
- unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
+ if req.ActivationMode == apb.ActivationMode_ACTIVATION_KEXEC {
+ unix.Reboot(unix.LINUX_REBOOT_CMD_KEXEC)
+ } else {
+ unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
+ }
}()
}
+
return &apb.UpdateNodeResponse{}, nil
}
diff --git a/metropolis/node/core/update/BUILD.bazel b/metropolis/node/core/update/BUILD.bazel
index 143231b..e3cdcd1 100644
--- a/metropolis/node/core/update/BUILD.bazel
+++ b/metropolis/node/core/update/BUILD.bazel
@@ -10,9 +10,11 @@
"//metropolis/pkg/blockdev",
"//metropolis/pkg/efivarfs",
"//metropolis/pkg/gpt",
+ "//metropolis/pkg/kexec",
"//metropolis/pkg/logtree",
"@com_github_cenkalti_backoff_v4//:backoff",
"@org_golang_google_grpc//codes",
"@org_golang_google_grpc//status",
+ "@org_golang_x_sys//unix",
],
)
diff --git a/metropolis/node/core/update/e2e/e2e_test.go b/metropolis/node/core/update/e2e/e2e_test.go
index 7524d2a..491259a 100644
--- a/metropolis/node/core/update/e2e/e2e_test.go
+++ b/metropolis/node/core/update/e2e/e2e_test.go
@@ -26,29 +26,20 @@
var variantRegexp = regexp.MustCompile(`TESTOS_VARIANT=([A-Z])`)
-func runAndCheckVariant(t *testing.T, expectedVariant string, qemuArgs []string) {
- t.Helper()
- ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
- defer cancel()
- qemuCmdLaunch := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
- stdoutPipe, err := qemuCmdLaunch.StdoutPipe()
+func stdoutHandler(t *testing.T, cmd *exec.Cmd, cancel context.CancelFunc, testosStarted chan string) {
+ stdoutPipe, err := cmd.StdoutPipe()
if err != nil {
t.Fatal(err)
}
- stderrPipe, err := qemuCmdLaunch.StderrPipe()
- if err != nil {
- t.Fatal(err)
- }
- testosStarted := make(chan string, 1)
+ s := bufio.NewScanner(stdoutPipe)
go func() {
- s := bufio.NewScanner(stdoutPipe)
for s.Scan() {
if strings.HasPrefix(s.Text(), "[") {
continue
}
errIdx := strings.Index(s.Text(), "Error installing new bundle")
if errIdx != -1 {
- t.Error(s.Text()[errIdx:])
+ cancel()
}
t.Log("vm: " + s.Text())
if m := variantRegexp.FindStringSubmatch(s.Text()); len(m) == 2 {
@@ -59,8 +50,15 @@
}
}
}()
+}
+
+func stderrHandler(t *testing.T, cmd *exec.Cmd) {
+ stderrPipe, err := cmd.StderrPipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+ s := bufio.NewScanner(stderrPipe)
go func() {
- s := bufio.NewScanner(stderrPipe)
for s.Scan() {
if strings.HasPrefix(s.Text(), "[") {
continue
@@ -68,6 +66,16 @@
t.Log("qemu: " + s.Text())
}
}()
+}
+
+func runAndCheckVariant(t *testing.T, expectedVariant string, qemuArgs []string) {
+ t.Helper()
+ ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+ defer cancel()
+ qemuCmdLaunch := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
+ testosStarted := make(chan string, 1)
+ stdoutHandler(t, qemuCmdLaunch, cancel, testosStarted)
+ stderrHandler(t, qemuCmdLaunch)
if err := qemuCmdLaunch.Start(); err != nil {
t.Fatal(err)
}
@@ -85,7 +93,7 @@
case <-procExit:
return
case <-ctx.Done():
- t.Log("Canceled VM")
+ t.Error("Timed out waiting for VM to exit")
cancel()
<-procExit
return
@@ -98,31 +106,60 @@
}
}
-func TestABUpdateSequence(t *testing.T) {
+type bundleServing struct {
+ t *testing.T
+ bundlePaths map[string]string
+ bundleFilePath string
+ // Protects bundleFilePath above
+ m sync.Mutex
+}
+
+func (b *bundleServing) setNextBundle(variant string) {
+ b.m.Lock()
+ defer b.m.Unlock()
+ p, ok := b.bundlePaths[variant]
+ if !ok {
+ b.t.Fatalf("no bundle for variant %s available", variant)
+ return
+ }
+ b.bundleFilePath = p
+}
+
+// setup sets up an an HTTP server for serving bundles which can be controlled
+// through the returned bundleServing struct as well as the initial boot disk
+// and EFI variable storage. It also returns the required QEMU arguments to
+// boot the initial TestOS.
+func setup(t *testing.T) (*bundleServing, []string) {
+ t.Helper()
blobAddr := net.TCPAddr{
IP: net.IPv4(10, 42, 0, 5),
Port: 80,
}
- var nextBundlePathToInstall string
- var nbpMutex sync.Mutex
+ b := bundleServing{
+ t: t,
+ bundlePaths: make(map[string]string),
+ }
m := http.NewServeMux()
bundleYPath, err := datafile.ResolveRunfile("metropolis/node/core/update/e2e/testos/testos_bundle_y.zip")
if err != nil {
t.Fatal(err)
}
+ b.bundlePaths["Y"] = bundleYPath
bundleZPath, err := datafile.ResolveRunfile("metropolis/node/core/update/e2e/testos/testos_bundle_z.zip")
if err != nil {
t.Fatal(err)
}
+ b.bundlePaths["Z"] = bundleZPath
m.HandleFunc("/bundle.bin", func(w http.ResponseWriter, req *http.Request) {
- nbpMutex.Lock()
- bundleFilePath := nextBundlePathToInstall
- nbpMutex.Unlock()
+ b.m.Lock()
+ bundleFilePath := b.bundleFilePath
+ b.m.Unlock()
if bundleFilePath == "" {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte("No next bundle set in the test harness"))
+ return
}
http.ServeFile(w, req, bundleFilePath)
})
@@ -130,6 +167,7 @@
if err != nil {
t.Fatal(err)
}
+ t.Cleanup(func() { blobLis.Close() })
blobListenAddr := blobLis.Addr().(*net.TCPAddr)
go http.Serve(blobLis, m)
@@ -139,7 +177,7 @@
if err != nil {
t.Fatal(err)
}
- defer os.Remove(rootDevPath)
+ t.Cleanup(func() { os.Remove(rootDevPath) })
defer rootDisk.Close()
ovmfVarsPath, err := datafile.ResolveRunfile("external/edk2/OVMF_VARS.fd")
@@ -181,18 +219,20 @@
}); err != nil {
t.Fatalf("unable to generate starting point image: %v", err)
}
- rootDisk.Close()
blobGuestFwd := fmt.Sprintf("guestfwd=tcp:%s-tcp:127.0.0.1:%d", blobAddr.String(), blobListenAddr.Port)
- ovmfVars, err := os.CreateTemp("", "agent-ovmf-vars")
+ ovmfVars, err := os.CreateTemp("", "ab-ovmf-vars")
if err != nil {
t.Fatal(err)
}
+ defer ovmfVars.Close()
+ t.Cleanup(func() { os.Remove(ovmfVars.Name()) })
ovmfVarsTmpl, err := os.Open(ovmfVarsPath)
if err != nil {
t.Fatal(err)
}
+ defer ovmfVarsTmpl.Close()
if _, err := io.Copy(ovmfVars, ovmfVarsTmpl); err != nil {
t.Fatal(err)
}
@@ -207,25 +247,75 @@
"-device", "virtio-net-pci,netdev=net0,mac=22:d5:8e:76:1d:07",
"-device", "virtio-rng-pci",
"-serial", "stdio",
- "-trace", "pflash*",
"-no-reboot",
}
- // Install Bundle Y next
- nbpMutex.Lock()
- nextBundlePathToInstall = bundleYPath
- nbpMutex.Unlock()
+ return &b, qemuArgs
+}
+
+func TestABUpdateSequenceReboot(t *testing.T) {
+ bsrv, qemuArgs := setup(t)
t.Log("Launching X image to install Y")
+ bsrv.setNextBundle("Y")
runAndCheckVariant(t, "X", qemuArgs)
- // Install Bundle Z next
- nbpMutex.Lock()
- nextBundlePathToInstall = bundleZPath
- nbpMutex.Unlock()
-
t.Log("Launching Y on slot B to install Z on slot A")
+ bsrv.setNextBundle("Z")
runAndCheckVariant(t, "Y", qemuArgs)
t.Log("Launching Z on slot A")
runAndCheckVariant(t, "Z", qemuArgs)
}
+
+func TestABUpdateSequenceKexec(t *testing.T) {
+ bsrv, qemuArgs := setup(t)
+ qemuArgs = append(qemuArgs, "-fw_cfg", "name=use_kexec,string=1")
+
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+ qemuCmdLaunch := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
+ testosStarted := make(chan string, 1)
+ stdoutHandler(t, qemuCmdLaunch, cancel, testosStarted)
+ stderrHandler(t, qemuCmdLaunch)
+ if err := qemuCmdLaunch.Start(); err != nil {
+ t.Fatal(err)
+ }
+ procExit := make(chan error)
+ go func() {
+ procExit <- qemuCmdLaunch.Wait()
+ close(procExit)
+ }()
+ var expectedVariant = "X"
+ for {
+ select {
+ case variant := <-testosStarted:
+ if variant != expectedVariant {
+ t.Fatalf("expected variant %s to launch, got %s", expectedVariant, variant)
+ }
+ switch expectedVariant {
+ case "X":
+ expectedVariant = "Y"
+ case "Y":
+ expectedVariant = "Z"
+ case "Z":
+ // We're done, wait for everything to wind down and return
+ select {
+ case <-procExit:
+ return
+ case <-ctx.Done():
+ t.Error("Timed out waiting for VM to exit")
+ cancel()
+ <-procExit
+ return
+ }
+ }
+ bsrv.setNextBundle(expectedVariant)
+ t.Logf("Got %s, installing %s", variant, expectedVariant)
+ case err := <-procExit:
+ t.Fatalf("QEMU exited unexpectedly: %v", err)
+ return
+ case <-ctx.Done():
+ t.Fatalf("Waiting for TestOS variant %s launch timed out", expectedVariant)
+ }
+ }
+}
diff --git a/metropolis/node/core/update/e2e/testos/main.go b/metropolis/node/core/update/e2e/testos/main.go
index cae004c..cba1ade 100644
--- a/metropolis/node/core/update/e2e/testos/main.go
+++ b/metropolis/node/core/update/e2e/testos/main.go
@@ -123,8 +123,11 @@
if err := updateSvc.MarkBootSuccessful(); err != nil {
supervisor.Logger(ctx).Errorf("error marking boot successful: %w", err)
}
+ _, err = os.Stat("/sys/firmware/qemu_fw_cfg/by_name/use_kexec/raw")
+ useKexec := err == nil
+ supervisor.Logger(ctx).Infof("Kexec: %v", useKexec)
if Variant != "Z" {
- if err := updateSvc.InstallBundle(ctx, "http://10.42.0.5:80/bundle.bin"); err != nil {
+ if err := updateSvc.InstallBundle(ctx, "http://10.42.0.5:80/bundle.bin", useKexec); err != nil {
supervisor.Logger(ctx).Errorf("Error installing new bundle: %v", err)
}
}
@@ -132,6 +135,10 @@
supervisor.Logger(ctx).Info("Installed bundle successfully, powering off")
unix.Sync()
time.Sleep(1 * time.Second)
- unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF)
+ if useKexec && Variant != "Z" {
+ unix.Reboot(unix.LINUX_REBOOT_CMD_KEXEC)
+ } else {
+ unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF)
+ }
return nil
}
diff --git a/metropolis/node/core/update/e2e/testos/testos.bzl b/metropolis/node/core/update/e2e/testos/testos.bzl
index a123ea6..29e218f 100644
--- a/metropolis/node/core/update/e2e/testos/testos.bzl
+++ b/metropolis/node/core/update/e2e/testos/testos.bzl
@@ -27,7 +27,7 @@
efi_unified_kernel_image(
name = "kernel_efi_" + variant,
- cmdline = "console=ttyS0 init=/init",
+ cmdline = "console=ttyS0 quiet rootfstype=erofs init=/init loadpin.exclude=kexec-image,kexec-initramfs",
kernel = "//third_party/linux",
verity = ":verity_rootfs_" + variant,
visibility = ["//metropolis/node/core/update/e2e:__pkg__"],
diff --git a/metropolis/node/core/update/update.go b/metropolis/node/core/update/update.go
index 107f9cc..84a5db9 100644
--- a/metropolis/node/core/update/update.go
+++ b/metropolis/node/core/update/update.go
@@ -4,6 +4,7 @@
"archive/zip"
"bytes"
"context"
+ "debug/pe"
"errors"
"fmt"
"io"
@@ -12,8 +13,10 @@
"path/filepath"
"regexp"
"strconv"
+ "strings"
"github.com/cenkalti/backoff/v4"
+ "golang.org/x/sys/unix"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
@@ -21,6 +24,7 @@
"source.monogon.dev/metropolis/pkg/blockdev"
"source.monogon.dev/metropolis/pkg/efivarfs"
"source.monogon.dev/metropolis/pkg/gpt"
+ "source.monogon.dev/metropolis/pkg/kexec"
"source.monogon.dev/metropolis/pkg/logtree"
)
@@ -177,6 +181,7 @@
efivarfs.FilePath(slot.EFIBootPath()),
},
}
+ s.Logger.Infof("Recreated boot entry %s", newEntry.Description)
newIdx, err := efivarfs.AddBootEntry(newEntry)
if err == nil {
existing[newIdx] = newEntry
@@ -274,7 +279,7 @@
// InstallBundle installs the bundle at the given HTTP(S) URL into the currently
// inactive slot and sets that slot to boot next. If it doesn't return an error,
// a reboot boots into the new slot.
-func (s *Service) InstallBundle(ctx context.Context, bundleURL string) error {
+func (s *Service) InstallBundle(ctx context.Context, bundleURL string, withKexec bool) error {
if s.ESPPath == "" {
return errors.New("no ESP information provided to update service, cannot continue")
}
@@ -353,8 +358,14 @@
return fmt.Errorf("failed setting boot entry %d active: %w", targetSlotBootEntryIdx, err)
}
- if err := efivarfs.SetBootNext(uint16(targetSlotBootEntryIdx)); err != nil {
- return fmt.Errorf("failed to set BootNext variable: %w", err)
+ if withKexec {
+ if err := s.stageKexec(bootFile, targetSlot); err != nil {
+ return fmt.Errorf("while kexec staging: %w", err)
+ }
+ } else {
+ if err := efivarfs.SetBootNext(uint16(targetSlotBootEntryIdx)); err != nil {
+ return fmt.Errorf("failed to set BootNext variable: %w", err)
+ }
}
return nil
@@ -388,3 +399,62 @@
}
return nil
}
+
+// newMemfile creates a new file which is not located on a specific filesystem,
+// but is instead backed by anonymous memory.
+func newMemfile(name string, flags int) (*os.File, error) {
+ fd, err := unix.MemfdCreate(name, flags)
+ if err != nil {
+ return nil, fmt.Errorf("memfd_create: %w", err)
+ }
+ return os.NewFile(uintptr(fd), name), nil
+}
+
+// stageKexec stages the kernel, command line and initramfs if available for
+// a future kexec. It extracts the relevant data from the EFI boot executable.
+func (s *Service) stageKexec(bootFile io.ReaderAt, targetSlot Slot) error {
+ bootPE, err := pe.NewFile(bootFile)
+ if err != nil {
+ return fmt.Errorf("unable to open bootFile as PE: %w", err)
+ }
+ var cmdlineRaw []byte
+ cmdlineSection := bootPE.Section(".cmdline")
+ if cmdlineSection == nil {
+ return fmt.Errorf("no .cmdline section in boot PE")
+ }
+ cmdlineRaw, err = cmdlineSection.Data()
+ if err != nil {
+ return fmt.Errorf("while reading .cmdline PE section: %w", err)
+ }
+ cmdline := string(bytes.TrimRight(cmdlineRaw, "\x00"))
+ cmdline = strings.ReplaceAll(cmdline, "METROPOLIS-SYSTEM-X", fmt.Sprintf("METROPOLIS-SYSTEM-%s", targetSlot))
+ kernelFile, err := newMemfile("kernel", 0)
+ if err != nil {
+ return fmt.Errorf("failed to create kernel memfile: %w", err)
+ }
+ defer kernelFile.Close()
+ kernelSection := bootPE.Section(".linux")
+ if kernelSection == nil {
+ return fmt.Errorf("no .linux section in boot PE")
+ }
+ if _, err := io.Copy(kernelFile, kernelSection.Open()); err != nil {
+ return fmt.Errorf("while copying .linux PE section: %w", err)
+ }
+
+ initramfsSection := bootPE.Section(".initrd")
+ var initramfsFile *os.File
+ if initramfsSection != nil && initramfsSection.Size > 0 {
+ initramfsFile, err = newMemfile("initramfs", 0)
+ if err != nil {
+ return fmt.Errorf("failed to create initramfs memfile: %w", err)
+ }
+ defer initramfsFile.Close()
+ if _, err := io.Copy(initramfsFile, initramfsSection.Open()); err != nil {
+ return fmt.Errorf("while copying .initrd PE section: %w", err)
+ }
+ }
+ if err := kexec.FileLoad(kernelFile, initramfsFile, cmdline); err != nil {
+ return fmt.Errorf("while staging new kexec kernel: %w", err)
+ }
+ return nil
+}