metropolis: add Reboot RPC

This adds a new Reboot RPC to reboot a running node. It also supports
rebooting into the passive slot and powering off the node.

Change-Id: I329b22ea879adeb65a3e31103d39ad89813d61e8
Reviewed-on: https://review.monogon.dev/c/monogon/+/3354
Tested-by: Jenkins CI
Reviewed-by: Leopold Schabel <leo@monogon.tech>
diff --git a/metropolis/node/core/mgmt/BUILD.bazel b/metropolis/node/core/mgmt/BUILD.bazel
index a75dff9..b95565b 100644
--- a/metropolis/node/core/mgmt/BUILD.bazel
+++ b/metropolis/node/core/mgmt/BUILD.bazel
@@ -4,6 +4,7 @@
     name = "mgmt",
     srcs = [
         "mgmt.go",
+        "power.go",
         "svc_logs.go",
         "update.go",
     ],
@@ -16,6 +17,7 @@
         "//metropolis/node/core/update",
         "//metropolis/proto/api",
         "//metropolis/proto/common",
+        "//osbase/efivarfs",
         "//osbase/logtree",
         "//osbase/logtree/proto",
         "//osbase/supervisor",
diff --git a/metropolis/node/core/mgmt/power.go b/metropolis/node/core/mgmt/power.go
new file mode 100644
index 0000000..77601d7
--- /dev/null
+++ b/metropolis/node/core/mgmt/power.go
@@ -0,0 +1,68 @@
+package mgmt
+
+import (
+	"context"
+	"os"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+
+	apb "source.monogon.dev/metropolis/proto/api"
+	"source.monogon.dev/osbase/efivarfs"
+)
+
+func (s *Service) Reboot(ctx context.Context, req *apb.RebootRequest) (*apb.RebootResponse, error) {
+	var method int
+	// Do not yet perform any system-wide actions here as the request might
+	// still get rejected. There is another switch statement for that below.
+	switch req.Type {
+	case apb.RebootRequest_KEXEC:
+		method = unix.LINUX_REBOOT_CMD_KEXEC
+	case apb.RebootRequest_FIRMWARE:
+		method = unix.LINUX_REBOOT_CMD_RESTART
+	case apb.RebootRequest_POWER_OFF:
+		method = unix.LINUX_REBOOT_CMD_POWER_OFF
+	default:
+		return nil, status.Error(codes.Unimplemented, "unimplemented type value")
+	}
+	switch req.NextBoot {
+	case apb.RebootRequest_START_NORMAL:
+	case apb.RebootRequest_START_ROLLBACK:
+		if err := s.UpdateService.Rollback(); err != nil {
+			return nil, status.Errorf(codes.Unavailable, "performing rollback failed: %v", err)
+		}
+	case apb.RebootRequest_START_FIRMWARE_UI:
+		if req.Type == apb.RebootRequest_KEXEC {
+			return nil, status.Error(codes.InvalidArgument, "START_FIRMWARE_UI cannot be used with KEXEC type")
+		}
+		supp, err := efivarfs.OSIndicationsSupported()
+		if err != nil || supp&efivarfs.BootToFirmwareUI == 0 {
+			return nil, status.Error(codes.Unimplemented, "Unable to boot into firmware UI on this platform")
+		}
+		if err := efivarfs.SetOSIndications(efivarfs.BootToFirmwareUI); err != nil {
+			return nil, status.Errorf(codes.Unavailable, "Unable to set UEFI boot to UI indication: %v", err)
+		}
+	default:
+		return nil, status.Error(codes.Unimplemented, "unimplemented next_boot value")
+	}
+
+	switch req.Type {
+	case apb.RebootRequest_KEXEC:
+		if err := s.UpdateService.KexecLoadNext(); err != nil {
+			return nil, status.Errorf(codes.Unavailable, "failed to stage kexec kernel: %v", err)
+		}
+	case apb.RebootRequest_FIRMWARE:
+		// Best-effort, if it fails this will still be a firmware reboot.
+		os.WriteFile("/sys/kernel/reboot/mode", []byte("cold"), 0644)
+	}
+	s.LogTree.MustLeveledFor("root.mgmt").Warning("Reboot requested, rebooting in 2s")
+	go func() {
+		time.Sleep(2 * time.Second)
+		unix.Unmount(s.UpdateService.ESPPath, 0)
+		unix.Sync()
+		unix.Reboot(method)
+	}()
+	return &apb.RebootResponse{}, nil
+}
diff --git a/metropolis/proto/api/management.proto b/metropolis/proto/api/management.proto
index cd95a8e..f6900df 100644
--- a/metropolis/proto/api/management.proto
+++ b/metropolis/proto/api/management.proto
@@ -328,6 +328,45 @@
 message DeleteNodeResponse {
 }
 
+message RebootRequest {
+  enum Type {
+    TYPE_INVALID = 0;
+    // FIRMWARE performs a firmware-assisted (EFI, PSCI, ...) reboot and
+    // signals the firmware to perform a thorough reset if possible. This
+    // maximizes chances to clear hardware-related issues. The exact
+    // implementation is up to firmware.
+    FIRMWARE = 1;
+    // KEXEC performs a KEXEC reboot without going through firmware at all.
+    // This is the fastest reboot option, but does not fully reset most
+    // hardware and has compatibility issues on certain hardware.
+    KEXEC = 2;
+    // POWER_OFF fully powers off the system. It can only be started again by
+    // a physical power button, Wake On LAN if set supported by the NIC or
+    // an out-of-band management controller if available.
+    POWER_OFF = 3;
+  }
+  Type type = 1;
+  enum NextBoot {
+    // START_NORMAL starts the system normally, respecting standard A/B slot
+    // booting rules. Any staged but not activated updates will be activated
+    // as with a normal reboot.
+    START_NORMAL = 0;
+    // START_ROLLBACK tries to boot into the currently inactive slot on reboot.
+    START_ROLLBACK = 1;
+    // START_FIRMWARE_UI tries to boot into the EFI firmware UI. Cannot be used
+    // together with KEXEC as firmare is not involved there.
+    START_FIRMWARE_UI = 2;
+  }
+  // NextBoot can be used to select the boot slot to reboot into. This works
+  // even for POWER_OFF, but there the next boot will need to be triggered
+  // externally. START_FIRMWARE_UI cannot be used together with KEXEC.
+  NextBoot next_boot = 2;
+}
+
+message RebootResponse {
+
+}
+
 // NodeManagement runs on every node of the cluster and providers management
 // and troubleshooting RPCs to operators. All requests must be authenticated.
 service NodeManagement {
@@ -362,6 +401,14 @@
       need: PERMISSION_UPDATE_NODE
     };
   }
+
+  // Reboot initiates a node reboot or power-off. It can also be used to roll
+  // back to the inactive slot.
+  rpc Reboot(RebootRequest) returns (RebootResponse) {
+    option (metropolis.proto.ext.authorization) = {
+      need: PERMISSION_NODE_POWER_MANAGEMENT
+    };
+  }
 }
 
 message GetLogsRequest {
diff --git a/metropolis/proto/ext/authorization.proto b/metropolis/proto/ext/authorization.proto
index 4c27f3e..e526ec3 100644
--- a/metropolis/proto/ext/authorization.proto
+++ b/metropolis/proto/ext/authorization.proto
@@ -29,6 +29,7 @@
     PERMISSION_DECOMMISSION_NODE = 8;
     PERMISSION_DELETE_NODE = 9;
     PERMISSION_UPDATE_NODE_LABELS = 10;
+    PERMISSION_NODE_POWER_MANAGEMENT = 11;
 }
 
 // Authorization policy for an RPC method. This message/API does not have the