metropolis: add Reboot RPC
This adds a new Reboot RPC to reboot a running node. It also supports
rebooting into the passive slot and powering off the node.
Change-Id: I329b22ea879adeb65a3e31103d39ad89813d61e8
Reviewed-on: https://review.monogon.dev/c/monogon/+/3354
Tested-by: Jenkins CI
Reviewed-by: Leopold Schabel <leo@monogon.tech>
diff --git a/metropolis/node/core/mgmt/BUILD.bazel b/metropolis/node/core/mgmt/BUILD.bazel
index a75dff9..b95565b 100644
--- a/metropolis/node/core/mgmt/BUILD.bazel
+++ b/metropolis/node/core/mgmt/BUILD.bazel
@@ -4,6 +4,7 @@
name = "mgmt",
srcs = [
"mgmt.go",
+ "power.go",
"svc_logs.go",
"update.go",
],
@@ -16,6 +17,7 @@
"//metropolis/node/core/update",
"//metropolis/proto/api",
"//metropolis/proto/common",
+ "//osbase/efivarfs",
"//osbase/logtree",
"//osbase/logtree/proto",
"//osbase/supervisor",
diff --git a/metropolis/node/core/mgmt/power.go b/metropolis/node/core/mgmt/power.go
new file mode 100644
index 0000000..77601d7
--- /dev/null
+++ b/metropolis/node/core/mgmt/power.go
@@ -0,0 +1,68 @@
+package mgmt
+
+import (
+ "context"
+ "os"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "google.golang.org/grpc/codes"
+ "google.golang.org/grpc/status"
+
+ apb "source.monogon.dev/metropolis/proto/api"
+ "source.monogon.dev/osbase/efivarfs"
+)
+
+func (s *Service) Reboot(ctx context.Context, req *apb.RebootRequest) (*apb.RebootResponse, error) {
+ var method int
+ // Do not yet perform any system-wide actions here as the request might
+ // still get rejected. There is another switch statement for that below.
+ switch req.Type {
+ case apb.RebootRequest_KEXEC:
+ method = unix.LINUX_REBOOT_CMD_KEXEC
+ case apb.RebootRequest_FIRMWARE:
+ method = unix.LINUX_REBOOT_CMD_RESTART
+ case apb.RebootRequest_POWER_OFF:
+ method = unix.LINUX_REBOOT_CMD_POWER_OFF
+ default:
+ return nil, status.Error(codes.Unimplemented, "unimplemented type value")
+ }
+ switch req.NextBoot {
+ case apb.RebootRequest_START_NORMAL:
+ case apb.RebootRequest_START_ROLLBACK:
+ if err := s.UpdateService.Rollback(); err != nil {
+ return nil, status.Errorf(codes.Unavailable, "performing rollback failed: %v", err)
+ }
+ case apb.RebootRequest_START_FIRMWARE_UI:
+ if req.Type == apb.RebootRequest_KEXEC {
+ return nil, status.Error(codes.InvalidArgument, "START_FIRMWARE_UI cannot be used with KEXEC type")
+ }
+ supp, err := efivarfs.OSIndicationsSupported()
+ if err != nil || supp&efivarfs.BootToFirmwareUI == 0 {
+ return nil, status.Error(codes.Unimplemented, "Unable to boot into firmware UI on this platform")
+ }
+ if err := efivarfs.SetOSIndications(efivarfs.BootToFirmwareUI); err != nil {
+ return nil, status.Errorf(codes.Unavailable, "Unable to set UEFI boot to UI indication: %v", err)
+ }
+ default:
+ return nil, status.Error(codes.Unimplemented, "unimplemented next_boot value")
+ }
+
+ switch req.Type {
+ case apb.RebootRequest_KEXEC:
+ if err := s.UpdateService.KexecLoadNext(); err != nil {
+ return nil, status.Errorf(codes.Unavailable, "failed to stage kexec kernel: %v", err)
+ }
+ case apb.RebootRequest_FIRMWARE:
+ // Best-effort, if it fails this will still be a firmware reboot.
+ os.WriteFile("/sys/kernel/reboot/mode", []byte("cold"), 0644)
+ }
+ s.LogTree.MustLeveledFor("root.mgmt").Warning("Reboot requested, rebooting in 2s")
+ go func() {
+ time.Sleep(2 * time.Second)
+ unix.Unmount(s.UpdateService.ESPPath, 0)
+ unix.Sync()
+ unix.Reboot(method)
+ }()
+ return &apb.RebootResponse{}, nil
+}
diff --git a/metropolis/proto/api/management.proto b/metropolis/proto/api/management.proto
index cd95a8e..f6900df 100644
--- a/metropolis/proto/api/management.proto
+++ b/metropolis/proto/api/management.proto
@@ -328,6 +328,45 @@
message DeleteNodeResponse {
}
+message RebootRequest {
+ enum Type {
+ TYPE_INVALID = 0;
+ // FIRMWARE performs a firmware-assisted (EFI, PSCI, ...) reboot and
+ // signals the firmware to perform a thorough reset if possible. This
+ // maximizes chances to clear hardware-related issues. The exact
+ // implementation is up to firmware.
+ FIRMWARE = 1;
+ // KEXEC performs a KEXEC reboot without going through firmware at all.
+ // This is the fastest reboot option, but does not fully reset most
+ // hardware and has compatibility issues on certain hardware.
+ KEXEC = 2;
+ // POWER_OFF fully powers off the system. It can only be started again by
+ // a physical power button, Wake On LAN if set supported by the NIC or
+ // an out-of-band management controller if available.
+ POWER_OFF = 3;
+ }
+ Type type = 1;
+ enum NextBoot {
+ // START_NORMAL starts the system normally, respecting standard A/B slot
+ // booting rules. Any staged but not activated updates will be activated
+ // as with a normal reboot.
+ START_NORMAL = 0;
+ // START_ROLLBACK tries to boot into the currently inactive slot on reboot.
+ START_ROLLBACK = 1;
+ // START_FIRMWARE_UI tries to boot into the EFI firmware UI. Cannot be used
+ // together with KEXEC as firmare is not involved there.
+ START_FIRMWARE_UI = 2;
+ }
+ // NextBoot can be used to select the boot slot to reboot into. This works
+ // even for POWER_OFF, but there the next boot will need to be triggered
+ // externally. START_FIRMWARE_UI cannot be used together with KEXEC.
+ NextBoot next_boot = 2;
+}
+
+message RebootResponse {
+
+}
+
// NodeManagement runs on every node of the cluster and providers management
// and troubleshooting RPCs to operators. All requests must be authenticated.
service NodeManagement {
@@ -362,6 +401,14 @@
need: PERMISSION_UPDATE_NODE
};
}
+
+ // Reboot initiates a node reboot or power-off. It can also be used to roll
+ // back to the inactive slot.
+ rpc Reboot(RebootRequest) returns (RebootResponse) {
+ option (metropolis.proto.ext.authorization) = {
+ need: PERMISSION_NODE_POWER_MANAGEMENT
+ };
+ }
}
message GetLogsRequest {
diff --git a/metropolis/proto/ext/authorization.proto b/metropolis/proto/ext/authorization.proto
index 4c27f3e..e526ec3 100644
--- a/metropolis/proto/ext/authorization.proto
+++ b/metropolis/proto/ext/authorization.proto
@@ -29,6 +29,7 @@
PERMISSION_DECOMMISSION_NODE = 8;
PERMISSION_DELETE_NODE = 9;
PERMISSION_UPDATE_NODE_LABELS = 10;
+ PERMISSION_NODE_POWER_MANAGEMENT = 11;
}
// Authorization policy for an RPC method. This message/API does not have the