m/c/metroctl: implement reboot and poweroff

Implements node-level reboot and poweroff commands, both routed to the
Reboot RPC.

These are approximately modeled after systemd's systemctl as that's
what most people will be familiar with.

Change-Id: I5578bb0a37cd8f0ac9438ae5f2f5db0bf025672b
Reviewed-on: https://review.monogon.dev/c/monogon/+/3391
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/metropolis/cli/metroctl/cmd_node.go b/metropolis/cli/metroctl/cmd_node.go
index 4b3a546..59b4d9b 100644
--- a/metropolis/cli/metroctl/cmd_node.go
+++ b/metropolis/cli/metroctl/cmd_node.go
@@ -286,6 +286,126 @@
 	Args: cobra.ExactArgs(1),
 }
 
+func dialNode(ctx context.Context, node string) (apb.NodeManagementClient, error) {
+	// First connect to the main management service and figure out the node's IP
+	// address.
+	cc := dialAuthenticated(ctx)
+	mgmt := apb.NewManagementClient(cc)
+	nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", node))
+	if err != nil {
+		return nil, fmt.Errorf("when getting node info: %w", err)
+	}
+
+	if len(nodes) == 0 {
+		return nil, fmt.Errorf("no such node")
+	}
+	if len(nodes) > 1 {
+		return nil, fmt.Errorf("expression matched more than one node")
+	}
+	n := nodes[0]
+	if n.Status == nil || n.Status.ExternalAddress == "" {
+		return nil, fmt.Errorf("node has no external address")
+	}
+
+	cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
+	if err != nil {
+		return nil, fmt.Errorf("could not get CA certificate: %w", err)
+	}
+
+	// Dial the actual node at its management port.
+	cl := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
+	nmgmt := apb.NewNodeManagementClient(cl)
+	return nmgmt, nil
+}
+
+var nodeRebootCmd = &cobra.Command{
+	Short: "Reboot a node",
+	Long: `Reboot a node.
+
+This command can be used quite flexibly. Without any options it performs a
+normal, firmware-assisted reboot. It can roll back the last update by also
+passing the --rollback option. To reboot quicker the --kexec option can be used
+to skip firmware during reboot and boot straigt into the kernel.
+
+It can also be used to reboot into the firmware (BIOS) setup UI by passing the
+--firmware flag. This flag cannot be combined with any others.
+	`,
+	Use:          "reboot [node-id]",
+	Args:         cobra.ExactArgs(1),
+	SilenceUsage: true,
+	RunE: func(cmd *cobra.Command, args []string) error {
+		ctx := cmd.Context()
+
+		kexecFlag, err := cmd.Flags().GetBool("kexec")
+		if err != nil {
+			return err
+		}
+		rollbackFlag, err := cmd.Flags().GetBool("rollback")
+		if err != nil {
+			return err
+		}
+		firmwareFlag, err := cmd.Flags().GetBool("firmware")
+		if err != nil {
+			return err
+		}
+
+		if kexecFlag && firmwareFlag {
+			return errors.New("--kexec cannot be used with --firmware as firmware is not involved when using kexec")
+		}
+		if firmwareFlag && rollbackFlag {
+			return errors.New("--firmware cannot be used with --rollback as the next boot won't be into the OS")
+		}
+		var req apb.RebootRequest
+		if kexecFlag {
+			req.Type = apb.RebootRequest_KEXEC
+		} else {
+			req.Type = apb.RebootRequest_FIRMWARE
+		}
+		if firmwareFlag {
+			req.NextBoot = apb.RebootRequest_START_FIRMWARE_UI
+		}
+		if rollbackFlag {
+			req.NextBoot = apb.RebootRequest_START_ROLLBACK
+		}
+
+		nmgmt, err := dialNode(ctx, args[0])
+		if err != nil {
+			return fmt.Errorf("failed to dial node: %w", err)
+		}
+
+		if _, err := nmgmt.Reboot(ctx, &req); err != nil {
+			return fmt.Errorf("reboot RPC failed: %w", err)
+		}
+		fmt.Printf("Node %v is being rebooted", args[0])
+
+		return nil
+	},
+}
+
+var nodePoweroffCmd = &cobra.Command{
+	Short:        "Power off a node",
+	Use:          "poweroff [node-id]",
+	Args:         cobra.ExactArgs(1),
+	SilenceUsage: true,
+	RunE: func(cmd *cobra.Command, args []string) error {
+		ctx := cmd.Context()
+
+		nmgmt, err := dialNode(ctx, args[0])
+		if err != nil {
+			return err
+		}
+
+		if _, err := nmgmt.Reboot(ctx, &apb.RebootRequest{
+			Type: apb.RebootRequest_POWER_OFF,
+		}); err != nil {
+			return fmt.Errorf("reboot RPC failed: %w", err)
+		}
+		fmt.Printf("Node %v is being powered off", args[0])
+
+		return nil
+	},
+}
+
 func init() {
 	nodeUpdateCmd.Flags().String("bundle-url", "", "The URL to the new version")
 	nodeUpdateCmd.Flags().String("activation-mode", "reboot", "How the update should be activated (kexec, reboot, none)")
@@ -295,10 +415,16 @@
 	nodeDeleteCmd.Flags().Bool("bypass-has-roles", false, "Allows to bypass the HasRoles check")
 	nodeDeleteCmd.Flags().Bool("bypass-not-decommissioned", false, "Allows to bypass the NotDecommissioned check")
 
+	nodeRebootCmd.Flags().Bool("rollback", false, "Reboot into the last OS version in the other slot")
+	nodeRebootCmd.Flags().Bool("firmware", false, "Reboot into the firmware (BIOS) setup UI")
+	nodeRebootCmd.Flags().Bool("kexec", false, "Use kexec to reboot much quicker without going through firmware")
+
 	nodeCmd.AddCommand(nodeDescribeCmd)
 	nodeCmd.AddCommand(nodeListCmd)
 	nodeCmd.AddCommand(nodeUpdateCmd)
 	nodeCmd.AddCommand(nodeDeleteCmd)
+	nodeCmd.AddCommand(nodeRebootCmd)
+	nodeCmd.AddCommand(nodePoweroffCmd)
 	rootCmd.AddCommand(nodeCmd)
 }