blob: 30dd965cf466c0f51c875a15fc4f1f92e5f1cbcc [file] [log] [blame]
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01001package main
2
3import (
4 "context"
Lorenz Brun9ce40712024-02-13 21:54:46 +01005 "errors"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +02006 "fmt"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01007 "io"
8 "log"
9 "os"
Tim Windelschmidtb765f242024-05-08 01:40:02 +020010 "os/signal"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020011 "strings"
Lorenz Brun9ce40712024-02-13 21:54:46 +010012 "sync"
13 "time"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010014
15 "github.com/spf13/cobra"
Lorenz Brun9ce40712024-02-13 21:54:46 +010016 "golang.org/x/sync/semaphore"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010017
Serge Bazanskie0c06172023-09-19 12:28:16 +000018 "source.monogon.dev/go/clitable"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010019 "source.monogon.dev/metropolis/cli/metroctl/core"
Lorenz Brun9ce40712024-02-13 21:54:46 +010020 "source.monogon.dev/version"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020021
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010022 apb "source.monogon.dev/metropolis/proto/api"
23)
24
25var nodeCmd = &cobra.Command{
26 Short: "Updates and queries node information.",
27 Use: "node",
28}
29
30var nodeDescribeCmd = &cobra.Command{
31 Short: "Describes cluster nodes.",
Serge Bazanski98840342024-05-22 13:03:55 +020032 Use: "describe [node-id] [--filter] [--output] [--format] [--columns]",
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010033 Example: "metroctl node describe metropolis-c556e31c3fa2bf0a36e9ccb9fd5d6056",
34 Run: func(cmd *cobra.Command, args []string) {
Tim Windelschmidtb765f242024-05-08 01:40:02 +020035 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010036 cc := dialAuthenticated(ctx)
37 mgmt := apb.NewManagementClient(cc)
38
39 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
40 if err != nil {
41 log.Fatalf("While calling Management.GetNodes: %v", err)
42 }
43
Serge Bazanski98840342024-05-22 13:03:55 +020044 var columns map[string]bool
45 if flags.columns != "" {
46 columns = make(map[string]bool)
47 for _, p := range strings.Split(flags.columns, ",") {
48 p = strings.ToLower(p)
49 p = strings.TrimSpace(p)
50 columns[p] = true
51 }
52 }
53 printNodes(nodes, args, columns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010054 },
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +020055 Args: PrintUsageOnWrongArgs(cobra.ArbitraryArgs),
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010056}
57
58var nodeListCmd = &cobra.Command{
59 Short: "Lists cluster nodes.",
60 Use: "list [node-id] [--filter] [--output] [--format]",
61 Example: "metroctl node list --filter node.status.external_address==\"10.8.0.2\"",
62 Run: func(cmd *cobra.Command, args []string) {
Tim Windelschmidtb765f242024-05-08 01:40:02 +020063 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010064 cc := dialAuthenticated(ctx)
65 mgmt := apb.NewManagementClient(cc)
66
67 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
68 if err != nil {
69 log.Fatalf("While calling Management.GetNodes: %v", err)
70 }
71
72 printNodes(nodes, args, map[string]bool{"node id": true})
73 },
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +020074 Args: PrintUsageOnWrongArgs(cobra.ArbitraryArgs),
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010075}
76
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020077var nodeUpdateCmd = &cobra.Command{
78 Short: "Updates the operating system of a cluster node.",
Lorenz Brun9ce40712024-02-13 21:54:46 +010079 Use: "update [NodeIDs]",
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020080 Example: "metroctl node update --bundle-url https://example.com/bundle.zip --activation-mode reboot metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
81 RunE: func(cmd *cobra.Command, args []string) error {
82 bundleUrl, err := cmd.Flags().GetString("bundle-url")
83 if err != nil {
84 return err
85 }
86
87 if len(bundleUrl) == 0 {
88 return fmt.Errorf("flag bundle-url is required")
89 }
90
91 activationMode, err := cmd.Flags().GetString("activation-mode")
92 if err != nil {
93 return err
94 }
95
96 var am apb.ActivationMode
97 switch strings.ToLower(activationMode) {
98 case "none":
99 am = apb.ActivationMode_ACTIVATION_NONE
100 case "reboot":
101 am = apb.ActivationMode_ACTIVATION_REBOOT
102 case "kexec":
103 am = apb.ActivationMode_ACTIVATION_KEXEC
104 default:
105 return fmt.Errorf("invalid value for flag activation-mode")
106 }
107
Lorenz Brun9ce40712024-02-13 21:54:46 +0100108 maxUnavailable, err := cmd.Flags().GetUint64("max-unavailable")
109 if err != nil {
110 return err
111 }
112 if maxUnavailable == 0 {
113 return errors.New("unable to update notes with max-unavailable set to zero")
114 }
115 unavailableSemaphore := semaphore.NewWeighted(int64(maxUnavailable))
116
Tim Windelschmidtb765f242024-05-08 01:40:02 +0200117 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200118
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100119 cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200120 if err != nil {
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100121 return fmt.Errorf("could not get CA certificate: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200122 }
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100123
124 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200125
126 nodes, err := core.GetNodes(ctx, mgmt, "")
127 if err != nil {
Tim Windelschmidt58786032024-05-21 13:47:41 +0200128 return fmt.Errorf("while calling Management.GetNodes: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200129 }
130 // Narrow down the output set to supplied node IDs, if any.
131 qids := make(map[string]bool)
132 if len(args) != 0 && args[0] != "all" {
133 for _, a := range args {
134 qids[a] = true
135 }
136 }
137
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000138 excludedNodesSlice, err := cmd.Flags().GetStringArray("exclude")
139 if err != nil {
140 return err
141 }
142 excludedNodes := make(map[string]bool)
143 for _, n := range excludedNodesSlice {
144 excludedNodes[n] = true
145 }
146
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200147 updateReq := &apb.UpdateNodeRequest{
148 BundleUrl: bundleUrl,
149 ActivationMode: am,
150 }
151
Lorenz Brun9ce40712024-02-13 21:54:46 +0100152 var wg sync.WaitGroup
153
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200154 for _, n := range nodes {
155 // Filter the information we want client-side.
156 if len(qids) != 0 {
Jan Schär39d9c242024-09-24 13:49:55 +0200157 if _, e := qids[n.Id]; !e {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200158 continue
159 }
160 }
Jan Schär39d9c242024-09-24 13:49:55 +0200161 if excludedNodes[n.Id] {
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000162 continue
163 }
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200164
Lorenz Brun9ce40712024-02-13 21:54:46 +0100165 if err := unavailableSemaphore.Acquire(ctx, 1); err != nil {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200166 return err
167 }
Lorenz Brun9ce40712024-02-13 21:54:46 +0100168 wg.Add(1)
169
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200170 go func(n *apb.Node) {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100171 defer wg.Done()
172 cc := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
173 nodeMgmt := apb.NewNodeManagementClient(cc)
174 log.Printf("sending update request to: %s (%s)", n.Id, n.Status.ExternalAddress)
175 start := time.Now()
176 _, err := nodeMgmt.UpdateNode(ctx, updateReq)
177 if err != nil {
178 log.Printf("update request to node %s failed: %v", n.Id, err)
179 // A failed UpdateNode does not mean that the node is now unavailable as it
180 // hasn't started activating yet.
181 unavailableSemaphore.Release(1)
182 }
183 // Wait for the internal activation sleep plus the heartbeat
184 // to make sure the node has missed one heartbeat (or is
185 // back up already).
186 time.Sleep((5 + 10) * time.Second)
187 for {
188 select {
189 case <-time.After(10 * time.Second):
190 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", n.Id))
191 if err != nil {
192 log.Printf("while getting node status for %s: %v", n.Id, err)
Lorenz Brun76612022024-03-05 19:20:36 +0100193 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100194 }
195 if len(nodes) == 0 {
196 log.Printf("node status for %s returned no node", n.Id)
Lorenz Brun76612022024-03-05 19:20:36 +0100197 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100198 }
199 if len(nodes) > 1 {
200 log.Printf("node status for %s returned too many nodes (%d)", n.Id, len(nodes))
Lorenz Brun76612022024-03-05 19:20:36 +0100201 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100202 }
203 s := nodes[0]
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200204 if s.Health == apb.Node_HEALTHY {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100205 if s.Status != nil && s.Status.Version != nil {
206 log.Printf("node %s updated in %v to version %s", s.Id, time.Since(start), version.Semver(s.Status.Version))
207 } else {
208 log.Printf("node %s updated in %v to unknown version", s.Id, time.Since(start))
209 }
210 unavailableSemaphore.Release(1)
211 return
212 }
213 case <-ctx.Done():
214 log.Printf("update to node %s incomplete", n.Id)
215 return
216 }
217 }
218 }(n)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200219 }
220
Lorenz Brun9ce40712024-02-13 21:54:46 +0100221 // Wait for all update processes to finish
222 wg.Wait()
223
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200224 return nil
225 },
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +0200226 Args: PrintUsageOnWrongArgs(cobra.MinimumNArgs(1)),
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200227}
228
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100229var nodeDeleteCmd = &cobra.Command{
230 Short: "Deletes a node from the cluster.",
231 Use: "delete [NodeID] [--bypass-has-roles] [--bypass-not-decommissioned]",
232 Example: "metroctl node delete metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
233 RunE: func(cmd *cobra.Command, args []string) error {
234 bypassHasRoles, err := cmd.Flags().GetBool("bypass-has-roles")
235 if err != nil {
236 return err
237 }
238
239 bypassNotDecommissioned, err := cmd.Flags().GetBool("bypass-not-decommissioned")
240 if err != nil {
241 return err
242 }
243
Tim Windelschmidtb765f242024-05-08 01:40:02 +0200244 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100245 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
246
247 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id==%q", args[0]))
248 if err != nil {
Tim Windelschmidt58786032024-05-21 13:47:41 +0200249 return fmt.Errorf("while calling Management.GetNodes: %w", err)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100250 }
251
252 if len(nodes) == 0 {
253 return fmt.Errorf("could not find node with id: %s", args[0])
254 }
255
256 if len(nodes) != 1 {
257 return fmt.Errorf("expected one node, got %d", len(nodes))
258 }
259
260 n := nodes[0]
Lorenz Brun2542ef82024-08-20 13:33:02 +0200261 if n.Status != nil && n.Status.ExternalAddress != "" {
262 log.Printf("deleting node: %s (%s)", n.Id, n.Status.ExternalAddress)
263 } else {
264 log.Printf("deleting node: %s", n.Id)
265 }
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100266
267 req := &apb.DeleteNodeRequest{
268 Node: &apb.DeleteNodeRequest_Id{
269 Id: n.Id,
270 },
271 }
272
273 if bypassHasRoles {
274 req.SafetyBypassHasRoles = &apb.DeleteNodeRequest_SafetyBypassHasRoles{}
275 }
276
277 if bypassNotDecommissioned {
278 req.SafetyBypassNotDecommissioned = &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{}
279 }
280
281 _, err = mgmt.DeleteNode(ctx, req)
282 return err
283 },
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +0200284 Args: PrintUsageOnWrongArgs(cobra.ExactArgs(1)),
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100285}
286
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000287func dialNode(ctx context.Context, node string) (apb.NodeManagementClient, error) {
288 // First connect to the main management service and figure out the node's IP
289 // address.
290 cc := dialAuthenticated(ctx)
291 mgmt := apb.NewManagementClient(cc)
292 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", node))
293 if err != nil {
294 return nil, fmt.Errorf("when getting node info: %w", err)
295 }
296
297 if len(nodes) == 0 {
298 return nil, fmt.Errorf("no such node")
299 }
300 if len(nodes) > 1 {
301 return nil, fmt.Errorf("expression matched more than one node")
302 }
303 n := nodes[0]
304 if n.Status == nil || n.Status.ExternalAddress == "" {
305 return nil, fmt.Errorf("node has no external address")
306 }
307
308 cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
309 if err != nil {
310 return nil, fmt.Errorf("could not get CA certificate: %w", err)
311 }
312
313 // Dial the actual node at its management port.
314 cl := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
315 nmgmt := apb.NewNodeManagementClient(cl)
316 return nmgmt, nil
317}
318
319var nodeRebootCmd = &cobra.Command{
320 Short: "Reboot a node",
321 Long: `Reboot a node.
322
323This command can be used quite flexibly. Without any options it performs a
324normal, firmware-assisted reboot. It can roll back the last update by also
325passing the --rollback option. To reboot quicker the --kexec option can be used
326to skip firmware during reboot and boot straigt into the kernel.
327
328It can also be used to reboot into the firmware (BIOS) setup UI by passing the
329--firmware flag. This flag cannot be combined with any others.
330 `,
331 Use: "reboot [node-id]",
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +0200332 Args: PrintUsageOnWrongArgs(cobra.ExactArgs(1)),
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000333 SilenceUsage: true,
334 RunE: func(cmd *cobra.Command, args []string) error {
335 ctx := cmd.Context()
336
337 kexecFlag, err := cmd.Flags().GetBool("kexec")
338 if err != nil {
339 return err
340 }
341 rollbackFlag, err := cmd.Flags().GetBool("rollback")
342 if err != nil {
343 return err
344 }
345 firmwareFlag, err := cmd.Flags().GetBool("firmware")
346 if err != nil {
347 return err
348 }
349
350 if kexecFlag && firmwareFlag {
351 return errors.New("--kexec cannot be used with --firmware as firmware is not involved when using kexec")
352 }
353 if firmwareFlag && rollbackFlag {
354 return errors.New("--firmware cannot be used with --rollback as the next boot won't be into the OS")
355 }
356 var req apb.RebootRequest
357 if kexecFlag {
358 req.Type = apb.RebootRequest_KEXEC
359 } else {
360 req.Type = apb.RebootRequest_FIRMWARE
361 }
362 if firmwareFlag {
363 req.NextBoot = apb.RebootRequest_START_FIRMWARE_UI
364 }
365 if rollbackFlag {
366 req.NextBoot = apb.RebootRequest_START_ROLLBACK
367 }
368
369 nmgmt, err := dialNode(ctx, args[0])
370 if err != nil {
371 return fmt.Errorf("failed to dial node: %w", err)
372 }
373
374 if _, err := nmgmt.Reboot(ctx, &req); err != nil {
375 return fmt.Errorf("reboot RPC failed: %w", err)
376 }
377 fmt.Printf("Node %v is being rebooted", args[0])
378
379 return nil
380 },
381}
382
383var nodePoweroffCmd = &cobra.Command{
384 Short: "Power off a node",
385 Use: "poweroff [node-id]",
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +0200386 Args: PrintUsageOnWrongArgs(cobra.ExactArgs(1)),
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000387 SilenceUsage: true,
388 RunE: func(cmd *cobra.Command, args []string) error {
389 ctx := cmd.Context()
390
391 nmgmt, err := dialNode(ctx, args[0])
392 if err != nil {
393 return err
394 }
395
396 if _, err := nmgmt.Reboot(ctx, &apb.RebootRequest{
397 Type: apb.RebootRequest_POWER_OFF,
398 }); err != nil {
399 return fmt.Errorf("reboot RPC failed: %w", err)
400 }
401 fmt.Printf("Node %v is being powered off", args[0])
402
403 return nil
404 },
405}
406
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100407func init() {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200408 nodeUpdateCmd.Flags().String("bundle-url", "", "The URL to the new version")
409 nodeUpdateCmd.Flags().String("activation-mode", "reboot", "How the update should be activated (kexec, reboot, none)")
Lorenz Brun9ce40712024-02-13 21:54:46 +0100410 nodeUpdateCmd.Flags().Uint64("max-unavailable", 1, "Maximum nodes which can be unavailable during the update process")
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000411 nodeUpdateCmd.Flags().StringArray("exclude", nil, "List of nodes to exclude (useful with the \"all\" argument)")
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200412
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100413 nodeDeleteCmd.Flags().Bool("bypass-has-roles", false, "Allows to bypass the HasRoles check")
414 nodeDeleteCmd.Flags().Bool("bypass-not-decommissioned", false, "Allows to bypass the NotDecommissioned check")
415
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000416 nodeRebootCmd.Flags().Bool("rollback", false, "Reboot into the last OS version in the other slot")
417 nodeRebootCmd.Flags().Bool("firmware", false, "Reboot into the firmware (BIOS) setup UI")
418 nodeRebootCmd.Flags().Bool("kexec", false, "Use kexec to reboot much quicker without going through firmware")
419
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100420 nodeCmd.AddCommand(nodeDescribeCmd)
421 nodeCmd.AddCommand(nodeListCmd)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200422 nodeCmd.AddCommand(nodeUpdateCmd)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100423 nodeCmd.AddCommand(nodeDeleteCmd)
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000424 nodeCmd.AddCommand(nodeRebootCmd)
425 nodeCmd.AddCommand(nodePoweroffCmd)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100426 rootCmd.AddCommand(nodeCmd)
427}
428
429func printNodes(nodes []*apb.Node, args []string, onlyColumns map[string]bool) {
430 o := io.WriteCloser(os.Stdout)
431 if flags.output != "" {
432 of, err := os.Create(flags.output)
433 if err != nil {
434 log.Fatalf("Couldn't create the output file at %s: %v", flags.output, err)
435 }
436 o = of
437 }
438
439 // Narrow down the output set to supplied node IDs, if any.
440 qids := make(map[string]bool)
441 if len(args) != 0 && args[0] != "all" {
442 for _, a := range args {
443 qids[a] = true
444 }
445 }
446
Serge Bazanskie0c06172023-09-19 12:28:16 +0000447 var t clitable.Table
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100448 for _, n := range nodes {
449 // Filter the information we want client-side.
450 if len(qids) != 0 {
Jan Schär39d9c242024-09-24 13:49:55 +0200451 if _, e := qids[n.Id]; !e {
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100452 continue
453 }
454 }
Serge Bazanskie0c06172023-09-19 12:28:16 +0000455 t.Add(nodeEntry(n))
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100456 }
457
Serge Bazanskie0c06172023-09-19 12:28:16 +0000458 t.Print(o, onlyColumns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100459}