blob: d7a23eeb15d4e0965412b71ff028aaf34d0ebe59 [file] [log] [blame]
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01001package main
2
3import (
4 "context"
Lorenz Brun9ce40712024-02-13 21:54:46 +01005 "errors"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +02006 "fmt"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01007 "io"
8 "log"
9 "os"
Tim Windelschmidtb765f242024-05-08 01:40:02 +020010 "os/signal"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020011 "strings"
Lorenz Brun9ce40712024-02-13 21:54:46 +010012 "sync"
13 "time"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010014
15 "github.com/spf13/cobra"
Lorenz Brun9ce40712024-02-13 21:54:46 +010016 "golang.org/x/sync/semaphore"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010017
Serge Bazanskie0c06172023-09-19 12:28:16 +000018 "source.monogon.dev/go/clitable"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010019 "source.monogon.dev/metropolis/cli/metroctl/core"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010020 "source.monogon.dev/metropolis/node/core/identity"
Lorenz Brun9ce40712024-02-13 21:54:46 +010021 "source.monogon.dev/version"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020022
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010023 apb "source.monogon.dev/metropolis/proto/api"
24)
25
26var nodeCmd = &cobra.Command{
27 Short: "Updates and queries node information.",
28 Use: "node",
29}
30
31var nodeDescribeCmd = &cobra.Command{
32 Short: "Describes cluster nodes.",
Serge Bazanski98840342024-05-22 13:03:55 +020033 Use: "describe [node-id] [--filter] [--output] [--format] [--columns]",
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010034 Example: "metroctl node describe metropolis-c556e31c3fa2bf0a36e9ccb9fd5d6056",
35 Run: func(cmd *cobra.Command, args []string) {
Tim Windelschmidtb765f242024-05-08 01:40:02 +020036 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010037 cc := dialAuthenticated(ctx)
38 mgmt := apb.NewManagementClient(cc)
39
40 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
41 if err != nil {
42 log.Fatalf("While calling Management.GetNodes: %v", err)
43 }
44
Serge Bazanski98840342024-05-22 13:03:55 +020045 var columns map[string]bool
46 if flags.columns != "" {
47 columns = make(map[string]bool)
48 for _, p := range strings.Split(flags.columns, ",") {
49 p = strings.ToLower(p)
50 p = strings.TrimSpace(p)
51 columns[p] = true
52 }
53 }
54 printNodes(nodes, args, columns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010055 },
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +020056 Args: PrintUsageOnWrongArgs(cobra.ArbitraryArgs),
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010057}
58
59var nodeListCmd = &cobra.Command{
60 Short: "Lists cluster nodes.",
61 Use: "list [node-id] [--filter] [--output] [--format]",
62 Example: "metroctl node list --filter node.status.external_address==\"10.8.0.2\"",
63 Run: func(cmd *cobra.Command, args []string) {
Tim Windelschmidtb765f242024-05-08 01:40:02 +020064 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010065 cc := dialAuthenticated(ctx)
66 mgmt := apb.NewManagementClient(cc)
67
68 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
69 if err != nil {
70 log.Fatalf("While calling Management.GetNodes: %v", err)
71 }
72
73 printNodes(nodes, args, map[string]bool{"node id": true})
74 },
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +020075 Args: PrintUsageOnWrongArgs(cobra.ArbitraryArgs),
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010076}
77
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020078var nodeUpdateCmd = &cobra.Command{
79 Short: "Updates the operating system of a cluster node.",
Lorenz Brun9ce40712024-02-13 21:54:46 +010080 Use: "update [NodeIDs]",
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020081 Example: "metroctl node update --bundle-url https://example.com/bundle.zip --activation-mode reboot metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
82 RunE: func(cmd *cobra.Command, args []string) error {
83 bundleUrl, err := cmd.Flags().GetString("bundle-url")
84 if err != nil {
85 return err
86 }
87
88 if len(bundleUrl) == 0 {
89 return fmt.Errorf("flag bundle-url is required")
90 }
91
92 activationMode, err := cmd.Flags().GetString("activation-mode")
93 if err != nil {
94 return err
95 }
96
97 var am apb.ActivationMode
98 switch strings.ToLower(activationMode) {
99 case "none":
100 am = apb.ActivationMode_ACTIVATION_NONE
101 case "reboot":
102 am = apb.ActivationMode_ACTIVATION_REBOOT
103 case "kexec":
104 am = apb.ActivationMode_ACTIVATION_KEXEC
105 default:
106 return fmt.Errorf("invalid value for flag activation-mode")
107 }
108
Lorenz Brun9ce40712024-02-13 21:54:46 +0100109 maxUnavailable, err := cmd.Flags().GetUint64("max-unavailable")
110 if err != nil {
111 return err
112 }
113 if maxUnavailable == 0 {
114 return errors.New("unable to update notes with max-unavailable set to zero")
115 }
116 unavailableSemaphore := semaphore.NewWeighted(int64(maxUnavailable))
117
Tim Windelschmidtb765f242024-05-08 01:40:02 +0200118 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200119
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100120 cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200121 if err != nil {
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100122 return fmt.Errorf("could not get CA certificate: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200123 }
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100124
125 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200126
127 nodes, err := core.GetNodes(ctx, mgmt, "")
128 if err != nil {
Tim Windelschmidt58786032024-05-21 13:47:41 +0200129 return fmt.Errorf("while calling Management.GetNodes: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200130 }
131 // Narrow down the output set to supplied node IDs, if any.
132 qids := make(map[string]bool)
133 if len(args) != 0 && args[0] != "all" {
134 for _, a := range args {
135 qids[a] = true
136 }
137 }
138
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000139 excludedNodesSlice, err := cmd.Flags().GetStringArray("exclude")
140 if err != nil {
141 return err
142 }
143 excludedNodes := make(map[string]bool)
144 for _, n := range excludedNodesSlice {
145 excludedNodes[n] = true
146 }
147
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200148 updateReq := &apb.UpdateNodeRequest{
149 BundleUrl: bundleUrl,
150 ActivationMode: am,
151 }
152
Lorenz Brun9ce40712024-02-13 21:54:46 +0100153 var wg sync.WaitGroup
154
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200155 for _, n := range nodes {
156 // Filter the information we want client-side.
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000157 nid := identity.NodeID(n.Pubkey)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200158 if len(qids) != 0 {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200159 if _, e := qids[nid]; !e {
160 continue
161 }
162 }
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000163 if excludedNodes[nid] {
164 continue
165 }
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200166
Lorenz Brun9ce40712024-02-13 21:54:46 +0100167 if err := unavailableSemaphore.Acquire(ctx, 1); err != nil {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200168 return err
169 }
Lorenz Brun9ce40712024-02-13 21:54:46 +0100170 wg.Add(1)
171
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200172 go func(n *apb.Node) {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100173 defer wg.Done()
174 cc := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
175 nodeMgmt := apb.NewNodeManagementClient(cc)
176 log.Printf("sending update request to: %s (%s)", n.Id, n.Status.ExternalAddress)
177 start := time.Now()
178 _, err := nodeMgmt.UpdateNode(ctx, updateReq)
179 if err != nil {
180 log.Printf("update request to node %s failed: %v", n.Id, err)
181 // A failed UpdateNode does not mean that the node is now unavailable as it
182 // hasn't started activating yet.
183 unavailableSemaphore.Release(1)
184 }
185 // Wait for the internal activation sleep plus the heartbeat
186 // to make sure the node has missed one heartbeat (or is
187 // back up already).
188 time.Sleep((5 + 10) * time.Second)
189 for {
190 select {
191 case <-time.After(10 * time.Second):
192 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", n.Id))
193 if err != nil {
194 log.Printf("while getting node status for %s: %v", n.Id, err)
Lorenz Brun76612022024-03-05 19:20:36 +0100195 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100196 }
197 if len(nodes) == 0 {
198 log.Printf("node status for %s returned no node", n.Id)
Lorenz Brun76612022024-03-05 19:20:36 +0100199 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100200 }
201 if len(nodes) > 1 {
202 log.Printf("node status for %s returned too many nodes (%d)", n.Id, len(nodes))
Lorenz Brun76612022024-03-05 19:20:36 +0100203 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100204 }
205 s := nodes[0]
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200206 if s.Health == apb.Node_HEALTHY {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100207 if s.Status != nil && s.Status.Version != nil {
208 log.Printf("node %s updated in %v to version %s", s.Id, time.Since(start), version.Semver(s.Status.Version))
209 } else {
210 log.Printf("node %s updated in %v to unknown version", s.Id, time.Since(start))
211 }
212 unavailableSemaphore.Release(1)
213 return
214 }
215 case <-ctx.Done():
216 log.Printf("update to node %s incomplete", n.Id)
217 return
218 }
219 }
220 }(n)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200221 }
222
Lorenz Brun9ce40712024-02-13 21:54:46 +0100223 // Wait for all update processes to finish
224 wg.Wait()
225
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200226 return nil
227 },
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +0200228 Args: PrintUsageOnWrongArgs(cobra.MinimumNArgs(1)),
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200229}
230
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100231var nodeDeleteCmd = &cobra.Command{
232 Short: "Deletes a node from the cluster.",
233 Use: "delete [NodeID] [--bypass-has-roles] [--bypass-not-decommissioned]",
234 Example: "metroctl node delete metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
235 RunE: func(cmd *cobra.Command, args []string) error {
236 bypassHasRoles, err := cmd.Flags().GetBool("bypass-has-roles")
237 if err != nil {
238 return err
239 }
240
241 bypassNotDecommissioned, err := cmd.Flags().GetBool("bypass-not-decommissioned")
242 if err != nil {
243 return err
244 }
245
Tim Windelschmidtb765f242024-05-08 01:40:02 +0200246 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100247 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
248
249 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id==%q", args[0]))
250 if err != nil {
Tim Windelschmidt58786032024-05-21 13:47:41 +0200251 return fmt.Errorf("while calling Management.GetNodes: %w", err)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100252 }
253
254 if len(nodes) == 0 {
255 return fmt.Errorf("could not find node with id: %s", args[0])
256 }
257
258 if len(nodes) != 1 {
259 return fmt.Errorf("expected one node, got %d", len(nodes))
260 }
261
262 n := nodes[0]
Lorenz Brun2542ef82024-08-20 13:33:02 +0200263 if n.Status != nil && n.Status.ExternalAddress != "" {
264 log.Printf("deleting node: %s (%s)", n.Id, n.Status.ExternalAddress)
265 } else {
266 log.Printf("deleting node: %s", n.Id)
267 }
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100268
269 req := &apb.DeleteNodeRequest{
270 Node: &apb.DeleteNodeRequest_Id{
271 Id: n.Id,
272 },
273 }
274
275 if bypassHasRoles {
276 req.SafetyBypassHasRoles = &apb.DeleteNodeRequest_SafetyBypassHasRoles{}
277 }
278
279 if bypassNotDecommissioned {
280 req.SafetyBypassNotDecommissioned = &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{}
281 }
282
283 _, err = mgmt.DeleteNode(ctx, req)
284 return err
285 },
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +0200286 Args: PrintUsageOnWrongArgs(cobra.ExactArgs(1)),
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100287}
288
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000289func dialNode(ctx context.Context, node string) (apb.NodeManagementClient, error) {
290 // First connect to the main management service and figure out the node's IP
291 // address.
292 cc := dialAuthenticated(ctx)
293 mgmt := apb.NewManagementClient(cc)
294 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", node))
295 if err != nil {
296 return nil, fmt.Errorf("when getting node info: %w", err)
297 }
298
299 if len(nodes) == 0 {
300 return nil, fmt.Errorf("no such node")
301 }
302 if len(nodes) > 1 {
303 return nil, fmt.Errorf("expression matched more than one node")
304 }
305 n := nodes[0]
306 if n.Status == nil || n.Status.ExternalAddress == "" {
307 return nil, fmt.Errorf("node has no external address")
308 }
309
310 cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
311 if err != nil {
312 return nil, fmt.Errorf("could not get CA certificate: %w", err)
313 }
314
315 // Dial the actual node at its management port.
316 cl := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
317 nmgmt := apb.NewNodeManagementClient(cl)
318 return nmgmt, nil
319}
320
321var nodeRebootCmd = &cobra.Command{
322 Short: "Reboot a node",
323 Long: `Reboot a node.
324
325This command can be used quite flexibly. Without any options it performs a
326normal, firmware-assisted reboot. It can roll back the last update by also
327passing the --rollback option. To reboot quicker the --kexec option can be used
328to skip firmware during reboot and boot straigt into the kernel.
329
330It can also be used to reboot into the firmware (BIOS) setup UI by passing the
331--firmware flag. This flag cannot be combined with any others.
332 `,
333 Use: "reboot [node-id]",
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +0200334 Args: PrintUsageOnWrongArgs(cobra.ExactArgs(1)),
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000335 SilenceUsage: true,
336 RunE: func(cmd *cobra.Command, args []string) error {
337 ctx := cmd.Context()
338
339 kexecFlag, err := cmd.Flags().GetBool("kexec")
340 if err != nil {
341 return err
342 }
343 rollbackFlag, err := cmd.Flags().GetBool("rollback")
344 if err != nil {
345 return err
346 }
347 firmwareFlag, err := cmd.Flags().GetBool("firmware")
348 if err != nil {
349 return err
350 }
351
352 if kexecFlag && firmwareFlag {
353 return errors.New("--kexec cannot be used with --firmware as firmware is not involved when using kexec")
354 }
355 if firmwareFlag && rollbackFlag {
356 return errors.New("--firmware cannot be used with --rollback as the next boot won't be into the OS")
357 }
358 var req apb.RebootRequest
359 if kexecFlag {
360 req.Type = apb.RebootRequest_KEXEC
361 } else {
362 req.Type = apb.RebootRequest_FIRMWARE
363 }
364 if firmwareFlag {
365 req.NextBoot = apb.RebootRequest_START_FIRMWARE_UI
366 }
367 if rollbackFlag {
368 req.NextBoot = apb.RebootRequest_START_ROLLBACK
369 }
370
371 nmgmt, err := dialNode(ctx, args[0])
372 if err != nil {
373 return fmt.Errorf("failed to dial node: %w", err)
374 }
375
376 if _, err := nmgmt.Reboot(ctx, &req); err != nil {
377 return fmt.Errorf("reboot RPC failed: %w", err)
378 }
379 fmt.Printf("Node %v is being rebooted", args[0])
380
381 return nil
382 },
383}
384
385var nodePoweroffCmd = &cobra.Command{
386 Short: "Power off a node",
387 Use: "poweroff [node-id]",
Tim Windelschmidtfc6e1cf2024-09-18 17:34:07 +0200388 Args: PrintUsageOnWrongArgs(cobra.ExactArgs(1)),
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000389 SilenceUsage: true,
390 RunE: func(cmd *cobra.Command, args []string) error {
391 ctx := cmd.Context()
392
393 nmgmt, err := dialNode(ctx, args[0])
394 if err != nil {
395 return err
396 }
397
398 if _, err := nmgmt.Reboot(ctx, &apb.RebootRequest{
399 Type: apb.RebootRequest_POWER_OFF,
400 }); err != nil {
401 return fmt.Errorf("reboot RPC failed: %w", err)
402 }
403 fmt.Printf("Node %v is being powered off", args[0])
404
405 return nil
406 },
407}
408
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100409func init() {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200410 nodeUpdateCmd.Flags().String("bundle-url", "", "The URL to the new version")
411 nodeUpdateCmd.Flags().String("activation-mode", "reboot", "How the update should be activated (kexec, reboot, none)")
Lorenz Brun9ce40712024-02-13 21:54:46 +0100412 nodeUpdateCmd.Flags().Uint64("max-unavailable", 1, "Maximum nodes which can be unavailable during the update process")
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000413 nodeUpdateCmd.Flags().StringArray("exclude", nil, "List of nodes to exclude (useful with the \"all\" argument)")
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200414
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100415 nodeDeleteCmd.Flags().Bool("bypass-has-roles", false, "Allows to bypass the HasRoles check")
416 nodeDeleteCmd.Flags().Bool("bypass-not-decommissioned", false, "Allows to bypass the NotDecommissioned check")
417
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000418 nodeRebootCmd.Flags().Bool("rollback", false, "Reboot into the last OS version in the other slot")
419 nodeRebootCmd.Flags().Bool("firmware", false, "Reboot into the firmware (BIOS) setup UI")
420 nodeRebootCmd.Flags().Bool("kexec", false, "Use kexec to reboot much quicker without going through firmware")
421
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100422 nodeCmd.AddCommand(nodeDescribeCmd)
423 nodeCmd.AddCommand(nodeListCmd)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200424 nodeCmd.AddCommand(nodeUpdateCmd)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100425 nodeCmd.AddCommand(nodeDeleteCmd)
Lorenz Bruncc32cc42024-09-09 20:14:05 +0000426 nodeCmd.AddCommand(nodeRebootCmd)
427 nodeCmd.AddCommand(nodePoweroffCmd)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100428 rootCmd.AddCommand(nodeCmd)
429}
430
431func printNodes(nodes []*apb.Node, args []string, onlyColumns map[string]bool) {
432 o := io.WriteCloser(os.Stdout)
433 if flags.output != "" {
434 of, err := os.Create(flags.output)
435 if err != nil {
436 log.Fatalf("Couldn't create the output file at %s: %v", flags.output, err)
437 }
438 o = of
439 }
440
441 // Narrow down the output set to supplied node IDs, if any.
442 qids := make(map[string]bool)
443 if len(args) != 0 && args[0] != "all" {
444 for _, a := range args {
445 qids[a] = true
446 }
447 }
448
Serge Bazanskie0c06172023-09-19 12:28:16 +0000449 var t clitable.Table
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100450 for _, n := range nodes {
451 // Filter the information we want client-side.
452 if len(qids) != 0 {
453 nid := identity.NodeID(n.Pubkey)
454 if _, e := qids[nid]; !e {
455 continue
456 }
457 }
Serge Bazanskie0c06172023-09-19 12:28:16 +0000458 t.Add(nodeEntry(n))
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100459 }
460
Serge Bazanskie0c06172023-09-19 12:28:16 +0000461 t.Print(o, onlyColumns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100462}