blob: 4b3a5461fa505ae93e09f90173e5e26ea18d7b01 [file] [log] [blame]
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01001package main
2
3import (
4 "context"
Lorenz Brun9ce40712024-02-13 21:54:46 +01005 "errors"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +02006 "fmt"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01007 "io"
8 "log"
9 "os"
Tim Windelschmidtb765f242024-05-08 01:40:02 +020010 "os/signal"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020011 "strings"
Lorenz Brun9ce40712024-02-13 21:54:46 +010012 "sync"
13 "time"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010014
15 "github.com/spf13/cobra"
Lorenz Brun9ce40712024-02-13 21:54:46 +010016 "golang.org/x/sync/semaphore"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010017
Serge Bazanskie0c06172023-09-19 12:28:16 +000018 "source.monogon.dev/go/clitable"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010019 "source.monogon.dev/metropolis/cli/metroctl/core"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010020 "source.monogon.dev/metropolis/node/core/identity"
Lorenz Brun9ce40712024-02-13 21:54:46 +010021 "source.monogon.dev/version"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020022
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010023 apb "source.monogon.dev/metropolis/proto/api"
24)
25
26var nodeCmd = &cobra.Command{
27 Short: "Updates and queries node information.",
28 Use: "node",
29}
30
31var nodeDescribeCmd = &cobra.Command{
32 Short: "Describes cluster nodes.",
Serge Bazanski98840342024-05-22 13:03:55 +020033 Use: "describe [node-id] [--filter] [--output] [--format] [--columns]",
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010034 Example: "metroctl node describe metropolis-c556e31c3fa2bf0a36e9ccb9fd5d6056",
35 Run: func(cmd *cobra.Command, args []string) {
Tim Windelschmidtb765f242024-05-08 01:40:02 +020036 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010037 cc := dialAuthenticated(ctx)
38 mgmt := apb.NewManagementClient(cc)
39
40 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
41 if err != nil {
42 log.Fatalf("While calling Management.GetNodes: %v", err)
43 }
44
Serge Bazanski98840342024-05-22 13:03:55 +020045 var columns map[string]bool
46 if flags.columns != "" {
47 columns = make(map[string]bool)
48 for _, p := range strings.Split(flags.columns, ",") {
49 p = strings.ToLower(p)
50 p = strings.TrimSpace(p)
51 columns[p] = true
52 }
53 }
54 printNodes(nodes, args, columns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010055 },
56 Args: cobra.ArbitraryArgs,
57}
58
59var nodeListCmd = &cobra.Command{
60 Short: "Lists cluster nodes.",
61 Use: "list [node-id] [--filter] [--output] [--format]",
62 Example: "metroctl node list --filter node.status.external_address==\"10.8.0.2\"",
63 Run: func(cmd *cobra.Command, args []string) {
Tim Windelschmidtb765f242024-05-08 01:40:02 +020064 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010065 cc := dialAuthenticated(ctx)
66 mgmt := apb.NewManagementClient(cc)
67
68 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
69 if err != nil {
70 log.Fatalf("While calling Management.GetNodes: %v", err)
71 }
72
73 printNodes(nodes, args, map[string]bool{"node id": true})
74 },
75 Args: cobra.ArbitraryArgs,
76}
77
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020078var nodeUpdateCmd = &cobra.Command{
79 Short: "Updates the operating system of a cluster node.",
Lorenz Brun9ce40712024-02-13 21:54:46 +010080 Use: "update [NodeIDs]",
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020081 Example: "metroctl node update --bundle-url https://example.com/bundle.zip --activation-mode reboot metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
82 RunE: func(cmd *cobra.Command, args []string) error {
83 bundleUrl, err := cmd.Flags().GetString("bundle-url")
84 if err != nil {
85 return err
86 }
87
88 if len(bundleUrl) == 0 {
89 return fmt.Errorf("flag bundle-url is required")
90 }
91
92 activationMode, err := cmd.Flags().GetString("activation-mode")
93 if err != nil {
94 return err
95 }
96
97 var am apb.ActivationMode
98 switch strings.ToLower(activationMode) {
99 case "none":
100 am = apb.ActivationMode_ACTIVATION_NONE
101 case "reboot":
102 am = apb.ActivationMode_ACTIVATION_REBOOT
103 case "kexec":
104 am = apb.ActivationMode_ACTIVATION_KEXEC
105 default:
106 return fmt.Errorf("invalid value for flag activation-mode")
107 }
108
Lorenz Brun9ce40712024-02-13 21:54:46 +0100109 maxUnavailable, err := cmd.Flags().GetUint64("max-unavailable")
110 if err != nil {
111 return err
112 }
113 if maxUnavailable == 0 {
114 return errors.New("unable to update notes with max-unavailable set to zero")
115 }
116 unavailableSemaphore := semaphore.NewWeighted(int64(maxUnavailable))
117
Tim Windelschmidtb765f242024-05-08 01:40:02 +0200118 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200119
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100120 cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200121 if err != nil {
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100122 return fmt.Errorf("could not get CA certificate: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200123 }
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100124
125 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200126
127 nodes, err := core.GetNodes(ctx, mgmt, "")
128 if err != nil {
Tim Windelschmidt58786032024-05-21 13:47:41 +0200129 return fmt.Errorf("while calling Management.GetNodes: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200130 }
131 // Narrow down the output set to supplied node IDs, if any.
132 qids := make(map[string]bool)
133 if len(args) != 0 && args[0] != "all" {
134 for _, a := range args {
135 qids[a] = true
136 }
137 }
138
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000139 excludedNodesSlice, err := cmd.Flags().GetStringArray("exclude")
140 if err != nil {
141 return err
142 }
143 excludedNodes := make(map[string]bool)
144 for _, n := range excludedNodesSlice {
145 excludedNodes[n] = true
146 }
147
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200148 updateReq := &apb.UpdateNodeRequest{
149 BundleUrl: bundleUrl,
150 ActivationMode: am,
151 }
152
Lorenz Brun9ce40712024-02-13 21:54:46 +0100153 var wg sync.WaitGroup
154
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200155 for _, n := range nodes {
156 // Filter the information we want client-side.
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000157 nid := identity.NodeID(n.Pubkey)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200158 if len(qids) != 0 {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200159 if _, e := qids[nid]; !e {
160 continue
161 }
162 }
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000163 if excludedNodes[nid] {
164 continue
165 }
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200166
Lorenz Brun9ce40712024-02-13 21:54:46 +0100167 if err := unavailableSemaphore.Acquire(ctx, 1); err != nil {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200168 return err
169 }
Lorenz Brun9ce40712024-02-13 21:54:46 +0100170 wg.Add(1)
171
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200172 go func(n *apb.Node) {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100173 defer wg.Done()
174 cc := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
175 nodeMgmt := apb.NewNodeManagementClient(cc)
176 log.Printf("sending update request to: %s (%s)", n.Id, n.Status.ExternalAddress)
177 start := time.Now()
178 _, err := nodeMgmt.UpdateNode(ctx, updateReq)
179 if err != nil {
180 log.Printf("update request to node %s failed: %v", n.Id, err)
181 // A failed UpdateNode does not mean that the node is now unavailable as it
182 // hasn't started activating yet.
183 unavailableSemaphore.Release(1)
184 }
185 // Wait for the internal activation sleep plus the heartbeat
186 // to make sure the node has missed one heartbeat (or is
187 // back up already).
188 time.Sleep((5 + 10) * time.Second)
189 for {
190 select {
191 case <-time.After(10 * time.Second):
192 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", n.Id))
193 if err != nil {
194 log.Printf("while getting node status for %s: %v", n.Id, err)
Lorenz Brun76612022024-03-05 19:20:36 +0100195 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100196 }
197 if len(nodes) == 0 {
198 log.Printf("node status for %s returned no node", n.Id)
Lorenz Brun76612022024-03-05 19:20:36 +0100199 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100200 }
201 if len(nodes) > 1 {
202 log.Printf("node status for %s returned too many nodes (%d)", n.Id, len(nodes))
Lorenz Brun76612022024-03-05 19:20:36 +0100203 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100204 }
205 s := nodes[0]
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200206 if s.Health == apb.Node_HEALTHY {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100207 if s.Status != nil && s.Status.Version != nil {
208 log.Printf("node %s updated in %v to version %s", s.Id, time.Since(start), version.Semver(s.Status.Version))
209 } else {
210 log.Printf("node %s updated in %v to unknown version", s.Id, time.Since(start))
211 }
212 unavailableSemaphore.Release(1)
213 return
214 }
215 case <-ctx.Done():
216 log.Printf("update to node %s incomplete", n.Id)
217 return
218 }
219 }
220 }(n)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200221 }
222
Lorenz Brun9ce40712024-02-13 21:54:46 +0100223 // Wait for all update processes to finish
224 wg.Wait()
225
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200226 return nil
227 },
Lorenz Brun9ce40712024-02-13 21:54:46 +0100228 Args: cobra.MinimumNArgs(1),
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200229}
230
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100231var nodeDeleteCmd = &cobra.Command{
232 Short: "Deletes a node from the cluster.",
233 Use: "delete [NodeID] [--bypass-has-roles] [--bypass-not-decommissioned]",
234 Example: "metroctl node delete metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
235 RunE: func(cmd *cobra.Command, args []string) error {
236 bypassHasRoles, err := cmd.Flags().GetBool("bypass-has-roles")
237 if err != nil {
238 return err
239 }
240
241 bypassNotDecommissioned, err := cmd.Flags().GetBool("bypass-not-decommissioned")
242 if err != nil {
243 return err
244 }
245
Tim Windelschmidtb765f242024-05-08 01:40:02 +0200246 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100247 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
248
249 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id==%q", args[0]))
250 if err != nil {
Tim Windelschmidt58786032024-05-21 13:47:41 +0200251 return fmt.Errorf("while calling Management.GetNodes: %w", err)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100252 }
253
254 if len(nodes) == 0 {
255 return fmt.Errorf("could not find node with id: %s", args[0])
256 }
257
258 if len(nodes) != 1 {
259 return fmt.Errorf("expected one node, got %d", len(nodes))
260 }
261
262 n := nodes[0]
Lorenz Brun2542ef82024-08-20 13:33:02 +0200263 if n.Status != nil && n.Status.ExternalAddress != "" {
264 log.Printf("deleting node: %s (%s)", n.Id, n.Status.ExternalAddress)
265 } else {
266 log.Printf("deleting node: %s", n.Id)
267 }
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100268
269 req := &apb.DeleteNodeRequest{
270 Node: &apb.DeleteNodeRequest_Id{
271 Id: n.Id,
272 },
273 }
274
275 if bypassHasRoles {
276 req.SafetyBypassHasRoles = &apb.DeleteNodeRequest_SafetyBypassHasRoles{}
277 }
278
279 if bypassNotDecommissioned {
280 req.SafetyBypassNotDecommissioned = &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{}
281 }
282
283 _, err = mgmt.DeleteNode(ctx, req)
284 return err
285 },
286 Args: cobra.ExactArgs(1),
287}
288
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100289func init() {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200290 nodeUpdateCmd.Flags().String("bundle-url", "", "The URL to the new version")
291 nodeUpdateCmd.Flags().String("activation-mode", "reboot", "How the update should be activated (kexec, reboot, none)")
Lorenz Brun9ce40712024-02-13 21:54:46 +0100292 nodeUpdateCmd.Flags().Uint64("max-unavailable", 1, "Maximum nodes which can be unavailable during the update process")
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000293 nodeUpdateCmd.Flags().StringArray("exclude", nil, "List of nodes to exclude (useful with the \"all\" argument)")
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200294
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100295 nodeDeleteCmd.Flags().Bool("bypass-has-roles", false, "Allows to bypass the HasRoles check")
296 nodeDeleteCmd.Flags().Bool("bypass-not-decommissioned", false, "Allows to bypass the NotDecommissioned check")
297
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100298 nodeCmd.AddCommand(nodeDescribeCmd)
299 nodeCmd.AddCommand(nodeListCmd)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200300 nodeCmd.AddCommand(nodeUpdateCmd)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100301 nodeCmd.AddCommand(nodeDeleteCmd)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100302 rootCmd.AddCommand(nodeCmd)
303}
304
305func printNodes(nodes []*apb.Node, args []string, onlyColumns map[string]bool) {
306 o := io.WriteCloser(os.Stdout)
307 if flags.output != "" {
308 of, err := os.Create(flags.output)
309 if err != nil {
310 log.Fatalf("Couldn't create the output file at %s: %v", flags.output, err)
311 }
312 o = of
313 }
314
315 // Narrow down the output set to supplied node IDs, if any.
316 qids := make(map[string]bool)
317 if len(args) != 0 && args[0] != "all" {
318 for _, a := range args {
319 qids[a] = true
320 }
321 }
322
Serge Bazanskie0c06172023-09-19 12:28:16 +0000323 var t clitable.Table
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100324 for _, n := range nodes {
325 // Filter the information we want client-side.
326 if len(qids) != 0 {
327 nid := identity.NodeID(n.Pubkey)
328 if _, e := qids[nid]; !e {
329 continue
330 }
331 }
Serge Bazanskie0c06172023-09-19 12:28:16 +0000332 t.Add(nodeEntry(n))
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100333 }
334
Serge Bazanskie0c06172023-09-19 12:28:16 +0000335 t.Print(o, onlyColumns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100336}