blob: 0476d998533ef9bf459081d601e73a0cdd6b05d0 [file] [log] [blame]
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01001package main
2
3import (
4 "context"
Lorenz Brun9ce40712024-02-13 21:54:46 +01005 "errors"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +02006 "fmt"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01007 "io"
8 "log"
9 "os"
Tim Windelschmidtb765f242024-05-08 01:40:02 +020010 "os/signal"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020011 "strings"
Lorenz Brun9ce40712024-02-13 21:54:46 +010012 "sync"
13 "time"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010014
15 "github.com/spf13/cobra"
Lorenz Brun9ce40712024-02-13 21:54:46 +010016 "golang.org/x/sync/semaphore"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010017
Serge Bazanskie0c06172023-09-19 12:28:16 +000018 "source.monogon.dev/go/clitable"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010019 "source.monogon.dev/metropolis/cli/metroctl/core"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010020 "source.monogon.dev/metropolis/node/core/identity"
Lorenz Brun9ce40712024-02-13 21:54:46 +010021 "source.monogon.dev/version"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020022
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010023 apb "source.monogon.dev/metropolis/proto/api"
24)
25
26var nodeCmd = &cobra.Command{
27 Short: "Updates and queries node information.",
28 Use: "node",
29}
30
31var nodeDescribeCmd = &cobra.Command{
32 Short: "Describes cluster nodes.",
33 Use: "describe [node-id] [--filter] [--output] [--format]",
34 Example: "metroctl node describe metropolis-c556e31c3fa2bf0a36e9ccb9fd5d6056",
35 Run: func(cmd *cobra.Command, args []string) {
Tim Windelschmidtb765f242024-05-08 01:40:02 +020036 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010037 cc := dialAuthenticated(ctx)
38 mgmt := apb.NewManagementClient(cc)
39
40 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
41 if err != nil {
42 log.Fatalf("While calling Management.GetNodes: %v", err)
43 }
44
45 printNodes(nodes, args, nil)
46 },
47 Args: cobra.ArbitraryArgs,
48}
49
50var nodeListCmd = &cobra.Command{
51 Short: "Lists cluster nodes.",
52 Use: "list [node-id] [--filter] [--output] [--format]",
53 Example: "metroctl node list --filter node.status.external_address==\"10.8.0.2\"",
54 Run: func(cmd *cobra.Command, args []string) {
Tim Windelschmidtb765f242024-05-08 01:40:02 +020055 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010056 cc := dialAuthenticated(ctx)
57 mgmt := apb.NewManagementClient(cc)
58
59 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
60 if err != nil {
61 log.Fatalf("While calling Management.GetNodes: %v", err)
62 }
63
64 printNodes(nodes, args, map[string]bool{"node id": true})
65 },
66 Args: cobra.ArbitraryArgs,
67}
68
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020069var nodeUpdateCmd = &cobra.Command{
70 Short: "Updates the operating system of a cluster node.",
Lorenz Brun9ce40712024-02-13 21:54:46 +010071 Use: "update [NodeIDs]",
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020072 Example: "metroctl node update --bundle-url https://example.com/bundle.zip --activation-mode reboot metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
73 RunE: func(cmd *cobra.Command, args []string) error {
74 bundleUrl, err := cmd.Flags().GetString("bundle-url")
75 if err != nil {
76 return err
77 }
78
79 if len(bundleUrl) == 0 {
80 return fmt.Errorf("flag bundle-url is required")
81 }
82
83 activationMode, err := cmd.Flags().GetString("activation-mode")
84 if err != nil {
85 return err
86 }
87
88 var am apb.ActivationMode
89 switch strings.ToLower(activationMode) {
90 case "none":
91 am = apb.ActivationMode_ACTIVATION_NONE
92 case "reboot":
93 am = apb.ActivationMode_ACTIVATION_REBOOT
94 case "kexec":
95 am = apb.ActivationMode_ACTIVATION_KEXEC
96 default:
97 return fmt.Errorf("invalid value for flag activation-mode")
98 }
99
Lorenz Brun9ce40712024-02-13 21:54:46 +0100100 maxUnavailable, err := cmd.Flags().GetUint64("max-unavailable")
101 if err != nil {
102 return err
103 }
104 if maxUnavailable == 0 {
105 return errors.New("unable to update notes with max-unavailable set to zero")
106 }
107 unavailableSemaphore := semaphore.NewWeighted(int64(maxUnavailable))
108
Tim Windelschmidtb765f242024-05-08 01:40:02 +0200109 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200110
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100111 cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200112 if err != nil {
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100113 return fmt.Errorf("could not get CA certificate: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200114 }
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100115
116 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200117
118 nodes, err := core.GetNodes(ctx, mgmt, "")
119 if err != nil {
Tim Windelschmidt58786032024-05-21 13:47:41 +0200120 return fmt.Errorf("while calling Management.GetNodes: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200121 }
122 // Narrow down the output set to supplied node IDs, if any.
123 qids := make(map[string]bool)
124 if len(args) != 0 && args[0] != "all" {
125 for _, a := range args {
126 qids[a] = true
127 }
128 }
129
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000130 excludedNodesSlice, err := cmd.Flags().GetStringArray("exclude")
131 if err != nil {
132 return err
133 }
134 excludedNodes := make(map[string]bool)
135 for _, n := range excludedNodesSlice {
136 excludedNodes[n] = true
137 }
138
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200139 updateReq := &apb.UpdateNodeRequest{
140 BundleUrl: bundleUrl,
141 ActivationMode: am,
142 }
143
Lorenz Brun9ce40712024-02-13 21:54:46 +0100144 var wg sync.WaitGroup
145
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200146 for _, n := range nodes {
147 // Filter the information we want client-side.
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000148 nid := identity.NodeID(n.Pubkey)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200149 if len(qids) != 0 {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200150 if _, e := qids[nid]; !e {
151 continue
152 }
153 }
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000154 if excludedNodes[nid] {
155 continue
156 }
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200157
Lorenz Brun9ce40712024-02-13 21:54:46 +0100158 if err := unavailableSemaphore.Acquire(ctx, 1); err != nil {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200159 return err
160 }
Lorenz Brun9ce40712024-02-13 21:54:46 +0100161 wg.Add(1)
162
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200163 go func(n *apb.Node) {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100164 defer wg.Done()
165 cc := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
166 nodeMgmt := apb.NewNodeManagementClient(cc)
167 log.Printf("sending update request to: %s (%s)", n.Id, n.Status.ExternalAddress)
168 start := time.Now()
169 _, err := nodeMgmt.UpdateNode(ctx, updateReq)
170 if err != nil {
171 log.Printf("update request to node %s failed: %v", n.Id, err)
172 // A failed UpdateNode does not mean that the node is now unavailable as it
173 // hasn't started activating yet.
174 unavailableSemaphore.Release(1)
175 }
176 // Wait for the internal activation sleep plus the heartbeat
177 // to make sure the node has missed one heartbeat (or is
178 // back up already).
179 time.Sleep((5 + 10) * time.Second)
180 for {
181 select {
182 case <-time.After(10 * time.Second):
183 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", n.Id))
184 if err != nil {
185 log.Printf("while getting node status for %s: %v", n.Id, err)
Lorenz Brun76612022024-03-05 19:20:36 +0100186 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100187 }
188 if len(nodes) == 0 {
189 log.Printf("node status for %s returned no node", n.Id)
Lorenz Brun76612022024-03-05 19:20:36 +0100190 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100191 }
192 if len(nodes) > 1 {
193 log.Printf("node status for %s returned too many nodes (%d)", n.Id, len(nodes))
Lorenz Brun76612022024-03-05 19:20:36 +0100194 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100195 }
196 s := nodes[0]
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200197 if s.Health == apb.Node_HEALTHY {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100198 if s.Status != nil && s.Status.Version != nil {
199 log.Printf("node %s updated in %v to version %s", s.Id, time.Since(start), version.Semver(s.Status.Version))
200 } else {
201 log.Printf("node %s updated in %v to unknown version", s.Id, time.Since(start))
202 }
203 unavailableSemaphore.Release(1)
204 return
205 }
206 case <-ctx.Done():
207 log.Printf("update to node %s incomplete", n.Id)
208 return
209 }
210 }
211 }(n)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200212 }
213
Lorenz Brun9ce40712024-02-13 21:54:46 +0100214 // Wait for all update processes to finish
215 wg.Wait()
216
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200217 return nil
218 },
Lorenz Brun9ce40712024-02-13 21:54:46 +0100219 Args: cobra.MinimumNArgs(1),
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200220}
221
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100222var nodeDeleteCmd = &cobra.Command{
223 Short: "Deletes a node from the cluster.",
224 Use: "delete [NodeID] [--bypass-has-roles] [--bypass-not-decommissioned]",
225 Example: "metroctl node delete metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
226 RunE: func(cmd *cobra.Command, args []string) error {
227 bypassHasRoles, err := cmd.Flags().GetBool("bypass-has-roles")
228 if err != nil {
229 return err
230 }
231
232 bypassNotDecommissioned, err := cmd.Flags().GetBool("bypass-not-decommissioned")
233 if err != nil {
234 return err
235 }
236
Tim Windelschmidtb765f242024-05-08 01:40:02 +0200237 ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100238 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
239
240 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id==%q", args[0]))
241 if err != nil {
Tim Windelschmidt58786032024-05-21 13:47:41 +0200242 return fmt.Errorf("while calling Management.GetNodes: %w", err)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100243 }
244
245 if len(nodes) == 0 {
246 return fmt.Errorf("could not find node with id: %s", args[0])
247 }
248
249 if len(nodes) != 1 {
250 return fmt.Errorf("expected one node, got %d", len(nodes))
251 }
252
253 n := nodes[0]
254 log.Printf("deleting node: %s (%s)", n.Id, n.Status.ExternalAddress)
255
256 req := &apb.DeleteNodeRequest{
257 Node: &apb.DeleteNodeRequest_Id{
258 Id: n.Id,
259 },
260 }
261
262 if bypassHasRoles {
263 req.SafetyBypassHasRoles = &apb.DeleteNodeRequest_SafetyBypassHasRoles{}
264 }
265
266 if bypassNotDecommissioned {
267 req.SafetyBypassNotDecommissioned = &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{}
268 }
269
270 _, err = mgmt.DeleteNode(ctx, req)
271 return err
272 },
273 Args: cobra.ExactArgs(1),
274}
275
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100276func init() {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200277 nodeUpdateCmd.Flags().String("bundle-url", "", "The URL to the new version")
278 nodeUpdateCmd.Flags().String("activation-mode", "reboot", "How the update should be activated (kexec, reboot, none)")
Lorenz Brun9ce40712024-02-13 21:54:46 +0100279 nodeUpdateCmd.Flags().Uint64("max-unavailable", 1, "Maximum nodes which can be unavailable during the update process")
Lorenz Bruncceb6a32024-04-16 13:33:15 +0000280 nodeUpdateCmd.Flags().StringArray("exclude", nil, "List of nodes to exclude (useful with the \"all\" argument)")
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200281
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100282 nodeDeleteCmd.Flags().Bool("bypass-has-roles", false, "Allows to bypass the HasRoles check")
283 nodeDeleteCmd.Flags().Bool("bypass-not-decommissioned", false, "Allows to bypass the NotDecommissioned check")
284
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100285 nodeCmd.AddCommand(nodeDescribeCmd)
286 nodeCmd.AddCommand(nodeListCmd)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200287 nodeCmd.AddCommand(nodeUpdateCmd)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100288 nodeCmd.AddCommand(nodeDeleteCmd)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100289 rootCmd.AddCommand(nodeCmd)
290}
291
292func printNodes(nodes []*apb.Node, args []string, onlyColumns map[string]bool) {
293 o := io.WriteCloser(os.Stdout)
294 if flags.output != "" {
295 of, err := os.Create(flags.output)
296 if err != nil {
297 log.Fatalf("Couldn't create the output file at %s: %v", flags.output, err)
298 }
299 o = of
300 }
301
302 // Narrow down the output set to supplied node IDs, if any.
303 qids := make(map[string]bool)
304 if len(args) != 0 && args[0] != "all" {
305 for _, a := range args {
306 qids[a] = true
307 }
308 }
309
Serge Bazanskie0c06172023-09-19 12:28:16 +0000310 var t clitable.Table
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100311 for _, n := range nodes {
312 // Filter the information we want client-side.
313 if len(qids) != 0 {
314 nid := identity.NodeID(n.Pubkey)
315 if _, e := qids[nid]; !e {
316 continue
317 }
318 }
Serge Bazanskie0c06172023-09-19 12:28:16 +0000319 t.Add(nodeEntry(n))
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100320 }
321
Serge Bazanskie0c06172023-09-19 12:28:16 +0000322 t.Print(o, onlyColumns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100323}