blob: 5332009b16bd694966c24a6654090079b713ee28 [file] [log] [blame]
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01001package main
2
3import (
4 "context"
Lorenz Brun9ce40712024-02-13 21:54:46 +01005 "errors"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +02006 "fmt"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01007 "io"
8 "log"
9 "os"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020010 "strings"
Lorenz Brun9ce40712024-02-13 21:54:46 +010011 "sync"
12 "time"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010013
14 "github.com/spf13/cobra"
Lorenz Brun9ce40712024-02-13 21:54:46 +010015 "golang.org/x/sync/semaphore"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010016
Serge Bazanskie0c06172023-09-19 12:28:16 +000017 "source.monogon.dev/go/clitable"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010018 "source.monogon.dev/metropolis/cli/metroctl/core"
19 clicontext "source.monogon.dev/metropolis/cli/pkg/context"
20 "source.monogon.dev/metropolis/node/core/identity"
Lorenz Brun9ce40712024-02-13 21:54:46 +010021 "source.monogon.dev/version"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020022
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010023 apb "source.monogon.dev/metropolis/proto/api"
24)
25
26var nodeCmd = &cobra.Command{
27 Short: "Updates and queries node information.",
28 Use: "node",
29}
30
31var nodeDescribeCmd = &cobra.Command{
32 Short: "Describes cluster nodes.",
33 Use: "describe [node-id] [--filter] [--output] [--format]",
34 Example: "metroctl node describe metropolis-c556e31c3fa2bf0a36e9ccb9fd5d6056",
35 Run: func(cmd *cobra.Command, args []string) {
36 ctx := clicontext.WithInterrupt(context.Background())
37 cc := dialAuthenticated(ctx)
38 mgmt := apb.NewManagementClient(cc)
39
40 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
41 if err != nil {
42 log.Fatalf("While calling Management.GetNodes: %v", err)
43 }
44
45 printNodes(nodes, args, nil)
46 },
47 Args: cobra.ArbitraryArgs,
48}
49
50var nodeListCmd = &cobra.Command{
51 Short: "Lists cluster nodes.",
52 Use: "list [node-id] [--filter] [--output] [--format]",
53 Example: "metroctl node list --filter node.status.external_address==\"10.8.0.2\"",
54 Run: func(cmd *cobra.Command, args []string) {
55 ctx := clicontext.WithInterrupt(context.Background())
56 cc := dialAuthenticated(ctx)
57 mgmt := apb.NewManagementClient(cc)
58
59 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
60 if err != nil {
61 log.Fatalf("While calling Management.GetNodes: %v", err)
62 }
63
64 printNodes(nodes, args, map[string]bool{"node id": true})
65 },
66 Args: cobra.ArbitraryArgs,
67}
68
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020069var nodeUpdateCmd = &cobra.Command{
70 Short: "Updates the operating system of a cluster node.",
Lorenz Brun9ce40712024-02-13 21:54:46 +010071 Use: "update [NodeIDs]",
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020072 Example: "metroctl node update --bundle-url https://example.com/bundle.zip --activation-mode reboot metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
73 RunE: func(cmd *cobra.Command, args []string) error {
74 bundleUrl, err := cmd.Flags().GetString("bundle-url")
75 if err != nil {
76 return err
77 }
78
79 if len(bundleUrl) == 0 {
80 return fmt.Errorf("flag bundle-url is required")
81 }
82
83 activationMode, err := cmd.Flags().GetString("activation-mode")
84 if err != nil {
85 return err
86 }
87
88 var am apb.ActivationMode
89 switch strings.ToLower(activationMode) {
90 case "none":
91 am = apb.ActivationMode_ACTIVATION_NONE
92 case "reboot":
93 am = apb.ActivationMode_ACTIVATION_REBOOT
94 case "kexec":
95 am = apb.ActivationMode_ACTIVATION_KEXEC
96 default:
97 return fmt.Errorf("invalid value for flag activation-mode")
98 }
99
Lorenz Brun9ce40712024-02-13 21:54:46 +0100100 maxUnavailable, err := cmd.Flags().GetUint64("max-unavailable")
101 if err != nil {
102 return err
103 }
104 if maxUnavailable == 0 {
105 return errors.New("unable to update notes with max-unavailable set to zero")
106 }
107 unavailableSemaphore := semaphore.NewWeighted(int64(maxUnavailable))
108
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200109 ctx := clicontext.WithInterrupt(context.Background())
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200110
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100111 cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200112 if err != nil {
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100113 return fmt.Errorf("could not get CA certificate: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200114 }
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100115
116 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200117
118 nodes, err := core.GetNodes(ctx, mgmt, "")
119 if err != nil {
120 return fmt.Errorf("while calling Management.GetNodes: %v", err)
121 }
122 // Narrow down the output set to supplied node IDs, if any.
123 qids := make(map[string]bool)
124 if len(args) != 0 && args[0] != "all" {
125 for _, a := range args {
126 qids[a] = true
127 }
128 }
129
130 updateReq := &apb.UpdateNodeRequest{
131 BundleUrl: bundleUrl,
132 ActivationMode: am,
133 }
134
Lorenz Brun9ce40712024-02-13 21:54:46 +0100135 var wg sync.WaitGroup
136
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200137 for _, n := range nodes {
138 // Filter the information we want client-side.
139 if len(qids) != 0 {
140 nid := identity.NodeID(n.Pubkey)
141 if _, e := qids[nid]; !e {
142 continue
143 }
144 }
145
Lorenz Brun9ce40712024-02-13 21:54:46 +0100146 if err := unavailableSemaphore.Acquire(ctx, 1); err != nil {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200147 return err
148 }
Lorenz Brun9ce40712024-02-13 21:54:46 +0100149 wg.Add(1)
150
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200151 go func(n *apb.Node) {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100152 defer wg.Done()
153 cc := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
154 nodeMgmt := apb.NewNodeManagementClient(cc)
155 log.Printf("sending update request to: %s (%s)", n.Id, n.Status.ExternalAddress)
156 start := time.Now()
157 _, err := nodeMgmt.UpdateNode(ctx, updateReq)
158 if err != nil {
159 log.Printf("update request to node %s failed: %v", n.Id, err)
160 // A failed UpdateNode does not mean that the node is now unavailable as it
161 // hasn't started activating yet.
162 unavailableSemaphore.Release(1)
163 }
164 // Wait for the internal activation sleep plus the heartbeat
165 // to make sure the node has missed one heartbeat (or is
166 // back up already).
167 time.Sleep((5 + 10) * time.Second)
168 for {
169 select {
170 case <-time.After(10 * time.Second):
171 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", n.Id))
172 if err != nil {
173 log.Printf("while getting node status for %s: %v", n.Id, err)
Lorenz Brun76612022024-03-05 19:20:36 +0100174 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100175 }
176 if len(nodes) == 0 {
177 log.Printf("node status for %s returned no node", n.Id)
Lorenz Brun76612022024-03-05 19:20:36 +0100178 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100179 }
180 if len(nodes) > 1 {
181 log.Printf("node status for %s returned too many nodes (%d)", n.Id, len(nodes))
Lorenz Brun76612022024-03-05 19:20:36 +0100182 continue
Lorenz Brun9ce40712024-02-13 21:54:46 +0100183 }
184 s := nodes[0]
Tim Windelschmidtb41b5482024-04-18 23:24:01 +0200185 if s.Health == apb.Node_HEALTHY {
Lorenz Brun9ce40712024-02-13 21:54:46 +0100186 if s.Status != nil && s.Status.Version != nil {
187 log.Printf("node %s updated in %v to version %s", s.Id, time.Since(start), version.Semver(s.Status.Version))
188 } else {
189 log.Printf("node %s updated in %v to unknown version", s.Id, time.Since(start))
190 }
191 unavailableSemaphore.Release(1)
192 return
193 }
194 case <-ctx.Done():
195 log.Printf("update to node %s incomplete", n.Id)
196 return
197 }
198 }
199 }(n)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200200 }
201
Lorenz Brun9ce40712024-02-13 21:54:46 +0100202 // Wait for all update processes to finish
203 wg.Wait()
204
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200205 return nil
206 },
Lorenz Brun9ce40712024-02-13 21:54:46 +0100207 Args: cobra.MinimumNArgs(1),
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200208}
209
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100210var nodeDeleteCmd = &cobra.Command{
211 Short: "Deletes a node from the cluster.",
212 Use: "delete [NodeID] [--bypass-has-roles] [--bypass-not-decommissioned]",
213 Example: "metroctl node delete metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
214 RunE: func(cmd *cobra.Command, args []string) error {
215 bypassHasRoles, err := cmd.Flags().GetBool("bypass-has-roles")
216 if err != nil {
217 return err
218 }
219
220 bypassNotDecommissioned, err := cmd.Flags().GetBool("bypass-not-decommissioned")
221 if err != nil {
222 return err
223 }
224
225 ctx := clicontext.WithInterrupt(context.Background())
226 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
227
228 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id==%q", args[0]))
229 if err != nil {
230 return fmt.Errorf("while calling Management.GetNodes: %v", err)
231 }
232
233 if len(nodes) == 0 {
234 return fmt.Errorf("could not find node with id: %s", args[0])
235 }
236
237 if len(nodes) != 1 {
238 return fmt.Errorf("expected one node, got %d", len(nodes))
239 }
240
241 n := nodes[0]
242 log.Printf("deleting node: %s (%s)", n.Id, n.Status.ExternalAddress)
243
244 req := &apb.DeleteNodeRequest{
245 Node: &apb.DeleteNodeRequest_Id{
246 Id: n.Id,
247 },
248 }
249
250 if bypassHasRoles {
251 req.SafetyBypassHasRoles = &apb.DeleteNodeRequest_SafetyBypassHasRoles{}
252 }
253
254 if bypassNotDecommissioned {
255 req.SafetyBypassNotDecommissioned = &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{}
256 }
257
258 _, err = mgmt.DeleteNode(ctx, req)
259 return err
260 },
261 Args: cobra.ExactArgs(1),
262}
263
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100264func init() {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200265 nodeUpdateCmd.Flags().String("bundle-url", "", "The URL to the new version")
266 nodeUpdateCmd.Flags().String("activation-mode", "reboot", "How the update should be activated (kexec, reboot, none)")
Lorenz Brun9ce40712024-02-13 21:54:46 +0100267 nodeUpdateCmd.Flags().Uint64("max-unavailable", 1, "Maximum nodes which can be unavailable during the update process")
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200268
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100269 nodeDeleteCmd.Flags().Bool("bypass-has-roles", false, "Allows to bypass the HasRoles check")
270 nodeDeleteCmd.Flags().Bool("bypass-not-decommissioned", false, "Allows to bypass the NotDecommissioned check")
271
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100272 nodeCmd.AddCommand(nodeDescribeCmd)
273 nodeCmd.AddCommand(nodeListCmd)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200274 nodeCmd.AddCommand(nodeUpdateCmd)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100275 nodeCmd.AddCommand(nodeDeleteCmd)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100276 rootCmd.AddCommand(nodeCmd)
277}
278
279func printNodes(nodes []*apb.Node, args []string, onlyColumns map[string]bool) {
280 o := io.WriteCloser(os.Stdout)
281 if flags.output != "" {
282 of, err := os.Create(flags.output)
283 if err != nil {
284 log.Fatalf("Couldn't create the output file at %s: %v", flags.output, err)
285 }
286 o = of
287 }
288
289 // Narrow down the output set to supplied node IDs, if any.
290 qids := make(map[string]bool)
291 if len(args) != 0 && args[0] != "all" {
292 for _, a := range args {
293 qids[a] = true
294 }
295 }
296
Serge Bazanskie0c06172023-09-19 12:28:16 +0000297 var t clitable.Table
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100298 for _, n := range nodes {
299 // Filter the information we want client-side.
300 if len(qids) != 0 {
301 nid := identity.NodeID(n.Pubkey)
302 if _, e := qids[nid]; !e {
303 continue
304 }
305 }
Serge Bazanskie0c06172023-09-19 12:28:16 +0000306 t.Add(nodeEntry(n))
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100307 }
308
Serge Bazanskie0c06172023-09-19 12:28:16 +0000309 t.Print(o, onlyColumns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100310}