blob: 18cec838bec8e587118706a3d90556d42b0048ef [file] [log] [blame]
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01001package main
2
3import (
4 "context"
Lorenz Brun9ce40712024-02-13 21:54:46 +01005 "errors"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +02006 "fmt"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +01007 "io"
8 "log"
9 "os"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020010 "strings"
Lorenz Brun9ce40712024-02-13 21:54:46 +010011 "sync"
12 "time"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010013
14 "github.com/spf13/cobra"
Lorenz Brun9ce40712024-02-13 21:54:46 +010015 "golang.org/x/sync/semaphore"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010016
Serge Bazanskie0c06172023-09-19 12:28:16 +000017 "source.monogon.dev/go/clitable"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010018 "source.monogon.dev/metropolis/cli/metroctl/core"
19 clicontext "source.monogon.dev/metropolis/cli/pkg/context"
20 "source.monogon.dev/metropolis/node/core/identity"
Lorenz Brun9ce40712024-02-13 21:54:46 +010021 "source.monogon.dev/version"
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020022
Lorenz Brun9ce40712024-02-13 21:54:46 +010023 "source.monogon.dev/metropolis/proto/api"
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +010024 apb "source.monogon.dev/metropolis/proto/api"
25)
26
27var nodeCmd = &cobra.Command{
28 Short: "Updates and queries node information.",
29 Use: "node",
30}
31
32var nodeDescribeCmd = &cobra.Command{
33 Short: "Describes cluster nodes.",
34 Use: "describe [node-id] [--filter] [--output] [--format]",
35 Example: "metroctl node describe metropolis-c556e31c3fa2bf0a36e9ccb9fd5d6056",
36 Run: func(cmd *cobra.Command, args []string) {
37 ctx := clicontext.WithInterrupt(context.Background())
38 cc := dialAuthenticated(ctx)
39 mgmt := apb.NewManagementClient(cc)
40
41 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
42 if err != nil {
43 log.Fatalf("While calling Management.GetNodes: %v", err)
44 }
45
46 printNodes(nodes, args, nil)
47 },
48 Args: cobra.ArbitraryArgs,
49}
50
51var nodeListCmd = &cobra.Command{
52 Short: "Lists cluster nodes.",
53 Use: "list [node-id] [--filter] [--output] [--format]",
54 Example: "metroctl node list --filter node.status.external_address==\"10.8.0.2\"",
55 Run: func(cmd *cobra.Command, args []string) {
56 ctx := clicontext.WithInterrupt(context.Background())
57 cc := dialAuthenticated(ctx)
58 mgmt := apb.NewManagementClient(cc)
59
60 nodes, err := core.GetNodes(ctx, mgmt, flags.filter)
61 if err != nil {
62 log.Fatalf("While calling Management.GetNodes: %v", err)
63 }
64
65 printNodes(nodes, args, map[string]bool{"node id": true})
66 },
67 Args: cobra.ArbitraryArgs,
68}
69
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020070var nodeUpdateCmd = &cobra.Command{
71 Short: "Updates the operating system of a cluster node.",
Lorenz Brun9ce40712024-02-13 21:54:46 +010072 Use: "update [NodeIDs]",
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +020073 Example: "metroctl node update --bundle-url https://example.com/bundle.zip --activation-mode reboot metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
74 RunE: func(cmd *cobra.Command, args []string) error {
75 bundleUrl, err := cmd.Flags().GetString("bundle-url")
76 if err != nil {
77 return err
78 }
79
80 if len(bundleUrl) == 0 {
81 return fmt.Errorf("flag bundle-url is required")
82 }
83
84 activationMode, err := cmd.Flags().GetString("activation-mode")
85 if err != nil {
86 return err
87 }
88
89 var am apb.ActivationMode
90 switch strings.ToLower(activationMode) {
91 case "none":
92 am = apb.ActivationMode_ACTIVATION_NONE
93 case "reboot":
94 am = apb.ActivationMode_ACTIVATION_REBOOT
95 case "kexec":
96 am = apb.ActivationMode_ACTIVATION_KEXEC
97 default:
98 return fmt.Errorf("invalid value for flag activation-mode")
99 }
100
Lorenz Brun9ce40712024-02-13 21:54:46 +0100101 maxUnavailable, err := cmd.Flags().GetUint64("max-unavailable")
102 if err != nil {
103 return err
104 }
105 if maxUnavailable == 0 {
106 return errors.New("unable to update notes with max-unavailable set to zero")
107 }
108 unavailableSemaphore := semaphore.NewWeighted(int64(maxUnavailable))
109
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200110 ctx := clicontext.WithInterrupt(context.Background())
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200111
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100112 cacert, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200113 if err != nil {
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100114 return fmt.Errorf("could not get CA certificate: %w", err)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200115 }
Serge Bazanskic51d47d2024-02-13 18:40:26 +0100116
117 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200118
119 nodes, err := core.GetNodes(ctx, mgmt, "")
120 if err != nil {
121 return fmt.Errorf("while calling Management.GetNodes: %v", err)
122 }
123 // Narrow down the output set to supplied node IDs, if any.
124 qids := make(map[string]bool)
125 if len(args) != 0 && args[0] != "all" {
126 for _, a := range args {
127 qids[a] = true
128 }
129 }
130
131 updateReq := &apb.UpdateNodeRequest{
132 BundleUrl: bundleUrl,
133 ActivationMode: am,
134 }
135
Lorenz Brun9ce40712024-02-13 21:54:46 +0100136 var wg sync.WaitGroup
137
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200138 for _, n := range nodes {
139 // Filter the information we want client-side.
140 if len(qids) != 0 {
141 nid := identity.NodeID(n.Pubkey)
142 if _, e := qids[nid]; !e {
143 continue
144 }
145 }
146
Lorenz Brun9ce40712024-02-13 21:54:46 +0100147 if err := unavailableSemaphore.Acquire(ctx, 1); err != nil {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200148 return err
149 }
Lorenz Brun9ce40712024-02-13 21:54:46 +0100150 wg.Add(1)
151
152 go func(n *api.Node) {
153 defer wg.Done()
154 cc := dialAuthenticatedNode(ctx, n.Id, n.Status.ExternalAddress, cacert)
155 nodeMgmt := apb.NewNodeManagementClient(cc)
156 log.Printf("sending update request to: %s (%s)", n.Id, n.Status.ExternalAddress)
157 start := time.Now()
158 _, err := nodeMgmt.UpdateNode(ctx, updateReq)
159 if err != nil {
160 log.Printf("update request to node %s failed: %v", n.Id, err)
161 // A failed UpdateNode does not mean that the node is now unavailable as it
162 // hasn't started activating yet.
163 unavailableSemaphore.Release(1)
164 }
165 // Wait for the internal activation sleep plus the heartbeat
166 // to make sure the node has missed one heartbeat (or is
167 // back up already).
168 time.Sleep((5 + 10) * time.Second)
169 for {
170 select {
171 case <-time.After(10 * time.Second):
172 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id == %q", n.Id))
173 if err != nil {
174 log.Printf("while getting node status for %s: %v", n.Id, err)
175 }
176 if len(nodes) == 0 {
177 log.Printf("node status for %s returned no node", n.Id)
178 }
179 if len(nodes) > 1 {
180 log.Printf("node status for %s returned too many nodes (%d)", n.Id, len(nodes))
181 }
182 s := nodes[0]
183 if s.Health == api.Node_HEALTHY {
184 if s.Status != nil && s.Status.Version != nil {
185 log.Printf("node %s updated in %v to version %s", s.Id, time.Since(start), version.Semver(s.Status.Version))
186 } else {
187 log.Printf("node %s updated in %v to unknown version", s.Id, time.Since(start))
188 }
189 unavailableSemaphore.Release(1)
190 return
191 }
192 case <-ctx.Done():
193 log.Printf("update to node %s incomplete", n.Id)
194 return
195 }
196 }
197 }(n)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200198 }
199
Lorenz Brun9ce40712024-02-13 21:54:46 +0100200 // Wait for all update processes to finish
201 wg.Wait()
202
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200203 return nil
204 },
Lorenz Brun9ce40712024-02-13 21:54:46 +0100205 Args: cobra.MinimumNArgs(1),
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200206}
207
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100208var nodeDeleteCmd = &cobra.Command{
209 Short: "Deletes a node from the cluster.",
210 Use: "delete [NodeID] [--bypass-has-roles] [--bypass-not-decommissioned]",
211 Example: "metroctl node delete metropolis-25fa5f5e9349381d4a5e9e59de0215e3",
212 RunE: func(cmd *cobra.Command, args []string) error {
213 bypassHasRoles, err := cmd.Flags().GetBool("bypass-has-roles")
214 if err != nil {
215 return err
216 }
217
218 bypassNotDecommissioned, err := cmd.Flags().GetBool("bypass-not-decommissioned")
219 if err != nil {
220 return err
221 }
222
223 ctx := clicontext.WithInterrupt(context.Background())
224 mgmt := apb.NewManagementClient(dialAuthenticated(ctx))
225
226 nodes, err := core.GetNodes(ctx, mgmt, fmt.Sprintf("node.id==%q", args[0]))
227 if err != nil {
228 return fmt.Errorf("while calling Management.GetNodes: %v", err)
229 }
230
231 if len(nodes) == 0 {
232 return fmt.Errorf("could not find node with id: %s", args[0])
233 }
234
235 if len(nodes) != 1 {
236 return fmt.Errorf("expected one node, got %d", len(nodes))
237 }
238
239 n := nodes[0]
240 log.Printf("deleting node: %s (%s)", n.Id, n.Status.ExternalAddress)
241
242 req := &apb.DeleteNodeRequest{
243 Node: &apb.DeleteNodeRequest_Id{
244 Id: n.Id,
245 },
246 }
247
248 if bypassHasRoles {
249 req.SafetyBypassHasRoles = &apb.DeleteNodeRequest_SafetyBypassHasRoles{}
250 }
251
252 if bypassNotDecommissioned {
253 req.SafetyBypassNotDecommissioned = &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{}
254 }
255
256 _, err = mgmt.DeleteNode(ctx, req)
257 return err
258 },
259 Args: cobra.ExactArgs(1),
260}
261
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100262func init() {
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200263 nodeUpdateCmd.Flags().String("bundle-url", "", "The URL to the new version")
264 nodeUpdateCmd.Flags().String("activation-mode", "reboot", "How the update should be activated (kexec, reboot, none)")
Lorenz Brun9ce40712024-02-13 21:54:46 +0100265 nodeUpdateCmd.Flags().Uint64("max-unavailable", 1, "Maximum nodes which can be unavailable during the update process")
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200266
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100267 nodeDeleteCmd.Flags().Bool("bypass-has-roles", false, "Allows to bypass the HasRoles check")
268 nodeDeleteCmd.Flags().Bool("bypass-not-decommissioned", false, "Allows to bypass the NotDecommissioned check")
269
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100270 nodeCmd.AddCommand(nodeDescribeCmd)
271 nodeCmd.AddCommand(nodeListCmd)
Tim Windelschmidt3b25cf72023-07-17 16:58:10 +0200272 nodeCmd.AddCommand(nodeUpdateCmd)
Tim Windelschmidt7dbf18c2023-10-31 22:39:42 +0100273 nodeCmd.AddCommand(nodeDeleteCmd)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100274 rootCmd.AddCommand(nodeCmd)
275}
276
277func printNodes(nodes []*apb.Node, args []string, onlyColumns map[string]bool) {
278 o := io.WriteCloser(os.Stdout)
279 if flags.output != "" {
280 of, err := os.Create(flags.output)
281 if err != nil {
282 log.Fatalf("Couldn't create the output file at %s: %v", flags.output, err)
283 }
284 o = of
285 }
286
287 // Narrow down the output set to supplied node IDs, if any.
288 qids := make(map[string]bool)
289 if len(args) != 0 && args[0] != "all" {
290 for _, a := range args {
291 qids[a] = true
292 }
293 }
294
Serge Bazanskie0c06172023-09-19 12:28:16 +0000295 var t clitable.Table
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100296 for _, n := range nodes {
297 // Filter the information we want client-side.
298 if len(qids) != 0 {
299 nid := identity.NodeID(n.Pubkey)
300 if _, e := qids[nid]; !e {
301 continue
302 }
303 }
Serge Bazanskie0c06172023-09-19 12:28:16 +0000304 t.Add(nodeEntry(n))
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100305 }
306
Serge Bazanskie0c06172023-09-19 12:28:16 +0000307 t.Print(o, onlyColumns)
Serge Bazanskicfbbbdb2023-03-22 17:48:08 +0100308}