Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 1 | package roleserve |
| 2 | |
| 3 | import ( |
| 4 | "context" |
| 5 | "fmt" |
| 6 | |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 7 | "google.golang.org/protobuf/proto" |
| 8 | |
| 9 | "source.monogon.dev/metropolis/node/core/localstorage" |
Serge Bazanski | 37110c3 | 2023-03-01 13:57:27 +0000 | [diff] [blame] | 10 | "source.monogon.dev/metropolis/pkg/event/memory" |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 11 | "source.monogon.dev/metropolis/pkg/supervisor" |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 12 | |
| 13 | ipb "source.monogon.dev/metropolis/node/core/curator/proto/api" |
Serge Bazanski | 37110c3 | 2023-03-01 13:57:27 +0000 | [diff] [blame] | 14 | cpb "source.monogon.dev/metropolis/proto/common" |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 15 | ) |
| 16 | |
| 17 | // workerRoleFetch is the Role Fetcher, an internal bookkeeping service |
Serge Bazanski | fe3d8fd | 2023-05-30 20:50:09 +0200 | [diff] [blame] | 18 | // responsible for populating localRoles based on a curatorConnection whenever |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 19 | // the node is HOME and cluster credentials / curator access is available. |
| 20 | type workerRoleFetch struct { |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 21 | storageRoot *localstorage.Root |
Serge Bazanski | fe3d8fd | 2023-05-30 20:50:09 +0200 | [diff] [blame] | 22 | curatorConnection *memory.Value[*curatorConnection] |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 23 | |
| 24 | // localRoles will be written. |
Serge Bazanski | 37110c3 | 2023-03-01 13:57:27 +0000 | [diff] [blame] | 25 | localRoles *memory.Value[*cpb.NodeRoles] |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 26 | } |
| 27 | |
| 28 | func (s *workerRoleFetch) run(ctx context.Context) error { |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 29 | // Wait for a 'curator connection' first. This isn't a real connection but just a |
| 30 | // set of credentials and a potentially working resolver. Here, we just use its |
| 31 | // presence as a marking that the local disk has been mounted and thus we can |
| 32 | // attempt to read a persisted cluster directory. |
Serge Bazanski | fe3d8fd | 2023-05-30 20:50:09 +0200 | [diff] [blame] | 33 | w := s.curatorConnection.Watch() |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 34 | _, err := w.Get(ctx) |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 35 | if err != nil { |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 36 | w.Close() |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 37 | return err |
| 38 | } |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 39 | w.Close() |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 40 | |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 41 | // Read persisted roles if available. |
| 42 | exists, _ := s.storageRoot.Data.Node.PersistedRoles.Exists() |
| 43 | if exists { |
| 44 | supervisor.Logger(ctx).Infof("Attempting to read persisted node roles...") |
| 45 | data, err := s.storageRoot.Data.Node.PersistedRoles.Read() |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 46 | if err != nil { |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 47 | supervisor.Logger(ctx).Errorf("Failed to read persisted roles: %w", err) |
| 48 | } else { |
| 49 | var nr cpb.NodeRoles |
| 50 | if err := proto.Unmarshal(data, &nr); err != nil { |
| 51 | supervisor.Logger(ctx).Errorf("Failed to unmarshal persisted roles: %w", err) |
| 52 | } else { |
| 53 | supervisor.Logger(ctx).Infof("Got persisted role data from disk:") |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 54 | } |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 55 | if nr.ConsensusMember != nil { |
| 56 | supervisor.Logger(ctx).Infof(" - control plane member, existing peers: %+v", nr.ConsensusMember.Peers) |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 57 | } |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 58 | if nr.KubernetesController != nil { |
Serge Bazanski | 15f7f63 | 2023-03-14 17:17:20 +0100 | [diff] [blame] | 59 | supervisor.Logger(ctx).Infof(" - kubernetes controller") |
| 60 | } |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 61 | if nr.KubernetesWorker != nil { |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 62 | supervisor.Logger(ctx).Infof(" - kubernetes worker") |
| 63 | } |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 64 | s.localRoles.Set(&nr) |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 65 | } |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 66 | } else { |
| 67 | supervisor.Logger(ctx).Infof("No persisted node roles.") |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 68 | } |
Serge Bazanski | 186109c | 2023-06-21 16:57:36 +0200 | [diff] [blame] | 69 | |
| 70 | // Run networked part in a sub-runnable so that network errors don't cause us to |
| 71 | // retry the above and make us possibly trigger spurious restarts. |
| 72 | supervisor.Run(ctx, "watcher", func(ctx context.Context) error { |
| 73 | w := s.curatorConnection.Watch() |
| 74 | defer w.Close() |
| 75 | cc, err := w.Get(ctx) |
| 76 | if err != nil { |
| 77 | return err |
| 78 | } |
| 79 | |
| 80 | nodeID := cc.nodeID() |
| 81 | cur := ipb.NewCuratorClient(cc.conn) |
| 82 | |
| 83 | // Start watch for current node, update localRoles whenever we get something |
| 84 | // new. |
| 85 | srv, err := cur.Watch(ctx, &ipb.WatchRequest{Kind: &ipb.WatchRequest_NodeInCluster_{ |
| 86 | NodeInCluster: &ipb.WatchRequest_NodeInCluster{ |
| 87 | NodeId: nodeID, |
| 88 | }, |
| 89 | }}) |
| 90 | if err != nil { |
| 91 | return fmt.Errorf("watch failed: %w", err) |
| 92 | } |
| 93 | defer srv.CloseSend() |
| 94 | |
| 95 | supervisor.Signal(ctx, supervisor.SignalHealthy) |
| 96 | for { |
| 97 | ev, err := srv.Recv() |
| 98 | if err != nil { |
| 99 | return fmt.Errorf("watch event receive failed: %w", err) |
| 100 | } |
| 101 | for _, n := range ev.Nodes { |
| 102 | n := n |
| 103 | // Skip spuriously sent other nodes. |
| 104 | if n.Id != nodeID { |
| 105 | continue |
| 106 | } |
| 107 | supervisor.Logger(ctx).Infof("Got new node data. Roles:") |
| 108 | if n.Roles.ConsensusMember != nil { |
| 109 | supervisor.Logger(ctx).Infof(" - control plane member, existing peers: %+v", n.Roles.ConsensusMember.Peers) |
| 110 | } |
| 111 | if n.Roles.KubernetesController != nil { |
| 112 | supervisor.Logger(ctx).Infof(" - kubernetes controller") |
| 113 | } |
| 114 | if n.Roles.KubernetesWorker != nil { |
| 115 | supervisor.Logger(ctx).Infof(" - kubernetes worker") |
| 116 | } |
| 117 | s.localRoles.Set(n.Roles) |
| 118 | |
| 119 | // Persist role data to disk. |
| 120 | bytes, err := proto.Marshal(n.Roles) |
| 121 | if err != nil { |
| 122 | supervisor.Logger(ctx).Errorf("Failed to marshal node roles: %w", err) |
| 123 | } else { |
| 124 | err = s.storageRoot.Data.Node.PersistedRoles.Write(bytes, 0400) |
| 125 | if err != nil { |
| 126 | supervisor.Logger(ctx).Errorf("Failed to write node roles: %w", err) |
| 127 | } |
| 128 | } |
| 129 | break |
| 130 | } |
| 131 | } |
| 132 | }) |
| 133 | |
| 134 | supervisor.Signal(ctx, supervisor.SignalHealthy) |
| 135 | <-ctx.Done() |
| 136 | return ctx.Err() |
| 137 | |
Serge Bazanski | 6dff6d6 | 2022-01-28 18:15:14 +0100 | [diff] [blame] | 138 | } |