blob: a6650441f4e90f439be6a38e1e0a21ea33a72711 [file] [log] [blame]
Serge Bazanski0d937772021-06-17 15:54:40 +02001// package roleserve implements the roleserver/“Role Server”.
2//
Serge Bazanski6dff6d62022-01-28 18:15:14 +01003// The Role Server runs on every node and is responsible for running all of the
4// node's role dependant services, like the control plane (Consensus/etcd and
5// Curator) and Kubernetes. It watches the node roles as assigned by the
6// cluster's curator, updates the status of the node within the curator, and
7// spawns on-demand services.
Serge Bazanski0d937772021-06-17 15:54:40 +02008//
Serge Bazanski37110c32023-03-01 13:57:27 +00009// .-----------. .--------. Watches .------------.
10// | Cluster |--------->| Role |<----------| Node Roles |
11// | Enrolment | Provides | Server | Updates '------------'
12// '-----------' Data | |----. .-------------.
13// '--------' '----->| Node Status |
14// Spawns | | Spawns '-------------'
15// .-----' '-----.
16// V V
17// .-----------. .------------.
18// | Consensus | | Kubernetes |
19// | & Curator | | |
20// '-----------' '------------'
Serge Bazanski6dff6d62022-01-28 18:15:14 +010021//
22// The internal state of the Role Server (eg. status of services, input from
23// Cluster Enrolment, current node roles as retrieved from the cluster) is
24// stored as in-memory Event Value variables, with some of them being exposed
25// externally for other services to consume (ie. ones that wish to depend on
26// some information managed by the Role Server but which do not need to be
27// spawned on demand by the Role Server). These Event Values and code which acts
28// upon them form a reactive/dataflow-driven model which drives the Role Server
29// logic forward.
30//
31// The Role Server also has to handle the complex bootstrap problem involved in
32// simultaneously accessing the control plane (for node roles and other cluster
33// data) while maintaining (possibly the only one in the cluster) control plane
34// instance. The state of of resolution of this bootstrap problem is maintained
35// within ClusterMembership, which contains critical information about the
36// control plane, like the information required to connect to a Curator (local
37// or remote). It is updated both by external processes (ie. data from the
38// Cluster Enrolment) as well as logic responsible for spawning the control
39// plane.
Serge Bazanski0d937772021-06-17 15:54:40 +020040package roleserve
41
42import (
43 "context"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010044 "crypto/ed25519"
Serge Bazanski0d937772021-06-17 15:54:40 +020045
Serge Bazanskib43d0f02022-06-23 17:32:10 +020046 common "source.monogon.dev/metropolis/node"
Serge Bazanski79208522023-03-28 20:14:58 +020047 "source.monogon.dev/metropolis/node/core/clusternet"
Lorenz Brun1de8b182021-12-21 17:15:18 +010048 "source.monogon.dev/metropolis/node/core/identity"
Serge Bazanski0d937772021-06-17 15:54:40 +020049 "source.monogon.dev/metropolis/node/core/localstorage"
50 "source.monogon.dev/metropolis/node/core/network"
Serge Bazanskib43d0f02022-06-23 17:32:10 +020051 "source.monogon.dev/metropolis/node/core/rpc/resolver"
Serge Bazanski37110c32023-03-01 13:57:27 +000052 "source.monogon.dev/metropolis/pkg/event/memory"
Serge Bazanskie012b722023-03-29 17:49:04 +020053 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanski0d937772021-06-17 15:54:40 +020054 "source.monogon.dev/metropolis/pkg/supervisor"
Serge Bazanskie012b722023-03-29 17:49:04 +020055
Serge Bazanski6dff6d62022-01-28 18:15:14 +010056 cpb "source.monogon.dev/metropolis/proto/common"
Serge Bazanski0d937772021-06-17 15:54:40 +020057)
58
59// Config is the configuration of the role server.
60type Config struct {
Serge Bazanski0d937772021-06-17 15:54:40 +020061 // StorageRoot is a handle to access all of the Node's storage. This is needed
62 // as the roleserver spawns complex workloads like Kubernetes which need access
63 // to a broad range of storage.
64 StorageRoot *localstorage.Root
65
66 // Network is a handle to the network service, used by workloads.
67 Network *network.Service
Serge Bazanski58ddc092022-06-30 18:23:33 +020068
69 // resolver is the main, long-lived, authenticated cluster resolver that is used
70 // for all subsequent gRPC calls by the subordinates of the roleserver. It is
71 // created early in the roleserver lifecycle, and is seeded with node
72 // information as the first subordinate runs DialCurator().
73 Resolver *resolver.Resolver
Serge Bazanskie012b722023-03-29 17:49:04 +020074
75 LogTree *logtree.LogTree
Serge Bazanski0d937772021-06-17 15:54:40 +020076}
77
78// Service is the roleserver/“Role Server” service. See the package-level
79// documentation for more details.
80type Service struct {
81 Config
82
Serge Bazanski1fb2b102023-04-06 10:13:46 +020083 ClusterMembership memory.Value[*ClusterMembership]
84 KubernetesStatus memory.Value[*KubernetesStatus]
85 bootstrapData memory.Value[*bootstrapData]
86 localRoles memory.Value[*cpb.NodeRoles]
87 podNetwork memory.Value[*clusternet.Prefixes]
88 clusterDirectorySaved memory.Value[bool]
Serge Bazanski0d937772021-06-17 15:54:40 +020089
Serge Bazanski6dff6d62022-01-28 18:15:14 +010090 controlPlane *workerControlPlane
91 statusPush *workerStatusPush
Mateusz Zalega32b19292022-05-17 13:26:55 +020092 heartbeat *workerHeartbeat
Serge Bazanski6dff6d62022-01-28 18:15:14 +010093 kubernetes *workerKubernetes
94 rolefetch *workerRoleFetch
Serge Bazanskib40c0082023-03-29 14:28:04 +020095 nodeMgmt *workerNodeMgmt
Serge Bazanski79208522023-03-28 20:14:58 +020096 clusternet *workerClusternet
Serge Bazanski1fb2b102023-04-06 10:13:46 +020097 hostsfile *workerHostsfile
Serge Bazanski0d937772021-06-17 15:54:40 +020098}
99
100// New creates a Role Server services from a Config.
101func New(c Config) *Service {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200102 s := &Service{
Serge Bazanski58ddc092022-06-30 18:23:33 +0200103 Config: c,
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200104 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100105 s.controlPlane = &workerControlPlane{
106 storageRoot: s.StorageRoot,
107
108 bootstrapData: &s.bootstrapData,
109 clusterMembership: &s.ClusterMembership,
110 localRoles: &s.localRoles,
Serge Bazanski58ddc092022-06-30 18:23:33 +0200111 resolver: s.Resolver,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100112 }
113
114 s.statusPush = &workerStatusPush{
115 network: s.Network,
116
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200117 clusterMembership: &s.ClusterMembership,
118 clusterDirectorySaved: &s.clusterDirectorySaved,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100119 }
120
Mateusz Zalega32b19292022-05-17 13:26:55 +0200121 s.heartbeat = &workerHeartbeat{
122 network: s.Network,
123
124 clusterMembership: &s.ClusterMembership,
125 }
126
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100127 s.kubernetes = &workerKubernetes{
128 network: s.Network,
129 storageRoot: s.StorageRoot,
130
131 localRoles: &s.localRoles,
132 clusterMembership: &s.ClusterMembership,
133
134 kubernetesStatus: &s.KubernetesStatus,
Serge Bazanski79208522023-03-28 20:14:58 +0200135 podNetwork: &s.podNetwork,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100136 }
137
138 s.rolefetch = &workerRoleFetch{
139 clusterMembership: &s.ClusterMembership,
140
141 localRoles: &s.localRoles,
142 }
143
Serge Bazanskib40c0082023-03-29 14:28:04 +0200144 s.nodeMgmt = &workerNodeMgmt{
145 clusterMembership: &s.ClusterMembership,
Serge Bazanskie012b722023-03-29 17:49:04 +0200146 logTree: s.LogTree,
Serge Bazanskib40c0082023-03-29 14:28:04 +0200147 }
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200148
Serge Bazanski79208522023-03-28 20:14:58 +0200149 s.clusternet = &workerClusternet{
150 storageRoot: s.StorageRoot,
151
152 clusterMembership: &s.ClusterMembership,
153 podNetwork: &s.podNetwork,
154 }
Serge Bazanskib40c0082023-03-29 14:28:04 +0200155
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200156 s.hostsfile = &workerHostsfile{
157 storageRoot: s.StorageRoot,
158 network: s.Network,
159 clusterMembership: &s.ClusterMembership,
160 clusterDirectorySaved: &s.clusterDirectorySaved,
161 }
162
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100163 return s
Serge Bazanski0d937772021-06-17 15:54:40 +0200164}
165
Mateusz Zalega2930e992022-04-25 12:52:35 +0200166func (s *Service) ProvideBootstrapData(privkey ed25519.PrivateKey, iok, cuk, nuk, jkey []byte) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200167 pubkey := privkey.Public().(ed25519.PublicKey)
168 nid := identity.NodeID(pubkey)
169
170 // This is the first time we have the node ID, tell the resolver that it's
171 // available on the loopback interface.
Serge Bazanski58ddc092022-06-30 18:23:33 +0200172 s.Resolver.AddOverride(nid, resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200173
Serge Bazanski37110c32023-03-01 13:57:27 +0000174 s.ClusterMembership.Set(&ClusterMembership{
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200175 pubkey: pubkey,
Serge Bazanski58ddc092022-06-30 18:23:33 +0200176 resolver: s.Resolver,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100177 })
Serge Bazanski37110c32023-03-01 13:57:27 +0000178 s.bootstrapData.Set(&bootstrapData{
Mateusz Zalega2930e992022-04-25 12:52:35 +0200179 nodePrivateKey: privkey,
180 initialOwnerKey: iok,
181 clusterUnlockKey: cuk,
182 nodeUnlockKey: nuk,
183 nodePrivateJoinKey: jkey,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100184 })
Serge Bazanski0d937772021-06-17 15:54:40 +0200185}
186
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100187func (s *Service) ProvideRegisterData(credentials identity.NodeCredentials, directory *cpb.ClusterDirectory) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200188 // This is the first time we have the node ID, tell the resolver that it's
189 // available on the loopback interface.
Serge Bazanski58ddc092022-06-30 18:23:33 +0200190 s.Resolver.AddOverride(credentials.ID(), resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200191
Serge Bazanski37110c32023-03-01 13:57:27 +0000192 s.ClusterMembership.Set(&ClusterMembership{
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100193 remoteCurators: directory,
194 credentials: &credentials,
195 pubkey: credentials.PublicKey(),
Serge Bazanski58ddc092022-06-30 18:23:33 +0200196 resolver: s.Resolver,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100197 })
Serge Bazanski0d937772021-06-17 15:54:40 +0200198}
199
Mateusz Zalega2930e992022-04-25 12:52:35 +0200200func (s *Service) ProvideJoinData(credentials identity.NodeCredentials, directory *cpb.ClusterDirectory) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200201 // This is the first time we have the node ID, tell the resolver that it's
202 // available on the loopback interface.
Serge Bazanski58ddc092022-06-30 18:23:33 +0200203 s.Resolver.AddOverride(credentials.ID(), resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200204
Serge Bazanski37110c32023-03-01 13:57:27 +0000205 s.ClusterMembership.Set(&ClusterMembership{
Mateusz Zalega2930e992022-04-25 12:52:35 +0200206 remoteCurators: directory,
207 credentials: &credentials,
208 pubkey: credentials.PublicKey(),
Serge Bazanski58ddc092022-06-30 18:23:33 +0200209 resolver: s.Resolver,
Mateusz Zalega2930e992022-04-25 12:52:35 +0200210 })
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200211 s.clusterDirectorySaved.Set(true)
Mateusz Zalega2930e992022-04-25 12:52:35 +0200212}
213
Serge Bazanski0d937772021-06-17 15:54:40 +0200214// Run the Role Server service, which uses intermediary workload launchers to
215// start/stop subordinate services as the Node's roles change.
216func (s *Service) Run(ctx context.Context) error {
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100217 supervisor.Run(ctx, "controlplane", s.controlPlane.run)
218 supervisor.Run(ctx, "kubernetes", s.kubernetes.run)
219 supervisor.Run(ctx, "statuspush", s.statusPush.run)
Mateusz Zalega32b19292022-05-17 13:26:55 +0200220 supervisor.Run(ctx, "heartbeat", s.heartbeat.run)
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100221 supervisor.Run(ctx, "rolefetch", s.rolefetch.run)
Serge Bazanskib40c0082023-03-29 14:28:04 +0200222 supervisor.Run(ctx, "nodemgmt", s.nodeMgmt.run)
Serge Bazanski79208522023-03-28 20:14:58 +0200223 supervisor.Run(ctx, "clusternet", s.clusternet.run)
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200224 supervisor.Run(ctx, "hostsfile", s.hostsfile.run)
Serge Bazanski0d937772021-06-17 15:54:40 +0200225 supervisor.Signal(ctx, supervisor.SignalHealthy)
226
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100227 <-ctx.Done()
228 return ctx.Err()
Serge Bazanski0d937772021-06-17 15:54:40 +0200229}