blob: 3140ea8f2cdb0b1f98a2c09410076f8a498fef0b [file] [log] [blame]
Serge Bazanski5df62ba2023-03-22 17:56:46 +01001// Package roleserve implements the roleserver/“Role Server”.
Serge Bazanski0d937772021-06-17 15:54:40 +02002//
Serge Bazanski6dff6d62022-01-28 18:15:14 +01003// The Role Server runs on every node and is responsible for running all of the
4// node's role dependant services, like the control plane (Consensus/etcd and
5// Curator) and Kubernetes. It watches the node roles as assigned by the
6// cluster's curator, updates the status of the node within the curator, and
7// spawns on-demand services.
Serge Bazanski0d937772021-06-17 15:54:40 +02008//
Serge Bazanski37110c32023-03-01 13:57:27 +00009// .-----------. .--------. Watches .------------.
10// | Cluster |--------->| Role |<----------| Node Roles |
11// | Enrolment | Provides | Server | Updates '------------'
12// '-----------' Data | |----. .-------------.
13// '--------' '----->| Node Status |
14// Spawns | | Spawns '-------------'
15// .-----' '-----.
16// V V
17// .-----------. .------------.
18// | Consensus | | Kubernetes |
19// | & Curator | | |
20// '-----------' '------------'
Serge Bazanski6dff6d62022-01-28 18:15:14 +010021//
22// The internal state of the Role Server (eg. status of services, input from
23// Cluster Enrolment, current node roles as retrieved from the cluster) is
24// stored as in-memory Event Value variables, with some of them being exposed
25// externally for other services to consume (ie. ones that wish to depend on
26// some information managed by the Role Server but which do not need to be
27// spawned on demand by the Role Server). These Event Values and code which acts
28// upon them form a reactive/dataflow-driven model which drives the Role Server
29// logic forward.
30//
31// The Role Server also has to handle the complex bootstrap problem involved in
32// simultaneously accessing the control plane (for node roles and other cluster
33// data) while maintaining (possibly the only one in the cluster) control plane
Serge Bazanskife3d8fd2023-05-30 20:50:09 +020034// instance. This problem is resolved by using the RPC resolver package which
35// allows dynamic reconfiguration of endpoints as the cluster is running.
Serge Bazanski0d937772021-06-17 15:54:40 +020036package roleserve
37
38import (
39 "context"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010040 "crypto/ed25519"
Serge Bazanski0d937772021-06-17 15:54:40 +020041
Serge Bazanskib43d0f02022-06-23 17:32:10 +020042 common "source.monogon.dev/metropolis/node"
Serge Bazanski79208522023-03-28 20:14:58 +020043 "source.monogon.dev/metropolis/node/core/clusternet"
Serge Bazanski5df62ba2023-03-22 17:56:46 +010044 "source.monogon.dev/metropolis/node/core/curator"
Lorenz Brun1de8b182021-12-21 17:15:18 +010045 "source.monogon.dev/metropolis/node/core/identity"
Serge Bazanski0d937772021-06-17 15:54:40 +020046 "source.monogon.dev/metropolis/node/core/localstorage"
47 "source.monogon.dev/metropolis/node/core/network"
Serge Bazanskib43d0f02022-06-23 17:32:10 +020048 "source.monogon.dev/metropolis/node/core/rpc/resolver"
Lorenz Brun35fcf032023-06-29 04:15:58 +020049 "source.monogon.dev/metropolis/node/core/update"
Serge Bazanski37110c32023-03-01 13:57:27 +000050 "source.monogon.dev/metropolis/pkg/event/memory"
Serge Bazanskie012b722023-03-29 17:49:04 +020051 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanski0d937772021-06-17 15:54:40 +020052 "source.monogon.dev/metropolis/pkg/supervisor"
Serge Bazanskie012b722023-03-29 17:49:04 +020053
Serge Bazanski6dff6d62022-01-28 18:15:14 +010054 cpb "source.monogon.dev/metropolis/proto/common"
Serge Bazanski0d937772021-06-17 15:54:40 +020055)
56
57// Config is the configuration of the role server.
58type Config struct {
Serge Bazanski0d937772021-06-17 15:54:40 +020059 // StorageRoot is a handle to access all of the Node's storage. This is needed
60 // as the roleserver spawns complex workloads like Kubernetes which need access
61 // to a broad range of storage.
62 StorageRoot *localstorage.Root
63
64 // Network is a handle to the network service, used by workloads.
65 Network *network.Service
Serge Bazanski58ddc092022-06-30 18:23:33 +020066
67 // resolver is the main, long-lived, authenticated cluster resolver that is used
68 // for all subsequent gRPC calls by the subordinates of the roleserver. It is
69 // created early in the roleserver lifecycle, and is seeded with node
Serge Bazanskife3d8fd2023-05-30 20:50:09 +020070 // information from the ProvideXXX methods.
Serge Bazanski58ddc092022-06-30 18:23:33 +020071 Resolver *resolver.Resolver
Serge Bazanskie012b722023-03-29 17:49:04 +020072
Lorenz Brun35fcf032023-06-29 04:15:58 +020073 // Update is a handle to the update service, used by workloads.
74 Update *update.Service
75
Serge Bazanskie012b722023-03-29 17:49:04 +020076 LogTree *logtree.LogTree
Serge Bazanski0d937772021-06-17 15:54:40 +020077}
78
79// Service is the roleserver/“Role Server” service. See the package-level
80// documentation for more details.
81type Service struct {
82 Config
83
Serge Bazanski1fb2b102023-04-06 10:13:46 +020084 KubernetesStatus memory.Value[*KubernetesStatus]
85 bootstrapData memory.Value[*bootstrapData]
86 localRoles memory.Value[*cpb.NodeRoles]
87 podNetwork memory.Value[*clusternet.Prefixes]
88 clusterDirectorySaved memory.Value[bool]
Serge Bazanskife3d8fd2023-05-30 20:50:09 +020089 localControlPlane memory.Value[*localControlPlane]
90 CuratorConnection memory.Value[*curatorConnection]
Serge Bazanski0d937772021-06-17 15:54:40 +020091
Serge Bazanski6dff6d62022-01-28 18:15:14 +010092 controlPlane *workerControlPlane
93 statusPush *workerStatusPush
Mateusz Zalega32b19292022-05-17 13:26:55 +020094 heartbeat *workerHeartbeat
Serge Bazanski6dff6d62022-01-28 18:15:14 +010095 kubernetes *workerKubernetes
96 rolefetch *workerRoleFetch
Serge Bazanskib40c0082023-03-29 14:28:04 +020097 nodeMgmt *workerNodeMgmt
Serge Bazanski79208522023-03-28 20:14:58 +020098 clusternet *workerClusternet
Serge Bazanski1fb2b102023-04-06 10:13:46 +020099 hostsfile *workerHostsfile
Serge Bazanski54e212a2023-06-14 13:45:11 +0200100 metrics *workerMetrics
Serge Bazanski0d937772021-06-17 15:54:40 +0200101}
102
103// New creates a Role Server services from a Config.
104func New(c Config) *Service {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200105 s := &Service{
Serge Bazanski58ddc092022-06-30 18:23:33 +0200106 Config: c,
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200107 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100108 s.controlPlane = &workerControlPlane{
109 storageRoot: s.StorageRoot,
110
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200111 bootstrapData: &s.bootstrapData,
112 localRoles: &s.localRoles,
113 resolver: s.Resolver,
114
115 localControlPlane: &s.localControlPlane,
116 curatorConnection: &s.CuratorConnection,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100117 }
118
119 s.statusPush = &workerStatusPush{
120 network: s.Network,
121
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200122 curatorConnection: &s.CuratorConnection,
123 localControlPlane: &s.localControlPlane,
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200124 clusterDirectorySaved: &s.clusterDirectorySaved,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100125 }
126
Mateusz Zalega32b19292022-05-17 13:26:55 +0200127 s.heartbeat = &workerHeartbeat{
128 network: s.Network,
129
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200130 curatorConnection: &s.CuratorConnection,
Mateusz Zalega32b19292022-05-17 13:26:55 +0200131 }
132
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100133 s.kubernetes = &workerKubernetes{
134 network: s.Network,
135 storageRoot: s.StorageRoot,
136
137 localRoles: &s.localRoles,
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200138 localControlPlane: &s.localControlPlane,
139 curatorConnection: &s.CuratorConnection,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100140
141 kubernetesStatus: &s.KubernetesStatus,
Serge Bazanski79208522023-03-28 20:14:58 +0200142 podNetwork: &s.podNetwork,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100143 }
144
145 s.rolefetch = &workerRoleFetch{
Serge Bazanski186109c2023-06-21 16:57:36 +0200146 storageRoot: s.StorageRoot,
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200147 curatorConnection: &s.CuratorConnection,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100148
149 localRoles: &s.localRoles,
150 }
151
Serge Bazanskib40c0082023-03-29 14:28:04 +0200152 s.nodeMgmt = &workerNodeMgmt{
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200153 curatorConnection: &s.CuratorConnection,
Serge Bazanskie012b722023-03-29 17:49:04 +0200154 logTree: s.LogTree,
Lorenz Brun35fcf032023-06-29 04:15:58 +0200155 updateService: s.Update,
Serge Bazanskib40c0082023-03-29 14:28:04 +0200156 }
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200157
Serge Bazanski79208522023-03-28 20:14:58 +0200158 s.clusternet = &workerClusternet{
159 storageRoot: s.StorageRoot,
160
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200161 curatorConnection: &s.CuratorConnection,
Serge Bazanski79208522023-03-28 20:14:58 +0200162 podNetwork: &s.podNetwork,
Serge Bazanskib565cc62023-03-30 18:43:51 +0200163 network: s.Network,
Serge Bazanski79208522023-03-28 20:14:58 +0200164 }
Serge Bazanskib40c0082023-03-29 14:28:04 +0200165
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200166 s.hostsfile = &workerHostsfile{
167 storageRoot: s.StorageRoot,
168 network: s.Network,
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200169 curatorConnection: &s.CuratorConnection,
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200170 clusterDirectorySaved: &s.clusterDirectorySaved,
171 }
172
Serge Bazanski54e212a2023-06-14 13:45:11 +0200173 s.metrics = &workerMetrics{
174 curatorConnection: &s.CuratorConnection,
Tim Windelschmidtb551b652023-07-17 16:01:42 +0200175 localRoles: &s.localRoles,
Tim Windelschmidtfd49f222023-07-20 14:27:50 +0200176 localControlplane: &s.localControlPlane,
Serge Bazanski54e212a2023-06-14 13:45:11 +0200177 }
178
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100179 return s
Serge Bazanski0d937772021-06-17 15:54:40 +0200180}
181
Serge Bazanskie4a4ce12023-03-22 18:29:54 +0100182func (s *Service) ProvideBootstrapData(privkey ed25519.PrivateKey, iok, cuk, nuk, jkey []byte, icc *curator.Cluster, tpmUsage cpb.NodeTPMUsage) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200183 pubkey := privkey.Public().(ed25519.PublicKey)
184 nid := identity.NodeID(pubkey)
185
186 // This is the first time we have the node ID, tell the resolver that it's
187 // available on the loopback interface.
Serge Bazanski58ddc092022-06-30 18:23:33 +0200188 s.Resolver.AddOverride(nid, resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200189 s.Resolver.AddEndpoint(resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200190
Serge Bazanski37110c32023-03-01 13:57:27 +0000191 s.bootstrapData.Set(&bootstrapData{
Serge Bazanski5df62ba2023-03-22 17:56:46 +0100192 nodePrivateKey: privkey,
193 initialOwnerKey: iok,
194 clusterUnlockKey: cuk,
195 nodeUnlockKey: nuk,
196 nodePrivateJoinKey: jkey,
197 initialClusterConfiguration: icc,
Serge Bazanskie4a4ce12023-03-22 18:29:54 +0100198 nodeTPMUsage: tpmUsage,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100199 })
Serge Bazanski0d937772021-06-17 15:54:40 +0200200}
201
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100202func (s *Service) ProvideRegisterData(credentials identity.NodeCredentials, directory *cpb.ClusterDirectory) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200203 // This is the first time we have the node ID, tell the resolver that it's
204 // available on the loopback interface.
Serge Bazanski58ddc092022-06-30 18:23:33 +0200205 s.Resolver.AddOverride(credentials.ID(), resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200206 // Also tell the resolver about all the existing nodes in the cluster we just
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200207 // registered into. The directory passed here was used to issue the initial
208 // Register call, which means at least one of the nodes was running the control
209 // plane and thus can be used to seed the rest of the resolver.
Serge Bazanski90a70a02023-05-30 15:15:27 +0200210 for _, n := range directory.Nodes {
Serge Bazanski90a70a02023-05-30 15:15:27 +0200211 for _, addr := range n.Addresses {
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200212 s.Resolver.AddEndpoint(resolver.NodeAtAddressWithDefaultPort(addr.Host))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200213 }
214 }
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200215
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200216 s.CuratorConnection.Set(newCuratorConnection(&credentials, s.Resolver))
Serge Bazanski0d937772021-06-17 15:54:40 +0200217}
218
Mateusz Zalega2930e992022-04-25 12:52:35 +0200219func (s *Service) ProvideJoinData(credentials identity.NodeCredentials, directory *cpb.ClusterDirectory) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200220 // This is the first time we have the node ID, tell the resolver that it's
221 // available on the loopback interface.
Serge Bazanski58ddc092022-06-30 18:23:33 +0200222 s.Resolver.AddOverride(credentials.ID(), resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200223 // Also tell the resolver about all the existing nodes in the cluster we just
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200224 // joined into. The directory passed here was used to issue the initial
225 // Join call, which means at least one of the nodes was running the control
226 // plane and thus can be used to seed the rest of the resolver.
Serge Bazanski90a70a02023-05-30 15:15:27 +0200227 for _, n := range directory.Nodes {
Serge Bazanski90a70a02023-05-30 15:15:27 +0200228 for _, addr := range n.Addresses {
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200229 s.Resolver.AddEndpoint(resolver.NodeAtAddressWithDefaultPort(addr.Host))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200230 }
231 }
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200232
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200233 s.CuratorConnection.Set(newCuratorConnection(&credentials, s.Resolver))
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200234 s.clusterDirectorySaved.Set(true)
Mateusz Zalega2930e992022-04-25 12:52:35 +0200235}
236
Serge Bazanski0d937772021-06-17 15:54:40 +0200237// Run the Role Server service, which uses intermediary workload launchers to
238// start/stop subordinate services as the Node's roles change.
239func (s *Service) Run(ctx context.Context) error {
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100240 supervisor.Run(ctx, "controlplane", s.controlPlane.run)
241 supervisor.Run(ctx, "kubernetes", s.kubernetes.run)
242 supervisor.Run(ctx, "statuspush", s.statusPush.run)
Mateusz Zalega32b19292022-05-17 13:26:55 +0200243 supervisor.Run(ctx, "heartbeat", s.heartbeat.run)
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100244 supervisor.Run(ctx, "rolefetch", s.rolefetch.run)
Serge Bazanskib40c0082023-03-29 14:28:04 +0200245 supervisor.Run(ctx, "nodemgmt", s.nodeMgmt.run)
Serge Bazanski79208522023-03-28 20:14:58 +0200246 supervisor.Run(ctx, "clusternet", s.clusternet.run)
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200247 supervisor.Run(ctx, "hostsfile", s.hostsfile.run)
Serge Bazanski54e212a2023-06-14 13:45:11 +0200248 supervisor.Run(ctx, "metrics", s.metrics.run)
Serge Bazanski0d937772021-06-17 15:54:40 +0200249 supervisor.Signal(ctx, supervisor.SignalHealthy)
250
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100251 <-ctx.Done()
252 return ctx.Err()
Serge Bazanski0d937772021-06-17 15:54:40 +0200253}