blob: 4c1f6102273d95b30a5892d2bbcdd19905c8c1fa [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski5df62ba2023-03-22 17:56:46 +01004// Package roleserve implements the roleserver/“Role Server”.
Serge Bazanski0d937772021-06-17 15:54:40 +02005//
Serge Bazanski6dff6d62022-01-28 18:15:14 +01006// The Role Server runs on every node and is responsible for running all of the
7// node's role dependant services, like the control plane (Consensus/etcd and
8// Curator) and Kubernetes. It watches the node roles as assigned by the
9// cluster's curator, updates the status of the node within the curator, and
10// spawns on-demand services.
Serge Bazanski0d937772021-06-17 15:54:40 +020011//
Serge Bazanski37110c32023-03-01 13:57:27 +000012// .-----------. .--------. Watches .------------.
13// | Cluster |--------->| Role |<----------| Node Roles |
14// | Enrolment | Provides | Server | Updates '------------'
15// '-----------' Data | |----. .-------------.
16// '--------' '----->| Node Status |
17// Spawns | | Spawns '-------------'
18// .-----' '-----.
19// V V
20// .-----------. .------------.
21// | Consensus | | Kubernetes |
22// | & Curator | | |
23// '-----------' '------------'
Serge Bazanski6dff6d62022-01-28 18:15:14 +010024//
25// The internal state of the Role Server (eg. status of services, input from
26// Cluster Enrolment, current node roles as retrieved from the cluster) is
27// stored as in-memory Event Value variables, with some of them being exposed
28// externally for other services to consume (ie. ones that wish to depend on
29// some information managed by the Role Server but which do not need to be
30// spawned on demand by the Role Server). These Event Values and code which acts
31// upon them form a reactive/dataflow-driven model which drives the Role Server
32// logic forward.
33//
34// The Role Server also has to handle the complex bootstrap problem involved in
35// simultaneously accessing the control plane (for node roles and other cluster
36// data) while maintaining (possibly the only one in the cluster) control plane
Serge Bazanskife3d8fd2023-05-30 20:50:09 +020037// instance. This problem is resolved by using the RPC resolver package which
38// allows dynamic reconfiguration of endpoints as the cluster is running.
Serge Bazanski0d937772021-06-17 15:54:40 +020039package roleserve
40
41import (
42 "context"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010043 "crypto/ed25519"
Serge Bazanski0d937772021-06-17 15:54:40 +020044
Serge Bazanskib43d0f02022-06-23 17:32:10 +020045 common "source.monogon.dev/metropolis/node"
Serge Bazanski79208522023-03-28 20:14:58 +020046 "source.monogon.dev/metropolis/node/core/clusternet"
Serge Bazanski5df62ba2023-03-22 17:56:46 +010047 "source.monogon.dev/metropolis/node/core/curator"
Lorenz Brun1de8b182021-12-21 17:15:18 +010048 "source.monogon.dev/metropolis/node/core/identity"
Serge Bazanski0d937772021-06-17 15:54:40 +020049 "source.monogon.dev/metropolis/node/core/localstorage"
50 "source.monogon.dev/metropolis/node/core/network"
Serge Bazanskib43d0f02022-06-23 17:32:10 +020051 "source.monogon.dev/metropolis/node/core/rpc/resolver"
Lorenz Brun35fcf032023-06-29 04:15:58 +020052 "source.monogon.dev/metropolis/node/core/update"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010053 cpb "source.monogon.dev/metropolis/proto/common"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020054 "source.monogon.dev/osbase/event/memory"
55 "source.monogon.dev/osbase/logtree"
56 "source.monogon.dev/osbase/supervisor"
Serge Bazanski0d937772021-06-17 15:54:40 +020057)
58
59// Config is the configuration of the role server.
60type Config struct {
Serge Bazanski0d937772021-06-17 15:54:40 +020061 // StorageRoot is a handle to access all of the Node's storage. This is needed
62 // as the roleserver spawns complex workloads like Kubernetes which need access
63 // to a broad range of storage.
64 StorageRoot *localstorage.Root
65
66 // Network is a handle to the network service, used by workloads.
67 Network *network.Service
Serge Bazanski58ddc092022-06-30 18:23:33 +020068
Lorenz Brunc607bf62025-07-22 20:25:26 +020069 PodNetwork *memory.Value[*clusternet.Prefixes]
70
Serge Bazanski58ddc092022-06-30 18:23:33 +020071 // resolver is the main, long-lived, authenticated cluster resolver that is used
72 // for all subsequent gRPC calls by the subordinates of the roleserver. It is
73 // created early in the roleserver lifecycle, and is seeded with node
Serge Bazanskife3d8fd2023-05-30 20:50:09 +020074 // information from the ProvideXXX methods.
Serge Bazanski58ddc092022-06-30 18:23:33 +020075 Resolver *resolver.Resolver
Serge Bazanskie012b722023-03-29 17:49:04 +020076
Lorenz Brun35fcf032023-06-29 04:15:58 +020077 // Update is a handle to the update service, used by workloads.
78 Update *update.Service
79
Serge Bazanskie012b722023-03-29 17:49:04 +020080 LogTree *logtree.LogTree
Serge Bazanski0d937772021-06-17 15:54:40 +020081}
82
83// Service is the roleserver/“Role Server” service. See the package-level
84// documentation for more details.
85type Service struct {
86 Config
87
Serge Bazanski1fb2b102023-04-06 10:13:46 +020088 KubernetesStatus memory.Value[*KubernetesStatus]
Serge Bazanski11198c82024-05-22 14:11:01 +020089 bootstrapData memory.Value[*BootstrapData]
Serge Bazanskib2d6c332024-09-03 12:18:24 +020090 LocalRoles memory.Value[*cpb.NodeRoles]
Serge Bazanski1fb2b102023-04-06 10:13:46 +020091 clusterDirectorySaved memory.Value[bool]
Serge Bazanskife3d8fd2023-05-30 20:50:09 +020092 localControlPlane memory.Value[*localControlPlane]
Serge Bazanskib2d6c332024-09-03 12:18:24 +020093 CuratorConnection memory.Value[*CuratorConnection]
Serge Bazanski0d937772021-06-17 15:54:40 +020094
Serge Bazanski6dff6d62022-01-28 18:15:14 +010095 controlPlane *workerControlPlane
96 statusPush *workerStatusPush
Mateusz Zalega32b19292022-05-17 13:26:55 +020097 heartbeat *workerHeartbeat
Serge Bazanski6dff6d62022-01-28 18:15:14 +010098 kubernetes *workerKubernetes
99 rolefetch *workerRoleFetch
Serge Bazanskib40c0082023-03-29 14:28:04 +0200100 nodeMgmt *workerNodeMgmt
Serge Bazanski79208522023-03-28 20:14:58 +0200101 clusternet *workerClusternet
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200102 hostsfile *workerHostsfile
Serge Bazanski54e212a2023-06-14 13:45:11 +0200103 metrics *workerMetrics
Serge Bazanski0d937772021-06-17 15:54:40 +0200104}
105
106// New creates a Role Server services from a Config.
107func New(c Config) *Service {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200108 s := &Service{
Serge Bazanski58ddc092022-06-30 18:23:33 +0200109 Config: c,
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200110 }
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100111 s.controlPlane = &workerControlPlane{
112 storageRoot: s.StorageRoot,
113
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200114 bootstrapData: &s.bootstrapData,
Serge Bazanskib2d6c332024-09-03 12:18:24 +0200115 localRoles: &s.LocalRoles,
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200116 resolver: s.Resolver,
117
118 localControlPlane: &s.localControlPlane,
119 curatorConnection: &s.CuratorConnection,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100120 }
121
122 s.statusPush = &workerStatusPush{
123 network: s.Network,
124
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200125 curatorConnection: &s.CuratorConnection,
126 localControlPlane: &s.localControlPlane,
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200127 clusterDirectorySaved: &s.clusterDirectorySaved,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100128 }
129
Mateusz Zalega32b19292022-05-17 13:26:55 +0200130 s.heartbeat = &workerHeartbeat{
131 network: s.Network,
132
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200133 curatorConnection: &s.CuratorConnection,
Mateusz Zalega32b19292022-05-17 13:26:55 +0200134 }
135
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100136 s.kubernetes = &workerKubernetes{
137 network: s.Network,
138 storageRoot: s.StorageRoot,
139
Serge Bazanskib2d6c332024-09-03 12:18:24 +0200140 localRoles: &s.LocalRoles,
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200141 localControlPlane: &s.localControlPlane,
142 curatorConnection: &s.CuratorConnection,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100143
144 kubernetesStatus: &s.KubernetesStatus,
Lorenz Brunc607bf62025-07-22 20:25:26 +0200145 podNetwork: s.Config.PodNetwork,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100146 }
147
148 s.rolefetch = &workerRoleFetch{
Serge Bazanski186109c2023-06-21 16:57:36 +0200149 storageRoot: s.StorageRoot,
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200150 curatorConnection: &s.CuratorConnection,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100151
Serge Bazanskib2d6c332024-09-03 12:18:24 +0200152 localRoles: &s.LocalRoles,
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100153 }
154
Serge Bazanskib40c0082023-03-29 14:28:04 +0200155 s.nodeMgmt = &workerNodeMgmt{
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200156 curatorConnection: &s.CuratorConnection,
Serge Bazanskie012b722023-03-29 17:49:04 +0200157 logTree: s.LogTree,
Lorenz Brun35fcf032023-06-29 04:15:58 +0200158 updateService: s.Update,
Serge Bazanskib40c0082023-03-29 14:28:04 +0200159 }
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200160
Serge Bazanski79208522023-03-28 20:14:58 +0200161 s.clusternet = &workerClusternet{
162 storageRoot: s.StorageRoot,
163
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200164 curatorConnection: &s.CuratorConnection,
Lorenz Brunc607bf62025-07-22 20:25:26 +0200165 podNetwork: s.Config.PodNetwork,
Serge Bazanskib565cc62023-03-30 18:43:51 +0200166 network: s.Network,
Serge Bazanski79208522023-03-28 20:14:58 +0200167 }
Serge Bazanskib40c0082023-03-29 14:28:04 +0200168
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200169 s.hostsfile = &workerHostsfile{
170 storageRoot: s.StorageRoot,
171 network: s.Network,
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200172 curatorConnection: &s.CuratorConnection,
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200173 clusterDirectorySaved: &s.clusterDirectorySaved,
174 }
175
Serge Bazanski54e212a2023-06-14 13:45:11 +0200176 s.metrics = &workerMetrics{
177 curatorConnection: &s.CuratorConnection,
Serge Bazanskib2d6c332024-09-03 12:18:24 +0200178 localRoles: &s.LocalRoles,
Tim Windelschmidtfd49f222023-07-20 14:27:50 +0200179 localControlplane: &s.localControlPlane,
Serge Bazanski54e212a2023-06-14 13:45:11 +0200180 }
181
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100182 return s
Serge Bazanski0d937772021-06-17 15:54:40 +0200183}
184
Serge Bazanski11198c82024-05-22 14:11:01 +0200185// BootstrapData contains all the information needed to be injected into the
186// roleserver by the cluster bootstrap logic via ProvideBootstrapData.
187type BootstrapData struct {
188 // Data about the bootstrapping node.
189 Node struct {
Jan Schär39d9c242024-09-24 13:49:55 +0200190 ID string
Serge Bazanski11198c82024-05-22 14:11:01 +0200191 PrivateKey ed25519.PrivateKey
192
193 // CUK/NUK for storage, if storage encryption is enabled.
194 ClusterUnlockKey []byte
195 NodeUnlockKey []byte
196
197 // Join key for subsequent reboots.
198 JoinKey ed25519.PrivateKey
199
200 // Reported TPM usage by the node.
201 TPMUsage cpb.NodeTPMUsage
202
203 // Initial labels for the node.
204 Labels map[string]string
205 }
206 // Cluster-specific data.
207 Cluster struct {
208 // Public keys of initial owner of cluster. Used to escrow real user credentials
209 // during the takeownership metroctl process.
210 InitialOwnerKey []byte
211 // Initial cluster configuration.
212 Configuration *curator.Cluster
213 }
214}
215
216func (s *Service) ProvideBootstrapData(data *BootstrapData) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200217 // This is the first time we have the node ID, tell the resolver that it's
218 // available on the loopback interface.
Jan Schär39d9c242024-09-24 13:49:55 +0200219 s.Resolver.AddOverride(data.Node.ID, resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200220 s.Resolver.AddEndpoint(resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200221
Serge Bazanski11198c82024-05-22 14:11:01 +0200222 s.bootstrapData.Set(data)
Serge Bazanski0d937772021-06-17 15:54:40 +0200223}
224
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100225func (s *Service) ProvideRegisterData(credentials identity.NodeCredentials, directory *cpb.ClusterDirectory) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200226 // This is the first time we have the node ID, tell the resolver that it's
227 // available on the loopback interface.
Serge Bazanski58ddc092022-06-30 18:23:33 +0200228 s.Resolver.AddOverride(credentials.ID(), resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200229 // Also tell the resolver about all the existing nodes in the cluster we just
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200230 // registered into. The directory passed here was used to issue the initial
231 // Register call, which means at least one of the nodes was running the control
232 // plane and thus can be used to seed the rest of the resolver.
Serge Bazanski90a70a02023-05-30 15:15:27 +0200233 for _, n := range directory.Nodes {
Serge Bazanski90a70a02023-05-30 15:15:27 +0200234 for _, addr := range n.Addresses {
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200235 s.Resolver.AddEndpoint(resolver.NodeAtAddressWithDefaultPort(addr.Host))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200236 }
237 }
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200238
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200239 s.CuratorConnection.Set(newCuratorConnection(&credentials, s.Resolver))
Serge Bazanski0d937772021-06-17 15:54:40 +0200240}
241
Mateusz Zalega2930e992022-04-25 12:52:35 +0200242func (s *Service) ProvideJoinData(credentials identity.NodeCredentials, directory *cpb.ClusterDirectory) {
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200243 // This is the first time we have the node ID, tell the resolver that it's
244 // available on the loopback interface.
Serge Bazanski58ddc092022-06-30 18:23:33 +0200245 s.Resolver.AddOverride(credentials.ID(), resolver.NodeByHostPort("127.0.0.1", uint16(common.CuratorServicePort)))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200246 // Also tell the resolver about all the existing nodes in the cluster we just
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200247 // joined into. The directory passed here was used to issue the initial
248 // Join call, which means at least one of the nodes was running the control
249 // plane and thus can be used to seed the rest of the resolver.
Serge Bazanski90a70a02023-05-30 15:15:27 +0200250 for _, n := range directory.Nodes {
Serge Bazanski90a70a02023-05-30 15:15:27 +0200251 for _, addr := range n.Addresses {
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200252 s.Resolver.AddEndpoint(resolver.NodeAtAddressWithDefaultPort(addr.Host))
Serge Bazanski90a70a02023-05-30 15:15:27 +0200253 }
254 }
Serge Bazanskib43d0f02022-06-23 17:32:10 +0200255
Serge Bazanskife3d8fd2023-05-30 20:50:09 +0200256 s.CuratorConnection.Set(newCuratorConnection(&credentials, s.Resolver))
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200257 s.clusterDirectorySaved.Set(true)
Mateusz Zalega2930e992022-04-25 12:52:35 +0200258}
259
Serge Bazanski0d937772021-06-17 15:54:40 +0200260// Run the Role Server service, which uses intermediary workload launchers to
261// start/stop subordinate services as the Node's roles change.
262func (s *Service) Run(ctx context.Context) error {
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100263 supervisor.Run(ctx, "controlplane", s.controlPlane.run)
264 supervisor.Run(ctx, "kubernetes", s.kubernetes.run)
265 supervisor.Run(ctx, "statuspush", s.statusPush.run)
Mateusz Zalega32b19292022-05-17 13:26:55 +0200266 supervisor.Run(ctx, "heartbeat", s.heartbeat.run)
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100267 supervisor.Run(ctx, "rolefetch", s.rolefetch.run)
Serge Bazanskib40c0082023-03-29 14:28:04 +0200268 supervisor.Run(ctx, "nodemgmt", s.nodeMgmt.run)
Serge Bazanski79208522023-03-28 20:14:58 +0200269 supervisor.Run(ctx, "clusternet", s.clusternet.run)
Serge Bazanski1fb2b102023-04-06 10:13:46 +0200270 supervisor.Run(ctx, "hostsfile", s.hostsfile.run)
Serge Bazanski54e212a2023-06-14 13:45:11 +0200271 supervisor.Run(ctx, "metrics", s.metrics.run)
Serge Bazanski0d937772021-06-17 15:54:40 +0200272 supervisor.Signal(ctx, supervisor.SignalHealthy)
273
Serge Bazanski6dff6d62022-01-28 18:15:14 +0100274 <-ctx.Done()
275 return ctx.Err()
Serge Bazanski0d937772021-06-17 15:54:40 +0200276}