blob: 625466954627785418be096f3f721dff44470339 [file] [log] [blame]
Serge Bazanski42e61c62021-03-18 15:07:18 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
Serge Bazanski37110c32023-03-01 13:57:27 +000017// Package cluster implements low-level clustering logic, especially logic
18// regarding to bootstrapping, registering into and joining a cluster. Its goal
19// is to provide the rest of the node code with the following:
20// - A mounted plaintext storage.
21// - Node credentials/identity.
22// - A locally running etcd server if the node is supposed to run one, and a
23// client connection to that etcd cluster if so.
24// - The state of the cluster as seen by the node, to enable code to respond to
25// node lifecycle changes.
Serge Bazanski42e61c62021-03-18 15:07:18 +010026package cluster
27
28import (
Serge Bazanskia959cbd2021-06-17 15:56:51 +020029 "context"
Leopold Schabela5545282021-12-04 23:29:44 +010030 "encoding/base64"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020031 "errors"
Serge Bazanski42e61c62021-03-18 15:07:18 +010032 "fmt"
Leopold Schabela5545282021-12-04 23:29:44 +010033 "io"
34 "net/http"
Lorenz Brun764a2de2021-11-22 16:26:36 +010035 "os"
Mateusz Zalega2930e992022-04-25 12:52:35 +020036 "strings"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020037 "sync"
Serge Bazanski42e61c62021-03-18 15:07:18 +010038
Leopold Schabela5545282021-12-04 23:29:44 +010039 "github.com/cenkalti/backoff/v4"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020040 "google.golang.org/protobuf/proto"
41
42 "source.monogon.dev/metropolis/node/core/consensus"
43 "source.monogon.dev/metropolis/node/core/localstorage"
44 "source.monogon.dev/metropolis/node/core/network"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010045 "source.monogon.dev/metropolis/node/core/roleserve"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020046 "source.monogon.dev/metropolis/pkg/supervisor"
47 apb "source.monogon.dev/metropolis/proto/api"
Mateusz Zalega2930e992022-04-25 12:52:35 +020048 cpb "source.monogon.dev/metropolis/proto/common"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020049 ppb "source.monogon.dev/metropolis/proto/private"
Serge Bazanski42e61c62021-03-18 15:07:18 +010050)
51
Serge Bazanskia959cbd2021-06-17 15:56:51 +020052type state struct {
53 mu sync.RWMutex
Serge Bazanski42e61c62021-03-18 15:07:18 +010054
Serge Bazanskia959cbd2021-06-17 15:56:51 +020055 oneway bool
Serge Bazanski42e61c62021-03-18 15:07:18 +010056
Serge Bazanskia959cbd2021-06-17 15:56:51 +020057 configuration *ppb.SealedConfiguration
Serge Bazanski42e61c62021-03-18 15:07:18 +010058}
59
Serge Bazanskia959cbd2021-06-17 15:56:51 +020060type Manager struct {
61 storageRoot *localstorage.Root
62 networkService *network.Service
Serge Bazanski6dff6d62022-01-28 18:15:14 +010063 roleServer *roleserve.Service
Serge Bazanskia959cbd2021-06-17 15:56:51 +020064
65 state
66
67 // consensus is the spawned etcd/consensus service, if the Manager brought
68 // up a Node that should run one.
69 consensus *consensus.Service
70}
71
72// NewManager creates a new cluster Manager. The given localstorage Root must
73// be places, but not yet started (and will be started as the Manager makes
74// progress). The given network Service must already be running.
Serge Bazanski6dff6d62022-01-28 18:15:14 +010075func NewManager(storageRoot *localstorage.Root, networkService *network.Service, rs *roleserve.Service) *Manager {
Serge Bazanskia959cbd2021-06-17 15:56:51 +020076 return &Manager{
77 storageRoot: storageRoot,
78 networkService: networkService,
Serge Bazanski6dff6d62022-01-28 18:15:14 +010079 roleServer: rs,
Serge Bazanskia959cbd2021-06-17 15:56:51 +020080
81 state: state{},
82 }
83}
84
85func (m *Manager) lock() (*state, func()) {
86 m.mu.Lock()
87 return &m.state, m.mu.Unlock
88}
89
90func (m *Manager) rlock() (*state, func()) {
91 m.mu.RLock()
92 return &m.state, m.mu.RUnlock
93}
94
95// Run is the runnable of the Manager, to be started using the Supervisor. It
96// is one-shot, and should not be restarted.
97func (m *Manager) Run(ctx context.Context) error {
98 state, unlock := m.lock()
99 if state.oneway {
100 unlock()
101 // TODO(q3k): restart the entire system if this happens
102 return fmt.Errorf("cannot restart cluster manager")
103 }
104 state.oneway = true
105 unlock()
106
Lorenz Brun6c35e972021-12-14 03:08:23 +0100107 configuration, err := m.storageRoot.ESP.Metropolis.SealedConfiguration.Unseal()
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200108 if err == nil {
109 supervisor.Logger(ctx).Info("Sealed configuration present. attempting to join cluster")
Mateusz Zalega2930e992022-04-25 12:52:35 +0200110
111 // Read Cluster Directory and unmarshal it. Since the node is already
112 // registered with the cluster, the directory won't be bootstrapped from
113 // Node Parameters.
114 cd, err := m.storageRoot.ESP.Metropolis.ClusterDirectory.Unmarshal()
115 if err != nil {
116 return fmt.Errorf("while reading cluster directory: %w", err)
117 }
118 return m.join(ctx, configuration, cd)
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200119 }
120
121 if !errors.Is(err, localstorage.ErrNoSealed) {
122 return fmt.Errorf("unexpected sealed config error: %w", err)
123 }
124
125 supervisor.Logger(ctx).Info("No sealed configuration, looking for node parameters")
126
127 params, err := m.nodeParams(ctx)
128 if err != nil {
129 return fmt.Errorf("no parameters available: %w", err)
130 }
131
132 switch inner := params.Cluster.(type) {
133 case *apb.NodeParameters_ClusterBootstrap_:
Serge Bazanski5839e972021-11-16 15:46:19 +0100134 err = m.bootstrap(ctx, inner.ClusterBootstrap)
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200135 case *apb.NodeParameters_ClusterRegister_:
Serge Bazanski5839e972021-11-16 15:46:19 +0100136 err = m.register(ctx, inner.ClusterRegister)
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200137 default:
Serge Bazanski5839e972021-11-16 15:46:19 +0100138 err = fmt.Errorf("node parameters misconfigured: neither cluster_bootstrap nor cluster_register set")
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200139 }
Serge Bazanski5839e972021-11-16 15:46:19 +0100140
141 if err == nil {
142 supervisor.Logger(ctx).Info("Cluster enrolment done.")
143 }
144 return err
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200145}
146
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200147func (m *Manager) nodeParamsFWCFG(ctx context.Context) (*apb.NodeParameters, error) {
Lorenz Brun764a2de2021-11-22 16:26:36 +0100148 bytes, err := os.ReadFile("/sys/firmware/qemu_fw_cfg/by_name/dev.monogon.metropolis/parameters.pb/raw")
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200149 if err != nil {
150 return nil, fmt.Errorf("could not read firmware enrolment file: %w", err)
151 }
152
153 config := apb.NodeParameters{}
154 err = proto.Unmarshal(bytes, &config)
155 if err != nil {
156 return nil, fmt.Errorf("could not unmarshal: %v", err)
157 }
158
159 return &config, nil
160}
161
Leopold Schabela5545282021-12-04 23:29:44 +0100162// nodeParamsGCPMetadata attempts to retrieve the node parameters from the
163// GCP metadata service. Returns nil if the metadata service is available,
164// but no node parameters are specified.
165func (m *Manager) nodeParamsGCPMetadata(ctx context.Context) (*apb.NodeParameters, error) {
166 const metadataURL = "http://169.254.169.254/computeMetadata/v1/instance/attributes/metropolis-node-params"
167 req, err := http.NewRequestWithContext(ctx, "GET", metadataURL, nil)
168 if err != nil {
169 return nil, fmt.Errorf("could not create request: %w", err)
170 }
171 req.Header.Set("Metadata-Flavor", "Google")
172 resp, err := http.DefaultClient.Do(req)
173 if err != nil {
174 return nil, fmt.Errorf("HTTP request failed: %w", err)
175 }
176 defer resp.Body.Close()
177 if resp.StatusCode != http.StatusOK {
178 if resp.StatusCode == http.StatusNotFound {
179 return nil, nil
180 }
181 return nil, fmt.Errorf("non-200 status code: %d", resp.StatusCode)
182 }
183 decoded, err := io.ReadAll(base64.NewDecoder(base64.StdEncoding, resp.Body))
184 if err != nil {
185 return nil, fmt.Errorf("cannot decode base64: %w", err)
186 }
187 config := apb.NodeParameters{}
188 err = proto.Unmarshal(decoded, &config)
189 if err != nil {
190 return nil, fmt.Errorf("failed unmarshalling NodeParameters: %w", err)
191 }
192 return &config, nil
193}
194
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200195func (m *Manager) nodeParams(ctx context.Context) (*apb.NodeParameters, error) {
Leopold Schabela5545282021-12-04 23:29:44 +0100196 boardName, err := getDMIBoardName()
197 if err != nil {
198 supervisor.Logger(ctx).Warningf("Could not get board name, cannot detect platform: %v", err)
199 }
200 supervisor.Logger(ctx).Infof("Board name: %q", boardName)
201
202 // When running on GCP, attempt to retrieve the node parameters from the
203 // metadata server first. Retry until we get a response, since we need to
204 // wait for the network service to assign an IP address first.
205 if isGCPInstance(boardName) {
206 var params *apb.NodeParameters
207 op := func() error {
208 supervisor.Logger(ctx).Info("Running on GCP, attempting to retrieve node parameters from metadata server")
209 params, err = m.nodeParamsGCPMetadata(ctx)
210 return err
211 }
212 err := backoff.Retry(op, backoff.WithContext(backoff.NewExponentialBackOff(), ctx))
213 if err != nil {
214 supervisor.Logger(ctx).Errorf("Failed to retrieve node parameters: %v", err)
215 }
216 if params != nil {
217 supervisor.Logger(ctx).Info("Retrieved parameters from GCP metadata server")
218 return params, nil
219 }
220 supervisor.Logger(ctx).Infof("\"metropolis-node-params\" metadata not found")
221 }
222
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200223 // Retrieve node parameters from qemu's fwcfg interface or ESP.
224 // TODO(q3k): probably abstract this away and implement per platform/build/...
225 paramsFWCFG, err := m.nodeParamsFWCFG(ctx)
226 if err != nil {
227 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from qemu fwcfg: %v", err)
228 paramsFWCFG = nil
229 } else {
230 supervisor.Logger(ctx).Infof("Retrieved node parameters from qemu fwcfg")
231 }
Lorenz Brun6c35e972021-12-14 03:08:23 +0100232 paramsESP, err := m.storageRoot.ESP.Metropolis.NodeParameters.Unmarshal()
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200233 if err != nil {
234 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from ESP: %v", err)
235 paramsESP = nil
236 } else {
237 supervisor.Logger(ctx).Infof("Retrieved node parameters from ESP")
238 }
239 if paramsFWCFG == nil && paramsESP == nil {
240 return nil, fmt.Errorf("could not find node parameters in ESP or qemu fwcfg")
241 }
242 if paramsFWCFG != nil && paramsESP != nil {
243 supervisor.Logger(ctx).Warningf("Node parameters found both in both ESP and qemu fwcfg, using the latter")
244 return paramsFWCFG, nil
245 } else if paramsFWCFG != nil {
246 return paramsFWCFG, nil
247 } else {
248 return paramsESP, nil
249 }
250}
251
Mateusz Zalega2930e992022-04-25 12:52:35 +0200252// logClusterDirectory verbosely logs the whole Cluster Directory passed to it.
253func logClusterDirectory(ctx context.Context, cd *cpb.ClusterDirectory) {
254 for _, node := range cd.Nodes {
Mateusz Zalega2930e992022-04-25 12:52:35 +0200255 var addresses []string
256 for _, add := range node.Addresses {
257 addresses = append(addresses, add.Host)
258 }
Mateusz Zalegade821502022-04-29 16:37:17 +0200259 supervisor.Logger(ctx).Infof(" Addresses: %s", strings.Join(addresses, ","))
Mateusz Zalega2930e992022-04-25 12:52:35 +0200260 }
261}