blob: c484132d627698f8ecce294dc9ec36f616d7af0e [file] [log] [blame]
Serge Bazanski42e61c62021-03-18 15:07:18 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
Serge Bazanski37110c32023-03-01 13:57:27 +000017// Package cluster implements low-level clustering logic, especially logic
18// regarding to bootstrapping, registering into and joining a cluster. Its goal
19// is to provide the rest of the node code with the following:
20// - A mounted plaintext storage.
21// - Node credentials/identity.
22// - A locally running etcd server if the node is supposed to run one, and a
23// client connection to that etcd cluster if so.
24// - The state of the cluster as seen by the node, to enable code to respond to
25// node lifecycle changes.
Serge Bazanski42e61c62021-03-18 15:07:18 +010026package cluster
27
28import (
Serge Bazanskia959cbd2021-06-17 15:56:51 +020029 "context"
Leopold Schabela5545282021-12-04 23:29:44 +010030 "encoding/base64"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020031 "errors"
Serge Bazanski42e61c62021-03-18 15:07:18 +010032 "fmt"
Leopold Schabela5545282021-12-04 23:29:44 +010033 "io"
34 "net/http"
Lorenz Brun764a2de2021-11-22 16:26:36 +010035 "os"
Mateusz Zalega2930e992022-04-25 12:52:35 +020036 "strings"
Serge Bazanski42e61c62021-03-18 15:07:18 +010037
Leopold Schabela5545282021-12-04 23:29:44 +010038 "github.com/cenkalti/backoff/v4"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020039 "google.golang.org/protobuf/proto"
40
Serge Bazanskia959cbd2021-06-17 15:56:51 +020041 "source.monogon.dev/metropolis/node/core/localstorage"
42 "source.monogon.dev/metropolis/node/core/network"
Serge Bazanski6dff6d62022-01-28 18:15:14 +010043 "source.monogon.dev/metropolis/node/core/roleserve"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020044 "source.monogon.dev/metropolis/pkg/supervisor"
45 apb "source.monogon.dev/metropolis/proto/api"
Mateusz Zalega2930e992022-04-25 12:52:35 +020046 cpb "source.monogon.dev/metropolis/proto/common"
Serge Bazanski42e61c62021-03-18 15:07:18 +010047)
48
Serge Bazanskia959cbd2021-06-17 15:56:51 +020049type Manager struct {
50 storageRoot *localstorage.Root
51 networkService *network.Service
Serge Bazanski6dff6d62022-01-28 18:15:14 +010052 roleServer *roleserve.Service
Serge Bazanskia959cbd2021-06-17 15:56:51 +020053
Serge Bazanskife5192d2023-03-16 11:33:56 +010054 oneway chan struct{}
Serge Bazanskia959cbd2021-06-17 15:56:51 +020055}
56
57// NewManager creates a new cluster Manager. The given localstorage Root must
58// be places, but not yet started (and will be started as the Manager makes
59// progress). The given network Service must already be running.
Serge Bazanski6dff6d62022-01-28 18:15:14 +010060func NewManager(storageRoot *localstorage.Root, networkService *network.Service, rs *roleserve.Service) *Manager {
Serge Bazanskia959cbd2021-06-17 15:56:51 +020061 return &Manager{
62 storageRoot: storageRoot,
63 networkService: networkService,
Serge Bazanski6dff6d62022-01-28 18:15:14 +010064 roleServer: rs,
Serge Bazanskife5192d2023-03-16 11:33:56 +010065 oneway: make(chan struct{}),
Serge Bazanskia959cbd2021-06-17 15:56:51 +020066 }
67}
68
Serge Bazanskia959cbd2021-06-17 15:56:51 +020069// Run is the runnable of the Manager, to be started using the Supervisor. It
70// is one-shot, and should not be restarted.
71func (m *Manager) Run(ctx context.Context) error {
Serge Bazanskife5192d2023-03-16 11:33:56 +010072 select {
73 case <-m.oneway:
Serge Bazanskia959cbd2021-06-17 15:56:51 +020074 return fmt.Errorf("cannot restart cluster manager")
Serge Bazanskife5192d2023-03-16 11:33:56 +010075 default:
Serge Bazanskia959cbd2021-06-17 15:56:51 +020076 }
Serge Bazanskife5192d2023-03-16 11:33:56 +010077 close(m.oneway)
Serge Bazanskia959cbd2021-06-17 15:56:51 +020078
Lorenz Brun6c35e972021-12-14 03:08:23 +010079 configuration, err := m.storageRoot.ESP.Metropolis.SealedConfiguration.Unseal()
Serge Bazanskia959cbd2021-06-17 15:56:51 +020080 if err == nil {
81 supervisor.Logger(ctx).Info("Sealed configuration present. attempting to join cluster")
Mateusz Zalega2930e992022-04-25 12:52:35 +020082
83 // Read Cluster Directory and unmarshal it. Since the node is already
84 // registered with the cluster, the directory won't be bootstrapped from
85 // Node Parameters.
86 cd, err := m.storageRoot.ESP.Metropolis.ClusterDirectory.Unmarshal()
87 if err != nil {
88 return fmt.Errorf("while reading cluster directory: %w", err)
89 }
90 return m.join(ctx, configuration, cd)
Serge Bazanskia959cbd2021-06-17 15:56:51 +020091 }
92
93 if !errors.Is(err, localstorage.ErrNoSealed) {
94 return fmt.Errorf("unexpected sealed config error: %w", err)
95 }
96
97 supervisor.Logger(ctx).Info("No sealed configuration, looking for node parameters")
98
99 params, err := m.nodeParams(ctx)
100 if err != nil {
101 return fmt.Errorf("no parameters available: %w", err)
102 }
103
104 switch inner := params.Cluster.(type) {
105 case *apb.NodeParameters_ClusterBootstrap_:
Serge Bazanski5839e972021-11-16 15:46:19 +0100106 err = m.bootstrap(ctx, inner.ClusterBootstrap)
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200107 case *apb.NodeParameters_ClusterRegister_:
Serge Bazanski5839e972021-11-16 15:46:19 +0100108 err = m.register(ctx, inner.ClusterRegister)
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200109 default:
Serge Bazanski5839e972021-11-16 15:46:19 +0100110 err = fmt.Errorf("node parameters misconfigured: neither cluster_bootstrap nor cluster_register set")
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200111 }
Serge Bazanski5839e972021-11-16 15:46:19 +0100112
113 if err == nil {
114 supervisor.Logger(ctx).Info("Cluster enrolment done.")
Serge Bazanskife5192d2023-03-16 11:33:56 +0100115 return nil
Serge Bazanski5839e972021-11-16 15:46:19 +0100116 }
117 return err
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200118}
119
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200120func (m *Manager) nodeParamsFWCFG(ctx context.Context) (*apb.NodeParameters, error) {
Lorenz Brun764a2de2021-11-22 16:26:36 +0100121 bytes, err := os.ReadFile("/sys/firmware/qemu_fw_cfg/by_name/dev.monogon.metropolis/parameters.pb/raw")
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200122 if err != nil {
123 return nil, fmt.Errorf("could not read firmware enrolment file: %w", err)
124 }
125
126 config := apb.NodeParameters{}
127 err = proto.Unmarshal(bytes, &config)
128 if err != nil {
129 return nil, fmt.Errorf("could not unmarshal: %v", err)
130 }
131
132 return &config, nil
133}
134
Leopold Schabela5545282021-12-04 23:29:44 +0100135// nodeParamsGCPMetadata attempts to retrieve the node parameters from the
136// GCP metadata service. Returns nil if the metadata service is available,
137// but no node parameters are specified.
138func (m *Manager) nodeParamsGCPMetadata(ctx context.Context) (*apb.NodeParameters, error) {
139 const metadataURL = "http://169.254.169.254/computeMetadata/v1/instance/attributes/metropolis-node-params"
140 req, err := http.NewRequestWithContext(ctx, "GET", metadataURL, nil)
141 if err != nil {
142 return nil, fmt.Errorf("could not create request: %w", err)
143 }
144 req.Header.Set("Metadata-Flavor", "Google")
145 resp, err := http.DefaultClient.Do(req)
146 if err != nil {
147 return nil, fmt.Errorf("HTTP request failed: %w", err)
148 }
149 defer resp.Body.Close()
150 if resp.StatusCode != http.StatusOK {
151 if resp.StatusCode == http.StatusNotFound {
152 return nil, nil
153 }
154 return nil, fmt.Errorf("non-200 status code: %d", resp.StatusCode)
155 }
156 decoded, err := io.ReadAll(base64.NewDecoder(base64.StdEncoding, resp.Body))
157 if err != nil {
158 return nil, fmt.Errorf("cannot decode base64: %w", err)
159 }
160 config := apb.NodeParameters{}
161 err = proto.Unmarshal(decoded, &config)
162 if err != nil {
163 return nil, fmt.Errorf("failed unmarshalling NodeParameters: %w", err)
164 }
165 return &config, nil
166}
167
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200168func (m *Manager) nodeParams(ctx context.Context) (*apb.NodeParameters, error) {
Leopold Schabela5545282021-12-04 23:29:44 +0100169 boardName, err := getDMIBoardName()
170 if err != nil {
171 supervisor.Logger(ctx).Warningf("Could not get board name, cannot detect platform: %v", err)
172 }
173 supervisor.Logger(ctx).Infof("Board name: %q", boardName)
174
175 // When running on GCP, attempt to retrieve the node parameters from the
176 // metadata server first. Retry until we get a response, since we need to
177 // wait for the network service to assign an IP address first.
178 if isGCPInstance(boardName) {
179 var params *apb.NodeParameters
180 op := func() error {
181 supervisor.Logger(ctx).Info("Running on GCP, attempting to retrieve node parameters from metadata server")
182 params, err = m.nodeParamsGCPMetadata(ctx)
183 return err
184 }
185 err := backoff.Retry(op, backoff.WithContext(backoff.NewExponentialBackOff(), ctx))
186 if err != nil {
187 supervisor.Logger(ctx).Errorf("Failed to retrieve node parameters: %v", err)
188 }
189 if params != nil {
190 supervisor.Logger(ctx).Info("Retrieved parameters from GCP metadata server")
191 return params, nil
192 }
193 supervisor.Logger(ctx).Infof("\"metropolis-node-params\" metadata not found")
194 }
195
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200196 // Retrieve node parameters from qemu's fwcfg interface or ESP.
197 // TODO(q3k): probably abstract this away and implement per platform/build/...
198 paramsFWCFG, err := m.nodeParamsFWCFG(ctx)
199 if err != nil {
200 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from qemu fwcfg: %v", err)
201 paramsFWCFG = nil
202 } else {
203 supervisor.Logger(ctx).Infof("Retrieved node parameters from qemu fwcfg")
204 }
Lorenz Brun6c35e972021-12-14 03:08:23 +0100205 paramsESP, err := m.storageRoot.ESP.Metropolis.NodeParameters.Unmarshal()
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200206 if err != nil {
207 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from ESP: %v", err)
208 paramsESP = nil
209 } else {
210 supervisor.Logger(ctx).Infof("Retrieved node parameters from ESP")
211 }
212 if paramsFWCFG == nil && paramsESP == nil {
213 return nil, fmt.Errorf("could not find node parameters in ESP or qemu fwcfg")
214 }
215 if paramsFWCFG != nil && paramsESP != nil {
216 supervisor.Logger(ctx).Warningf("Node parameters found both in both ESP and qemu fwcfg, using the latter")
217 return paramsFWCFG, nil
218 } else if paramsFWCFG != nil {
219 return paramsFWCFG, nil
220 } else {
221 return paramsESP, nil
222 }
223}
224
Mateusz Zalega2930e992022-04-25 12:52:35 +0200225// logClusterDirectory verbosely logs the whole Cluster Directory passed to it.
226func logClusterDirectory(ctx context.Context, cd *cpb.ClusterDirectory) {
227 for _, node := range cd.Nodes {
Mateusz Zalega2930e992022-04-25 12:52:35 +0200228 var addresses []string
229 for _, add := range node.Addresses {
230 addresses = append(addresses, add.Host)
231 }
Mateusz Zalegade821502022-04-29 16:37:17 +0200232 supervisor.Logger(ctx).Infof(" Addresses: %s", strings.Join(addresses, ","))
Mateusz Zalega2930e992022-04-25 12:52:35 +0200233 }
234}