blob: 3ff1ad40f866d6c10e83392735d714a985299a73 [file] [log] [blame]
Serge Bazanski42e61c62021-03-18 15:07:18 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
Serge Bazanskia959cbd2021-06-17 15:56:51 +020017// cluster implements low-level clustering logic, especially logic regarding to
18// bootstrapping, registering into and joining a cluster. Its goal is to provide
19// the rest of the node code with the following:
20// - A mounted plaintext storage.
21// - Node credentials/identity.
22// - A locally running etcd server if the node is supposed to run one, and a
23// client connection to that etcd cluster if so.
24// - The state of the cluster as seen by the node, to enable code to respond to
25// node lifecycle changes.
Serge Bazanski42e61c62021-03-18 15:07:18 +010026package cluster
27
28import (
Serge Bazanskia959cbd2021-06-17 15:56:51 +020029 "context"
Leopold Schabela5545282021-12-04 23:29:44 +010030 "encoding/base64"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020031 "errors"
Serge Bazanski42e61c62021-03-18 15:07:18 +010032 "fmt"
Leopold Schabela5545282021-12-04 23:29:44 +010033 "io"
34 "net/http"
Lorenz Brun764a2de2021-11-22 16:26:36 +010035 "os"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020036 "sync"
Serge Bazanski42e61c62021-03-18 15:07:18 +010037
Leopold Schabela5545282021-12-04 23:29:44 +010038 "github.com/cenkalti/backoff/v4"
Serge Bazanskia959cbd2021-06-17 15:56:51 +020039 "google.golang.org/protobuf/proto"
40
41 "source.monogon.dev/metropolis/node/core/consensus"
42 "source.monogon.dev/metropolis/node/core/localstorage"
43 "source.monogon.dev/metropolis/node/core/network"
44 "source.monogon.dev/metropolis/pkg/event/memory"
45 "source.monogon.dev/metropolis/pkg/supervisor"
46 apb "source.monogon.dev/metropolis/proto/api"
47 ppb "source.monogon.dev/metropolis/proto/private"
Serge Bazanski42e61c62021-03-18 15:07:18 +010048)
49
Serge Bazanskia959cbd2021-06-17 15:56:51 +020050type state struct {
51 mu sync.RWMutex
Serge Bazanski42e61c62021-03-18 15:07:18 +010052
Serge Bazanskia959cbd2021-06-17 15:56:51 +020053 oneway bool
Serge Bazanski42e61c62021-03-18 15:07:18 +010054
Serge Bazanskia959cbd2021-06-17 15:56:51 +020055 configuration *ppb.SealedConfiguration
Serge Bazanski42e61c62021-03-18 15:07:18 +010056}
57
Serge Bazanskia959cbd2021-06-17 15:56:51 +020058type Manager struct {
59 storageRoot *localstorage.Root
60 networkService *network.Service
61 status memory.Value
62
63 state
64
65 // consensus is the spawned etcd/consensus service, if the Manager brought
66 // up a Node that should run one.
67 consensus *consensus.Service
68}
69
70// NewManager creates a new cluster Manager. The given localstorage Root must
71// be places, but not yet started (and will be started as the Manager makes
72// progress). The given network Service must already be running.
73func NewManager(storageRoot *localstorage.Root, networkService *network.Service) *Manager {
74 return &Manager{
75 storageRoot: storageRoot,
76 networkService: networkService,
77
78 state: state{},
79 }
80}
81
82func (m *Manager) lock() (*state, func()) {
83 m.mu.Lock()
84 return &m.state, m.mu.Unlock
85}
86
87func (m *Manager) rlock() (*state, func()) {
88 m.mu.RLock()
89 return &m.state, m.mu.RUnlock
90}
91
92// Run is the runnable of the Manager, to be started using the Supervisor. It
93// is one-shot, and should not be restarted.
94func (m *Manager) Run(ctx context.Context) error {
95 state, unlock := m.lock()
96 if state.oneway {
97 unlock()
98 // TODO(q3k): restart the entire system if this happens
99 return fmt.Errorf("cannot restart cluster manager")
100 }
101 state.oneway = true
102 unlock()
103
104 configuration, err := m.storageRoot.ESP.SealedConfiguration.Unseal()
105 if err == nil {
106 supervisor.Logger(ctx).Info("Sealed configuration present. attempting to join cluster")
107 return m.join(ctx, configuration)
108 }
109
110 if !errors.Is(err, localstorage.ErrNoSealed) {
111 return fmt.Errorf("unexpected sealed config error: %w", err)
112 }
113
114 supervisor.Logger(ctx).Info("No sealed configuration, looking for node parameters")
115
116 params, err := m.nodeParams(ctx)
117 if err != nil {
118 return fmt.Errorf("no parameters available: %w", err)
119 }
120
121 switch inner := params.Cluster.(type) {
122 case *apb.NodeParameters_ClusterBootstrap_:
123 return m.bootstrap(ctx, inner.ClusterBootstrap)
124 case *apb.NodeParameters_ClusterRegister_:
125 return m.register(ctx, inner.ClusterRegister)
126 default:
127 return fmt.Errorf("node parameters misconfigured: neither cluster_bootstrap nor cluster_register set")
128 }
129}
130
131func (m *Manager) register(ctx context.Context, bootstrap *apb.NodeParameters_ClusterRegister) error {
132 return fmt.Errorf("unimplemented")
133}
134
135func (m *Manager) nodeParamsFWCFG(ctx context.Context) (*apb.NodeParameters, error) {
Lorenz Brun764a2de2021-11-22 16:26:36 +0100136 bytes, err := os.ReadFile("/sys/firmware/qemu_fw_cfg/by_name/dev.monogon.metropolis/parameters.pb/raw")
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200137 if err != nil {
138 return nil, fmt.Errorf("could not read firmware enrolment file: %w", err)
139 }
140
141 config := apb.NodeParameters{}
142 err = proto.Unmarshal(bytes, &config)
143 if err != nil {
144 return nil, fmt.Errorf("could not unmarshal: %v", err)
145 }
146
147 return &config, nil
148}
149
Leopold Schabela5545282021-12-04 23:29:44 +0100150// nodeParamsGCPMetadata attempts to retrieve the node parameters from the
151// GCP metadata service. Returns nil if the metadata service is available,
152// but no node parameters are specified.
153func (m *Manager) nodeParamsGCPMetadata(ctx context.Context) (*apb.NodeParameters, error) {
154 const metadataURL = "http://169.254.169.254/computeMetadata/v1/instance/attributes/metropolis-node-params"
155 req, err := http.NewRequestWithContext(ctx, "GET", metadataURL, nil)
156 if err != nil {
157 return nil, fmt.Errorf("could not create request: %w", err)
158 }
159 req.Header.Set("Metadata-Flavor", "Google")
160 resp, err := http.DefaultClient.Do(req)
161 if err != nil {
162 return nil, fmt.Errorf("HTTP request failed: %w", err)
163 }
164 defer resp.Body.Close()
165 if resp.StatusCode != http.StatusOK {
166 if resp.StatusCode == http.StatusNotFound {
167 return nil, nil
168 }
169 return nil, fmt.Errorf("non-200 status code: %d", resp.StatusCode)
170 }
171 decoded, err := io.ReadAll(base64.NewDecoder(base64.StdEncoding, resp.Body))
172 if err != nil {
173 return nil, fmt.Errorf("cannot decode base64: %w", err)
174 }
175 config := apb.NodeParameters{}
176 err = proto.Unmarshal(decoded, &config)
177 if err != nil {
178 return nil, fmt.Errorf("failed unmarshalling NodeParameters: %w", err)
179 }
180 return &config, nil
181}
182
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200183func (m *Manager) nodeParams(ctx context.Context) (*apb.NodeParameters, error) {
Leopold Schabela5545282021-12-04 23:29:44 +0100184 boardName, err := getDMIBoardName()
185 if err != nil {
186 supervisor.Logger(ctx).Warningf("Could not get board name, cannot detect platform: %v", err)
187 }
188 supervisor.Logger(ctx).Infof("Board name: %q", boardName)
189
190 // When running on GCP, attempt to retrieve the node parameters from the
191 // metadata server first. Retry until we get a response, since we need to
192 // wait for the network service to assign an IP address first.
193 if isGCPInstance(boardName) {
194 var params *apb.NodeParameters
195 op := func() error {
196 supervisor.Logger(ctx).Info("Running on GCP, attempting to retrieve node parameters from metadata server")
197 params, err = m.nodeParamsGCPMetadata(ctx)
198 return err
199 }
200 err := backoff.Retry(op, backoff.WithContext(backoff.NewExponentialBackOff(), ctx))
201 if err != nil {
202 supervisor.Logger(ctx).Errorf("Failed to retrieve node parameters: %v", err)
203 }
204 if params != nil {
205 supervisor.Logger(ctx).Info("Retrieved parameters from GCP metadata server")
206 return params, nil
207 }
208 supervisor.Logger(ctx).Infof("\"metropolis-node-params\" metadata not found")
209 }
210
Serge Bazanskia959cbd2021-06-17 15:56:51 +0200211 // Retrieve node parameters from qemu's fwcfg interface or ESP.
212 // TODO(q3k): probably abstract this away and implement per platform/build/...
213 paramsFWCFG, err := m.nodeParamsFWCFG(ctx)
214 if err != nil {
215 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from qemu fwcfg: %v", err)
216 paramsFWCFG = nil
217 } else {
218 supervisor.Logger(ctx).Infof("Retrieved node parameters from qemu fwcfg")
219 }
220 paramsESP, err := m.storageRoot.ESP.NodeParameters.Unmarshal()
221 if err != nil {
222 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from ESP: %v", err)
223 paramsESP = nil
224 } else {
225 supervisor.Logger(ctx).Infof("Retrieved node parameters from ESP")
226 }
227 if paramsFWCFG == nil && paramsESP == nil {
228 return nil, fmt.Errorf("could not find node parameters in ESP or qemu fwcfg")
229 }
230 if paramsFWCFG != nil && paramsESP != nil {
231 supervisor.Logger(ctx).Warningf("Node parameters found both in both ESP and qemu fwcfg, using the latter")
232 return paramsFWCFG, nil
233 } else if paramsFWCFG != nil {
234 return paramsFWCFG, nil
235 } else {
236 return paramsESP, nil
237 }
238}
239
240func (m *Manager) join(ctx context.Context, cfg *ppb.SealedConfiguration) error {
241 return fmt.Errorf("unimplemented")
242}