blob: 874d3ae6c079242b4bd9293e739d7d63296838cf [file] [log] [blame]
Serge Bazanski1ebd1e12020-07-13 19:17:16 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package cluster
18
19import (
20 "context"
Serge Bazanski42e61c62021-03-18 15:07:18 +010021 "errors"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020022 "fmt"
23 "io/ioutil"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020024 "sync"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020025
Serge Bazanski0ed2f962021-03-15 16:39:30 +010026 "google.golang.org/protobuf/proto"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020027
Serge Bazanski31370b02021-01-07 16:31:14 +010028 "source.monogon.dev/metropolis/node/core/consensus"
Serge Bazanskia105db52021-04-12 19:57:46 +020029 "source.monogon.dev/metropolis/node/core/consensus/client"
Serge Bazanski31370b02021-01-07 16:31:14 +010030 "source.monogon.dev/metropolis/node/core/localstorage"
Serge Bazanski31370b02021-01-07 16:31:14 +010031 "source.monogon.dev/metropolis/node/core/network"
Serge Bazanskia105db52021-04-12 19:57:46 +020032 "source.monogon.dev/metropolis/pkg/event"
Serge Bazanski31370b02021-01-07 16:31:14 +010033 "source.monogon.dev/metropolis/pkg/supervisor"
34 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski42e61c62021-03-18 15:07:18 +010035 ppb "source.monogon.dev/metropolis/proto/private"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020036)
37
Serge Bazanskia105db52021-04-12 19:57:46 +020038// Status is returned to Cluster clients (ie., node code) on Manager.Watch/.Get.
39type Status struct {
40 // State is the current state of the cluster, as seen by the node.
41 State ClusterState
42 // Node is the configuration of this node in the cluster.
43 Node *Node
44
45 consensusClient client.Namespaced
46}
47
48// ConsensusUser is the to-level user of an etcd client in Metropolis node
49// code. These need to be defined ahead of time in an Go 'enum', and different
50// ConsensusUsers should not be shared by different codepaths.
51type ConsensusUser string
52
53const (
54 ConsensusUserKubernetesPKI ConsensusUser = "kubernetes-pki"
55)
56
57// ConsensusClient returns an etcd/consensus client for a given ConsensusUser.
58func (s *Status) ConsensusClient(user ConsensusUser) (client.Namespaced, error) {
59 // Ensure that we already are connected to etcd and are in a state in which we
60 // should be handing out cluster connectivity.
61 if s.consensusClient == nil {
62 return nil, fmt.Errorf("not connected")
63 }
64 switch s.State {
65 case ClusterHome:
66 case ClusterSplit:
67 return nil, fmt.Errorf("refusing connection with cluster state %v", s.State)
68 default:
69 }
70
71 // Ensure only defined 'applications' are used to prevent programmer error and
72 // casting to ConsensusUser from an arbitrary string.
73 switch user {
74 case ConsensusUserKubernetesPKI:
75 default:
76 return nil, fmt.Errorf("unknown ConsensusUser %q", user)
77 }
78 client, err := s.consensusClient.Sub(string(user))
79 if err != nil {
80 return nil, fmt.Errorf("retrieving subclient failed: %w", err)
81 }
82 return client, nil
Serge Bazanski42e61c62021-03-18 15:07:18 +010083}
84
85type state struct {
86 mu sync.RWMutex
87
88 oneway bool
89 stateCluster ClusterState
90 stateNode ppb.Node_FSMState
91
92 configuration *ppb.SealedConfiguration
Serge Bazanski42e61c62021-03-18 15:07:18 +010093}
94
Serge Bazanskia105db52021-04-12 19:57:46 +020095type Watcher struct {
96 event.Watcher
97}
98
99func (w *Watcher) Get(ctx context.Context) (*Status, error) {
100 val, err := w.Watcher.Get(ctx)
101 if err != nil {
102 return nil, err
Serge Bazanski42e61c62021-03-18 15:07:18 +0100103 }
Serge Bazanskia105db52021-04-12 19:57:46 +0200104 status := val.(Status)
105 return &status, err
106}
107
108// GetHome waits until the cluster, from the point of view of this node, is in
109// the ClusterHome state. This can be used to wait for the cluster manager to
110// 'settle', before clients start more node services.
111func (w *Watcher) GetHome(ctx context.Context) (*Status, error) {
112 for {
113 status, err := w.Get(ctx)
114 if err != nil {
115 return nil, err
116 }
117 switch status.State {
118 case ClusterHome:
119 return status, nil
120 case ClusterDisowning:
121 return nil, fmt.Errorf("the cluster has disowned this node")
122 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100123 }
Serge Bazanskia105db52021-04-12 19:57:46 +0200124}
125
126func (m *Manager) Watch() Watcher {
127 return Watcher{
128 Watcher: m.status.Watch(),
129 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100130}
131
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200132type Manager struct {
133 storageRoot *localstorage.Root
134 networkService *network.Service
Serge Bazanskia105db52021-04-12 19:57:46 +0200135 status event.MemoryValue
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200136
Serge Bazanski42e61c62021-03-18 15:07:18 +0100137 state
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200138
Serge Bazanski42e61c62021-03-18 15:07:18 +0100139 // consensus is the spawned etcd/consensus service, if the Manager brought
140 // up a Node that should run one.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200141 consensus *consensus.Service
142}
143
Serge Bazanski42e61c62021-03-18 15:07:18 +0100144// NewManager creates a new cluster Manager. The given localstorage Root must
145// be places, but not yet started (and will be started as the Manager makes
146// progress). The given network Service must already be running.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200147func NewManager(storageRoot *localstorage.Root, networkService *network.Service) *Manager {
148 return &Manager{
149 storageRoot: storageRoot,
150 networkService: networkService,
Serge Bazanski42e61c62021-03-18 15:07:18 +0100151
152 state: state{
153 stateCluster: ClusterUnknown,
154 stateNode: ppb.Node_FSM_STATE_INVALID,
155 },
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200156 }
157}
158
Serge Bazanski42e61c62021-03-18 15:07:18 +0100159func (m *Manager) lock() (*state, func()) {
160 m.mu.Lock()
161 return &m.state, m.mu.Unlock
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200162}
163
Serge Bazanski42e61c62021-03-18 15:07:18 +0100164func (m *Manager) rlock() (*state, func()) {
165 m.mu.RLock()
166 return &m.state, m.mu.RUnlock
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200167}
168
Serge Bazanski42e61c62021-03-18 15:07:18 +0100169// Run is the runnable of the Manager, to be started using the Supervisor. It
170// is one-shot, and should not be restarted.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200171func (m *Manager) Run(ctx context.Context) error {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100172 state, unlock := m.lock()
173 if state.oneway {
174 unlock()
175 // TODO(q3k): restart the entire system if this happens
176 return fmt.Errorf("cannot restart cluster manager")
177 }
178 state.oneway = true
179 unlock()
180
181 configuration, err := m.storageRoot.ESP.SealedConfiguration.Unseal()
182 if err == nil {
183 supervisor.Logger(ctx).Info("Sealed configuration present. attempting to join cluster")
184 return m.join(ctx, configuration)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200185 }
186
Serge Bazanski42e61c62021-03-18 15:07:18 +0100187 if !errors.Is(err, localstorage.ErrNoSealed) {
188 return fmt.Errorf("unexpected sealed config error: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200189 }
190
Serge Bazanski42e61c62021-03-18 15:07:18 +0100191 supervisor.Logger(ctx).Info("No sealed configuration, looking for node parameters")
192
193 params, err := m.nodeParams(ctx)
194 if err != nil {
195 return fmt.Errorf("no parameters available: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200196 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100197
198 switch inner := params.Cluster.(type) {
199 case *apb.NodeParameters_ClusterBootstrap_:
200 return m.bootstrap(ctx, inner.ClusterBootstrap)
201 case *apb.NodeParameters_ClusterRegister_:
202 return m.register(ctx, inner.ClusterRegister)
203 default:
204 return fmt.Errorf("node parameters misconfigured: neither cluster_bootstrap nor cluster_register set")
205 }
206}
207
Serge Bazanski42e61c62021-03-18 15:07:18 +0100208func (m *Manager) register(ctx context.Context, bootstrap *apb.NodeParameters_ClusterRegister) error {
209 return fmt.Errorf("unimplemented")
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200210}
211
Serge Bazanski42e61c62021-03-18 15:07:18 +0100212func (m *Manager) nodeParamsFWCFG(ctx context.Context) (*apb.NodeParameters, error) {
213 bytes, err := ioutil.ReadFile("/sys/firmware/qemu_fw_cfg/by_name/dev.monogon.metropolis/parameters.pb/raw")
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200214 if err != nil {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100215 return nil, fmt.Errorf("could not read firmware enrolment file: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200216 }
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200217
Serge Bazanski42e61c62021-03-18 15:07:18 +0100218 config := apb.NodeParameters{}
219 err = proto.Unmarshal(bytes, &config)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200220 if err != nil {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100221 return nil, fmt.Errorf("could not unmarshal: %v", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200222 }
223
Serge Bazanski42e61c62021-03-18 15:07:18 +0100224 return &config, nil
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200225}
226
Serge Bazanski42e61c62021-03-18 15:07:18 +0100227func (m *Manager) nodeParams(ctx context.Context) (*apb.NodeParameters, error) {
228 // Retrieve node parameters from qemu's fwcfg interface or ESP.
229 // TODO(q3k): probably abstract this away and implement per platform/build/...
230 paramsFWCFG, err := m.nodeParamsFWCFG(ctx)
231 if err != nil {
232 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from qemu fwcfg: %v", err)
233 paramsFWCFG = nil
234 } else {
235 supervisor.Logger(ctx).Infof("Retrieved node parameters from qemu fwcfg")
236 }
237 paramsESP, err := m.storageRoot.ESP.NodeParameters.Unmarshal()
238 if err != nil {
239 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from ESP: %v", err)
240 paramsESP = nil
241 } else {
242 supervisor.Logger(ctx).Infof("Retrieved node parameters from ESP")
243 }
244 if paramsFWCFG == nil && paramsESP == nil {
245 return nil, fmt.Errorf("could not find node parameters in ESP or qemu fwcfg")
246 }
247 if paramsFWCFG != nil && paramsESP != nil {
Serge Bazanskia105db52021-04-12 19:57:46 +0200248 supervisor.Logger(ctx).Warningf("Node parameters found both in both ESP and qemu fwcfg, using the latter")
Serge Bazanski42e61c62021-03-18 15:07:18 +0100249 return paramsFWCFG, nil
250 } else if paramsFWCFG != nil {
251 return paramsFWCFG, nil
252 } else {
253 return paramsESP, nil
254 }
255}
256
257func (m *Manager) join(ctx context.Context, cfg *ppb.SealedConfiguration) error {
258 return fmt.Errorf("unimplemented")
259}
260
261// Node returns the Node that the Manager brought into a cluster, or nil if the
262// Manager is not Running. This is safe to call from any goroutine.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200263func (m *Manager) Node() *Node {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100264 return nil
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200265}