blob: cc98d8e31eb6e8f590dd983c20c6ddd4e3a34d42 [file] [log] [blame]
Serge Bazanski1ebd1e12020-07-13 19:17:16 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package cluster
18
19import (
20 "context"
Serge Bazanski42e61c62021-03-18 15:07:18 +010021 "errors"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020022 "fmt"
23 "io/ioutil"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020024 "sync"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020025
Serge Bazanski0ed2f962021-03-15 16:39:30 +010026 "google.golang.org/protobuf/proto"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020027
Serge Bazanski31370b02021-01-07 16:31:14 +010028 "source.monogon.dev/metropolis/node/core/consensus"
Serge Bazanskia105db52021-04-12 19:57:46 +020029 "source.monogon.dev/metropolis/node/core/consensus/client"
Serge Bazanski31370b02021-01-07 16:31:14 +010030 "source.monogon.dev/metropolis/node/core/localstorage"
Serge Bazanski31370b02021-01-07 16:31:14 +010031 "source.monogon.dev/metropolis/node/core/network"
Serge Bazanskia105db52021-04-12 19:57:46 +020032 "source.monogon.dev/metropolis/pkg/event"
Serge Bazanski68ca5ee2021-04-27 16:09:16 +020033 "source.monogon.dev/metropolis/pkg/event/memory"
Serge Bazanski31370b02021-01-07 16:31:14 +010034 "source.monogon.dev/metropolis/pkg/supervisor"
35 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski42e61c62021-03-18 15:07:18 +010036 ppb "source.monogon.dev/metropolis/proto/private"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020037)
38
Serge Bazanskia105db52021-04-12 19:57:46 +020039// Status is returned to Cluster clients (ie., node code) on Manager.Watch/.Get.
40type Status struct {
41 // State is the current state of the cluster, as seen by the node.
42 State ClusterState
43 // Node is the configuration of this node in the cluster.
44 Node *Node
45
46 consensusClient client.Namespaced
47}
48
49// ConsensusUser is the to-level user of an etcd client in Metropolis node
50// code. These need to be defined ahead of time in an Go 'enum', and different
51// ConsensusUsers should not be shared by different codepaths.
52type ConsensusUser string
53
54const (
55 ConsensusUserKubernetesPKI ConsensusUser = "kubernetes-pki"
Serge Bazanski76003f82021-06-17 16:39:01 +020056 ConsensusUserCurator ConsensusUser = "curator"
Serge Bazanskia105db52021-04-12 19:57:46 +020057)
58
59// ConsensusClient returns an etcd/consensus client for a given ConsensusUser.
60func (s *Status) ConsensusClient(user ConsensusUser) (client.Namespaced, error) {
61 // Ensure that we already are connected to etcd and are in a state in which we
62 // should be handing out cluster connectivity.
63 if s.consensusClient == nil {
64 return nil, fmt.Errorf("not connected")
65 }
66 switch s.State {
67 case ClusterHome:
68 case ClusterSplit:
69 return nil, fmt.Errorf("refusing connection with cluster state %v", s.State)
70 default:
71 }
72
73 // Ensure only defined 'applications' are used to prevent programmer error and
74 // casting to ConsensusUser from an arbitrary string.
75 switch user {
76 case ConsensusUserKubernetesPKI:
Serge Bazanski76003f82021-06-17 16:39:01 +020077 case ConsensusUserCurator:
Serge Bazanskia105db52021-04-12 19:57:46 +020078 default:
79 return nil, fmt.Errorf("unknown ConsensusUser %q", user)
80 }
81 client, err := s.consensusClient.Sub(string(user))
82 if err != nil {
83 return nil, fmt.Errorf("retrieving subclient failed: %w", err)
84 }
85 return client, nil
Serge Bazanski42e61c62021-03-18 15:07:18 +010086}
87
88type state struct {
89 mu sync.RWMutex
90
91 oneway bool
92 stateCluster ClusterState
93 stateNode ppb.Node_FSMState
94
95 configuration *ppb.SealedConfiguration
Serge Bazanski42e61c62021-03-18 15:07:18 +010096}
97
Serge Bazanskia105db52021-04-12 19:57:46 +020098type Watcher struct {
99 event.Watcher
100}
101
102func (w *Watcher) Get(ctx context.Context) (*Status, error) {
103 val, err := w.Watcher.Get(ctx)
104 if err != nil {
105 return nil, err
Serge Bazanski42e61c62021-03-18 15:07:18 +0100106 }
Serge Bazanskia105db52021-04-12 19:57:46 +0200107 status := val.(Status)
108 return &status, err
109}
110
111// GetHome waits until the cluster, from the point of view of this node, is in
112// the ClusterHome state. This can be used to wait for the cluster manager to
113// 'settle', before clients start more node services.
114func (w *Watcher) GetHome(ctx context.Context) (*Status, error) {
115 for {
116 status, err := w.Get(ctx)
117 if err != nil {
118 return nil, err
119 }
120 switch status.State {
121 case ClusterHome:
122 return status, nil
123 case ClusterDisowning:
124 return nil, fmt.Errorf("the cluster has disowned this node")
125 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100126 }
Serge Bazanskia105db52021-04-12 19:57:46 +0200127}
128
129func (m *Manager) Watch() Watcher {
130 return Watcher{
131 Watcher: m.status.Watch(),
132 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100133}
134
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200135type Manager struct {
136 storageRoot *localstorage.Root
137 networkService *network.Service
Serge Bazanski68ca5ee2021-04-27 16:09:16 +0200138 status memory.Value
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200139
Serge Bazanski42e61c62021-03-18 15:07:18 +0100140 state
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200141
Serge Bazanski42e61c62021-03-18 15:07:18 +0100142 // consensus is the spawned etcd/consensus service, if the Manager brought
143 // up a Node that should run one.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200144 consensus *consensus.Service
145}
146
Serge Bazanski42e61c62021-03-18 15:07:18 +0100147// NewManager creates a new cluster Manager. The given localstorage Root must
148// be places, but not yet started (and will be started as the Manager makes
149// progress). The given network Service must already be running.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200150func NewManager(storageRoot *localstorage.Root, networkService *network.Service) *Manager {
151 return &Manager{
152 storageRoot: storageRoot,
153 networkService: networkService,
Serge Bazanski42e61c62021-03-18 15:07:18 +0100154
155 state: state{
156 stateCluster: ClusterUnknown,
157 stateNode: ppb.Node_FSM_STATE_INVALID,
158 },
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200159 }
160}
161
Serge Bazanski42e61c62021-03-18 15:07:18 +0100162func (m *Manager) lock() (*state, func()) {
163 m.mu.Lock()
164 return &m.state, m.mu.Unlock
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200165}
166
Serge Bazanski42e61c62021-03-18 15:07:18 +0100167func (m *Manager) rlock() (*state, func()) {
168 m.mu.RLock()
169 return &m.state, m.mu.RUnlock
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200170}
171
Serge Bazanski42e61c62021-03-18 15:07:18 +0100172// Run is the runnable of the Manager, to be started using the Supervisor. It
173// is one-shot, and should not be restarted.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200174func (m *Manager) Run(ctx context.Context) error {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100175 state, unlock := m.lock()
176 if state.oneway {
177 unlock()
178 // TODO(q3k): restart the entire system if this happens
179 return fmt.Errorf("cannot restart cluster manager")
180 }
181 state.oneway = true
182 unlock()
183
184 configuration, err := m.storageRoot.ESP.SealedConfiguration.Unseal()
185 if err == nil {
186 supervisor.Logger(ctx).Info("Sealed configuration present. attempting to join cluster")
187 return m.join(ctx, configuration)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200188 }
189
Serge Bazanski42e61c62021-03-18 15:07:18 +0100190 if !errors.Is(err, localstorage.ErrNoSealed) {
191 return fmt.Errorf("unexpected sealed config error: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200192 }
193
Serge Bazanski42e61c62021-03-18 15:07:18 +0100194 supervisor.Logger(ctx).Info("No sealed configuration, looking for node parameters")
195
196 params, err := m.nodeParams(ctx)
197 if err != nil {
198 return fmt.Errorf("no parameters available: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200199 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100200
201 switch inner := params.Cluster.(type) {
202 case *apb.NodeParameters_ClusterBootstrap_:
203 return m.bootstrap(ctx, inner.ClusterBootstrap)
204 case *apb.NodeParameters_ClusterRegister_:
205 return m.register(ctx, inner.ClusterRegister)
206 default:
207 return fmt.Errorf("node parameters misconfigured: neither cluster_bootstrap nor cluster_register set")
208 }
209}
210
Serge Bazanski42e61c62021-03-18 15:07:18 +0100211func (m *Manager) register(ctx context.Context, bootstrap *apb.NodeParameters_ClusterRegister) error {
212 return fmt.Errorf("unimplemented")
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200213}
214
Serge Bazanski42e61c62021-03-18 15:07:18 +0100215func (m *Manager) nodeParamsFWCFG(ctx context.Context) (*apb.NodeParameters, error) {
216 bytes, err := ioutil.ReadFile("/sys/firmware/qemu_fw_cfg/by_name/dev.monogon.metropolis/parameters.pb/raw")
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200217 if err != nil {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100218 return nil, fmt.Errorf("could not read firmware enrolment file: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200219 }
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200220
Serge Bazanski42e61c62021-03-18 15:07:18 +0100221 config := apb.NodeParameters{}
222 err = proto.Unmarshal(bytes, &config)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200223 if err != nil {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100224 return nil, fmt.Errorf("could not unmarshal: %v", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200225 }
226
Serge Bazanski42e61c62021-03-18 15:07:18 +0100227 return &config, nil
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200228}
229
Serge Bazanski42e61c62021-03-18 15:07:18 +0100230func (m *Manager) nodeParams(ctx context.Context) (*apb.NodeParameters, error) {
231 // Retrieve node parameters from qemu's fwcfg interface or ESP.
232 // TODO(q3k): probably abstract this away and implement per platform/build/...
233 paramsFWCFG, err := m.nodeParamsFWCFG(ctx)
234 if err != nil {
235 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from qemu fwcfg: %v", err)
236 paramsFWCFG = nil
237 } else {
238 supervisor.Logger(ctx).Infof("Retrieved node parameters from qemu fwcfg")
239 }
240 paramsESP, err := m.storageRoot.ESP.NodeParameters.Unmarshal()
241 if err != nil {
242 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from ESP: %v", err)
243 paramsESP = nil
244 } else {
245 supervisor.Logger(ctx).Infof("Retrieved node parameters from ESP")
246 }
247 if paramsFWCFG == nil && paramsESP == nil {
248 return nil, fmt.Errorf("could not find node parameters in ESP or qemu fwcfg")
249 }
250 if paramsFWCFG != nil && paramsESP != nil {
Serge Bazanskia105db52021-04-12 19:57:46 +0200251 supervisor.Logger(ctx).Warningf("Node parameters found both in both ESP and qemu fwcfg, using the latter")
Serge Bazanski42e61c62021-03-18 15:07:18 +0100252 return paramsFWCFG, nil
253 } else if paramsFWCFG != nil {
254 return paramsFWCFG, nil
255 } else {
256 return paramsESP, nil
257 }
258}
259
260func (m *Manager) join(ctx context.Context, cfg *ppb.SealedConfiguration) error {
261 return fmt.Errorf("unimplemented")
262}
263
264// Node returns the Node that the Manager brought into a cluster, or nil if the
265// Manager is not Running. This is safe to call from any goroutine.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200266func (m *Manager) Node() *Node {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100267 return nil
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200268}