blob: 65af21243dee26ade6070b9dc75c1b0c45194049 [file] [log] [blame]
Serge Bazanski1ebd1e12020-07-13 19:17:16 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package cluster
18
19import (
20 "context"
Serge Bazanski42e61c62021-03-18 15:07:18 +010021 "errors"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020022 "fmt"
23 "io/ioutil"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020024 "sync"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020025
Serge Bazanski0ed2f962021-03-15 16:39:30 +010026 "google.golang.org/protobuf/proto"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020027
Serge Bazanski31370b02021-01-07 16:31:14 +010028 "source.monogon.dev/metropolis/node/core/consensus"
29 "source.monogon.dev/metropolis/node/core/localstorage"
Serge Bazanski31370b02021-01-07 16:31:14 +010030 "source.monogon.dev/metropolis/node/core/network"
31 "source.monogon.dev/metropolis/pkg/supervisor"
32 apb "source.monogon.dev/metropolis/proto/api"
Serge Bazanski42e61c62021-03-18 15:07:18 +010033 ppb "source.monogon.dev/metropolis/proto/private"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020034)
35
Serge Bazanski42e61c62021-03-18 15:07:18 +010036type managerResult struct {
37 node *Node
38 err error
39}
40
41type state struct {
42 mu sync.RWMutex
43
44 oneway bool
45 stateCluster ClusterState
46 stateNode ppb.Node_FSMState
47
48 configuration *ppb.SealedConfiguration
49
50 result *managerResult
51 waiters []chan *managerResult
52}
53
54func (s *state) setResult(node *Node, err error) {
55 s.result = &managerResult{
56 node: node,
57 err: err,
58 }
59 for _, w := range s.waiters {
60 go func(c chan *managerResult) {
61 c <- s.result
62 }(w)
63 }
64 s.waiters = nil
65}
66
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020067type Manager struct {
68 storageRoot *localstorage.Root
69 networkService *network.Service
70
Serge Bazanski42e61c62021-03-18 15:07:18 +010071 state
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020072
Serge Bazanski42e61c62021-03-18 15:07:18 +010073 // consensus is the spawned etcd/consensus service, if the Manager brought
74 // up a Node that should run one.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020075 consensus *consensus.Service
76}
77
Serge Bazanski42e61c62021-03-18 15:07:18 +010078// NewManager creates a new cluster Manager. The given localstorage Root must
79// be places, but not yet started (and will be started as the Manager makes
80// progress). The given network Service must already be running.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020081func NewManager(storageRoot *localstorage.Root, networkService *network.Service) *Manager {
82 return &Manager{
83 storageRoot: storageRoot,
84 networkService: networkService,
Serge Bazanski42e61c62021-03-18 15:07:18 +010085
86 state: state{
87 stateCluster: ClusterUnknown,
88 stateNode: ppb.Node_FSM_STATE_INVALID,
89 },
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020090 }
91}
92
Serge Bazanski42e61c62021-03-18 15:07:18 +010093func (m *Manager) lock() (*state, func()) {
94 m.mu.Lock()
95 return &m.state, m.mu.Unlock
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020096}
97
Serge Bazanski42e61c62021-03-18 15:07:18 +010098func (m *Manager) rlock() (*state, func()) {
99 m.mu.RLock()
100 return &m.state, m.mu.RUnlock
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200101}
102
Serge Bazanski42e61c62021-03-18 15:07:18 +0100103func (m *Manager) Wait() (*Node, error) {
104 state, unlock := m.lock()
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200105
Serge Bazanski42e61c62021-03-18 15:07:18 +0100106 if state.result != nil {
107 unlock()
108 return state.result.node, state.result.err
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200109 }
110
Serge Bazanski42e61c62021-03-18 15:07:18 +0100111 c := make(chan *managerResult)
112 state.waiters = append(state.waiters, c)
113 unlock()
114 res := <-c
115 return res.node, res.err
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200116}
117
Serge Bazanski42e61c62021-03-18 15:07:18 +0100118// Run is the runnable of the Manager, to be started using the Supervisor. It
119// is one-shot, and should not be restarted.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200120func (m *Manager) Run(ctx context.Context) error {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100121 state, unlock := m.lock()
122 if state.oneway {
123 unlock()
124 // TODO(q3k): restart the entire system if this happens
125 return fmt.Errorf("cannot restart cluster manager")
126 }
127 state.oneway = true
128 unlock()
129
130 configuration, err := m.storageRoot.ESP.SealedConfiguration.Unseal()
131 if err == nil {
132 supervisor.Logger(ctx).Info("Sealed configuration present. attempting to join cluster")
133 return m.join(ctx, configuration)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200134 }
135
Serge Bazanski42e61c62021-03-18 15:07:18 +0100136 if !errors.Is(err, localstorage.ErrNoSealed) {
137 return fmt.Errorf("unexpected sealed config error: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200138 }
139
Serge Bazanski42e61c62021-03-18 15:07:18 +0100140 supervisor.Logger(ctx).Info("No sealed configuration, looking for node parameters")
141
142 params, err := m.nodeParams(ctx)
143 if err != nil {
144 return fmt.Errorf("no parameters available: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200145 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100146
147 switch inner := params.Cluster.(type) {
148 case *apb.NodeParameters_ClusterBootstrap_:
149 return m.bootstrap(ctx, inner.ClusterBootstrap)
150 case *apb.NodeParameters_ClusterRegister_:
151 return m.register(ctx, inner.ClusterRegister)
152 default:
153 return fmt.Errorf("node parameters misconfigured: neither cluster_bootstrap nor cluster_register set")
154 }
155}
156
Serge Bazanski42e61c62021-03-18 15:07:18 +0100157func (m *Manager) register(ctx context.Context, bootstrap *apb.NodeParameters_ClusterRegister) error {
158 return fmt.Errorf("unimplemented")
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200159}
160
Serge Bazanski42e61c62021-03-18 15:07:18 +0100161func (m *Manager) nodeParamsFWCFG(ctx context.Context) (*apb.NodeParameters, error) {
162 bytes, err := ioutil.ReadFile("/sys/firmware/qemu_fw_cfg/by_name/dev.monogon.metropolis/parameters.pb/raw")
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200163 if err != nil {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100164 return nil, fmt.Errorf("could not read firmware enrolment file: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200165 }
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200166
Serge Bazanski42e61c62021-03-18 15:07:18 +0100167 config := apb.NodeParameters{}
168 err = proto.Unmarshal(bytes, &config)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200169 if err != nil {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100170 return nil, fmt.Errorf("could not unmarshal: %v", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200171 }
172
Serge Bazanski42e61c62021-03-18 15:07:18 +0100173 return &config, nil
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200174}
175
Serge Bazanski42e61c62021-03-18 15:07:18 +0100176func (m *Manager) nodeParams(ctx context.Context) (*apb.NodeParameters, error) {
177 // Retrieve node parameters from qemu's fwcfg interface or ESP.
178 // TODO(q3k): probably abstract this away and implement per platform/build/...
179 paramsFWCFG, err := m.nodeParamsFWCFG(ctx)
180 if err != nil {
181 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from qemu fwcfg: %v", err)
182 paramsFWCFG = nil
183 } else {
184 supervisor.Logger(ctx).Infof("Retrieved node parameters from qemu fwcfg")
185 }
186 paramsESP, err := m.storageRoot.ESP.NodeParameters.Unmarshal()
187 if err != nil {
188 supervisor.Logger(ctx).Warningf("Could not retrieve node parameters from ESP: %v", err)
189 paramsESP = nil
190 } else {
191 supervisor.Logger(ctx).Infof("Retrieved node parameters from ESP")
192 }
193 if paramsFWCFG == nil && paramsESP == nil {
194 return nil, fmt.Errorf("could not find node parameters in ESP or qemu fwcfg")
195 }
196 if paramsFWCFG != nil && paramsESP != nil {
197 supervisor.Logger(ctx).Warningf("Node parameters found both inboth ESP and qemu fwcfg, using the latter")
198 return paramsFWCFG, nil
199 } else if paramsFWCFG != nil {
200 return paramsFWCFG, nil
201 } else {
202 return paramsESP, nil
203 }
204}
205
206func (m *Manager) join(ctx context.Context, cfg *ppb.SealedConfiguration) error {
207 return fmt.Errorf("unimplemented")
208}
209
210// Node returns the Node that the Manager brought into a cluster, or nil if the
211// Manager is not Running. This is safe to call from any goroutine.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200212func (m *Manager) Node() *Node {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100213 return nil
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200214}