Blame - core/internal/cluster/manager.go - monogon

blob: 7e5af9f7c913f4e0ac89a181dd2b160be84fbeb9 [file] [log] [blame]

Serge Bazanski	1ebd1e1	2020-07-13 19:17:16 +0200	[diff] [blame]	1	// Copyright 2020 The Monogon Project Authors.
				2	//
				3	// SPDX-License-Identifier: Apache-2.0
				4	//
				5	// Licensed under the Apache License, Version 2.0 (the "License");
				6	// you may not use this file except in compliance with the License.
				7	// You may obtain a copy of the License at
				8	//
				9	// http://www.apache.org/licenses/LICENSE-2.0
				10	//
				11	// Unless required by applicable law or agreed to in writing, software
				12	// distributed under the License is distributed on an "AS IS" BASIS,
				13	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	// See the License for the specific language governing permissions and
				15	// limitations under the License.
				16
				17	package cluster
				18
				19	import (
				20	"context"
				21	"fmt"
				22	"io/ioutil"
				23	"os"
				24	"sync"
				25	"time"
				26
				27	"github.com/cenkalti/backoff/v4"
				28	"go.etcd.io/etcd/clientv3"
				29	"go.uber.org/zap"
				30
				31	"git.monogon.dev/source/nexantic.git/core/internal/common/supervisor"
				32	"git.monogon.dev/source/nexantic.git/core/internal/consensus"
				33	"git.monogon.dev/source/nexantic.git/core/internal/localstorage"
				34	"git.monogon.dev/source/nexantic.git/core/internal/network"
				35	)
				36
				37	// Manager is a finite state machine that joins this node (ie., Smalltown instance running on a virtual/physical machine)
				38	// into a Smalltown cluster (ie. group of nodes that act as a single control plane for Smalltown services). It does that
				39	// by bringing up all required operating-system level components, including mounting the local filesystem, bringing up
				40	// a consensus (etcd) server/client, ...
				41	//
				42	// The Manager runs as a single-shot Runnable. It will attempt to progress its state from the initial state (New) to
				43	// either Running (meaning that the node is now part of a cluster), or Failed (meaning that the node couldn't become
				44	// part of a cluster). It is not restartable, as it mutates quite a bit of implicit operating-system level state (like
				45	// filesystem mounts). As such, it's difficult to recover reliably from failures, and since these failures indicate
				46	// some high issues with the cluster configuration/state, a failure requires a full kernel reboot to retry (or fix/
				47	// reconfigure the node).
				48	//
				49	// Currently, the Manager only supports one flow for bringing up a Node: by creating a new cluster. As such, it's
				50	// missing the following flows:
				51	// - joining a new node into an already running cluster
				52	// - restarting a node into an already existing cluster
				53	// - restarting a node into an already running cluster (ie. full reboot of whole cluster)
				54	//
				55	type Manager struct {
				56	storageRoot *localstorage.Root
				57	networkService *network.Service
				58
				59	// stateLock locks all state* variables.
				60	stateLock sync.RWMutex
				61	// state is the FSM state of the Manager.
				62	state State
				63	// stateRunningNode is the Node that this Manager got from joining a cluster. It's only valid if the Manager is
				64	// Running.
				65	stateRunningNode *Node
				66	// stateWaiters is a list of channels that wish to be notified (by sending true or false) for when the Manager
				67	// reaches a final state (Running or Failed respectively).
				68	stateWaiters []chan bool
				69
				70	// consensus is the spawned etcd/consensus service, if the Manager brought up a Node that should run one.
				71	consensus *consensus.Service
				72	}
				73
				74	// NewManager creates a new cluster Manager. The given localstorage Root must be places, but not yet started (and will
				75	// be started as the Manager makes progress). The given network Service must already be running.
				76	func NewManager(storageRoot localstorage.Root, networkService network.Service) *Manager {
				77	return &Manager{
				78	storageRoot: storageRoot,
				79	networkService: networkService,
				80	}
				81	}
				82
				83	// State is the state of the Manager finite state machine.
				84	type State int
				85
				86	const (
				87	// StateNew is the initial state of the Manager. It decides how to go about joining or creating a cluster.
				88	StateNew State = iota
				89	// StateCreatingCluster is when the Manager attempts to create a new cluster - this happens when a node is started
				90	// with no EnrolmentConfig.
				91	StateCreatingCluster
				92	// StateRunning is when the Manager successfully got the node to be part of a cluster. stateRunningNode is valid.
				93	StateRunning
				94	// StateFailed is when the Manager failed to ge the node to be part of a cluster.
				95	StateFailed
				96	)
				97
				98	func (s State) String() string {
				99	switch s {
				100	case StateNew:
				101	return "New"
				102	case StateCreatingCluster:
				103	return "CreatingCluster"
				104	case StateRunning:
				105	return "Running"
				106	case StateFailed:
				107	return "Failed"
				108	default:
				109	return "UNKNOWN"
				110	}
				111	}
				112
				113	// allowedTransition describes all allowed state transitions (map[From][]To).
				114	var allowedTransitions = map[State][]State{
				115	StateNew: {StateCreatingCluster},
				116	StateCreatingCluster: {StateRunning, StateFailed},
				117	}
				118
				119	// allowed returns whether a transition from a state to another state is allowed (ie. is defined in allowedTransitions).
				120	func (m *Manager) allowed(from, to State) bool {
				121	for _, allowed := range allowedTransitions[from] {
				122	if to == allowed {
				123	return true
				124	}
				125	}
				126	return false
				127	}
				128
				129	// next moves the Manager finite state machine from its current state to `n`, or to Failed if the transition is not
				130	// allowed.
				131	func (m *Manager) next(ctx context.Context, n State) {
				132	m.stateLock.Lock()
				133	defer m.stateLock.Unlock()
				134
				135	if !m.allowed(m.state, n) {
				136	supervisor.Logger(ctx).Error("Attempted invalid enrolment state transition, failing enrolment",
				137	zap.String("from", m.state.String()), zap.String("to", m.state.String()))
				138	m.state = StateFailed
				139	return
				140	}
				141
				142	supervisor.Logger(ctx).Info("Enrolment state change",
				143	zap.String("from", m.state.String()), zap.String("to", n.String()))
				144
				145	m.state = n
				146	}
				147
				148	// State returns the state of the Manager. It's safe to call this from any goroutine.
				149	func (m *Manager) State() State {
				150	m.stateLock.RLock()
				151	defer m.stateLock.RUnlock()
				152	return m.state
				153	}
				154
				155	// WaitFinished waits until the Manager FSM reaches Running or Failed, and returns true if the FSM is Running. It's
				156	// safe to call this from any goroutine.
				157	func (m *Manager) WaitFinished() (success bool) {
				158	m.stateLock.Lock()
				159	switch m.state {
				160	case StateFailed:
				161	m.stateLock.Unlock()
				162	return false
				163	case StateRunning:
				164	m.stateLock.Unlock()
				165	return true
				166	}
				167
				168	C := make(chan bool)
				169	m.stateWaiters = append(m.stateWaiters, C)
				170	m.stateLock.Unlock()
				171	return <-C
				172	}
				173
				174	// wakeWaiters wakes any WaitFinished waiters and lets them know about the current state of the Manager.
				175	// The stateLock must already been taken, and the state must have been set in the same critical section (otherwise
				176	// this can cause a race condition).
				177	func (m *Manager) wakeWaiters() {
				178	state := m.state
				179	waiters := m.stateWaiters
				180	m.stateWaiters = nil
				181
				182	for _, waiter := range waiters {
				183	go func(w chan bool) {
				184	w <- state == StateRunning
				185	}(waiter)
				186	}
				187	}
				188
				189	// Run is the runnable of the Manager, to be started using the Supervisor. It is one-shot, and should not be restarted.
				190	func (m *Manager) Run(ctx context.Context) error {
				191	if state := m.State(); state != StateNew {
				192	supervisor.Logger(ctx).Error("Manager started with non-New state, failing", zap.String("state", state.String()))
				193	m.stateLock.Lock()
				194	m.state = StateFailed
				195	m.wakeWaiters()
				196	m.stateLock.Unlock()
				197	return nil
				198	}
				199
				200	var err error
				201	bo := backoff.NewExponentialBackOff()
				202	for {
				203	done := false
				204	state := m.State()
				205	switch state {
				206	case StateNew:
				207	err = m.stateNew(ctx)
				208	case StateCreatingCluster:
				209	err = m.stateCreatingCluster(ctx)
				210	default:
				211	done = true
				212	break
				213	}
				214
				215	if err != nil \|\| done {
				216	break
				217	}
				218
				219	if state == m.State() && !m.allowed(state, m.State()) {
				220	supervisor.Logger(ctx).Error("Enrolment got stuck, failing", zap.String("state", m.state.String()))
				221	m.stateLock.Lock()
				222	m.state = StateFailed
				223	m.stateLock.Unlock()
				224	} else {
				225	bo.Reset()
				226	}
				227	}
				228
				229	m.stateLock.Lock()
				230	state := m.state
				231	if state != StateRunning {
				232	supervisor.Logger(ctx).Error("Enrolment failed", zap.Error(err), zap.String("state", m.state.String()))
				233	} else {
				234	supervisor.Logger(ctx).Info("Enrolment successful!")
				235	}
				236	m.wakeWaiters()
				237	m.stateLock.Unlock()
				238
				239	supervisor.Signal(ctx, supervisor.SignalHealthy)
				240	supervisor.Signal(ctx, supervisor.SignalDone)
				241	return nil
				242	}
				243
				244	// stateNew is called when a Manager is New. It makes the decision on how to join this node into a cluster.
				245	func (m *Manager) stateNew(ctx context.Context) error {
				246	supervisor.Logger(ctx).Info("Starting enrolment process...")
				247
				248	// Check for presence of EnrolmentConfig on ESP or in qemu firmware variables.
				249	var configRaw []byte
				250	configRaw, err := m.storageRoot.ESP.Enrolment.Read()
				251	if err != nil && !os.IsNotExist(err) {
				252	return fmt.Errorf("could not read local enrolment file: %w", err)
				253	} else if err != nil {
				254	configRaw, err = ioutil.ReadFile("/sys/firmware/qemu_fw_cfg/by_name/com.nexantic.smalltown/enrolment.pb/raw")
				255	if err != nil && !os.IsNotExist(err) {
				256	return fmt.Errorf("could not read firmware enrolment file: %w", err)
				257	}
				258	}
				259
				260	// If no enrolment file exists, we create a new cluster.
				261	if configRaw == nil {
				262	m.next(ctx, StateCreatingCluster)
				263	return nil
				264	}
				265
				266	// Enrolment file exists, this is not yet implemented (need to enroll into or join existing cluster).
				267	return fmt.Errorf("unimplemented join/enroll")
				268	}
				269
				270	// stateCreatingCluster is called when the Manager has decided to create a new cluster.
				271	//
				272	// The process to create a new cluster is as follows:
				273	// - wait for IP address
				274	// - initialize new data partition, by generating local and cluster unlock keys (the local unlock key is saved to
				275	// the ESP, while the cluster unlock key is returned)
				276	// - create a new node certificate and Node (with new given cluster unlock key)
				277	// - start up a new etcd cluster, with this node being the only member
				278	// - save the new Node to the new etcd cluster (thereby saving the node's cluster unlock key to etcd)
				279	func (m *Manager) stateCreatingCluster(ctx context.Context) error {
				280	logger := supervisor.Logger(ctx)
				281	logger.Info("Creating new cluster: waiting for IP address...")
				282	ip, err := m.networkService.GetIP(ctx, true)
				283	if err != nil {
				284	return fmt.Errorf("when getting IP address: %w", err)
				285	}
				286	logger.Info("Creating new cluster: got IP address", zap.String("address", ip.String()))
				287
				288	logger.Info("Creating new cluster: initializing storage...")
				289	cuk, err := m.storageRoot.Data.MountNew(&m.storageRoot.ESP.LocalUnlock)
				290	if err != nil {
				291	return fmt.Errorf("when making new data partition: %w", err)
				292	}
				293	logger.Info("Creating new cluster: storage initialized")
				294
				295	// Create certificate for node.
				296	cert, err := m.storageRoot.Data.Node.EnsureSelfSigned(localstorage.CertificateForNode)
				297	if err != nil {
				298	return fmt.Errorf("failed to create new node certificate: %w", err)
				299	}
				300
				301	node := NewNode(cuk, ip, cert.Leaf)
				302
				303	m.consensus = consensus.New(consensus.Config{
				304	Data: &m.storageRoot.Data.Etcd,
				305	Ephemeral: &m.storageRoot.Ephemeral.Consensus,
				306	NewCluster: true,
				307	Name: node.ID(),
				308	InitialCluster: ip.String(),
				309	ExternalHost: ip.String(),
				310	ListenHost: ip.String(),
				311	})
				312	if err := supervisor.Run(ctx, "consensus", m.consensus.Run); err != nil {
				313	return fmt.Errorf("when starting consensus: %w", err)
				314	}
				315
				316	// TODO(q3k): make timeout configurable?
				317	ctxT, ctxC := context.WithTimeout(ctx, 5*time.Second)
				318	defer ctxC()
				319
				320	supervisor.Logger(ctx).Info("Creating new cluster: waiting for consensus...")
				321	if err := m.consensus.WaitReady(ctxT); err != nil {
				322	return fmt.Errorf("consensus service failed to become ready: %w", err)
				323	}
				324
				325	// Configure node to be a consensus member and kubernetes worker. In the future, different nodes will have
				326	// different roles, but for now they're all symmetrical.
				327	_, consensusName, err := m.consensus.MemberInfo(ctx)
				328	if err != nil {
				329	return fmt.Errorf("could not get consensus MemberInfo: %w", err)
				330	}
				331	if err := node.MakeConsensusMember(consensusName); err != nil {
				332	return fmt.Errorf("could not make new node into consensus member: %w", err)
				333	}
				334	if err := node.MakeKubernetesWorker(node.ID()); err != nil {
				335	return fmt.Errorf("could not make new node into kubernetes worker: %w", err)
				336	}
				337
				338	// Save node into etcd.
				339	supervisor.Logger(ctx).Info("Creating new cluster: storing first node...")
				340	if err := node.Store(ctx, m.consensus.KV("cluster", "enrolment")); err != nil {
				341	return fmt.Errorf("could not save new node: %w", err)
				342	}
				343
				344	m.stateLock.Lock()
				345	m.stateRunningNode = node
				346	m.stateLock.Unlock()
				347
				348	m.next(ctx, StateRunning)
				349	return nil
				350	}
				351
				352	// Node returns the Node that the Manager brought into a cluster, or nil if the Manager is not Running.
				353	// This is safe to call from any goroutine.
				354	func (m Manager) Node() Node {
				355	m.stateLock.Lock()
				356	defer m.stateLock.Unlock()
				357	if m.state != StateRunning {
				358	return nil
				359	}
				360	return m.stateRunningNode
				361	}
				362
				363	// ConsensusKV returns a namespaced etcd KV client, or nil if the Manager is not Running.
				364	// This is safe to call from any goroutine.
				365	func (m *Manager) ConsensusKV(module, space string) clientv3.KV {
				366	m.stateLock.Lock()
				367	defer m.stateLock.Unlock()
				368	if m.state != StateRunning {
				369	return nil
				370	}
				371	if m.stateRunningNode.ConsensusMember() == nil {
				372	// TODO(q3k): in this case, we should return a client to etcd even though this
				373	// node is not a member of consensus. For now, all nodes are consensus members.
				374	return nil
				375	}
				376	return m.consensus.KV(module, space)
				377	}
				378
				379	// ConsensusKVRoot returns a non-namespaced etcd KV client, or nil if the Manager is not Running.
				380	// This is safe to call from any goroutine.
				381	func (m *Manager) ConsensusKVRoot() clientv3.KV {
				382	m.stateLock.Lock()
				383	defer m.stateLock.Unlock()
				384	if m.state != StateRunning {
				385	return nil
				386	}
				387	if m.stateRunningNode.ConsensusMember() == nil {
				388	// TODO(q3k): in this case, we should return a client to etcd even though this
				389	// node is not a member of consensus. For now, all nodes are consensus members.
				390	return nil
				391	}
				392	return m.consensus.KVRoot()
				393	}
				394
				395	// ConsensusCluster returns an etcd Cluster client, or nil if the Manager is not Running.
				396	// This is safe to call from any goroutine.
				397	func (m *Manager) ConsensusCluster() clientv3.Cluster {
				398	m.stateLock.Lock()
				399	defer m.stateLock.Unlock()
				400	if m.state != StateRunning {
				401	return nil
				402	}
				403	if m.stateRunningNode.ConsensusMember() == nil {
				404	// TODO(q3k): in this case, we should return a client to etcd even though this
				405	// node is not a member of consensus. For now, all nodes are consensus members.
				406	return nil
				407	}
				408	return m.consensus.Cluster()
				409	}