blob: 1d6e73d20a084bc39fd5ac1babe993c837502c6b [file] [log] [blame]
Serge Bazanski1ebd1e12020-07-13 19:17:16 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package cluster
18
19import (
20 "context"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020021 "encoding/hex"
22 "fmt"
23 "net"
Serge Bazanski42e61c62021-03-18 15:07:18 +010024 "strings"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020025
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020026 "go.etcd.io/etcd/clientv3"
27 "golang.org/x/sys/unix"
Serge Bazanski42e61c62021-03-18 15:07:18 +010028 "google.golang.org/protobuf/proto"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020029
Serge Bazanski31370b02021-01-07 16:31:14 +010030 "source.monogon.dev/metropolis/node/core/localstorage"
Serge Bazanski42e61c62021-03-18 15:07:18 +010031 "source.monogon.dev/metropolis/pkg/supervisor"
32 ppb "source.monogon.dev/metropolis/proto/private"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020033)
34
Serge Bazanski42e61c62021-03-18 15:07:18 +010035// Node is a Metropolis cluster member. A node is a virtual or physical machine
36// running Metropolis. This object represents a node only as part of a cluster
37// - ie., this object will never be available outside of
38// //metropolis/node/core/cluster if the Node is not part of a Cluster. Nodes
39// are inherently tied to their long term storage, which is etcd. As such,
40// methods on this object relate heavily to the Node's expected lifecycle on
41// etcd.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020042type Node struct {
Serge Bazanski42e61c62021-03-18 15:07:18 +010043 // clusterUnlockKey is half of the unlock key required to mount the node's
44 // data partition. It's stored in etcd, and will only be provided to the
45 // Node if it can prove its identity via an integrity mechanism (ie. via
46 // TPM), or when the Node was just created (as the key is generated locally
47 // by localstorage on first format/mount). The other part of the unlock
48 // key is the LocalUnlockKey that's present on the node's ESP partition.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020049 clusterUnlockKey []byte
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020050
Serge Bazanski42e61c62021-03-18 15:07:18 +010051 pubkey []byte
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020052
Serge Bazanski42e61c62021-03-18 15:07:18 +010053 state ppb.Node_FSMState
54
55 // A Node can have multiple Roles. Each Role is represented by the presence
56 // of NodeRole* structures in this structure, with a nil pointer
57 // representing the lack of a role.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020058 consensusMember *NodeRoleConsensusMember
59 kubernetesWorker *NodeRoleKubernetesWorker
60}
61
Serge Bazanski42e61c62021-03-18 15:07:18 +010062// NodeRoleConsensusMember defines that the Node is a consensus (etcd) cluster
63// member.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020064type NodeRoleConsensusMember struct {
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020065}
66
Serge Bazanski42e61c62021-03-18 15:07:18 +010067// NodeRoleKubernetesWorker defines that the Node should be running the
68// Kubernetes control and data plane.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020069type NodeRoleKubernetesWorker struct {
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020070}
71
Serge Bazanski42e61c62021-03-18 15:07:18 +010072// ID returns the name of this node, which is `metropolis-{pubkeyHash}`. This
73// name should be the primary way to refer to Metropoils nodes within a
74// cluster, and is guaranteed to be unique by relying on cryptographic
75// randomness.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020076func (n *Node) ID() string {
Serge Bazanski662b5b32020-12-21 13:49:00 +010077 return fmt.Sprintf("metropolis-%s", n.IDBare())
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020078}
79
80// IDBare returns the `{pubkeyHash}` part of the node ID.
81func (n Node) IDBare() string {
Serge Bazanski42e61c62021-03-18 15:07:18 +010082 return hex.EncodeToString(n.pubkey[:16])
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020083}
84
85func (n *Node) String() string {
86 return n.ID()
87}
88
Serge Bazanski42e61c62021-03-18 15:07:18 +010089// ConsensusMember returns a copy of the NodeRoleConsensusMember struct if the
90// Node is a consensus member, otherwise nil.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020091func (n *Node) ConsensusMember() *NodeRoleConsensusMember {
92 if n.consensusMember == nil {
93 return nil
94 }
95 cm := *n.consensusMember
96 return &cm
97}
98
Serge Bazanski42e61c62021-03-18 15:07:18 +010099// KubernetesWorker returns a copy of the NodeRoleKubernetesWorker struct if
100// the Node is a kubernetes worker, otherwise nil.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200101func (n *Node) KubernetesWorker() *NodeRoleKubernetesWorker {
102 if n.kubernetesWorker == nil {
103 return nil
104 }
105 kw := *n.kubernetesWorker
106 return &kw
107}
108
Serge Bazanski42e61c62021-03-18 15:07:18 +0100109// etcdPath builds the etcd path in which this node's protobuf-serialized state
110// is stored in etcd.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200111func (n *Node) etcdPath() string {
112 return fmt.Sprintf("/nodes/%s", n.ID())
113}
114
Serge Bazanski42e61c62021-03-18 15:07:18 +0100115// proto serializes the Node object into protobuf, to be used for saving to
116// etcd.
117func (n *Node) proto() *ppb.Node {
118 msg := &ppb.Node{
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200119 ClusterUnlockKey: n.clusterUnlockKey,
Serge Bazanski42e61c62021-03-18 15:07:18 +0100120 PublicKey: n.pubkey,
121 FsmState: n.state,
122 Roles: &ppb.Node_Roles{},
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200123 }
124 if n.consensusMember != nil {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100125 msg.Roles.ConsensusMember = &ppb.Node_Roles_ConsensusMember{}
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200126 }
127 if n.kubernetesWorker != nil {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100128 msg.Roles.KubernetesWorker = &ppb.Node_Roles_KubernetesWorker{}
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200129 }
130 return msg
131}
132
Serge Bazanski42e61c62021-03-18 15:07:18 +0100133// Store saves the Node into etcd. This should be called only once per Node
134// (ie. when the Node has been created).
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200135func (n *Node) Store(ctx context.Context, kv clientv3.KV) error {
Serge Bazanski42e61c62021-03-18 15:07:18 +0100136 // Currently the only flow to store a node to etcd is a write-once flow:
137 // once a node is created, it cannot be deleted or updated. In the future,
138 // flows to change cluster node roles might be introduced (ie. to promote
139 // nodes to consensus members, etc).
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200140 key := n.etcdPath()
141 msg := n.proto()
142 nodeRaw, err := proto.Marshal(msg)
143 if err != nil {
144 return fmt.Errorf("failed to marshal node: %w", err)
145 }
146
147 res, err := kv.Txn(ctx).If(
148 clientv3.Compare(clientv3.CreateRevision(key), "=", 0),
149 ).Then(
150 clientv3.OpPut(key, string(nodeRaw)),
151 ).Commit()
152 if err != nil {
153 return fmt.Errorf("failed to store node: %w", err)
154 }
155
156 if !res.Succeeded {
157 return fmt.Errorf("attempted to re-register node (unsupported flow)")
158 }
159 return nil
160}
161
Serge Bazanski42e61c62021-03-18 15:07:18 +0100162// MakeConsensusMember turns the node into a consensus member. This only
163// configures internal fields, and does not actually start any services.
164func (n *Node) MakeConsensusMember() error {
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200165 if n.consensusMember != nil {
166 return fmt.Errorf("node already is consensus member")
167 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100168 n.consensusMember = &NodeRoleConsensusMember{}
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200169 return nil
170}
171
Serge Bazanski42e61c62021-03-18 15:07:18 +0100172// MakeKubernetesWorker turns the node into a kubernetes worker. This only
173// configures internal fields, and does not actually start any services.
174func (n *Node) MakeKubernetesWorker() error {
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200175 if n.kubernetesWorker != nil {
176 return fmt.Errorf("node is already kubernetes worker")
177 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100178 n.kubernetesWorker = &NodeRoleKubernetesWorker{}
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200179 return nil
180}
181
Serge Bazanski42e61c62021-03-18 15:07:18 +0100182// ConfigureLocalHostname uses the node's ID as a hostname, and sets the
183// current hostname, and local files like hosts and machine-id accordingly.
184func (n *Node) ConfigureLocalHostname(ctx context.Context, ephemeral *localstorage.EphemeralDirectory, address net.IP) error {
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200185 if err := unix.Sethostname([]byte(n.ID())); err != nil {
186 return fmt.Errorf("failed to set runtime hostname: %w", err)
187 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100188 hosts := []string{
189 "127.0.0.1 localhost",
190 "::1 localhost",
191 fmt.Sprintf("%s %s", address.String(), n.ID()),
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200192 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100193 if err := ephemeral.Hosts.Write([]byte(strings.Join(hosts, "\n")), 0644); err != nil {
194 return fmt.Errorf("failed to write /ephemeral/hosts: %w", err)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200195 }
Serge Bazanski42e61c62021-03-18 15:07:18 +0100196 if err := ephemeral.MachineID.Write([]byte(n.IDBare()), 0644); err != nil {
197 return fmt.Errorf("failed to write /ephemeral/machine-id: %w", err)
198 }
199
200 // Check that we are self-resolvable.
201 ip, err := net.ResolveIPAddr("ip", n.ID())
202 if err != nil {
203 return fmt.Errorf("failed to self-resolve: %w", err)
204 }
205 supervisor.Logger(ctx).Infof("This is node %s at %v", n.ID(), ip)
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200206 return nil
207}