Serge Bazanski | 1ebd1e1 | 2020-07-13 19:17:16 +0200 | [diff] [blame] | 1 | // Copyright 2020 The Monogon Project Authors. |
| 2 | // |
| 3 | // SPDX-License-Identifier: Apache-2.0 |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | // you may not use this file except in compliance with the License. |
| 7 | // You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, software |
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | // See the License for the specific language governing permissions and |
| 15 | // limitations under the License. |
| 16 | |
| 17 | package cluster |
| 18 | |
| 19 | import ( |
| 20 | "context" |
| 21 | "crypto/ed25519" |
| 22 | "crypto/x509" |
| 23 | "encoding/hex" |
| 24 | "fmt" |
| 25 | "net" |
| 26 | |
| 27 | "github.com/golang/protobuf/proto" |
| 28 | "go.etcd.io/etcd/clientv3" |
| 29 | "golang.org/x/sys/unix" |
| 30 | |
| 31 | "git.monogon.dev/source/nexantic.git/core/internal/localstorage" |
| 32 | ipb "git.monogon.dev/source/nexantic.git/core/proto/internal" |
| 33 | ) |
| 34 | |
| 35 | // Node is a Smalltown cluster member. A node is a virtual or physical machine running Smalltown. This object represents a |
| 36 | // node only as part of a Cluster - ie., this object will never be available outside of //core/internal/cluster if the |
| 37 | // Node is not part of a Cluster. |
| 38 | // Nodes are inherently tied to their long term storage, which is etcd. As such, methods on this object relate heavily |
| 39 | // to the Node's expected lifecycle on etcd. |
| 40 | type Node struct { |
| 41 | // clusterUnlockKey is half of the unlock key required to mount the node's data partition. It's stored in etcd, and |
| 42 | // will only be provided to the Node if it can prove its identity via an integrity mechanism (ie. via TPM), or when |
| 43 | // the Node was just created (as the key is generated locally by localstorage on first format/mount). |
| 44 | // The other part of the unlock key is the LocalUnlockKey that's present on the node's ESP partition. |
| 45 | clusterUnlockKey []byte |
| 46 | // certificate is the node's TLS certificate, used to authenticate Smalltown gRPC calls/services (but not |
| 47 | // consensus/etcd). The certificate for a node is permanent (and never expires). It's self-signed by the node on |
| 48 | // startup, and contains the node's IP address in its SAN. Callers/services should check directly against the |
| 49 | // expected certificate, and not against a CA. |
| 50 | certificate x509.Certificate |
| 51 | // address is the management IP address of the node. The management IP address of a node is permanent. |
| 52 | address net.IP |
| 53 | |
| 54 | // A Node can have multiple Roles. Each Role is represented by the presence of NodeRole* structures in this |
| 55 | // structure, with a nil pointer representing the lack of a role. |
| 56 | |
| 57 | consensusMember *NodeRoleConsensusMember |
| 58 | kubernetesWorker *NodeRoleKubernetesWorker |
| 59 | } |
| 60 | |
| 61 | // NewNode creates a new Node. This is only called when a New node is supposed to be created as part of a cluster, |
| 62 | // otherwise it should be loaded from Etcd. |
| 63 | func NewNode(cuk []byte, address net.IP, certificate x509.Certificate) *Node { |
| 64 | if certificate.Raw == nil { |
| 65 | panic("new node must contain raw certificate") |
| 66 | } |
| 67 | return &Node{ |
| 68 | clusterUnlockKey: cuk, |
| 69 | certificate: certificate, |
| 70 | address: address, |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | // NodeRoleConsensusMember defines that the Node is a consensus (etcd) cluster member. |
| 75 | type NodeRoleConsensusMember struct { |
| 76 | // etcdMember is the name of the node in Kubernetes. This is for now usually the same as the ID() of the Node. |
| 77 | etcdMemberName string |
| 78 | } |
| 79 | |
| 80 | // NodeRoleKubernetesWorker defines that the Node should be running the Kubernetes control and data plane. |
| 81 | type NodeRoleKubernetesWorker struct { |
| 82 | // nodeName is the name of the node in Kubernetes. This is for now usually the same as the ID() of the Node. |
| 83 | nodeName string |
| 84 | } |
| 85 | |
| 86 | // ID returns the name of this node, which is `smalltown-{pubkeyHash}`. This name should be the primary way to refer to |
| 87 | // Smalltown nodes within a cluster, and is guaranteed to be unique by relying on cryptographic randomness. |
| 88 | func (n *Node) ID() string { |
| 89 | return fmt.Sprintf("smalltown-%s", n.IDBare()) |
| 90 | } |
| 91 | |
| 92 | // IDBare returns the `{pubkeyHash}` part of the node ID. |
| 93 | func (n Node) IDBare() string { |
| 94 | pubKey, ok := n.certificate.PublicKey.(ed25519.PublicKey) |
| 95 | if !ok { |
| 96 | panic("node has non-ed25519 public key") |
| 97 | } |
| 98 | return hex.EncodeToString(pubKey[:16]) |
| 99 | } |
| 100 | |
| 101 | func (n *Node) String() string { |
| 102 | return n.ID() |
| 103 | } |
| 104 | |
| 105 | // ConsensusMember returns a copy of the NodeRoleConsensusMember struct if the Node is a consensus member, otherwise |
| 106 | // nil. |
| 107 | func (n *Node) ConsensusMember() *NodeRoleConsensusMember { |
| 108 | if n.consensusMember == nil { |
| 109 | return nil |
| 110 | } |
| 111 | cm := *n.consensusMember |
| 112 | return &cm |
| 113 | } |
| 114 | |
| 115 | // KubernetesWorker returns a copy of the NodeRoleKubernetesWorker struct if the Node is a kubernetes worker, otherwise |
| 116 | // nil. |
| 117 | func (n *Node) KubernetesWorker() *NodeRoleKubernetesWorker { |
| 118 | if n.kubernetesWorker == nil { |
| 119 | return nil |
| 120 | } |
| 121 | kw := *n.kubernetesWorker |
| 122 | return &kw |
| 123 | } |
| 124 | |
| 125 | // etcdPath builds the etcd path in which this node's protobuf-serialized state is stored in etcd. |
| 126 | func (n *Node) etcdPath() string { |
| 127 | return fmt.Sprintf("/nodes/%s", n.ID()) |
| 128 | } |
| 129 | |
| 130 | // proto serializes the Node object into protobuf, to be used for saving to etcd. |
| 131 | func (n *Node) proto() *ipb.Node { |
| 132 | msg := &ipb.Node{ |
| 133 | Certificate: n.certificate.Raw, |
| 134 | ClusterUnlockKey: n.clusterUnlockKey, |
| 135 | Address: n.address.String(), |
| 136 | Roles: &ipb.Node_Roles{}, |
| 137 | } |
| 138 | if n.consensusMember != nil { |
| 139 | msg.Roles.ConsensusMember = &ipb.Node_Roles_ConsensusMember{ |
| 140 | EtcdMemberName: n.consensusMember.etcdMemberName, |
| 141 | } |
| 142 | } |
| 143 | if n.kubernetesWorker != nil { |
| 144 | msg.Roles.KubernetesWorker = &ipb.Node_Roles_KubernetesWorker{ |
| 145 | NodeName: n.kubernetesWorker.nodeName, |
| 146 | } |
| 147 | } |
| 148 | return msg |
| 149 | } |
| 150 | |
| 151 | // Store saves the Node into etcd. This should be called only once per Node (ie. when the Node has been created). |
| 152 | func (n *Node) Store(ctx context.Context, kv clientv3.KV) error { |
| 153 | // Currently the only flow to store a node to etcd is a write-once flow: once a node is created, it cannot be |
| 154 | // deleted or updated. In the future, flows to change cluster node roles might be introduced (ie. to promote nodes |
| 155 | // to consensus members, etc). |
| 156 | key := n.etcdPath() |
| 157 | msg := n.proto() |
| 158 | nodeRaw, err := proto.Marshal(msg) |
| 159 | if err != nil { |
| 160 | return fmt.Errorf("failed to marshal node: %w", err) |
| 161 | } |
| 162 | |
| 163 | res, err := kv.Txn(ctx).If( |
| 164 | clientv3.Compare(clientv3.CreateRevision(key), "=", 0), |
| 165 | ).Then( |
| 166 | clientv3.OpPut(key, string(nodeRaw)), |
| 167 | ).Commit() |
| 168 | if err != nil { |
| 169 | return fmt.Errorf("failed to store node: %w", err) |
| 170 | } |
| 171 | |
| 172 | if !res.Succeeded { |
| 173 | return fmt.Errorf("attempted to re-register node (unsupported flow)") |
| 174 | } |
| 175 | return nil |
| 176 | } |
| 177 | |
| 178 | // MakeConsensusMember turns the node into a consensus member with a given name. This only configures internal fields, |
| 179 | // and does not actually start any services. |
| 180 | func (n *Node) MakeConsensusMember(etcdMemberName string) error { |
| 181 | if n.consensusMember != nil { |
| 182 | return fmt.Errorf("node already is consensus member") |
| 183 | } |
| 184 | n.consensusMember = &NodeRoleConsensusMember{ |
| 185 | etcdMemberName: etcdMemberName, |
| 186 | } |
| 187 | return nil |
| 188 | } |
| 189 | |
| 190 | // MakeKubernetesWorker turns the node into a kubernetes worker with a given name. This only configures internal fields, |
| 191 | // and does not actually start any services. |
| 192 | func (n *Node) MakeKubernetesWorker(name string) error { |
| 193 | if n.kubernetesWorker != nil { |
| 194 | return fmt.Errorf("node is already kubernetes worker") |
| 195 | } |
| 196 | n.kubernetesWorker = &NodeRoleKubernetesWorker{ |
| 197 | nodeName: name, |
| 198 | } |
| 199 | return nil |
| 200 | } |
| 201 | |
| 202 | func (n *Node) Address() net.IP { |
| 203 | return n.address |
| 204 | } |
| 205 | |
| 206 | // ConfigureLocalHostname uses the node's ID as a hostname, and sets the current hostname, and local files like hosts |
| 207 | // and machine-id accordingly. |
| 208 | func (n *Node) ConfigureLocalHostname(etc *localstorage.EtcDirectory) error { |
| 209 | if err := unix.Sethostname([]byte(n.ID())); err != nil { |
| 210 | return fmt.Errorf("failed to set runtime hostname: %w", err) |
| 211 | } |
| 212 | if err := etc.Hosts.Write([]byte(fmt.Sprintf("%s %s", "127.0.0.1", n.ID())), 0644); err != nil { |
| 213 | return fmt.Errorf("failed to write /etc/hosts: %w", err) |
| 214 | } |
| 215 | if err := etc.MachineID.Write([]byte(n.IDBare()), 0644); err != nil { |
| 216 | return fmt.Errorf("failed to write /etc/machine-id: %w", err) |
| 217 | } |
| 218 | return nil |
| 219 | } |