core -> metropolis

Smalltown is now called Metropolis!

This is the first commit in a series of cleanup commits that prepare us
for an open source release. This one just some Bazel packages around to
follow a stricter directory layout.

All of Metropolis now lives in `//metropolis`.

All of Metropolis Node code now lives in `//metropolis/node`.

All of the main /init now lives in `//m/n/core`.

All of the Kubernetes functionality/glue now lives in `//m/n/kubernetes`.

Next steps:
     - hunt down all references to Smalltown and replace them appropriately
     - narrow down visibility rules
     - document new code organization
     - move `//build/toolchain` to `//monogon/build/toolchain`
     - do another cleanup pass between `//golibs` and
       `//monogon/node/{core,common}`.
     - remove `//delta` and `//anubis`

Fixes T799.

Test Plan: Just a very large refactor. CI should help us out here.

Bug: T799

X-Origin-Diff: phab/D667
GitOrigin-RevId: 6029b8d4edc42325d50042596b639e8b122d0ded
diff --git a/metropolis/node/core/consensus/BUILD.bazel b/metropolis/node/core/consensus/BUILD.bazel
new file mode 100644
index 0000000..cab2c0a
--- /dev/null
+++ b/metropolis/node/core/consensus/BUILD.bazel
@@ -0,0 +1,30 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["consensus.go"],
+    importpath = "git.monogon.dev/source/nexantic.git/metropolis/node/core/consensus",
+    visibility = ["//:__subpackages__"],
+    deps = [
+        "//metropolis/node:go_default_library",
+        "//metropolis/node/common/supervisor:go_default_library",
+        "//metropolis/node/core/consensus/ca:go_default_library",
+        "//metropolis/node/core/localstorage:go_default_library",
+        "@io_etcd_go_etcd//clientv3:go_default_library",
+        "@io_etcd_go_etcd//clientv3/namespace:go_default_library",
+        "@io_etcd_go_etcd//embed:go_default_library",
+        "@org_uber_go_atomic//:go_default_library",
+    ],
+)
+
+go_test(
+    name = "go_default_test",
+    srcs = ["consensus_test.go"],
+    embed = [":go_default_library"],
+    deps = [
+        "//golibs/common:go_default_library",
+        "//metropolis/node/common/supervisor:go_default_library",
+        "//metropolis/node/core/localstorage:go_default_library",
+        "//metropolis/node/core/localstorage/declarative:go_default_library",
+    ],
+)
diff --git a/metropolis/node/core/consensus/ca/BUILD.bazel b/metropolis/node/core/consensus/ca/BUILD.bazel
new file mode 100644
index 0000000..fecffa0
--- /dev/null
+++ b/metropolis/node/core/consensus/ca/BUILD.bazel
@@ -0,0 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["ca.go"],
+    importpath = "git.monogon.dev/source/nexantic.git/metropolis/node/core/consensus/ca",
+    visibility = ["//:__subpackages__"],
+    deps = ["@io_etcd_go_etcd//clientv3:go_default_library"],
+)
diff --git a/metropolis/node/core/consensus/ca/ca.go b/metropolis/node/core/consensus/ca/ca.go
new file mode 100644
index 0000000..9a1b634
--- /dev/null
+++ b/metropolis/node/core/consensus/ca/ca.go
@@ -0,0 +1,440 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// package ca implements a simple standards-compliant certificate authority.
+// It only supports ed25519 keys, and does not maintain any persistent state.
+//
+// The CA is backed by etcd storage, and can also bootstrap itself without a yet running etcd storage (and commit
+// in-memory secrets to etcd at a later date).
+//
+// CA and certificates successfully pass https://github.com/zmap/zlint
+// (minus the CA/B rules that a public CA would adhere to, which requires
+// things like OCSP servers, Certificate Policies and ECDSA/RSA-only keys).
+package ca
+
+// TODO(leo): add zlint test
+
+import (
+	"context"
+	"crypto"
+	"crypto/ed25519"
+	"crypto/rand"
+	"crypto/sha1"
+	"crypto/x509"
+	"crypto/x509/pkix"
+	"encoding/asn1"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"math/big"
+	"net"
+	"time"
+
+	"go.etcd.io/etcd/clientv3"
+)
+
+const (
+	// TODO(q3k): move this to a declarative storage layer
+	pathCACertificate      = "/etcd-ca/ca.der"
+	pathCAKey              = "/etcd-ca/ca-key.der"
+	pathCACRL              = "/etcd-ca/crl.der"
+	pathIssuedCertificates = "/etcd-ca/certs/"
+)
+
+func pathIssuedCertificate(serial *big.Int) string {
+	return pathIssuedCertificates + hex.EncodeToString(serial.Bytes())
+}
+
+var (
+	// From RFC 5280 Section 4.1.2.5
+	unknownNotAfter = time.Unix(253402300799, 0)
+)
+
+type CA struct {
+	// TODO: Potentially protect the key with memguard
+	privateKey *ed25519.PrivateKey
+	CACert     *x509.Certificate
+	CACertRaw  []byte
+
+	// bootstrapIssued are certificates that have been issued by the CA before it has been successfully Saved to etcd.
+	bootstrapIssued [][]byte
+	// canBootstrapIssue is set on CAs that have been created by New and not yet stored to etcd. If not set,
+	// certificates cannot be issued in-memory.
+	canBootstrapIssue bool
+}
+
+// Workaround for https://github.com/golang/go/issues/26676 in Go's crypto/x509. Specifically Go
+// violates Section 4.2.1.2 of RFC 5280 without this.
+// Fixed for 1.15 in https://go-review.googlesource.com/c/go/+/227098/.
+//
+// Taken from https://github.com/FiloSottile/mkcert/blob/master/cert.go#L295 written by one of Go's
+// crypto engineers (BSD 3-clause).
+func calculateSKID(pubKey crypto.PublicKey) ([]byte, error) {
+	spkiASN1, err := x509.MarshalPKIXPublicKey(pubKey)
+	if err != nil {
+		return nil, err
+	}
+
+	var spki struct {
+		Algorithm        pkix.AlgorithmIdentifier
+		SubjectPublicKey asn1.BitString
+	}
+	_, err = asn1.Unmarshal(spkiASN1, &spki)
+	if err != nil {
+		return nil, err
+	}
+	skid := sha1.Sum(spki.SubjectPublicKey.Bytes)
+	return skid[:], nil
+}
+
+// New creates a new certificate authority with the given common name. The newly created CA will be stored in memory
+// until committed to etcd by calling .Save.
+func New(name string) (*CA, error) {
+	pubKey, privKey, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		panic(err)
+	}
+
+	serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 127)
+	serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate serial number: %w", err)
+	}
+
+	skid, err := calculateSKID(pubKey)
+	if err != nil {
+		return nil, err
+	}
+
+	caCert := &x509.Certificate{
+		SerialNumber: serialNumber,
+		Subject: pkix.Name{
+			CommonName: name,
+		},
+		IsCA:                  true,
+		BasicConstraintsValid: true,
+		NotBefore:             time.Now(),
+		NotAfter:              unknownNotAfter,
+		KeyUsage:              x509.KeyUsageCertSign | x509.KeyUsageCRLSign | x509.KeyUsageDigitalSignature,
+		ExtKeyUsage:           []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageOCSPSigning},
+		AuthorityKeyId:        skid,
+		SubjectKeyId:          skid,
+	}
+
+	caCertRaw, err := x509.CreateCertificate(rand.Reader, caCert, caCert, pubKey, privKey)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create root certificate: %w", err)
+	}
+
+	ca := &CA{
+		privateKey: &privKey,
+		CACertRaw:  caCertRaw,
+		CACert:     caCert,
+
+		canBootstrapIssue: true,
+	}
+
+	return ca, nil
+}
+
+// Load restores CA state from etcd.
+func Load(ctx context.Context, kv clientv3.KV) (*CA, error) {
+	resp, err := kv.Txn(ctx).Then(
+		clientv3.OpGet(pathCACertificate),
+		clientv3.OpGet(pathCAKey),
+		// We only read the CRL to ensure it exists on etcd (and early fail on inconsistency)
+		clientv3.OpGet(pathCACRL)).Commit()
+	if err != nil {
+		return nil, fmt.Errorf("failed to retrieve CA from etcd: %w", err)
+	}
+
+	var caCert, caKey, caCRL []byte
+	for _, el := range resp.Responses {
+		for _, kv := range el.GetResponseRange().GetKvs() {
+			switch string(kv.Key) {
+			case pathCACertificate:
+				caCert = kv.Value
+			case pathCAKey:
+				caKey = kv.Value
+			case pathCACRL:
+				caCRL = kv.Value
+			}
+		}
+	}
+	if caCert == nil || caKey == nil || caCRL == nil {
+		return nil, fmt.Errorf("failed to retrieve CA from etcd, missing at least one of {ca key, ca crt, ca crl}")
+	}
+
+	if len(caKey) != ed25519.PrivateKeySize {
+		return nil, errors.New("invalid CA private key size")
+	}
+	privateKey := ed25519.PrivateKey(caKey)
+
+	caCertVal, err := x509.ParseCertificate(caCert)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse CA certificate: %w", err)
+	}
+	return &CA{
+		privateKey: &privateKey,
+		CACertRaw:  caCert,
+		CACert:     caCertVal,
+	}, nil
+}
+
+// Save stores a newly created CA into etcd, committing both the CA data and any certificates issued until then.
+func (c *CA) Save(ctx context.Context, kv clientv3.KV) error {
+	crl, err := c.makeCRL(nil)
+	if err != nil {
+		return fmt.Errorf("failed to generate initial CRL: %w", err)
+	}
+
+	ops := []clientv3.Op{
+		clientv3.OpPut(pathCACertificate, string(c.CACertRaw)),
+		clientv3.OpPut(pathCAKey, string([]byte(*c.privateKey))),
+		clientv3.OpPut(pathCACRL, string(crl)),
+	}
+	for i, certRaw := range c.bootstrapIssued {
+		cert, err := x509.ParseCertificate(certRaw)
+		if err != nil {
+			return fmt.Errorf("failed to parse in-memory certificate %d", i)
+		}
+		ops = append(ops, clientv3.OpPut(pathIssuedCertificate(cert.SerialNumber), string(certRaw)))
+	}
+
+	res, err := kv.Txn(ctx).If(
+		clientv3.Compare(clientv3.CreateRevision(pathCAKey), "=", 0),
+	).Then(ops...).Commit()
+	if err != nil {
+		return fmt.Errorf("failed to store CA to etcd: %w", err)
+	}
+	if !res.Succeeded {
+		// This should pretty much never happen, but we want to catch it just in case.
+		return fmt.Errorf("failed to store CA to etcd: CA already present - cluster-level data inconsistency")
+	}
+	c.bootstrapIssued = nil
+	c.canBootstrapIssue = false
+	return nil
+}
+
+// Issue issues a certificate. If kv is non-nil, the newly issued certificate will be immediately stored to etcd,
+// otherwise it will be kept in memory (until .Save is called). Certificates can only be issued to memory on
+// newly-created CAs that have not been saved to etcd yet.
+func (c *CA) Issue(ctx context.Context, kv clientv3.KV, commonName string, externalAddress net.IP) (cert []byte, privkey []byte, err error) {
+	serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 127)
+	serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
+	if err != nil {
+		err = fmt.Errorf("failed to generate serial number: %w", err)
+		return
+	}
+
+	pubKey, privKeyRaw, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		return
+	}
+	privkey, err = x509.MarshalPKCS8PrivateKey(privKeyRaw)
+	if err != nil {
+		return
+	}
+
+	etcdCert := &x509.Certificate{
+		SerialNumber: serialNumber,
+		Subject: pkix.Name{
+			CommonName:         commonName,
+			OrganizationalUnit: []string{"etcd"},
+		},
+		IsCA:                  false,
+		BasicConstraintsValid: true,
+		NotBefore:             time.Now(),
+		NotAfter:              unknownNotAfter,
+		ExtKeyUsage:           []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth},
+		DNSNames:              []string{commonName},
+		IPAddresses:           []net.IP{externalAddress},
+	}
+	cert, err = x509.CreateCertificate(rand.Reader, etcdCert, c.CACert, pubKey, c.privateKey)
+	if err != nil {
+		err = fmt.Errorf("failed to sign new certificate: %w", err)
+		return
+	}
+
+	if kv != nil {
+		path := pathIssuedCertificate(serialNumber)
+		_, err = kv.Put(ctx, path, string(cert))
+		if err != nil {
+			err = fmt.Errorf("failed to commit new certificate to etcd: %w", err)
+			return
+		}
+	} else {
+		if !c.canBootstrapIssue {
+			err = fmt.Errorf("cannot issue new certificate to memory on existing, etcd-backed CA")
+			return
+		}
+		c.bootstrapIssued = append(c.bootstrapIssued, cert)
+	}
+	return
+}
+
+func (c *CA) makeCRL(revoked []pkix.RevokedCertificate) ([]byte, error) {
+	crl, err := c.CACert.CreateCRL(rand.Reader, c.privateKey, revoked, time.Now(), unknownNotAfter)
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate CRL: %w", err)
+	}
+	return crl, nil
+}
+
+// Revoke revokes a certificate by hostname. The selected hostname will be added to the CRL stored in etcd. This call
+// might fail (safely) if a simultaneous revoke happened that caused the CRL to be bumped. The call can be then retried
+// safely.
+func (c *CA) Revoke(ctx context.Context, kv clientv3.KV, hostname string) error {
+	res, err := kv.Txn(ctx).Then(
+		clientv3.OpGet(pathCACRL),
+		clientv3.OpGet(pathIssuedCertificates, clientv3.WithPrefix())).Commit()
+	if err != nil {
+		return fmt.Errorf("failed to retrieve certificates and CRL from etcd: %w", err)
+	}
+
+	var certs []*x509.Certificate
+	var crlRevision int64
+	var crl *pkix.CertificateList
+	for _, el := range res.Responses {
+		for _, kv := range el.GetResponseRange().GetKvs() {
+			if string(kv.Key) == pathCACRL {
+				crl, err = x509.ParseCRL(kv.Value)
+				if err != nil {
+					return fmt.Errorf("could not parse CRL from etcd: %w", err)
+				}
+				crlRevision = kv.CreateRevision
+			} else {
+				cert, err := x509.ParseCertificate(kv.Value)
+				if err != nil {
+					return fmt.Errorf("could not parse certificate %q from etcd: %w", string(kv.Key), err)
+				}
+				certs = append(certs, cert)
+			}
+		}
+	}
+
+	if crl == nil {
+		return fmt.Errorf("could not find CRL in etcd")
+	}
+	revoked := crl.TBSCertList.RevokedCertificates
+
+	// Find requested hostname in issued certificates.
+	var serial *big.Int
+	for _, cert := range certs {
+		for _, dnsName := range cert.DNSNames {
+			if dnsName == hostname {
+				serial = cert.SerialNumber
+				break
+			}
+		}
+		if serial != nil {
+			break
+		}
+	}
+	if serial == nil {
+		return fmt.Errorf("could not find requested hostname")
+	}
+
+	// Check if certificate has already been revoked.
+	for _, revokedCert := range revoked {
+		if revokedCert.SerialNumber.Cmp(serial) == 0 {
+			return nil // Already revoked
+		}
+	}
+
+	revoked = append(revoked, pkix.RevokedCertificate{
+		SerialNumber:   serial,
+		RevocationTime: time.Now(),
+	})
+
+	crlRaw, err := c.makeCRL(revoked)
+	if err != nil {
+		return fmt.Errorf("when generating new CRL for revocation: %w", err)
+	}
+
+	res, err = kv.Txn(ctx).If(
+		clientv3.Compare(clientv3.CreateRevision(pathCACRL), "=", crlRevision),
+	).Then(
+		clientv3.OpPut(pathCACRL, string(crlRaw)),
+	).Commit()
+	if err != nil {
+		return fmt.Errorf("when saving new CRL: %w", err)
+	}
+	if !res.Succeeded {
+		return fmt.Errorf("CRL save transaction failed, retry possibly")
+	}
+
+	return nil
+}
+
+// WaitCRLChange returns a channel that will receive a CRLUpdate any time the remote CRL changed. Immediately after
+// calling this method, the current CRL is retrieved from the cluster and put into the channel.
+func (c *CA) WaitCRLChange(ctx context.Context, kv clientv3.KV, w clientv3.Watcher) <-chan CRLUpdate {
+	C := make(chan CRLUpdate)
+
+	go func(ctx context.Context) {
+		ctxC, cancel := context.WithCancel(ctx)
+		defer cancel()
+
+		fail := func(f string, args ...interface{}) {
+			C <- CRLUpdate{Err: fmt.Errorf(f, args...)}
+			close(C)
+		}
+
+		initial, err := kv.Get(ctx, pathCACRL)
+		if err != nil {
+			fail("failed to retrieve initial CRL: %w", err)
+			return
+		}
+
+		C <- CRLUpdate{CRL: initial.Kvs[0].Value}
+
+		for wr := range w.Watch(ctxC, pathCACRL, clientv3.WithRev(initial.Kvs[0].CreateRevision)) {
+			if wr.Err() != nil {
+				fail("failed watching CRL: %w", wr.Err())
+				return
+			}
+
+			for _, e := range wr.Events {
+				if string(e.Kv.Key) != pathCACRL {
+					continue
+				}
+
+				C <- CRLUpdate{CRL: e.Kv.Value}
+			}
+		}
+	}(ctx)
+
+	return C
+}
+
+// CRLUpdate is emitted for every remote CRL change, and spuriously on ever new WaitCRLChange.
+type CRLUpdate struct {
+	// The new (or existing, in the case of the first call) CRL. If nil, Err will be set.
+	CRL []byte
+	// If set, an error occurred and the WaitCRLChange call must be restarted. If set, CRL will be nil.
+	Err error
+}
+
+// GetCurrentCRL returns the current CRL for the CA. This should only be used for one-shot operations like
+// bootstrapping a new node that doesn't yet have access to etcd - otherwise, WaitCRLChange shoulde be used.
+func (c *CA) GetCurrentCRL(ctx context.Context, kv clientv3.KV) ([]byte, error) {
+	initial, err := kv.Get(ctx, pathCACRL)
+	if err != nil {
+		return nil, fmt.Errorf("failed to retrieve initial CRL: %w", err)
+	}
+	return initial.Kvs[0].Value, nil
+}
diff --git a/metropolis/node/core/consensus/consensus.go b/metropolis/node/core/consensus/consensus.go
new file mode 100644
index 0000000..8916164
--- /dev/null
+++ b/metropolis/node/core/consensus/consensus.go
@@ -0,0 +1,429 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package consensus implements a managed etcd cluster member service, with a self-hosted CA system for issuing peer
+// certificates. Currently each Smalltown node runs an etcd member, and connects to the etcd member locally over a unix
+// domain socket.
+//
+// The service supports two modes of startup:
+//  - initializing a new cluster, by bootstrapping the CA in memory, starting a cluster, committing the CA to etcd
+//    afterwards, and saving the new node's certificate to local storage
+//  - joining an existing cluster, using certificates from local storage and loading the CA from etcd. This flow is also
+//    used when the node joins a cluster for the first time (then the certificates required must be provisioned
+//    externally before starting the consensus service).
+//
+// Regardless of how the etcd member service was started, the resulting running service is further managed and used
+// in the same way.
+//
+package consensus
+
+import (
+	"context"
+	"encoding/pem"
+	"fmt"
+	"net"
+	"net/url"
+	"sync"
+	"time"
+
+	"go.etcd.io/etcd/clientv3"
+	"go.etcd.io/etcd/clientv3/namespace"
+	"go.etcd.io/etcd/embed"
+	"go.uber.org/atomic"
+
+	common "git.monogon.dev/source/nexantic.git/metropolis/node"
+	"git.monogon.dev/source/nexantic.git/metropolis/node/common/supervisor"
+	"git.monogon.dev/source/nexantic.git/metropolis/node/core/consensus/ca"
+	"git.monogon.dev/source/nexantic.git/metropolis/node/core/localstorage"
+)
+
+const (
+	DefaultClusterToken = "SIGNOS"
+	DefaultLogger       = "zap"
+)
+
+// Service is the etcd cluster member service.
+type Service struct {
+	// The configuration with which the service was started. This is immutable.
+	config *Config
+
+	// stateMu guards state. This is locked internally on public methods of Service that require access to state. The
+	// state might be recreated on service restart.
+	stateMu sync.Mutex
+	state   *state
+}
+
+// state is the runtime state of a running etcd member.
+type state struct {
+	etcd  *embed.Etcd
+	ready atomic.Bool
+
+	ca *ca.CA
+	// cl is an etcd client that loops back to the localy running etcd server. This runs over the Client unix domain
+	// socket that etcd starts.
+	cl *clientv3.Client
+}
+
+type Config struct {
+	// Data directory (persistent, encrypted storage) for etcd.
+	Data *localstorage.DataEtcdDirectory
+	// Ephemeral directory for etcd.
+	Ephemeral *localstorage.EphemeralConsensusDirectory
+
+	// Name is the cluster name. This must be the same amongst all etcd members within one cluster.
+	Name string
+	// NewCluster selects whether the etcd member will start a new cluster and bootstrap a CA and the first member
+	// certificate, or load existing PKI certificates from disk.
+	NewCluster bool
+	// InitialCluster sets the initial cluster peer URLs when NewCluster is set, and is ignored otherwise. Usually this
+	// will be just the new, single server, and more members will be added later.
+	InitialCluster string
+	// ExternalHost is the IP address or hostname at which this cluster member is reachable to other cluster members.
+	ExternalHost string
+	// ListenHost is the IP address or hostname at which this cluster member will listen.
+	ListenHost string
+	// Port is the port at which this cluster member will listen for other members. If zero, defaults to the global
+	// Smalltown setting.
+	Port int
+}
+
+func New(config Config) *Service {
+	return &Service{
+		config: &config,
+	}
+}
+
+// configure transforms the service configuration into an embedded etcd configuration. This is pure and side effect
+// free.
+func (s *Service) configure(ctx context.Context) (*embed.Config, error) {
+	if err := s.config.Ephemeral.MkdirAll(0700); err != nil {
+		return nil, fmt.Errorf("failed to create ephemeral directory: %w", err)
+	}
+	if err := s.config.Data.MkdirAll(0700); err != nil {
+		return nil, fmt.Errorf("failed to create data directory: %w", err)
+	}
+
+	port := s.config.Port
+	if port == 0 {
+		port = common.ConsensusPort
+	}
+
+	cfg := embed.NewConfig()
+
+	cfg.Name = s.config.Name
+	cfg.Dir = s.config.Data.Data.FullPath()
+	cfg.InitialClusterToken = DefaultClusterToken
+
+	cfg.PeerTLSInfo.CertFile = s.config.Data.PeerPKI.Certificate.FullPath()
+	cfg.PeerTLSInfo.KeyFile = s.config.Data.PeerPKI.Key.FullPath()
+	cfg.PeerTLSInfo.TrustedCAFile = s.config.Data.PeerPKI.CACertificate.FullPath()
+	cfg.PeerTLSInfo.ClientCertAuth = true
+	cfg.PeerTLSInfo.CRLFile = s.config.Data.PeerCRL.FullPath()
+
+	cfg.LCUrls = []url.URL{{
+		Scheme: "unix",
+		Path:   s.config.Ephemeral.ClientSocket.FullPath() + ":0",
+	}}
+	cfg.ACUrls = []url.URL{}
+	cfg.LPUrls = []url.URL{{
+		Scheme: "https",
+		Host:   fmt.Sprintf("%s:%d", s.config.ListenHost, port),
+	}}
+	cfg.APUrls = []url.URL{{
+		Scheme: "https",
+		Host:   fmt.Sprintf("%s:%d", s.config.ExternalHost, port),
+	}}
+
+	if s.config.NewCluster {
+		cfg.ClusterState = "new"
+		cfg.InitialCluster = cfg.InitialClusterFromName(cfg.Name)
+	} else if s.config.InitialCluster != "" {
+		cfg.ClusterState = "existing"
+		cfg.InitialCluster = s.config.InitialCluster
+	}
+
+	// TODO(q3k): pipe logs from etcd to supervisor.RawLogger via a file.
+	cfg.Logger = DefaultLogger
+	cfg.LogOutputs = []string{"stderr"}
+
+	return cfg, nil
+}
+
+// Run is a Supervisor runnable that starts the etcd member service. It will become healthy once the member joins the
+// cluster successfully.
+func (s *Service) Run(ctx context.Context) error {
+	st := &state{
+		ready: *atomic.NewBool(false),
+	}
+	s.stateMu.Lock()
+	s.state = st
+	s.stateMu.Unlock()
+
+	if s.config.NewCluster {
+		// Expect certificate to be absent from disk.
+		absent, err := s.config.Data.PeerPKI.AllAbsent()
+		if err != nil {
+			return fmt.Errorf("checking certificate existence: %w", err)
+		}
+		if !absent {
+			return fmt.Errorf("want new cluster, but certificates already exist on disk")
+		}
+
+		// Generate CA, keep in memory, write it down in etcd later.
+		st.ca, err = ca.New("Smalltown etcd peer Root CA")
+		if err != nil {
+			return fmt.Errorf("when creating new cluster's peer CA: %w", err)
+		}
+
+		ip := net.ParseIP(s.config.ExternalHost)
+		if ip == nil {
+			return fmt.Errorf("configued external host is not an IP address (got %q)", s.config.ExternalHost)
+		}
+
+		cert, key, err := st.ca.Issue(ctx, nil, s.config.Name, ip)
+		if err != nil {
+			return fmt.Errorf("when issuing new cluster's first certificate: %w", err)
+		}
+
+		if err := s.config.Data.PeerPKI.MkdirAll(0600); err != nil {
+			return fmt.Errorf("when creating PKI directory: %w", err)
+		}
+		if err := s.config.Data.PeerPKI.CACertificate.Write(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: st.ca.CACertRaw}), 0600); err != nil {
+			return fmt.Errorf("when writing CA certificate to disk: %w", err)
+		}
+		if err := s.config.Data.PeerPKI.Certificate.Write(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: cert}), 0600); err != nil {
+			return fmt.Errorf("when writing certificate to disk: %w", err)
+		}
+		if err := s.config.Data.PeerPKI.Key.Write(pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: key}), 0600); err != nil {
+			return fmt.Errorf("when writing certificate to disk: %w", err)
+		}
+	} else {
+		// Expect certificate to be present on disk.
+		present, err := s.config.Data.PeerPKI.AllExist()
+		if err != nil {
+			return fmt.Errorf("checking certificate existence: %w", err)
+		}
+		if !present {
+			return fmt.Errorf("want existing cluster, but certificate is missing from disk")
+		}
+	}
+
+	if err := s.config.Data.MkdirAll(0600); err != nil {
+		return fmt.Errorf("failed to create data directory; %w", err)
+	}
+
+	cfg, err := s.configure(ctx)
+	if err != nil {
+		return fmt.Errorf("when configuring etcd: %w", err)
+	}
+
+	server, err := embed.StartEtcd(cfg)
+	keep := false
+	defer func() {
+		if !keep && server != nil {
+			server.Close()
+		}
+	}()
+	if err != nil {
+		return fmt.Errorf("failed to start etcd: %w", err)
+	}
+	st.etcd = server
+
+	supervisor.Logger(ctx).Info("waiting for etcd...")
+
+	okay := true
+	select {
+	case <-st.etcd.Server.ReadyNotify():
+	case <-ctx.Done():
+		okay = false
+	}
+
+	if !okay {
+		supervisor.Logger(ctx).Info("context done, aborting wait")
+		return ctx.Err()
+	}
+
+	socket := s.config.Ephemeral.ClientSocket.FullPath()
+	cl, err := clientv3.New(clientv3.Config{
+		Endpoints:   []string{fmt.Sprintf("unix://%s:0", socket)},
+		DialTimeout: time.Second,
+	})
+	if err != nil {
+		return fmt.Errorf("failed to connect to new etcd instance: %w", err)
+	}
+	st.cl = cl
+
+	if s.config.NewCluster {
+		if st.ca == nil {
+			panic("peerCA has not been generated")
+		}
+
+		// Save new CA into etcd.
+		err = st.ca.Save(ctx, cl.KV)
+		if err != nil {
+			return fmt.Errorf("failed to save new CA to etcd: %w", err)
+		}
+	} else {
+		// Load existing CA from etcd.
+		st.ca, err = ca.Load(ctx, cl.KV)
+		if err != nil {
+			return fmt.Errorf("failed to load CA from etcd: %w", err)
+		}
+	}
+
+	// Start CRL watcher.
+	if err := supervisor.Run(ctx, "crl", s.watchCRL); err != nil {
+		return fmt.Errorf("failed to start CRL watcher: %w", err)
+	}
+	// Start autopromoter.
+	if err := supervisor.Run(ctx, "autopromoter", s.autopromoter); err != nil {
+		return fmt.Errorf("failed to start autopromoter: %w", err)
+	}
+
+	supervisor.Logger(ctx).Info("etcd is now ready")
+	keep = true
+	st.ready.Store(true)
+	supervisor.Signal(ctx, supervisor.SignalHealthy)
+
+	<-ctx.Done()
+	st.etcd.Close()
+	return ctx.Err()
+}
+
+// watchCRL is a sub-runnable of the etcd cluster member service that updates the on-local-storage CRL to match the
+// newest available version in etcd.
+func (s *Service) watchCRL(ctx context.Context) error {
+	s.stateMu.Lock()
+	cl := s.state.cl
+	ca := s.state.ca
+	s.stateMu.Unlock()
+
+	supervisor.Signal(ctx, supervisor.SignalHealthy)
+	for e := range ca.WaitCRLChange(ctx, cl.KV, cl.Watcher) {
+		if e.Err != nil {
+			return fmt.Errorf("watching CRL: %w", e.Err)
+		}
+
+		if err := s.config.Data.PeerCRL.Write(e.CRL, 0600); err != nil {
+			return fmt.Errorf("saving CRL: %w", err)
+		}
+	}
+
+	// unreachable
+	return nil
+}
+
+func (s *Service) autopromoter(ctx context.Context) error {
+	t := time.NewTicker(5 * time.Second)
+	defer t.Stop()
+
+	autopromote := func() {
+		s.stateMu.Lock()
+		st := s.state
+		s.stateMu.Unlock()
+
+		if st.etcd.Server.Leader() != st.etcd.Server.ID() {
+			return
+		}
+
+		for _, member := range st.etcd.Server.Cluster().Members() {
+			if !member.IsLearner {
+				continue
+			}
+
+			// We always call PromoteMember since the metadata necessary to decide if we should is private.
+			// Luckily etcd already does sanity checks internally and will refuse to promote nodes that aren't
+			// connected or are still behind on transactions.
+			if _, err := st.etcd.Server.PromoteMember(ctx, uint64(member.ID)); err != nil {
+				supervisor.Logger(ctx).Infof("Failed to promote consensus node %s: %v", member.Name, err)
+			} else {
+				supervisor.Logger(ctx).Infof("Promoted new consensus node %s", member.Name)
+			}
+		}
+	}
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-t.C:
+			autopromote()
+		}
+	}
+}
+
+// IsReady returns whether etcd is ready and synced
+func (s *Service) IsReady() bool {
+	s.stateMu.Lock()
+	defer s.stateMu.Unlock()
+	if s.state == nil {
+		return false
+	}
+	return s.state.ready.Load()
+}
+
+func (s *Service) WaitReady(ctx context.Context) error {
+	// TODO(q3k): reimplement the atomic ready flag as an event synchronization mechanism
+	if s.IsReady() {
+		return nil
+	}
+	t := time.NewTicker(100 * time.Millisecond)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-t.C:
+			if s.IsReady() {
+				return nil
+			}
+		}
+	}
+}
+
+// KV returns and etcd KV client interface to the etcd member/cluster.
+func (s *Service) KV(module, space string) clientv3.KV {
+	s.stateMu.Lock()
+	defer s.stateMu.Unlock()
+	return namespace.NewKV(s.state.cl.KV, fmt.Sprintf("%s:%s", module, space))
+}
+
+func (s *Service) KVRoot() clientv3.KV {
+	s.stateMu.Lock()
+	defer s.stateMu.Unlock()
+	return s.state.cl.KV
+}
+
+func (s *Service) Cluster() clientv3.Cluster {
+	s.stateMu.Lock()
+	defer s.stateMu.Unlock()
+	return s.state.cl.Cluster
+}
+
+// MemberInfo returns information about this etcd cluster member: its ID and name. This will block until this
+// information is available (ie. the cluster status is Ready).
+func (s *Service) MemberInfo(ctx context.Context) (id uint64, name string, err error) {
+	if err = s.WaitReady(ctx); err != nil {
+		err = fmt.Errorf("when waiting for cluster readiness: %w", err)
+		return
+	}
+
+	s.stateMu.Lock()
+	defer s.stateMu.Unlock()
+	id = uint64(s.state.etcd.Server.ID())
+	name = s.config.Name
+	return
+}
diff --git a/metropolis/node/core/consensus/consensus_test.go b/metropolis/node/core/consensus/consensus_test.go
new file mode 100644
index 0000000..e08bd29
--- /dev/null
+++ b/metropolis/node/core/consensus/consensus_test.go
@@ -0,0 +1,261 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package consensus
+
+import (
+	"bytes"
+	"context"
+	"crypto/x509"
+	"io/ioutil"
+	"net"
+	"os"
+	"testing"
+	"time"
+
+	"git.monogon.dev/source/nexantic.git/golibs/common"
+	"git.monogon.dev/source/nexantic.git/metropolis/node/common/supervisor"
+	"git.monogon.dev/source/nexantic.git/metropolis/node/core/localstorage"
+	"git.monogon.dev/source/nexantic.git/metropolis/node/core/localstorage/declarative"
+)
+
+type boilerplate struct {
+	ctx    context.Context
+	ctxC   context.CancelFunc
+	root   *localstorage.Root
+	tmpdir string
+}
+
+func prep(t *testing.T) *boilerplate {
+	ctx, ctxC := context.WithCancel(context.Background())
+	root := &localstorage.Root{}
+	tmp, err := ioutil.TempDir("", "smalltown-test")
+	if err != nil {
+		t.Fatal(err)
+	}
+	err = declarative.PlaceFS(root, tmp)
+	if err != nil {
+		t.Fatal(err)
+	}
+	os.MkdirAll(root.Data.Etcd.FullPath(), 0700)
+	os.MkdirAll(root.Ephemeral.Consensus.FullPath(), 0700)
+
+	return &boilerplate{
+		ctx:    ctx,
+		ctxC:   ctxC,
+		root:   root,
+		tmpdir: tmp,
+	}
+}
+
+func (b *boilerplate) close() {
+	b.ctxC()
+	os.RemoveAll(b.tmpdir)
+}
+
+func waitEtcd(t *testing.T, s *Service) {
+	deadline := time.Now().Add(5 * time.Second)
+	for {
+		if time.Now().After(deadline) {
+			t.Fatalf("etcd did not start up on time")
+		}
+		if s.IsReady() {
+			break
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func TestBootstrap(t *testing.T) {
+	b := prep(t)
+	defer b.close()
+	etcd := New(Config{
+		Data:           &b.root.Data.Etcd,
+		Ephemeral:      &b.root.Ephemeral.Consensus,
+		Name:           "test",
+		NewCluster:     true,
+		InitialCluster: "127.0.0.1",
+		ExternalHost:   "127.0.0.1",
+		ListenHost:     "127.0.0.1",
+		Port:           common.MustConsume(common.AllocateTCPPort()),
+	})
+
+	supervisor.New(b.ctx, etcd.Run)
+	waitEtcd(t, etcd)
+
+	kv := etcd.KV("foo", "bar")
+	if _, err := kv.Put(b.ctx, "/foo", "bar"); err != nil {
+		t.Fatalf("test key creation failed: %v", err)
+	}
+	if _, err := kv.Get(b.ctx, "/foo"); err != nil {
+		t.Fatalf("test key retrieval failed: %v", err)
+	}
+}
+
+func TestMemberInfo(t *testing.T) {
+	b := prep(t)
+	defer b.close()
+	etcd := New(Config{
+		Data:           &b.root.Data.Etcd,
+		Ephemeral:      &b.root.Ephemeral.Consensus,
+		Name:           "test",
+		NewCluster:     true,
+		InitialCluster: "127.0.0.1",
+		ExternalHost:   "127.0.0.1",
+		ListenHost:     "127.0.0.1",
+		Port:           common.MustConsume(common.AllocateTCPPort()),
+	})
+	supervisor.New(b.ctx, etcd.Run)
+	waitEtcd(t, etcd)
+
+	id, name, err := etcd.MemberInfo(b.ctx)
+	if err != nil {
+		t.Fatalf("MemberInfo: %v", err)
+	}
+
+	// Compare name with configured name.
+	if want, got := "test", name; want != got {
+		t.Errorf("MemberInfo returned name %q, wanted %q (per config)", got, want)
+	}
+
+	// Compare name with cluster information.
+	members, err := etcd.Cluster().MemberList(b.ctx)
+	if err != nil {
+		t.Errorf("MemberList: %v", err)
+	}
+
+	if want, got := 1, len(members.Members); want != got {
+		t.Fatalf("expected one cluster member, got %d", got)
+	}
+	if want, got := id, members.Members[0].ID; want != got {
+		t.Errorf("MemberInfo returned ID %d, Cluster endpoint says %d", want, got)
+	}
+	if want, got := name, members.Members[0].Name; want != got {
+		t.Errorf("MemberInfo returned name %q, Cluster endpoint says %q", want, got)
+	}
+}
+
+func TestRestartFromDisk(t *testing.T) {
+	b := prep(t)
+	defer b.close()
+
+	startEtcd := func(new bool) (*Service, context.CancelFunc) {
+		etcd := New(Config{
+			Data:           &b.root.Data.Etcd,
+			Ephemeral:      &b.root.Ephemeral.Consensus,
+			Name:           "test",
+			NewCluster:     new,
+			InitialCluster: "127.0.0.1",
+			ExternalHost:   "127.0.0.1",
+			ListenHost:     "127.0.0.1",
+			Port:           common.MustConsume(common.AllocateTCPPort()),
+		})
+		ctx, ctxC := context.WithCancel(b.ctx)
+		supervisor.New(ctx, etcd.Run)
+		waitEtcd(t, etcd)
+		kv := etcd.KV("foo", "bar")
+		if new {
+			if _, err := kv.Put(b.ctx, "/foo", "bar"); err != nil {
+				t.Fatalf("test key creation failed: %v", err)
+			}
+		}
+		if _, err := kv.Get(b.ctx, "/foo"); err != nil {
+			t.Fatalf("test key retrieval failed: %v", err)
+		}
+
+		return etcd, ctxC
+	}
+
+	etcd, ctxC := startEtcd(true)
+	etcd.stateMu.Lock()
+	firstCA := etcd.state.ca.CACertRaw
+	etcd.stateMu.Unlock()
+	ctxC()
+
+	etcd, ctxC = startEtcd(false)
+	etcd.stateMu.Lock()
+	secondCA := etcd.state.ca.CACertRaw
+	etcd.stateMu.Unlock()
+	ctxC()
+
+	if bytes.Compare(firstCA, secondCA) != 0 {
+		t.Fatalf("wanted same, got different CAs accross runs")
+	}
+}
+
+func TestCRL(t *testing.T) {
+	b := prep(t)
+	defer b.close()
+	etcd := New(Config{
+		Data:           &b.root.Data.Etcd,
+		Ephemeral:      &b.root.Ephemeral.Consensus,
+		Name:           "test",
+		NewCluster:     true,
+		InitialCluster: "127.0.0.1",
+		ExternalHost:   "127.0.0.1",
+		ListenHost:     "127.0.0.1",
+		Port:           common.MustConsume(common.AllocateTCPPort()),
+	})
+	supervisor.New(b.ctx, etcd.Run)
+	waitEtcd(t, etcd)
+
+	etcd.stateMu.Lock()
+	ca := etcd.state.ca
+	kv := etcd.state.cl.KV
+	etcd.stateMu.Unlock()
+
+	certRaw, _, err := ca.Issue(b.ctx, kv, "revoketest", net.ParseIP("1.2.3.4"))
+	if err != nil {
+		t.Fatalf("cert issue failed: %v", err)
+	}
+	cert, err := x509.ParseCertificate(certRaw)
+	if err != nil {
+		t.Fatalf("cert parse failed: %v", err)
+	}
+
+	if err := ca.Revoke(b.ctx, kv, "revoketest"); err != nil {
+		t.Fatalf("cert revoke failed: %v", err)
+	}
+
+	deadline := time.Now().Add(5 * time.Second)
+	for {
+		if time.Now().After(deadline) {
+			t.Fatalf("CRL did not get updated in time")
+		}
+		time.Sleep(100 * time.Millisecond)
+
+		crlRaw, err := b.root.Data.Etcd.PeerCRL.Read()
+		if err != nil {
+			// That's fine. Maybe it hasn't been written yet.
+			continue
+		}
+		crl, err := x509.ParseCRL(crlRaw)
+		if err != nil {
+			// That's fine. Maybe it hasn't been written yet.
+			continue
+		}
+
+		found := false
+		for _, revoked := range crl.TBSCertList.RevokedCertificates {
+			if revoked.SerialNumber.Cmp(cert.SerialNumber) == 0 {
+				found = true
+			}
+		}
+		if found {
+			break
+		}
+	}
+}