m/c/metroctl: implement TOFU for CA certificates

This implements trust-on-first-use (TOFU) for connecting to a Metropolis
cluster.

If no locally persisted CA is available, one will be retrieved from the
cluster. If it is then accepted, it will be persisted for future use.

To retrieve the Cluster CA certificate we implement a new
unauthenticated call in the CuratorLocal service. The alternative would
be to include the CA certificate in the served TLS chain, but that would
likely cause some backwards compatibility problems with existing client
software.

Full TOFU (with an SSH style prompt) will be performed when the user
first takes ownership of a cluster. Otherwise, user credentials
including a certificate will be present, which allows the process to be
simplified by just retrieving a remote CA and checking it against the
signature of the credentials.

Change-Id: I20002399935c2f13adc4526f5cceddad84b36a8f
Reviewed-on: https://review.monogon.dev/c/monogon/+/2743
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/cli/metroctl/cmd_takeownership.go b/metropolis/cli/metroctl/cmd_takeownership.go
index 61d6c4e..2a93b57 100644
--- a/metropolis/cli/metroctl/cmd_takeownership.go
+++ b/metropolis/cli/metroctl/cmd_takeownership.go
@@ -31,6 +31,12 @@
 	if len(flags.clusterEndpoints) != 1 {
 		log.Fatalf("takeownership requires a single cluster endpoint to be provided with the --endpoints parameter.")
 	}
+	ctx := clicontext.WithInterrupt(context.Background())
+
+	ca, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
+	if err != nil {
+		log.Fatalf("Could not retrieve cluster CA: %v", err)
+	}
 
 	// Retrieve the cluster owner's private key, and use it to construct
 	// ephemeral credentials. Then, dial the cluster.
@@ -41,12 +47,11 @@
 	if err != nil {
 		log.Fatalf("Couldn't get owner's key: %v", err)
 	}
-	ctx := clicontext.WithInterrupt(context.Background())
 	opts, err := core.DialOpts(ctx, connectOptions())
 	if err != nil {
 		log.Fatalf("While configuring cluster dial opts: %v", err)
 	}
-	creds, err := rpc.NewEphemeralCredentials(opk, rpc.WantInsecure())
+	creds, err := rpc.NewEphemeralCredentials(opk, rpc.WantRemoteCluster(ca))
 	if err != nil {
 		log.Fatalf("While generating ephemeral credentials: %v", err)
 	}
diff --git a/metropolis/cli/metroctl/core/BUILD.bazel b/metropolis/cli/metroctl/core/BUILD.bazel
index 7c1a0f4..1795765 100644
--- a/metropolis/cli/metroctl/core/BUILD.bazel
+++ b/metropolis/cli/metroctl/core/BUILD.bazel
@@ -3,6 +3,7 @@
 go_library(
     name = "core",
     srcs = [
+        "ca_tofu.go",
         "config.go",
         "core.go",
         "install.go",
@@ -12,6 +13,7 @@
     visibility = ["//visibility:public"],
     deps = [
         "//metropolis/node",
+        "//metropolis/node/core/curator/proto/api",
         "//metropolis/node/core/rpc",
         "//metropolis/node/core/rpc/resolver",
         "//metropolis/pkg/blockdev",
@@ -22,6 +24,7 @@
         "@io_k8s_client_go//tools/clientcmd",
         "@io_k8s_client_go//tools/clientcmd/api",
         "@org_golang_google_grpc//:go_default_library",
+        "@org_golang_google_grpc//credentials",
         "@org_golang_google_protobuf//proto",
         "@org_golang_x_net//proxy",
     ],
diff --git a/metropolis/cli/metroctl/core/ca_tofu.go b/metropolis/cli/metroctl/core/ca_tofu.go
new file mode 100644
index 0000000..fc30c6c
--- /dev/null
+++ b/metropolis/cli/metroctl/core/ca_tofu.go
@@ -0,0 +1,200 @@
+package core
+
+import (
+	"bufio"
+	"context"
+	"crypto/sha256"
+	"crypto/x509"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"strings"
+
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials"
+
+	"source.monogon.dev/metropolis/node/core/rpc"
+	"source.monogon.dev/metropolis/node/core/rpc/resolver"
+
+	ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
+)
+
+// CertificateTOFU is an interface to different providers of a user interaction
+// to confirm the validity of a CA certificate.
+type CertificateTOFU interface {
+	// Ask is called whenever the user needs to confirm some certificate as being the
+	// CA certificate presented as the result of connection via given ConnectOptions.
+	// If true is returned, the certificate is accepted and persisted as the
+	// canonical CA certificate of the cluster pointed to by ConnectOptions.
+	Ask(ctx context.Context, connection *ConnectOptions, cert *x509.Certificate) (bool, error)
+}
+
+// TerminalTOFU implements CertificateTOFU in an interactive way, similar to SSH.
+type TerminalTOFU struct {
+	// Out will be used to output prompts to the user. If not set, defaults to
+	// os.Stdout.
+	Out io.Writer
+	// In will be used to read responses from the user. If not set, defaults to
+	// os.Stdin.
+	In io.Reader
+}
+
+func (i *TerminalTOFU) Ask(ctx context.Context, connection *ConnectOptions, cert *x509.Certificate) (bool, error) {
+	out := i.Out
+	if out == nil {
+		out = os.Stdout
+	}
+	in := i.In
+	if in == nil {
+		in = os.Stdin
+	}
+
+	clusterIdentity := fmt.Sprintf("at endpoints %s", strings.Join(connection.Endpoints, ", "))
+	if connection.ProxyServer != "" {
+		clusterIdentity += fmt.Sprintf(" via proxy %s", connection.ProxyServer)
+	}
+	fmt.Fprintf(out, "The authenticity of the cluster %s can't be established.\n", clusterIdentity)
+
+	sum := sha256.New()
+	sum.Write(cert.Raw)
+	fingerprint := "SHA256:" + hex.EncodeToString(sum.Sum(nil))
+	fmt.Fprintf(out, "ED25519 key fingerprint of the cluster CA is %s.\n", fingerprint)
+
+	fmt.Fprintf(out, "Are you sure you want to continue connecting (yes/no)? ")
+
+	reader := bufio.NewReader(in)
+
+	resC := make(chan string)
+	errC := make(chan error)
+	go func() {
+		// This goroutine will run until we read a full newline-delimited string from the
+		// input. It will leak if the context is canceled, but only until a string is
+		// fully read. This is fine for now as context cancellation indicates a shutdown
+		// of metroctl, but might complicate things whenever we cancel for other reasons
+		// and then attempt subsequent reads from the same input.
+		res, err := reader.ReadString('\n')
+		if err != nil {
+			errC <- err
+		} else {
+			resC <- res
+		}
+	}()
+
+	var res string
+	select {
+	case <-ctx.Done():
+		return false, ctx.Err()
+	case err := <-errC:
+		return false, err
+	case res = <-resC:
+	}
+	res = strings.ToLower(strings.TrimSpace(res))
+	return res == "yes", nil
+}
+
+// GetClusterCAWithTOFU returns the CA certificate of the cluster, performing
+// trust-on-first-use (TOFU) checks per ConnectOptions first if necessary.
+//
+// If no locally persisted CA is found, this will connect to the cluster and
+// retrieve it. Then, if now owner certificate is present, a TOFU prompt will be
+// shown to the user. Otherwise, the retrieved CA will be verified against the
+// local owner certificate.
+//
+// If the above logic accepts the CA it will be written to the configuration
+// directory and used automatically on subsequent connections.
+//
+// An error will be returned if the user rejects the certificate as part of the
+// TOFU process, if the returned CA does not matched persisted owner certificate
+// (if available) or if retrieving the certificate from the cluster fails for
+// some other reason.
+func GetClusterCAWithTOFU(ctx context.Context, c *ConnectOptions) (*x509.Certificate, error) {
+	ca, err := GetClusterCA(c.ConfigPath)
+	if err == nil {
+		return ca, nil
+	}
+	if err != NoCACertificateError {
+		return nil, err
+	}
+
+	// Connect to cluster with credentials. If possible, use owner credentials.
+	// Otherwise, use ephemeral credentials with owner key.
+	var creds credentials.TransportCredentials
+
+	tlsc, err := GetOwnerTLSCredentials(c.ConfigPath)
+
+	// If we have an owner certificate, simplify TOFU by just checking the cluster CA
+	// against it, and don't ask the user.
+	var ocert *x509.Certificate
+	if err != nil {
+		if errors.Is(err, NoCredentialsError) {
+			okey, err := GetOwnerKey(c.ConfigPath)
+			if err != nil {
+				return nil, err
+			}
+			creds, err = rpc.NewEphemeralCredentials(okey, rpc.WantInsecure())
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			return nil, err
+		}
+	} else {
+		creds = rpc.NewAuthenticatedCredentials(*tlsc, rpc.WantInsecure())
+		ocert, err = x509.ParseCertificate(tlsc.Certificate[0])
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	opts, err := DialOpts(ctx, c)
+	if err != nil {
+		return nil, err
+	}
+	opts = append(opts, grpc.WithTransportCredentials(creds))
+	cc, err := grpc.Dial(resolver.MetropolisControlAddress, opts...)
+	if err != nil {
+		return nil, fmt.Errorf("while dialing cluster to retrieve CA: %w", err)
+	}
+	cur := ipb.NewCuratorLocalClient(cc)
+	res, err := cur.GetCACertificate(ctx, &ipb.GetCACertificateRequest{})
+	if err != nil {
+		return nil, fmt.Errorf("while retrieving cluster CA certificate: %w", err)
+	}
+	if len(res.IdentityCaCertificate) == 0 {
+		return nil, fmt.Errorf("cluster returned empty CA certificate")
+	}
+
+	cert, err := x509.ParseCertificate(res.IdentityCaCertificate)
+	if err != nil {
+		return nil, fmt.Errorf("cluster returned invalid CA certificate: %w", err)
+	}
+
+	var okay bool
+	if ocert != nil {
+		// Simplified process.
+		if err := ocert.CheckSignatureFrom(cert); err != nil {
+			return nil, fmt.Errorf("server CA doesn't match owner certificate")
+		}
+		okay = true
+	} else {
+		// Full TOFU.
+		tofu := c.TOFU
+		if tofu == nil {
+			tofu = &TerminalTOFU{}
+		}
+		okay, err = tofu.Ask(ctx, c, cert)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	if !okay {
+		return nil, fmt.Errorf("cluster CA rejected by user")
+	}
+	if err := WriteCACertificate(c.ConfigPath, res.IdentityCaCertificate); err != nil {
+		return nil, err
+	}
+	return GetClusterCA(c.ConfigPath)
+}
diff --git a/metropolis/cli/metroctl/core/config.go b/metropolis/cli/metroctl/core/config.go
index 1307d61..92a8871 100644
--- a/metropolis/cli/metroctl/core/config.go
+++ b/metropolis/cli/metroctl/core/config.go
@@ -3,6 +3,7 @@
 import (
 	"crypto/ed25519"
 	"crypto/rand"
+	"crypto/tls"
 	"crypto/x509"
 	"encoding/pem"
 	"errors"
@@ -26,12 +27,17 @@
 	// OwnerCertificateFileName is the filename of the owner certificate in a
 	// metroctl config directory.
 	OwnerCertificateFileName = "owner.pem"
+	// CACertificateFileName is the filename of the cluster CA certificate in a
+	// metroctl config directory.
+	CACertificateFileName = "ca.pem"
 )
 
 // NoCredentialsError indicates that the requested datum (eg. owner key or owner
 // certificate) is not present in the requested directory.
 var NoCredentialsError = errors.New("owner certificate or key does not exist")
 
+var NoCACertificateError = errors.New("no cluster CA certificate while secure connection was requested")
+
 // A PEM block type for a Metropolis initial owner private key
 const ownerKeyType = "METROPOLIS INITIAL OWNER PRIVATE KEY"
 
@@ -67,6 +73,16 @@
 	return nil
 }
 
+// WriteCACertificate writes the given der-encoded X509 certificate to the given
+// metorctl configuration directory path.
+func WriteCACertificate(path string, der []byte) error {
+	pemCert := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der})
+	if err := os.WriteFile(filepath.Join(path, CACertificateFileName), pemCert, 0600); err != nil {
+		return fmt.Errorf("when saving CA certificate: %w", err)
+	}
+	return nil
+}
+
 // GetOwnerKey loads and returns a raw ED25519 private key from the saved owner
 // key in a given metroctl configuration directory path. If the owner key doesn't
 // exist, NoCredentialsError will be returned.
@@ -133,6 +149,43 @@
 	return
 }
 
+// GetOwnerTLSCredentials returns a client TLS Certificate for authenticating to
+// the metropolis cluster, based on metroctl configuration at a given path.
+func GetOwnerTLSCredentials(path string) (*tls.Certificate, error) {
+	ocert, opkey, err := GetOwnerCredentials(path)
+	if err != nil {
+		return nil, err
+	}
+	return &tls.Certificate{
+		Certificate: [][]byte{ocert.Raw},
+		PrivateKey:  opkey,
+	}, nil
+}
+
+// GetClusterCA returns the saved cluster CA certificate at the given metoctl
+// configuration path. This does not perform TOFU if the certificate is not
+// present.
+func GetClusterCA(path string) (cert *x509.Certificate, err error) {
+	caCertPEM, err := os.ReadFile(filepath.Join(path, CACertificateFileName))
+	if os.IsNotExist(err) {
+		return nil, NoCACertificateError
+	} else if err != nil {
+		return nil, fmt.Errorf("failed to load CA certificate: %w", err)
+	}
+	block, _ := pem.Decode(caCertPEM)
+	if block == nil {
+		return nil, errors.New("ca.pem contains invalid PEM armoring")
+	}
+	if block.Type != "CERTIFICATE" {
+		return nil, fmt.Errorf("ca.pem contains a PEM block that's not a CERTIFICATE")
+	}
+	cert, err = x509.ParseCertificate(block.Bytes)
+	if err != nil {
+		return nil, fmt.Errorf("ca.pem contains an invalid X.509 certificate: %w", err)
+	}
+	return
+}
+
 // InstallKubeletConfig modifies the default kubelet kubeconfig of the host
 // system to be able to connect via a metroctl (and an associated ConnectOptions)
 // to a Kubernetes apiserver at IP address/hostname 'server'.
@@ -172,6 +225,7 @@
 	var u url.URL
 	u.Scheme = "https"
 	u.Host = net.JoinHostPort(server, node.KubernetesAPIWrappedPort.PortString())
+
 	config.Clusters[configName] = &clientapi.Cluster{
 		// MVP: This is insecure, but making this work would be wasted effort
 		// as all of it will be replaced by the identity system.
@@ -222,6 +276,10 @@
 	// ResolverLogger can be set to enable verbose logging of the Metropolis RPC
 	// resolver layer.
 	ResolverLogger ResolverLogger
+	// TOFU overrides the trust-on-first-use behaviour for CA certificates for the
+	// connection. If not set, TerminalTOFU is used which will interactively ask the
+	// user to accept a CA certificate using os.Stdin/Stdout.
+	TOFU CertificateTOFU
 }
 
 // ToFlags returns the metroctl flags corresponding to the options described by
diff --git a/metropolis/cli/metroctl/main.go b/metropolis/cli/metroctl/main.go
index 698dbbc..e3ae92b 100644
--- a/metropolis/cli/metroctl/main.go
+++ b/metropolis/cli/metroctl/main.go
@@ -1,6 +1,8 @@
 package main
 
 import (
+	"context"
+	"crypto/x509"
 	"log"
 	"path/filepath"
 
@@ -35,6 +37,10 @@
 	// output is an optional output file path the resulting data will be saved
 	// at. If unspecified, the data will be written to stdout.
 	output string
+	// acceptAnyCA will persist the first encountered (while connecting) CA
+	// certificate of the cluster as the trusted CA certificate for this cluster.
+	// This is unsafe and should only be used for testing.
+	acceptAnyCA bool
 }
 
 var flags metroctlFlags
@@ -47,6 +53,7 @@
 	rootCmd.PersistentFlags().StringVar(&flags.format, "format", "plaintext", "Data output format")
 	rootCmd.PersistentFlags().StringVar(&flags.filter, "filter", "", "The object filter applied to the output data")
 	rootCmd.PersistentFlags().StringVarP(&flags.output, "output", "o", "", "Redirects output to the specified file")
+	rootCmd.PersistentFlags().BoolVar(&flags.acceptAnyCA, "insecure-accept-and-persist-first-encountered-ca", false, "Accept the first encountered CA while connecting as the trusted CA for future metroctl connections with this config path. This is very insecure and should only be used for testing.")
 }
 
 // rpcLogger passes through the cluster resolver logs, if "--verbose" flag was
@@ -61,13 +68,24 @@
 	cobra.CheckErr(rootCmd.Execute())
 }
 
+type acceptall struct{}
+
+func (a *acceptall) Ask(ctx context.Context, _ *core.ConnectOptions, _ *x509.Certificate) (bool, error) {
+	return true, nil
+}
+
 // connectOptions returns core.ConnectOptions as defined by the metroctl flags
 // currently set.
 func connectOptions() *core.ConnectOptions {
+	var tofu core.CertificateTOFU
+	if flags.acceptAnyCA {
+		tofu = &acceptall{}
+	}
 	return &core.ConnectOptions{
 		ConfigPath:     flags.configPath,
 		ProxyServer:    flags.proxyAddr,
 		Endpoints:      flags.clusterEndpoints,
 		ResolverLogger: rpcLogger,
+		TOFU:           tofu,
 	}
 }
diff --git a/metropolis/cli/metroctl/rpc.go b/metropolis/cli/metroctl/rpc.go
index f1c27e6..164e2ee 100644
--- a/metropolis/cli/metroctl/rpc.go
+++ b/metropolis/cli/metroctl/rpc.go
@@ -23,12 +23,17 @@
 	if len(flags.clusterEndpoints) == 0 {
 		log.Fatal("Please provide at least one cluster endpoint using the --endpoint parameter.")
 	}
+
+	ca, err := core.GetClusterCAWithTOFU(ctx, connectOptions())
+	if err != nil {
+		log.Fatalf("Failed to get cluster CA: %v", err)
+	}
+
 	tlsc := tls.Certificate{
 		Certificate: [][]byte{ocert.Raw},
 		PrivateKey:  opkey,
 	}
-	// TODO(q3k): check remote CA
-	creds := rpc.NewAuthenticatedCredentials(tlsc, rpc.WantInsecure())
+	creds := rpc.NewAuthenticatedCredentials(tlsc, rpc.WantRemoteCluster(ca))
 	opts, err := core.DialOpts(ctx, connectOptions())
 	if err != nil {
 		log.Fatalf("While configuring dial options: %v", err)
@@ -37,7 +42,7 @@
 
 	cc, err := grpc.Dial(resolver.MetropolisControlAddress, opts...)
 	if err != nil {
-		log.Fatalf("While dialing the cluster: %v", err)
+		log.Fatalf("While dialing cluster: %v", err)
 	}
 	return cc
 }
diff --git a/metropolis/cli/metroctl/test/test.go b/metropolis/cli/metroctl/test/test.go
index 969b4cc..b031271 100644
--- a/metropolis/cli/metroctl/test/test.go
+++ b/metropolis/cli/metroctl/test/test.go
@@ -125,6 +125,7 @@
 	commonOpts := []string{
 		"--proxy=" + socksRemote,
 		"--config=.",
+		"--insecure-accept-and-persist-first-encountered-ca",
 	}
 
 	var endpointOpts []string