metropolis: add cluster domain config and metroctl param
This adds a --cluster parameter to metroctl and a cluster domain field
to the bootstrap configuration. It is not yet used anywhere, but later
the cluster domain will be used to identify the cluster.
The length of the cluster domain is limited to 80, to allow for
constructing subdomains. This limit could be increased later if needed,
but it cannot easily be decreased, so I chose a conservative value that
should be enough in most cases.
Change-Id: I627cca8eb1d92c4b06e4dfd6b6926a013e8f33ae
Reviewed-on: https://review.monogon.dev/c/monogon/+/3508
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/cli/metroctl/cmd_install.go b/metropolis/cli/metroctl/cmd_install.go
index fff13e9..c40c9ed 100644
--- a/metropolis/cli/metroctl/cmd_install.go
+++ b/metropolis/cli/metroctl/cmd_install.go
@@ -17,6 +17,7 @@
"source.monogon.dev/metropolis/cli/flagdefs"
"source.monogon.dev/metropolis/cli/metroctl/core"
+ common "source.monogon.dev/metropolis/node"
"source.monogon.dev/osbase/blkio"
"source.monogon.dev/osbase/fat32"
)
@@ -44,6 +45,13 @@
var params *api.NodeParameters
if *bootstrap {
+ if flags.cluster == "" {
+ return nil, fmt.Errorf("when bootstrapping a cluster, the --cluster parameter is required")
+ }
+ if err := common.ValidateClusterDomain(flags.cluster); err != nil {
+ return nil, fmt.Errorf("invalid cluster domain: %w", err)
+ }
+
// TODO(lorenz): Have a key management story for this
priv, err := core.GetOrMakeOwnerKey(flags.configPath)
if err != nil {
@@ -55,6 +63,7 @@
ClusterBootstrap: &api.NodeParameters_ClusterBootstrap{
OwnerPublicKey: pub,
InitialClusterConfiguration: &cpb.ClusterConfiguration{
+ ClusterDomain: flags.cluster,
StorageSecurityPolicy: *bootstrapStorageSecurityPolicy,
TpmMode: *bootstrapTPMMode,
},
diff --git a/metropolis/cli/metroctl/main.go b/metropolis/cli/metroctl/main.go
index 21f87a6..6d26d4c 100644
--- a/metropolis/cli/metroctl/main.go
+++ b/metropolis/cli/metroctl/main.go
@@ -22,6 +22,8 @@
}
type metroctlFlags struct {
+ // cluster is the domain name identifying the target cluster.
+ cluster string
// clusterEndpoints is a list of the targeted cluster's endpoints, used by
// commands that perform RPC on it.
clusterEndpoints []string
@@ -53,6 +55,7 @@
var flags metroctlFlags
func init() {
+ rootCmd.PersistentFlags().StringVar(&flags.cluster, "cluster", "", "Cluster domain")
rootCmd.PersistentFlags().StringSliceVar(&flags.clusterEndpoints, "endpoints", nil, "A list of the target cluster's endpoints.")
rootCmd.PersistentFlags().StringVar(&flags.proxyAddr, "proxy", "", "SOCKS5 proxy address")
rootCmd.PersistentFlags().StringVar(&flags.configPath, "config", filepath.Join(xdg.ConfigHome, "metroctl"), "An alternative cluster config path")
diff --git a/metropolis/handbook/src/ch02-00-local-demo-cluster.md b/metropolis/handbook/src/ch02-00-local-demo-cluster.md
index 69e2e71..76a26c4 100644
--- a/metropolis/handbook/src/ch02-00-local-demo-cluster.md
+++ b/metropolis/handbook/src/ch02-00-local-demo-cluster.md
@@ -36,11 +36,11 @@
Let's generate the installer image that we'll use to install the first node of the upcoming cluster. To do that, use the *metroctl* tool in the following way:
```shell
-metroctl install genusb bootstrap-node-installer.img --bootstrap --bundle=<installation-bundle-path>
+metroctl install genusb bootstrap-node-installer.img --bootstrap --cluster=cluster.internal --bundle=<installation-bundle-path>
```
If you're going to install from a USB stick or other types of removable storage, supply metroctl with a device path:
```shell
-metroctl install genusb /dev/sdx --bootstrap --bundle=<installation-bundle-path>
+metroctl install genusb /dev/sdx --bootstrap --cluster=cluster.internal --bundle=<installation-bundle-path>
```
Since a new GPT will need to be generated for the target device, the image file cannot simply be copied into it.
**Caution:** make sure you'll be using the correct path. *metroctl* will overwrite data on the target device.
diff --git a/metropolis/node/BUILD.bazel b/metropolis/node/BUILD.bazel
index 2a7d296..f0f650f 100644
--- a/metropolis/node/BUILD.bazel
+++ b/metropolis/node/BUILD.bazel
@@ -12,6 +12,7 @@
"net_ips.go",
"net_protocols.go",
"ports.go",
+ "validation.go",
],
importpath = "source.monogon.dev/metropolis/node",
visibility = [
@@ -144,7 +145,10 @@
go_test(
name = "node_test",
- srcs = ["labels_test.go"],
+ srcs = [
+ "labels_test.go",
+ "validation_test.go",
+ ],
embed = [":node"],
deps = ["@io_k8s_apimachinery//pkg/util/validation"],
)
diff --git a/metropolis/node/core/curator/impl_leader_test.go b/metropolis/node/core/curator/impl_leader_test.go
index bbdf936..6550a30 100644
--- a/metropolis/node/core/curator/impl_leader_test.go
+++ b/metropolis/node/core/curator/impl_leader_test.go
@@ -1735,6 +1735,7 @@
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
cl := fakeLeader(t, &fakeLeaderOption{
icc: &cpb.ClusterConfiguration{
+ ClusterDomain: "cluster.test",
TpmMode: te.mode,
StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_ENCRYPTION_AND_AUTHENTICATION,
},
diff --git a/metropolis/node/core/curator/state_cluster.go b/metropolis/node/core/curator/state_cluster.go
index 02e6891..d5b6001 100644
--- a/metropolis/node/core/curator/state_cluster.go
+++ b/metropolis/node/core/curator/state_cluster.go
@@ -9,7 +9,9 @@
"google.golang.org/grpc/status"
"google.golang.org/protobuf/proto"
+ common "source.monogon.dev/metropolis/node"
"source.monogon.dev/metropolis/node/core/rpc"
+
cpb "source.monogon.dev/metropolis/proto/common"
)
@@ -20,6 +22,7 @@
// Cluster is the cluster's configuration, as (un)marshaled to/from
// common.ClusterConfiguration.
type Cluster struct {
+ ClusterDomain string
TPMMode cpb.ClusterConfiguration_TPMMode
StorageSecurityPolicy cpb.ClusterConfiguration_StorageSecurityPolicy
NodeLabelsToSynchronizeToKubernetes []*cpb.ClusterConfiguration_KubernetesConfig_NodeLabelsToSynchronize
@@ -30,6 +33,7 @@
// user.
func DefaultClusterConfiguration() *Cluster {
return &Cluster{
+ ClusterDomain: "cluster.internal",
TPMMode: cpb.ClusterConfiguration_TPM_MODE_REQUIRED,
StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_ENCRYPTION_AND_AUTHENTICATION,
NodeLabelsToSynchronizeToKubernetes: nil,
@@ -139,10 +143,19 @@
if err := proto.Unmarshal(data, &msg); err != nil {
return nil, fmt.Errorf("could not unmarshal proto: %w", err)
}
+ if msg.ClusterDomain == "" {
+ // Backward compatibility for clusters which did not have this field
+ // initially.
+ msg.ClusterDomain = "cluster.internal"
+ }
return clusterFromProto(&msg)
}
func clusterFromProto(cc *cpb.ClusterConfiguration) (*Cluster, error) {
+ if err := common.ValidateClusterDomain(cc.ClusterDomain); err != nil {
+ return nil, fmt.Errorf("invalid ClusterDomain: %w", err)
+ }
+
switch cc.TpmMode {
case cpb.ClusterConfiguration_TPM_MODE_REQUIRED:
case cpb.ClusterConfiguration_TPM_MODE_BEST_EFFORT:
@@ -161,6 +174,7 @@
}
c := &Cluster{
+ ClusterDomain: cc.ClusterDomain,
TPMMode: cc.TpmMode,
StorageSecurityPolicy: cc.StorageSecurityPolicy,
}
@@ -190,6 +204,7 @@
}
return &cpb.ClusterConfiguration{
+ ClusterDomain: c.ClusterDomain,
TpmMode: c.TPMMode,
StorageSecurityPolicy: c.StorageSecurityPolicy,
KubernetesConfig: &cpb.ClusterConfiguration_KubernetesConfig{
diff --git a/metropolis/node/validation.go b/metropolis/node/validation.go
new file mode 100644
index 0000000..cf6d520
--- /dev/null
+++ b/metropolis/node/validation.go
@@ -0,0 +1,61 @@
+package node
+
+import (
+ "errors"
+ "fmt"
+ "regexp"
+)
+
+const (
+ // domainNameMaxLength is the maximum length of a domain name supported by DNS
+ // when represented without a trailing dot.
+ domainNameMaxLength = 253
+
+ // clusterDomainMaxLength is the maximum length of a cluster domain. Limiting
+ // this to 80 allows for constructing subdomains of the cluster domain, where
+ // the subdomain part can have length up to 172. With the joining dot, this
+ // adds up to 253.
+ clusterDomainMaxLength = 80
+)
+
+var (
+ fmtDomainNameLabel = `[a-z0-9]([-a-z0-9]{0,61}[a-z0-9])?`
+ reDomainName = regexp.MustCompile(`^` + fmtDomainNameLabel + `(\.` + fmtDomainNameLabel + `)*$`)
+ reDomainNameEndsInNumber = regexp.MustCompile(`(^|\.)([0-9]+|0x[0-9a-f]*)$`)
+
+ errDomainNameTooLong = fmt.Errorf("too long, must have length at most %d", domainNameMaxLength)
+ errDomainNameInvalid = errors.New("must consist of labels separated by '.', where each label has between 1 and 63 lowercase letters, digits or '-', and must not start or end with '-'")
+ errDomainNameEndsInNumber = errors.New("must not end in a number")
+
+ errClusterDomainTooLong = fmt.Errorf("too long, must have length at most %d", clusterDomainMaxLength)
+)
+
+// validateDomainName returns an error if the passed string is not a valid
+// domain name, according to these rules: The name must be a valid DNS name
+// without a trailing dot. Labels must only consist of lowercase letters, digits
+// or '-', and must not start or end with '-'. Additionally, the name must not
+// end in a number, so that it won't be parsed as an IPv4 address.
+func validateDomainName(d string) error {
+ if len(d) > domainNameMaxLength {
+ return errDomainNameTooLong
+ }
+ // This implements RFC 1123 domain validation. Additionally, it does not allow
+ // uppercase, so that we don't need to implement case-insensitive matching.
+ if !reDomainName.MatchString(d) {
+ return errDomainNameInvalid
+ }
+ // This implements https://url.spec.whatwg.org/#ends-in-a-number-checker
+ if reDomainNameEndsInNumber.MatchString(d) {
+ return errDomainNameEndsInNumber
+ }
+ return nil
+}
+
+// ValidateClusterDomain returns an error if the passed string is not a valid
+// cluster domain.
+func ValidateClusterDomain(d string) error {
+ if len(d) > clusterDomainMaxLength {
+ return errClusterDomainTooLong
+ }
+ return validateDomainName(d)
+}
diff --git a/metropolis/node/validation_test.go b/metropolis/node/validation_test.go
new file mode 100644
index 0000000..1a9765e
--- /dev/null
+++ b/metropolis/node/validation_test.go
@@ -0,0 +1,49 @@
+package node
+
+import (
+ "errors"
+ "testing"
+
+ "k8s.io/apimachinery/pkg/util/validation"
+)
+
+func TestValidateDomainName(t *testing.T) {
+ for _, te := range []struct {
+ in string
+ want error
+ }{
+ {"example.com", nil},
+ {"localhost", nil},
+ {"123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.1.example.com", nil},
+ {"123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.123456789.12.example.com", errDomainNameTooLong},
+ {"ex_ample.com", errDomainNameInvalid},
+ {"-.com", errDomainNameInvalid},
+ {"example-.com", errDomainNameInvalid},
+ {"-example.com", errDomainNameInvalid},
+ {"1-1.com", nil},
+ {"xn--h-0fa.com", nil},
+ {".", errDomainNameInvalid},
+ {"example..com", errDomainNameInvalid},
+ {"example.com.", errDomainNameInvalid},
+ {".example.com", errDomainNameInvalid},
+ {"0.example.com", nil},
+ {"01.example.com", nil},
+ {"012345678901234567890123456789012345678901234567890123456789012.example.com", nil},
+ {"0123456789012345678901234567890123456789012345678901234567890123.example.com", errDomainNameInvalid},
+ {"1.1.1.1", errDomainNameEndsInNumber},
+ {"example.123", errDomainNameEndsInNumber},
+ {"0123456789", errDomainNameEndsInNumber},
+ {"example.0x", errDomainNameEndsInNumber},
+ {"0x0123456789abcdef", errDomainNameEndsInNumber},
+ {"1.2.3.1a1", nil},
+ } {
+ if got := validateDomainName(te.in); !errors.Is(got, te.want) {
+ t.Errorf("%q: wanted %v, got %v", te.in, te.want, got)
+ }
+ if validateDomainName(te.in) == nil {
+ if errs := validation.IsDNS1123Subdomain(te.in); len(errs) > 0 {
+ t.Errorf("%q: is not a valid Kubernetes domain: %v", te.in, errs)
+ }
+ }
+ }
+}
diff --git a/metropolis/proto/common/common.proto b/metropolis/proto/common/common.proto
index 1e8d748..50ea4e4 100644
--- a/metropolis/proto/common/common.proto
+++ b/metropolis/proto/common/common.proto
@@ -270,6 +270,18 @@
// NodeParamaters.ClusterBootstrap), and then can be partially managed by
// management calls to the curator.
message ClusterConfiguration {
+ // cluster_domain is the domain name which identifies the cluster.
+ // It should be unique, and ideally a public DNS name, but one under
+ // .internal works too. The cluster domain is used for different purposes:
+ //
+ // - To identify the cluster in clients like metroctl.
+ // - To resolve control plane endpoints with DNS in clients.
+ // - As the SPIFFE trust domain name of the cluster. Every identity
+ // issued by the cluster is rooted under `spiffe://cluster_domain/`.
+ // - As the issuer of OpenID Connect identity tokens. The discovery
+ // document is thus hosted at https://cluster_domain/.well-known/openid-configuration
+ string cluster_domain = 4;
+
// tpm_mode defines the TPM usage policy for cluster nodes. When nodes
// register into the cluster (and then join into it) they will report their
// TPM availability, and in return the cluster will respond whether they
@@ -382,4 +394,4 @@
// The node has encrypted and authenticated storage. Its data
// partition is an XFS partition mounted through dm-integrity and dm-crypt.
NODE_STORAGE_SECURITY_AUTHENTICATED_ENCRYPTED = 3;
-}
\ No newline at end of file
+}
diff --git a/metropolis/test/e2e/suites/core/run_test.go b/metropolis/test/e2e/suites/core/run_test.go
index c8f654f..7a07b1e 100644
--- a/metropolis/test/e2e/suites/core/run_test.go
+++ b/metropolis/test/e2e/suites/core/run_test.go
@@ -81,6 +81,7 @@
NumNodes: 2,
LocalRegistry: lr,
InitialClusterConfiguration: &cpb.ClusterConfiguration{
+ ClusterDomain: "cluster.test",
TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
},
diff --git a/metropolis/test/e2e/suites/ha_cold/run_test.go b/metropolis/test/e2e/suites/ha_cold/run_test.go
index 6a5c6e8..43ff689 100644
--- a/metropolis/test/e2e/suites/ha_cold/run_test.go
+++ b/metropolis/test/e2e/suites/ha_cold/run_test.go
@@ -42,6 +42,7 @@
NumNodes: 3,
NodeLogsToFiles: true,
InitialClusterConfiguration: &cpb.ClusterConfiguration{
+ ClusterDomain: "cluster.test",
TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
},
diff --git a/metropolis/test/e2e/suites/kubernetes/run_test.go b/metropolis/test/e2e/suites/kubernetes/run_test.go
index 80a8292..f815c57 100644
--- a/metropolis/test/e2e/suites/kubernetes/run_test.go
+++ b/metropolis/test/e2e/suites/kubernetes/run_test.go
@@ -72,6 +72,7 @@
clusterOptions := mlaunch.ClusterOptions{
NumNodes: 2,
InitialClusterConfiguration: &cpb.ClusterConfiguration{
+ ClusterDomain: "cluster.test",
TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
KubernetesConfig: &cpb.ClusterConfiguration_KubernetesConfig{
@@ -266,6 +267,7 @@
NumNodes: 2,
LocalRegistry: lr,
InitialClusterConfiguration: &cpb.ClusterConfiguration{
+ ClusterDomain: "cluster.test",
TpmMode: cpb.ClusterConfiguration_TPM_MODE_DISABLED,
StorageSecurityPolicy: cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE,
},
diff --git a/metropolis/test/launch/cli/launch-cluster/main.go b/metropolis/test/launch/cli/launch-cluster/main.go
index 859e17f..793cc28 100644
--- a/metropolis/test/launch/cli/launch-cluster/main.go
+++ b/metropolis/test/launch/cli/launch-cluster/main.go
@@ -148,6 +148,7 @@
var consensusMemberList, kubernetesControllerList, kubernetesWorkerList []int
flag.IntVar(&opts.NumNodes, "num-nodes", 3, "Number of cluster nodes")
+ flag.StringVar(&clusterConfig.ClusterDomain, "cluster-domain", "cluster.internal", "Cluster domain")
flagdefs.TPMModeVar(flag.CommandLine, &clusterConfig.TpmMode, "tpm-mode", cpb.ClusterConfiguration_TPM_MODE_REQUIRED, "TPM mode to set on cluster")
flagdefs.StorageSecurityPolicyVar(flag.CommandLine, &clusterConfig.StorageSecurityPolicy, "storage-security", cpb.ClusterConfiguration_STORAGE_SECURITY_POLICY_NEEDS_INSECURE, "Storage security policy to set on cluster")
flag.IntVar(&opts.Node.CPUs, "cpu", 1, "Number of virtual CPUs of each node")