m/node: clean up DNS service

The primary change in here is that CoreDNS now only listens on the
loopback interface by default.
This fixes #217 as it cannot be accessed from the outside anymore.
Since the containers do not share the host network namespace, they can
now no longer access the DNS service. This is solved by introducing a
new Network Service API to add listener IPs and using a link-local IP,
169.254.77.53 for the container DNS.
While at it, I cleaned up various parts of the DNS code.

Change-Id: Id7b618f62690032db335e8478b9de84410c210a1
Reviewed-on: https://review.monogon.dev/c/monogon/+/1759
Tested-by: Jenkins CI
Reviewed-by: Serge Bazanski <serge@monogon.tech>
diff --git a/metropolis/node/BUILD.bazel b/metropolis/node/BUILD.bazel
index 7ff504f..d737e37 100644
--- a/metropolis/node/BUILD.bazel
+++ b/metropolis/node/BUILD.bazel
@@ -9,6 +9,7 @@
     name = "node",
     srcs = [
         "ids.go",
+        "net_ips.go",
         "net_protocols.go",
         "ports.go",
     ],
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index 13d735f..dc50269 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -20,12 +20,14 @@
 	"context"
 	"fmt"
 	"io"
+	"net"
 	"os"
 	"strings"
 	"time"
 
 	"golang.org/x/sys/unix"
 
+	"source.monogon.dev/metropolis/node"
 	"source.monogon.dev/metropolis/node/core/cluster"
 	"source.monogon.dev/metropolis/node/core/localstorage"
 	"source.monogon.dev/metropolis/node/core/localstorage/declarative"
@@ -115,6 +117,7 @@
 
 	networkSvc := network.New(nil)
 	networkSvc.DHCPVendorClassID = "dev.monogon.metropolis.node.v1"
+	networkSvc.ExtraDNSListenerIPs = []net.IP{node.ContainerDNSIP}
 	timeSvc := timesvc.New()
 
 	// This function initializes a headless Delve if this is a debug build or
diff --git a/metropolis/node/core/network/dns/coredns.go b/metropolis/node/core/network/dns/coredns.go
index f865403..c539f44 100644
--- a/metropolis/node/core/network/dns/coredns.go
+++ b/metropolis/node/core/network/dns/coredns.go
@@ -21,6 +21,7 @@
 	"bytes"
 	"context"
 	"fmt"
+	"net"
 	"os"
 	"os/exec"
 	"strings"
@@ -45,6 +46,8 @@
 `
 
 type Service struct {
+	// Extra IPs the DNS service listens on and serves requests from.
+	ExtraListenerIPs      []net.IP
 	directiveRegistration chan *ExtraDirective
 	directives            map[string]ExtraDirective
 	cmd                   *exec.Cmd
@@ -54,7 +57,8 @@
 	stateMu sync.Mutex
 }
 
-// New creates a new CoreDNS service.
+// New creates a new CoreDNS service. By default it only listens on the loopback
+// IPs ::1/127.0.0.1, but extra listener IPs can be added in extraListenerIPs.
 // The given channel can then be used to dynamically register and unregister
 // directives in the configuaration.
 // To register a new directive, send an ExtraDirective on the channel. To
@@ -70,6 +74,11 @@
 func (s *Service) makeCorefile(fargs *fileargs.FileArgs) []byte {
 	corefile := bytes.Buffer{}
 	corefile.WriteString(corefileBase)
+	bindIPs := []string{"127.0.0.1", "::1"}
+	for _, ip := range s.ExtraListenerIPs {
+		bindIPs = append(bindIPs, ip.String())
+	}
+	fmt.Fprintf(&corefile, "\tbind %s\n", strings.Join(bindIPs, " "))
 	for _, dir := range s.directives {
 		resolvedDir := dir.directive
 		for fname, fcontent := range dir.files {
@@ -92,6 +101,16 @@
 // Run runs the DNS service consisting of the CoreDNS process and the directive
 // registration process
 func (s *Service) Run(ctx context.Context) error {
+	s.stateMu.Lock()
+	if s.args == nil {
+		args, err := fileargs.New()
+		if err != nil {
+			s.stateMu.Unlock()
+			return fmt.Errorf("failed to create fileargs: %w", err)
+		}
+		s.args = args
+	}
+	s.stateMu.Unlock()
 	supervisor.Run(ctx, "coredns", s.runCoreDNS)
 	supervisor.Run(ctx, "registration", s.runRegistration)
 	supervisor.Signal(ctx, supervisor.SignalHealthy)
@@ -102,21 +121,13 @@
 // runCoreDNS runs the CoreDNS proceess
 func (s *Service) runCoreDNS(ctx context.Context) error {
 	s.stateMu.Lock()
-	args, err := fileargs.New()
-	if err != nil {
-		s.stateMu.Unlock()
-		return fmt.Errorf("failed to create fileargs: %w", err)
-	}
-	defer args.Close()
-	s.args = args
-
 	s.cmd = exec.CommandContext(ctx, "/kubernetes/bin/coredns",
-		args.FileOpt("-conf", "Corefile", s.makeCorefile(args)),
+		s.args.FileOpt("-conf", "Corefile", s.makeCorefile(s.args)),
 	)
 
-	if args.Error() != nil {
+	if s.args.Error() != nil {
 		s.stateMu.Unlock()
-		return fmt.Errorf("failed to use fileargs: %w", err)
+		return fmt.Errorf("failed to use fileargs: %w", s.args.Error())
 	}
 
 	s.stateMu.Unlock()
diff --git a/metropolis/node/core/network/dns/directives.go b/metropolis/node/core/network/dns/directives.go
index ff25ae8..a9570e8 100644
--- a/metropolis/node/core/network/dns/directives.go
+++ b/metropolis/node/core/network/dns/directives.go
@@ -66,11 +66,13 @@
 // Service Discovery" compliant service under clusterDomain. The given
 // kubeconfig needs at least read access to services, endpoints and
 // endpointslices.
-//   [1] https://github.com/kubernetes/dns/blob/master/docs/specification.md
+//
+//	[1] https://github.com/kubernetes/dns/blob/master/docs/specification.md
 func NewKubernetesDirective(clusterDomain string, kubeconfig []byte) *ExtraDirective {
+	var prefix string
 	return &ExtraDirective{
 		ID:        "k8s-clusterdns",
-		directive: fmt.Sprintf(kubernetesDirective, clusterDomain),
+		directive: prefix + fmt.Sprintf(kubernetesDirective, clusterDomain),
 		files: map[string][]byte{
 			"kubeconfig": kubeconfig,
 		},
diff --git a/metropolis/node/core/network/main.go b/metropolis/node/core/network/main.go
index 466c076..956bb02 100644
--- a/metropolis/node/core/network/main.go
+++ b/metropolis/node/core/network/main.go
@@ -49,6 +49,13 @@
 	// autoconfiguration.
 	StaticConfig *netpb.Net
 
+	// List of IPs which get configured onto the loopback interface and the
+	// integrated DNS server is serving on. Cannot be changed at runtime.
+	// This is a hack to work around CoreDNS not being able to change listeners
+	// on-the-fly without breaking everything. This will go away once its
+	// frontend got replaced by something which can do that.
+	ExtraDNSListenerIPs []net.IP
+
 	// Vendor Class identifier of the system
 	DHCPVendorClassID string
 
@@ -111,6 +118,30 @@
 	s.dnsReg <- d
 }
 
+func singleIPtoNetlinkAddr(ip net.IP, label string) *netlink.Addr {
+	var mask net.IPMask
+	if ip.To4() == nil {
+		mask = net.CIDRMask(128, 128) // IPv6 /128
+	} else {
+		mask = net.CIDRMask(32, 32) // IPv4 /32
+	}
+	scope := netlink.SCOPE_UNIVERSE
+	if ip.IsLinkLocalUnicast() {
+		scope = netlink.SCOPE_LINK
+	}
+	if ip.IsLoopback() {
+		scope = netlink.SCOPE_HOST
+	}
+	return &netlink.Addr{
+		IPNet: &net.IPNet{
+			IP:   ip,
+			Mask: mask,
+		},
+		Label: label,
+		Scope: int(scope),
+	}
+}
+
 // nfifname converts an interface name into 16 bytes padded with zeroes (for
 // nftables)
 func nfifname(n string) []byte {
@@ -209,6 +240,7 @@
 
 func (s *Service) Run(ctx context.Context) error {
 	logger := supervisor.Logger(ctx)
+	s.dnsSvc.ExtraListenerIPs = s.ExtraDNSListenerIPs
 	supervisor.Run(ctx, "dns", s.dnsSvc.Run)
 
 	earlySysctlOpts := sysctlOptions{
@@ -297,6 +329,11 @@
 			if err := netlink.LinkSetUp(link); err != nil {
 				logger.Errorf("Failed to bring up loopback interface: %v", err)
 			}
+			for _, addr := range s.ExtraDNSListenerIPs {
+				if err := netlink.AddrAdd(link, singleIPtoNetlinkAddr(addr, "")); err != nil {
+					logger.Errorf("Failed to assign extra loopback IP: %v", err)
+				}
+			}
 		}
 	}
 	if len(ethernetLinks) != 1 {
diff --git a/metropolis/node/core/network/static.go b/metropolis/node/core/network/static.go
index 4d04ff3..a9a5176 100644
--- a/metropolis/node/core/network/static.go
+++ b/metropolis/node/core/network/static.go
@@ -36,10 +36,21 @@
 		return err
 	}
 
-	hostDevices, err := listHostDeviceIfaces()
+	hostDevices, loopbackLink, err := listHostDeviceIfaces()
 	if err != nil {
 		return err
 	}
+	if loopbackLink == nil {
+		return errors.New("No loopback interface present, weird/broken kernel?")
+	}
+	if err := netlink.LinkSetUp(loopbackLink); err != nil {
+		l.Error("Failed to enable loopback interface: %w", err)
+	}
+	for _, addr := range s.ExtraDNSListenerIPs {
+		if err := netlink.AddrAdd(loopbackLink, singleIPtoNetlinkAddr(addr, "")); err != nil {
+			l.Errorf("Failed to assign extra loopback IP: %v", err)
+		}
+	}
 
 	var hasIPv4Autoconfig bool
 
@@ -222,17 +233,23 @@
 	driver string
 }
 
-func listHostDeviceIfaces() ([]deviceIfData, error) {
+func listHostDeviceIfaces() ([]deviceIfData, netlink.Link, error) {
 	links, err := netlink.LinkList()
 	if err != nil {
-		return nil, fmt.Errorf("failed to list network links: %w", err)
+		return nil, nil, fmt.Errorf("failed to list network links: %w", err)
 	}
 
 	var hostDevices []deviceIfData
 
+	var loopbackLink netlink.Link
+
 	for _, link := range links {
-		if link.Attrs().EncapType == "loopback" {
-			netlink.LinkSetUp(link)
+		// Modern Linux versions always create a loopback device named "lo" with
+		// constant interface index 1 in every network namespace. Since Linux
+		// 3.6 there is a BUG_ON in the loopback driver, asserting that this is
+		// true for every loopback interface created.
+		if link.Attrs().Index == 1 {
+			loopbackLink = link
 		}
 		d, ok := link.(*netlink.Device)
 		if !ok {
@@ -248,7 +265,7 @@
 			driver: driver,
 		})
 	}
-	return hostDevices, nil
+	return hostDevices, loopbackLink, nil
 }
 
 func deviceIfaceFromSpec(it *netpb.Interface_Device, hostDevices []deviceIfData, l logtree.LeveledLogger) (*netlink.Device, error) {
diff --git a/metropolis/node/kubernetes/service_worker.go b/metropolis/node/kubernetes/service_worker.go
index d9f333e..2572dfa 100644
--- a/metropolis/node/kubernetes/service_worker.go
+++ b/metropolis/node/kubernetes/service_worker.go
@@ -86,6 +86,7 @@
 		ClusterDomain:      s.c.ClusterDomain,
 		KubeletDirectory:   &s.c.Root.Data.Kubernetes.Kubelet,
 		EphemeralDirectory: &s.c.Root.Ephemeral,
+		ClusterDNS:         []net.IP{node.ContainerDNSIP},
 	}
 
 	// Gather all required material to send over for certficiate issuance to the
@@ -177,43 +178,6 @@
 		c.informers = informers
 	}
 
-	// Sub-runnable which starts all parts of Kubernetes that depend on the
-	// machine's external IP address. If it changes, the runnable will exit.
-	// TODO(q3k): test this
-	supervisor.Run(ctx, "networked", func(ctx context.Context) error {
-		networkWatch := s.c.Network.Watch()
-		defer networkWatch.Close()
-
-		var status *network.Status
-
-		supervisor.Logger(ctx).Info("Waiting for node networking...")
-		for status == nil || status.ExternalAddress == nil {
-			status, err = networkWatch.Get(ctx)
-			if err != nil {
-				return fmt.Errorf("failed to get network status: %w", err)
-			}
-		}
-		address := status.ExternalAddress
-		supervisor.Logger(ctx).Info("Node has active networking, starting apiserver/kubelet")
-		kubelet.ClusterDNS = []net.IP{address}
-		err := supervisor.RunGroup(ctx, map[string]supervisor.Runnable{
-			"kubelet": kubelet.Run,
-		})
-		if err != nil {
-			return fmt.Errorf("when starting kubelet: %w", err)
-		}
-
-		supervisor.Signal(ctx, supervisor.SignalHealthy)
-
-		for status.ExternalAddress.Equal(address) {
-			status, err = networkWatch.Get(ctx)
-			if err != nil {
-				return fmt.Errorf("when watching for network changes: %w", err)
-			}
-		}
-		return fmt.Errorf("network configuration changed (%s -> %s)", address.String(), status.ExternalAddress.String())
-	})
-
 	csiPlugin := csiPluginServer{
 		KubeletDirectory: &s.c.Root.Data.Kubernetes.Kubelet,
 		VolumesDirectory: &s.c.Root.Data.Volumes,
@@ -250,6 +214,7 @@
 		{"clusternet", clusternet.Run},
 		{"nfproxy", nfproxy.Run},
 		{"kvmdeviceplugin", kvmDevicePlugin.Run},
+		{"kubelet", kubelet.Run},
 	} {
 		err := supervisor.Run(ctx, sub.name, sub.runnable)
 		if err != nil {
diff --git a/metropolis/node/net_ips.go b/metropolis/node/net_ips.go
new file mode 100644
index 0000000..27bc4b1
--- /dev/null
+++ b/metropolis/node/net_ips.go
@@ -0,0 +1,10 @@
+package node
+
+import "net"
+
+// These are IP addresses used by various parts of Metropolis.
+var (
+	// Used by //metropolis/node/kubernetes as the DNS server IP for containers.
+	// Link-local IP space, 77 for ASCII M(onogon), 53 for DNS port.
+	ContainerDNSIP = net.IPv4(169, 254, 77, 53)
+)