m/n/core: simplify network overlay
During the initial refactor into the role server the node external IP
(which was then used as the underlay target) got moved into the
published prefixes. This never did anything as the summary route into
clusternet does not contain it. Thus drop this and simplify the code.
Also remove the extra loopback IP introduced in 011dce6623 as it never
worked because control plane nodes do not even have a prefix to pick
such an IP from.
While at it, rename some variables to make them more accurate with how
this is eventually going to look.
Change-Id: I6a6a6964bd45d088c37f746d1c9514246d624093
Reviewed-on: https://review.monogon.dev/c/monogon/+/4524
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/metropolis/node/core/network/overlay/overlay.go b/metropolis/node/core/network/overlay/overlay.go
index c7979cc..9055609 100644
--- a/metropolis/node/core/network/overlay/overlay.go
+++ b/metropolis/node/core/network/overlay/overlay.go
@@ -16,25 +16,17 @@
// Service as ClusterNet. All destination addresses that should be carried by the
// mesh must thus be part of this single route. Otherwise, traffic will be able
// to flow into the node from other nodes, but will exit through another
-// interface. This is used in practice to allow other host nodes (whose external
-// addresses are outside the cluster network) to access the cluster network.
-//
-// Second, we have two hardcoded/purpose-specific sources of prefixes:
-// 1. Pod networking node prefixes from the kubelet
-// 2. The host's external IP address (as a /32) from the network service.
+// interface.
package overlay
import (
"context"
"fmt"
"net"
- "net/netip"
"slices"
"github.com/cenkalti/backoff/v4"
- "github.com/vishvananda/netlink"
- "source.monogon.dev/metropolis/node"
"source.monogon.dev/metropolis/node/core/curator/watcher"
"source.monogon.dev/metropolis/node/core/localstorage"
"source.monogon.dev/metropolis/node/core/network/ipam"
@@ -57,12 +49,9 @@
ClusterNet net.IPNet
// DataDirectory is where the WireGuard key of this node will be stored.
DataDirectory *localstorage.DataKubernetesClusterNetworkingDirectory
- // LocalKubernetesPodNetwork is an event.Value watched for prefixes that should
- // be announced into the mesh. This is to be Set by the Kubernetes service once
- // it knows about the local node's IPAM address assignment.
- LocalKubernetesPodNetwork event.Value[*ipam.Prefixes]
- // Network service used to get the local node's IP address to submit it as a /32.
- Network event.Value[*node.NetStatus]
+ // OverlayPrefixes is an event.Value watched for prefixes that should
+ // be announced into the mesh.
+ OverlayPrefixes event.Value[*ipam.Prefixes]
// wg is the interface to all the low-level interactions with WireGuard (and
// kernel routing). If not set, this defaults to a production implementation.
@@ -84,15 +73,7 @@
supervisor.Logger(ctx).Infof("Wireguard setup complete, starting updaters...")
- kubeC := make(chan *ipam.Prefixes)
- netC := make(chan *node.NetStatus)
- if err := supervisor.RunGroup(ctx, map[string]supervisor.Runnable{
- "source-kubernetes": event.Pipe(s.LocalKubernetesPodNetwork, kubeC),
- "source-network": event.Pipe(s.Network, netC),
- "push": func(ctx context.Context) error {
- return s.push(ctx, kubeC, netC)
- },
- }); err != nil {
+ if err := supervisor.Run(ctx, "push", s.push); err != nil {
return err
}
@@ -106,62 +87,22 @@
// push is the sub-runnable responsible for letting the Curator know about what
// prefixes that are originated by this node.
-func (s *Service) push(ctx context.Context, kubeC chan *ipam.Prefixes, netC chan *node.NetStatus) error {
+func (s *Service) push(ctx context.Context) error {
+ w := s.OverlayPrefixes.Watch()
supervisor.Signal(ctx, supervisor.SignalHealthy)
- var kubePrefixes *ipam.Prefixes
var prevKubePrefixes *ipam.Prefixes
-
- var localAddr net.IP
- var prevLocalAddr net.IP
-
for {
- kubeChanged := false
- localChanged := false
-
- select {
- case <-ctx.Done():
- return ctx.Err()
- case kubePrefixes = <-kubeC:
- if !kubePrefixes.Equal(prevKubePrefixes) {
- kubeChanged = true
- }
- case n := <-netC:
- localAddr = n.ExternalAddress
- if !localAddr.Equal(prevLocalAddr) {
- localChanged = true
- }
+ prefixes, err := w.Get(ctx)
+ if err != nil {
+ return err
}
-
- if kubeChanged {
- if err := configureKubeNetwork(prevKubePrefixes, kubePrefixes); err != nil {
- supervisor.Logger(ctx).Warningf("Could not configure cluster networking update: %v", err)
- }
- }
-
- // Ignore spurious updates.
- if !localChanged && !kubeChanged {
+ if prefixes.Equal(prevKubePrefixes) {
continue
}
+ supervisor.Logger(ctx).Infof("Submitting prefixes: %s", prefixes)
- // Prepare prefixes to submit to cluster.
- var prefixes ipam.Prefixes
-
- // Do we have a local node address? Add it to the prefixes.
- if len(localAddr) > 0 {
- addr, ok := netip.AddrFromSlice(localAddr)
- if ok {
- prefixes = append(prefixes, netip.PrefixFrom(addr, 32))
- }
- }
- // Do we have any kubelet prefixes? Add them, too.
- if kubePrefixes != nil {
- prefixes.Update(kubePrefixes)
- }
-
- supervisor.Logger(ctx).Infof("Submitting prefixes: %s (kube update: %v, local update: %v)", prefixes, kubeChanged, localChanged)
-
- err := backoff.Retry(func() error {
+ err = backoff.Retry(func() error {
_, err := s.Curator.UpdateNodeClusterNetworking(ctx, &apb.UpdateNodeClusterNetworkingRequest{
Clusternet: &cpb.NodeClusterNetworking{
WireguardPubkey: s.wg.key().PublicKey().String(),
@@ -177,75 +118,10 @@
return fmt.Errorf("couldn't update curator: %w", err)
}
- prevKubePrefixes = kubePrefixes
- prevLocalAddr = localAddr
-
+ prevKubePrefixes = prefixes
}
}
-// configureKubeNetwork configures the point-to-point peer IP address of the
-// node host network namespace (i.e. the one container P2P interfaces point to)
-// on its loopback interface to make it eligible to be used as a source IP
-// address for communication into the clusternet overlay.
-func configureKubeNetwork(oldPrefixes *ipam.Prefixes, newPrefixes *ipam.Prefixes) error {
- // diff maps prefixes to be removed to false
- // and prefixes to be added to true.
- diff := make(map[netip.Prefix]bool)
-
- if newPrefixes != nil {
- for _, newAddr := range *newPrefixes {
- diff[newAddr] = true
- }
- }
-
- if oldPrefixes != nil {
- for _, oldAddr := range *oldPrefixes {
- // Remove all prefixes in both the old
- // and new prefix sets from `diff`.
- if diff[oldAddr] {
- delete(diff, oldAddr)
- continue
- }
-
- // Mark all remaining (i.e. ones not in the new prefix set)
- // prefixes for removal.
- diff[oldAddr] = false
- }
- }
-
- loInterface, err := netlink.LinkByName("lo")
- if err != nil {
- return fmt.Errorf("while getting lo interface: %w", err)
- }
-
- for prefix, shouldAdd := range diff {
- // By CNI convention the first IP after the subnet base address is the
- // point-to-point partner for all pod veths. To make this IP eligible
- // to be used as a general host network namespace source IP we also add
- // it to the loopback interface. This ensures that the kernel picks it
- // as the source IP for traffic flowing into clusternet
- // (due to its preference for source IPs in the same subnet).
- addr := &netlink.Addr{
- IPNet: &net.IPNet{
- IP: prefix.Addr().Next().AsSlice(),
- Mask: net.CIDRMask(prefix.Addr().BitLen(), prefix.Addr().BitLen()),
- },
- }
-
- if shouldAdd {
- if err := netlink.AddrAdd(loInterface, addr); err != nil {
- return fmt.Errorf("assigning extra loopback IP: %w", err)
- }
- } else {
- if err := netlink.AddrDel(loInterface, addr); err != nil {
- return fmt.Errorf("removing extra loopback IP: %w", err)
- }
- }
- }
-
- return nil
-}
-
// pull is the sub-runnable responsible for fetching information about the
// cluster networking setup/status of other nodes, and programming it as
// WireGuard peers.
diff --git a/metropolis/node/core/network/overlay/overlay_test.go b/metropolis/node/core/network/overlay/overlay_test.go
index 44bdd8b..4409e09 100644
--- a/metropolis/node/core/network/overlay/overlay_test.go
+++ b/metropolis/node/core/network/overlay/overlay_test.go
@@ -94,9 +94,7 @@
defer cl.Close()
curator := apb.NewCuratorClient(cl)
- var nval memory.Value[*common.NetStatus]
-
- var podNetwork memory.Value[*ipam.Prefixes]
+ var overlayNetwork memory.Value[*ipam.Prefixes]
wg := &fakeWireguard{}
svc := Service{
Curator: curator,
@@ -104,9 +102,8 @@
IP: net.IP([]byte{10, 10, 0, 0}),
Mask: net.IPv4Mask(255, 255, 0, 0),
},
- DataDirectory: nil,
- LocalKubernetesPodNetwork: &podNetwork,
- Network: &nval,
+ DataDirectory: nil,
+ OverlayPrefixes: &overlayNetwork,
wg: wg,
}
diff --git a/metropolis/node/core/roleserve/roleserve.go b/metropolis/node/core/roleserve/roleserve.go
index ef84643..918b466 100644
--- a/metropolis/node/core/roleserve/roleserve.go
+++ b/metropolis/node/core/roleserve/roleserve.go
@@ -163,7 +163,6 @@
curatorConnection: &s.CuratorConnection,
podNetwork: s.Config.PodNetwork,
- network: s.Network,
}
s.hostsfile = &workerHostsfile{
diff --git a/metropolis/node/core/roleserve/worker_clusternet.go b/metropolis/node/core/roleserve/worker_clusternet.go
index 8a3bb8f..bbc842f 100644
--- a/metropolis/node/core/roleserve/worker_clusternet.go
+++ b/metropolis/node/core/roleserve/worker_clusternet.go
@@ -8,7 +8,6 @@
"net"
"source.monogon.dev/metropolis/node/core/localstorage"
- "source.monogon.dev/metropolis/node/core/network"
"source.monogon.dev/metropolis/node/core/network/ipam"
"source.monogon.dev/metropolis/node/core/network/overlay"
"source.monogon.dev/osbase/event/memory"
@@ -24,7 +23,6 @@
curatorConnection *memory.Value[*CuratorConnection]
// podNetwork will be read.
podNetwork *memory.Value[*ipam.Prefixes]
- network *network.Service
}
func (s *workerClusternet) run(ctx context.Context) error {
@@ -44,9 +42,8 @@
IP: []byte{10, 192, 0, 0},
Mask: net.IPMask{255, 224, 0, 0},
},
- DataDirectory: &s.storageRoot.Data.Kubernetes.ClusterNetworking,
- LocalKubernetesPodNetwork: s.podNetwork,
- Network: &s.network.Status,
+ DataDirectory: &s.storageRoot.Data.Kubernetes.ClusterNetworking,
+ OverlayPrefixes: s.podNetwork,
}
return svc.Run(ctx)
}