metropolis: use interface groups
This adds interface groups to all K8s pod interfaces via a CNI plugin
patch and corresponding configuration. It also adds an interface group
to the clusternet interface. Using these new interface groups the
nftables rules for NAT can be simplified.
These will also be used by the network policy plugin later.
Change-Id: I4638a4349ccb12b8724ad28ae34bb61cac4b4ece
Reviewed-on: https://review.monogon.dev/c/monogon/+/3814
Tested-by: Jenkins CI
Reviewed-by: Jan Schär <jan@monogon.tech>
diff --git a/metropolis/node/core/clusternet/wireguard.go b/metropolis/node/core/clusternet/wireguard.go
index 9d14b98..cc5f941 100644
--- a/metropolis/node/core/clusternet/wireguard.go
+++ b/metropolis/node/core/clusternet/wireguard.go
@@ -86,7 +86,7 @@
}
}
- wgInterface := &netlink.Wireguard{LinkAttrs: netlink.LinkAttrs{Name: clusterNetDeviceName, Flags: net.FlagUp}}
+ wgInterface := &netlink.Wireguard{LinkAttrs: netlink.LinkAttrs{Name: clusterNetDeviceName, Flags: net.FlagUp, Group: common.LinkGroupClusternet}}
if err := netlink.LinkAdd(wgInterface); err != nil {
return fmt.Errorf("when adding network interface: %w", err)
}
diff --git a/metropolis/node/core/network/BUILD.bazel b/metropolis/node/core/network/BUILD.bazel
index 6809c58..103f033 100644
--- a/metropolis/node/core/network/BUILD.bazel
+++ b/metropolis/node/core/network/BUILD.bazel
@@ -13,6 +13,7 @@
deps = [
"//go/algorithm/toposort",
"//go/logging",
+ "//metropolis/node",
"//metropolis/node/core/network/dhcp4c",
"//metropolis/node/core/network/dhcp4c/callback",
"//osbase/event/memory",
@@ -22,6 +23,7 @@
"//osbase/supervisor",
"//osbase/sysctl",
"@com_github_google_nftables//:nftables",
+ "@com_github_google_nftables//binaryutil",
"@com_github_google_nftables//expr",
"@com_github_insomniacslk_dhcp//dhcpv4",
"@com_github_mdlayher_arp//:arp",
diff --git a/metropolis/node/core/network/main.go b/metropolis/node/core/network/main.go
index e51ed11..dd0023d 100644
--- a/metropolis/node/core/network/main.go
+++ b/metropolis/node/core/network/main.go
@@ -24,10 +24,12 @@
"strconv"
"github.com/google/nftables"
+ "github.com/google/nftables/binaryutil"
"github.com/google/nftables/expr"
"github.com/insomniacslk/dhcp/dhcpv4"
"github.com/vishvananda/netlink"
+ "source.monogon.dev/metropolis/node"
"source.monogon.dev/metropolis/node/core/network/dhcp4c"
dhcpcb "source.monogon.dev/metropolis/node/core/network/dhcp4c/callback"
"source.monogon.dev/osbase/event/memory"
@@ -274,41 +276,38 @@
Table: s.natTable,
Type: nftables.ChainTypeNAT,
})
- // SNAT/Masquerade all traffic coming from interfaces starting with
- // veth going to interfaces not starting with veth.
- // This NATs all container traffic going out of the host without
- // affecting anything else and without needing to care about specific
- // interfaces. Will need to be changed when we support L3 attachments
- // (BGP, ...).
+ // SNAT/Masquerade all traffic coming from pod interface (identified by
+ // group) not going to another pod, either local or over clusternet.
+ // Will need to be changed when we support L3 attachments (BGP, ...).
s.nftConn.AddRule(&nftables.Rule{
Table: s.natTable,
Chain: s.natPostroutingChain,
Exprs: []expr.Any{
&expr.Meta{
- Key: expr.MetaKeyIIFNAME,
- Register: 8, // covers registers 8-12 (16 bytes/4 regs)
+ Key: expr.MetaKeyIIFGROUP,
+ Register: 8,
},
- // Check if incoming interface starts with veth
+ // Check if incoming interface is a K8s pod
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 8,
- Data: []byte{'v', 'e', 't', 'h'},
+ Data: binaryutil.NativeEndian.PutUint32(node.LinkGroupK8sPod),
},
&expr.Meta{
- Key: expr.MetaKeyOIFNAME,
- Register: 8, // covers registers 8-12
+ Key: expr.MetaKeyOIFGROUP,
+ Register: 8,
},
- // Check if outgoing interface doesn't start with veth
+ // Check if outgoing interface is not a K8s pod
&expr.Cmp{
Op: expr.CmpOpNeq,
Register: 8,
- Data: []byte{'v', 'e', 't', 'h'},
+ Data: binaryutil.NativeEndian.PutUint32(node.LinkGroupK8sPod),
},
// Check if outgoing interface isn't clusternet
&expr.Cmp{
Op: expr.CmpOpNeq,
Register: 8,
- Data: []byte{'c', 'l', 'u', 's', 't', 'e', 'r', 'n', 'e', 't'},
+ Data: binaryutil.NativeEndian.PutUint32(node.LinkGroupClusternet),
},
&expr.Masq{
FullyRandom: true,
diff --git a/metropolis/node/kubernetes/containerd/cnispec.gojson b/metropolis/node/kubernetes/containerd/cnispec.gojson
index d703ded..4fca790 100644
--- a/metropolis/node/kubernetes/containerd/cnispec.gojson
+++ b/metropolis/node/kubernetes/containerd/cnispec.gojson
@@ -6,6 +6,8 @@
{
"type": "ptp",
"mtu": 1420,
+ {{/* Must be node.LinkGroupK8sPod */}}
+ "linkGroup": 8,
"ipam": {
"type": "host-local",
"dataDir": "/ephemeral/containerd/ipam",
diff --git a/metropolis/node/net_protocols.go b/metropolis/node/net_protocols.go
index 2e005fb..26dd0c0 100644
--- a/metropolis/node/net_protocols.go
+++ b/metropolis/node/net_protocols.go
@@ -7,3 +7,13 @@
// creating/removing routes pointing to the clusternet interface.
ProtocolClusternet int = 129
)
+
+// Netlink link groups used for interface classification and traffic matching.
+const (
+ // LinkGroupK8sPod is set on all host side PtP interfaces going to K8s
+ // pods.
+ LinkGroupK8sPod uint32 = 8
+ // LinkGroupClusternet is set on all interfaces not needing SNAT from the
+ // K8s internal IPs.
+ LinkGroupClusternet uint32 = 9
+)