third_party/linux: fix LACP issues

This fixes two major issues with the Linux LACP implementation:
First, the bond interface indicates carrier availability before any port
is even in aggregation state. It pretty much only cares about underlying
port carrier state which is not meaningful in LACP-controlled
aggregation.
Second, individual ports are added to the list of transmitting ports
immediately after coming up. This causes packets to be transmitted
before the LACP state indicates that this should happen.

Fix both of these issues by only enabling ports when the LACP state
machine places them in collecting/distributing state and making the bond
carrier state dependent on ports being enabled. This makes the interface
also behave logically consistent, i.e. it can transmit packets when its
carrier is reported up and not when its carrier is reported down.

While in there, fix some timer-related annoyances which make convergence
unnecessarily slow.

This also comes with a ktest which can be used for testing and
verification of these changes.

Change-Id: I60d0ed483f4f4ccea4d582b80e2bb29ff741783d
Reviewed-on: https://review.monogon.dev/c/monogon/+/3073
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/test/ktest/init/main.go b/metropolis/test/ktest/init/main.go
index c5577fa..16ffa29 100644
--- a/metropolis/test/ktest/init/main.go
+++ b/metropolis/test/ktest/init/main.go
@@ -37,6 +37,7 @@
 		flags uintptr
 	}{
 		{"/sys", "sysfs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
+		{"/sys/kernel/debug", "debugfs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/proc", "proc", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/dev", "devtmpfs", unix.MS_NOEXEC | unix.MS_NOSUID},
 		{"/dev/pts", "devpts", unix.MS_NOEXEC | unix.MS_NOSUID},
diff --git a/metropolis/test/lacp/BUILD.bazel b/metropolis/test/lacp/BUILD.bazel
new file mode 100644
index 0000000..d044de2
--- /dev/null
+++ b/metropolis/test/lacp/BUILD.bazel
@@ -0,0 +1,24 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//metropolis/test/ktest:ktest.bzl", "ktest")
+
+go_test(
+    name = "lacptest_test",
+    srcs = ["lacp_test.go"],
+    deps = [
+        "@com_github_vishvananda_netlink//:netlink",
+        "@org_golang_x_sys//unix",
+    ],
+)
+
+ktest(
+    tester = ":lacptest_test",
+)
+
+go_test(
+    name = "lacp_test",
+    srcs = ["lacp_test.go"],
+    deps = [
+        "@com_github_vishvananda_netlink//:netlink",
+        "@org_golang_x_sys//unix",
+    ],
+)
diff --git a/metropolis/test/lacp/lacp_test.go b/metropolis/test/lacp/lacp_test.go
new file mode 100644
index 0000000..dbcc79e
--- /dev/null
+++ b/metropolis/test/lacp/lacp_test.go
@@ -0,0 +1,194 @@
+// Package lacp contains an integration test for our custom LACP patches.
+// It tests relevant behavior that other parts of the Monogon network stack
+// rely on, like proper carrier state indications.
+package lacp
+
+import (
+	"hash/fnv"
+	"math"
+	"net"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/vishvananda/netlink"
+	"golang.org/x/sys/unix"
+)
+
+// createVethPair is a helper creating a pair of virtual network interfaces
+// acting as a cable (i.e. traffic going in one interface comes back out on
+// the other one). Both ends are returned.
+func createVethPair(t *testing.T, name string) (*netlink.Veth, *netlink.Veth) {
+	t.Helper()
+	vethLink := netlink.Veth{
+		LinkAttrs: netlink.LinkAttrs{
+			Name:    name + "a",
+			NetNsID: -1,
+			TxQLen:  -1,
+		},
+		PeerName: name + "b",
+	}
+	if err := netlink.LinkAdd(&vethLink); err != nil {
+		t.Fatalf("while creating veth pair: %v", err)
+	}
+	vethLinkB, err := netlink.LinkByName(name + "b")
+	if err != nil {
+		t.Fatalf("while creating veth pair: while getting veth peer: %v", err)
+	}
+
+	return &vethLink, vethLinkB.(*netlink.Veth)
+}
+
+// setupNetem is a helper for setting up Linux's network emulation queuing
+// discipline on a network interface, which can simulate various network
+// imperfections like extra latency, reordering or packet loss on packets
+// transmitted on the interface specified. As it is internally implemented
+// as a queuing discipline it only affects transmitted packets.
+func setupNetem(t *testing.T, link netlink.Link, conf netlink.NetemQdiscAttrs) *netlink.Netem {
+	t.Helper()
+	h := fnv.New32a()
+	h.Write([]byte(link.Attrs().Name))
+	qdisc := netlink.NewNetem(netlink.QdiscAttrs{
+		LinkIndex: link.Attrs().Index,
+		Handle:    netlink.MakeHandle(uint16(h.Sum32()%math.MaxUint16), 0),
+		Parent:    netlink.HANDLE_ROOT,
+	}, conf)
+	if err := netlink.QdiscAdd(qdisc); err != nil {
+		t.Fatalf("while setting up qdisc netem for %q: %v", link.Attrs().Name, err)
+	}
+	return qdisc
+}
+
+// changeNetem is a helper for reconfiguring an existing netem instance
+// on-the-fly with new parameters.
+func changeNetem(t *testing.T, qdisc *netlink.Netem, conf netlink.NetemQdiscAttrs) {
+	t.Helper()
+	changedQd := netlink.NewNetem(qdisc.QdiscAttrs, conf)
+	if err := netlink.QdiscChange(changedQd); err != nil {
+		t.Fatalf("while changing qdisc netem for link index %v: %v", qdisc.LinkIndex, err)
+	}
+}
+
+func createBond(t *testing.T, name string, links ...netlink.Link) *netlink.Bond {
+	t.Helper()
+	bondLink := netlink.NewLinkBond(netlink.LinkAttrs{
+		Name:    name,
+		NetNsID: -1,
+		TxQLen:  -1,
+		Flags:   net.FlagUp,
+	})
+	bondLink.Mode = netlink.BOND_MODE_802_3AD
+	bondLink.LacpRate = netlink.BOND_LACP_RATE_FAST
+	bondLink.MinLinks = 1
+	bondLink.AdSelect = netlink.BOND_AD_SELECT_BANDWIDTH
+	if err := netlink.LinkAdd(bondLink); err != nil {
+		t.Fatalf("while creating bond: %v", err)
+	}
+	for _, l := range links {
+		if err := netlink.LinkSetBondSlave(l, bondLink); err != nil {
+			t.Fatalf("while enslaving link to bond %q: %v", name, err)
+		}
+	}
+	return bondLink
+}
+
+// assertRunning is a helper for asserting an interface's IFF_RUNNING state.
+func assertRunning(t *testing.T, l netlink.Link, expected bool) {
+	t.Helper()
+	linkCurrent, err := netlink.LinkByIndex(l.Attrs().Index)
+	if err != nil {
+		t.Fatalf("while checking if link %q is running: %v", l.Attrs().Name, err)
+	}
+	is := linkCurrent.Attrs().RawFlags&unix.IFF_RUNNING != 0
+	if expected != is {
+		t.Errorf("expected interface %q running state to be %v, is %v", l.Attrs().Name, expected, is)
+	}
+}
+
+func TestLACP(t *testing.T) {
+	if os.Getenv("IN_KTEST") != "true" {
+		t.Skip("Not in ktest")
+	}
+
+	if err := os.WriteFile("/sys/kernel/debug/dynamic_debug/control", []byte("module bonding +p"), 0); err != nil {
+		t.Fatal(err)
+	}
+	// Log dynamic debug to console
+	if err := os.WriteFile("/proc/sys/kernel/printk", []byte("8"), 0); err != nil {
+		t.Fatal(err)
+	}
+
+	link1a, link1b := createVethPair(t, "link1")
+	link2a, link2b := createVethPair(t, "link2")
+
+	// Drop all traffic
+	l1aq := setupNetem(t, link1a, netlink.NetemQdiscAttrs{Loss: 100.0})
+	l1bq := setupNetem(t, link1b, netlink.NetemQdiscAttrs{Loss: 100.0})
+	l2aq := setupNetem(t, link2a, netlink.NetemQdiscAttrs{Loss: 100.0})
+	l2bq := setupNetem(t, link2b, netlink.NetemQdiscAttrs{Loss: 100.0})
+
+	bondA := createBond(t, "bonda", link1a, link2a)
+	bondB := createBond(t, "bondb", link1b, link2b)
+
+	time.Sleep(5 * time.Second)
+
+	// Bonds should not come up with links dropping all traffic
+	assertRunning(t, bondA, false)
+	assertRunning(t, bondB, false)
+
+	changeNetem(t, l1aq, netlink.NetemQdiscAttrs{Loss: 0.0})
+	changeNetem(t, l1bq, netlink.NetemQdiscAttrs{Loss: 0.0})
+	t.Log("Enabled L1")
+
+	time.Sleep(5 * time.Second)
+
+	// Bonds should come up with one link working
+	assertRunning(t, bondA, true)
+	assertRunning(t, bondB, true)
+
+	changeNetem(t, l2aq, netlink.NetemQdiscAttrs{Loss: 0.0})
+	changeNetem(t, l2bq, netlink.NetemQdiscAttrs{Loss: 0.0})
+	t.Log("Enabled L2")
+
+	time.Sleep(3 * time.Second)
+
+	// Bonds be up with both links
+	assertRunning(t, bondA, true)
+	assertRunning(t, bondB, true)
+
+	bondAState, err := os.ReadFile("/proc/net/bonding/bonda")
+	if err != nil {
+		panic(err)
+	}
+	t.Log(string(bondAState))
+	if !strings.Contains(string(bondAState), "Number of ports: 2") {
+		t.Errorf("bonda aggregator should contain two ports")
+	}
+	if !strings.Contains(string(bondAState), "port state: 63") {
+		t.Errorf("bonda port state should be 63")
+	}
+	t.Log("------------")
+	bondBState, err := os.ReadFile("/proc/net/bonding/bondb")
+	if err != nil {
+		panic(err)
+	}
+	t.Log(string(bondBState))
+	if !strings.Contains(string(bondBState), "Number of ports: 2") {
+		t.Errorf("bondb aggregator should contain two ports")
+	}
+	if !strings.Contains(string(bondBState), "port state: 63") {
+		t.Errorf("bondb port state should be 63")
+	}
+	changeNetem(t, l1aq, netlink.NetemQdiscAttrs{Loss: 100.0})
+	changeNetem(t, l1bq, netlink.NetemQdiscAttrs{Loss: 100.0})
+	changeNetem(t, l2aq, netlink.NetemQdiscAttrs{Loss: 100.0})
+	changeNetem(t, l2bq, netlink.NetemQdiscAttrs{Loss: 100.0})
+	t.Log("Disabled both links")
+
+	time.Sleep(5 * time.Second)
+
+	// Bonds should be back down
+	assertRunning(t, bondA, false)
+	assertRunning(t, bondB, false)
+}
diff --git a/third_party/linux/external.bzl b/third_party/linux/external.bzl
index 84ba704..87a777a 100644
--- a/third_party/linux/external.bzl
+++ b/third_party/linux/external.bzl
@@ -28,6 +28,7 @@
             "//third_party/linux/external:0001-block-partition-expose-PARTUUID-through-uevent.patch",
             "//third_party/linux/external:disable-static-ifs.patch",
             "//third_party/linux/external:enable-pmsg.patch",
+            "//third_party/linux/external:lacp_fix.patch",
         ],
         sha256 = sums[version],
         strip_prefix = "linux-" + version,
diff --git a/third_party/linux/external/lacp_fix.patch b/third_party/linux/external/lacp_fix.patch
new file mode 100644
index 0000000..b1eecfb
--- /dev/null
+++ b/third_party/linux/external/lacp_fix.patch
@@ -0,0 +1,102 @@
+From dd369885e1c0e3a6eff97db6acc5c765ee5dd421 Mon Sep 17 00:00:00 2001
+From: Lorenz Brun <lorenz@monogon.tech>
+Date: Thu, 2 May 2024 10:17:14 +0200
+Subject: [PATCH] bonding: 3ad: fix carrier and tx with no active ports
+
+bond carrier state should not be be up when no ports are in
+aggregation. Ports should only be enabled when they are in aggregation.
+
+While in there, set the default template to fast mode to quickly recover
+links and shorten the non-standard aggregator selection timer.
+---
+ drivers/net/bonding/bond_3ad.c | 39 ++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
+index c99ffe6c683a..a310c27ea659 100644
+--- a/drivers/net/bonding/bond_3ad.c
++++ b/drivers/net/bonding/bond_3ad.c
+@@ -699,6 +699,22 @@ static int __agg_active_ports(struct aggregator *agg)
+ 	return active;
+ }
+ 
++// __agg_enabled_ports counts the number of ports which are currently in
++// aggregation, different from __agg_active_ports which counts ports which
++// are up (but not in aggregation because of LACP).
++static int __agg_enabled_ports(struct aggregator *agg) {
++	struct port *port;
++	int en_count = 0;
++
++	for (port = agg->lag_ports; port;
++	     port = port->next_port_in_aggregator) {
++		if (__port_is_enabled(port))
++			en_count++;
++	}
++
++	return en_count;
++}
++
+ /**
+  * __get_agg_bandwidth - get the total bandwidth of an aggregator
+  * @aggregator: the aggregator we're looking at
+@@ -1086,6 +1102,8 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr)
+ 			break;
+ 		}
+ 	}
++	// Update carrier state as we might have enabled/disabled ports
++	bond_3ad_set_carrier(__get_bond_by_port(port));
+ }
+ 
+ /**
+@@ -1780,21 +1798,6 @@ static void ad_agg_selection_logic(struct aggregator *agg,
+ 		*update_slave_arr = true;
+ 	}
+ 
+-	/* if the selected aggregator is of join individuals
+-	 * (partner_system is NULL), enable their ports
+-	 */
+-	active = __get_active_agg(origin);
+-
+-	if (active) {
+-		if (!__agg_has_partner(active)) {
+-			for (port = active->lag_ports; port;
+-			     port = port->next_port_in_aggregator) {
+-				__enable_port(port);
+-			}
+-			*update_slave_arr = true;
+-		}
+-	}
+-
+ 	rcu_read_unlock();
+ 
+ 	bond_3ad_set_carrier(bond);
+@@ -1852,7 +1855,7 @@ static void ad_initialize_port(struct port *port, int lacp_fast)
+ 		.key             = 1,
+ 		.port_number     = 1,
+ 		.port_priority   = 0xff,
+-		.port_state      = 1,
++		.port_state      = LACP_STATE_LACP_ACTIVITY | LACP_STATE_LACP_TIMEOUT,
+ 	};
+ 	static const struct lacpdu lacpdu = {
+ 		.subtype		= 0x01,
+@@ -1994,7 +1997,7 @@ static void ad_marker_response_received(struct bond_marker *marker,
+ /* ========= AD exported functions to the main bonding code ========= */
+ 
+ /* Check aggregators status in team every T seconds */
+-#define AD_AGGREGATOR_SELECTION_TIMER  8
++#define AD_AGGREGATOR_SELECTION_TIMER  3
+ 
+ /**
+  * bond_3ad_initiate_agg_selection - initate aggregator selection
+@@ -2619,7 +2622,7 @@ int bond_3ad_set_carrier(struct bonding *bond)
+ 	active = __get_active_agg(&(SLAVE_AD_INFO(first_slave)->aggregator));
+ 	if (active) {
+ 		/* are enough slaves available to consider link up? */
+-		if (__agg_active_ports(active) < bond->params.min_links) {
++		if (__agg_enabled_ports(active) < bond->params.min_links) {
+ 			if (netif_carrier_ok(bond->dev)) {
+ 				netif_carrier_off(bond->dev);
+ 				goto out;
+-- 
+2.42.0
+
diff --git a/third_party/linux/linux-metropolis.config b/third_party/linux/linux-metropolis.config
index 07f150e..59e8d6f 100644
--- a/third_party/linux/linux-metropolis.config
+++ b/third_party/linux/linux-metropolis.config
@@ -1285,7 +1285,7 @@
 # CONFIG_NET_SCH_ETF is not set
 # CONFIG_NET_SCH_TAPRIO is not set
 # CONFIG_NET_SCH_GRED is not set
-# CONFIG_NET_SCH_NETEM is not set
+CONFIG_NET_SCH_NETEM=y
 # CONFIG_NET_SCH_DRR is not set
 # CONFIG_NET_SCH_MQPRIO is not set
 # CONFIG_NET_SCH_SKBPRIO is not set