m/node: switch to cgroupv2

This switches us from legacy cgroup (v1) to cgroup v2 aka unified
cgroup. Our versions of Kubernetes, containerd and runc/gVisor all
support this by now.

cgroup_bpf needs to be enabled in the kernel for containerd with cgroup
v2. Also enable swap as this now works with cgroup v2, this gets rid of
a warning for every pod being started.

We are not really using cgroups ourselves, but as the root cgroup in v2
is special, move our own process into a subgroup at startup.

Change-Id: I8d63b2ad672568c052c3fe1a2306182f033667fa
Reviewed-on: https://review.monogon.dev/c/monogon/+/3207
Tested-by: Jenkins CI
Reviewed-by: Jan Schär <jan@monogon.tech>
diff --git a/metropolis/node/core/BUILD.bazel b/metropolis/node/core/BUILD.bazel
index 938a7d7..9c80741 100644
--- a/metropolis/node/core/BUILD.bazel
+++ b/metropolis/node/core/BUILD.bazel
@@ -45,6 +45,7 @@
         "@com_github_cenkalti_backoff_v4//:backoff",
         "@com_github_containerd_containerd//:containerd",
         "@com_github_containerd_containerd//namespaces",
+        "@com_github_opencontainers_runc//libcontainer/cgroups",
         "@org_golang_google_grpc//:go_default_library",
         "@org_golang_google_grpc//codes",
         "@org_golang_google_grpc//status",
diff --git a/metropolis/node/core/mounts.go b/metropolis/node/core/mounts.go
index cb8351d..047552b 100644
--- a/metropolis/node/core/mounts.go
+++ b/metropolis/node/core/mounts.go
@@ -19,8 +19,8 @@
 import (
 	"fmt"
 	"os"
-	"strings"
 
+	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"golang.org/x/sys/unix"
 )
 
@@ -50,39 +50,15 @@
 		}
 	}
 
-	// Mount all available CGroups for v1 (v2 uses a single unified hierarchy
-	// and is not supported by our runtimes yet)
-	if err := unix.Mount("tmpfs", "/sys/fs/cgroup", "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, ""); err != nil {
+	if err := unix.Mount("cgroup2", "/sys/fs/cgroup", "cgroup2", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, "nsdelegate,memory_recursiveprot"); err != nil {
 		panic(err)
 	}
-	cgroupsRaw, err := os.ReadFile("/proc/cgroups")
-	if err != nil {
+	// Create main cgroup "everything" and move ourselves into it.
+	if err := os.Mkdir("/sys/fs/cgroup/everything", 0755); err != nil {
 		panic(err)
 	}
-
-	cgroupLines := strings.Split(string(cgroupsRaw), "\n")
-	for _, cgroupLine := range cgroupLines {
-		if cgroupLine == "" || strings.HasPrefix(cgroupLine, "#") {
-			continue
-		}
-		cgroupParts := strings.Split(cgroupLine, "\t")
-		cgroupName := cgroupParts[0]
-		if err := os.Mkdir("/sys/fs/cgroup/"+cgroupName, 0755); err != nil {
-			panic(err)
-		}
-		if err := unix.Mount("cgroup", "/sys/fs/cgroup/"+cgroupName, "cgroup", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, cgroupName); err != nil {
-			panic(err)
-		}
-	}
-
-	// Enable hierarchical memory accounting
-	useMemoryHierarchy, err := os.OpenFile("/sys/fs/cgroup/memory/memory.use_hierarchy", os.O_RDWR, 0)
-	if err != nil {
+	if err := cgroups.WriteCgroupProc("/sys/fs/cgroup/everything", os.Getpid()); err != nil {
 		panic(err)
 	}
-	if _, err := useMemoryHierarchy.WriteString("1"); err != nil {
-		panic(err)
-	}
-	useMemoryHierarchy.Close()
 	return nil
 }
diff --git a/metropolis/node/kubernetes/kubelet.go b/metropolis/node/kubernetes/kubelet.go
index 19a79b2..2c46080 100644
--- a/metropolis/node/kubernetes/kubelet.go
+++ b/metropolis/node/kubernetes/kubelet.go
@@ -102,8 +102,12 @@
 		EnableControllerAttachDetach: reconciler.False(),
 		HairpinMode:                  "none",
 		MakeIPTablesUtilChains:       reconciler.False(), // We don't have iptables
-		FailSwapOn:                   reconciler.False(), // Our kernel doesn't have swap enabled which breaks Kubelet's detection
-		CgroupRoot:                   "/",
+		FailSwapOn:                   reconciler.False(),
+		MemorySwap: kubeletconfig.MemorySwapConfiguration{
+			// Only allow burstable pods to use swap
+			SwapBehavior: "LimitedSwap",
+		},
+		CgroupRoot: "/",
 		KubeReserved: map[string]string{
 			"cpu":    "200m",
 			"memory": "300Mi",
@@ -114,7 +118,7 @@
 		VolumePluginDir: s.EphemeralDirectory.FlexvolumePlugins.FullPath(),
 		// Currently we allocate a /24 per node, so we can have a maximum of
 		// 253 pods per node.
-		MaxPods: 253,
+		MaxPods:    253,
 		PodLogsDir: "/data/kubelet/logs",
 	}
 }
diff --git a/third_party/linux/linux-metropolis.config b/third_party/linux/linux-metropolis.config
index 780db80..8f75141 100644
--- a/third_party/linux/linux-metropolis.config
+++ b/third_party/linux/linux-metropolis.config
@@ -109,8 +109,10 @@
 #
 # BPF subsystem
 #
-# CONFIG_BPF_SYSCALL is not set
+CONFIG_BPF_SYSCALL=y
 # CONFIG_BPF_JIT is not set
+CONFIG_BPF_UNPRIV_DEFAULT_OFF=y
+# CONFIG_BPF_PRELOAD is not set
 # end of BPF subsystem
 
 CONFIG_PREEMPT_BUILD=y
@@ -148,6 +150,7 @@
 CONFIG_TASKS_RCU_GENERIC=y
 CONFIG_TASKS_RCU=y
 CONFIG_TASKS_RUDE_RCU=y
+CONFIG_TASKS_TRACE_RCU=y
 CONFIG_RCU_STALL_COMMON=y
 CONFIG_RCU_NEED_SEGCBLIST=y
 # end of RCU Subsystem
@@ -193,8 +196,10 @@
 CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_BPF=y
 CONFIG_CGROUP_MISC=y
 # CONFIG_CGROUP_DEBUG is not set
+CONFIG_SOCK_CGROUP_DATA=y
 CONFIG_NAMESPACES=y
 CONFIG_UTS_NS=y
 CONFIG_TIME_NS=y
@@ -514,6 +519,7 @@
 #
 # CONFIG_SUSPEND is not set
 CONFIG_HIBERNATE_CALLBACKS=y
+# CONFIG_HIBERNATION is not set
 CONFIG_PM_SLEEP=y
 CONFIG_PM_SLEEP_SMP=y
 # CONFIG_PM_AUTOSLEEP is not set
@@ -936,7 +942,11 @@
 #
 # Memory Management options
 #
-# CONFIG_SWAP is not set
+CONFIG_SWAP=y
+# CONFIG_ZSWAP is not set
+CONFIG_ZSMALLOC=y
+# CONFIG_ZSMALLOC_STAT is not set
+CONFIG_ZSMALLOC_CHAIN_SIZE=8
 
 #
 # SLAB allocator options
@@ -1044,6 +1054,7 @@
 # CONFIG_TLS is not set
 # CONFIG_XFRM_USER is not set
 # CONFIG_NET_KEY is not set
+# CONFIG_XDP_SOCKETS is not set
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 # CONFIG_IP_ADVANCED_ROUTER is not set
@@ -1124,6 +1135,7 @@
 CONFIG_NETFILTER_EGRESS=y
 CONFIG_NETFILTER_SKIP_EGRESS=y
 CONFIG_NETFILTER_NETLINK=y
+CONFIG_NETFILTER_BPF_LINK=y
 # CONFIG_NETFILTER_NETLINK_HOOK is not set
 CONFIG_NETFILTER_NETLINK_ACCT=y
 CONFIG_NETFILTER_NETLINK_QUEUE=y
@@ -1361,6 +1373,7 @@
 # CONFIG_CGROUP_NET_CLASSID is not set
 CONFIG_NET_RX_BUSY_POLL=y
 CONFIG_BQL=y
+# CONFIG_BPF_STREAM_PARSER is not set
 CONFIG_NET_FLOW_LIMIT=y
 
 #
@@ -1389,6 +1402,7 @@
 CONFIG_DST_CACHE=y
 CONFIG_GRO_CELLS=y
 CONFIG_NET_SELFTESTS=y
+CONFIG_NET_SOCK_MSG=y
 CONFIG_NET_DEVLINK=y
 CONFIG_PAGE_POOL=y
 # CONFIG_PAGE_POOL_STATS is not set
@@ -1592,7 +1606,13 @@
 # CONFIG_BLK_DEV_NULL_BLK is not set
 # CONFIG_BLK_DEV_FD is not set
 # CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set
-# CONFIG_ZRAM is not set
+CONFIG_ZRAM=y
+CONFIG_ZRAM_DEF_COMP_ZSTD=y
+# CONFIG_ZRAM_DEF_COMP_LZ4 is not set
+CONFIG_ZRAM_DEF_COMP="zstd"
+# CONFIG_ZRAM_WRITEBACK is not set
+# CONFIG_ZRAM_MEMORY_TRACKING is not set
+# CONFIG_ZRAM_MULTI_COMP is not set
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_LOOP_MIN_COUNT=0
 # CONFIG_BLK_DEV_DRBD is not set
@@ -3188,6 +3208,7 @@
 #
 # HID-BPF support
 #
+# CONFIG_HID_BPF is not set
 # end of HID-BPF support
 
 #
@@ -4809,6 +4830,7 @@
 # CONFIG_PROFILE_ANNOTATED_BRANCHES is not set
 # CONFIG_BLK_DEV_IO_TRACE is not set
 CONFIG_UPROBE_EVENTS=y
+CONFIG_BPF_EVENTS=y
 CONFIG_DYNAMIC_EVENTS=y
 CONFIG_PROBE_EVENTS=y
 CONFIG_FTRACE_MCOUNT_RECORD=y