m/node: switch to cgroupv2
This switches us from legacy cgroup (v1) to cgroup v2 aka unified
cgroup. Our versions of Kubernetes, containerd and runc/gVisor all
support this by now.
cgroup_bpf needs to be enabled in the kernel for containerd with cgroup
v2. Also enable swap as this now works with cgroup v2, this gets rid of
a warning for every pod being started.
We are not really using cgroups ourselves, but as the root cgroup in v2
is special, move our own process into a subgroup at startup.
Change-Id: I8d63b2ad672568c052c3fe1a2306182f033667fa
Reviewed-on: https://review.monogon.dev/c/monogon/+/3207
Tested-by: Jenkins CI
Reviewed-by: Jan Schär <jan@monogon.tech>
diff --git a/metropolis/node/core/BUILD.bazel b/metropolis/node/core/BUILD.bazel
index 938a7d7..9c80741 100644
--- a/metropolis/node/core/BUILD.bazel
+++ b/metropolis/node/core/BUILD.bazel
@@ -45,6 +45,7 @@
"@com_github_cenkalti_backoff_v4//:backoff",
"@com_github_containerd_containerd//:containerd",
"@com_github_containerd_containerd//namespaces",
+ "@com_github_opencontainers_runc//libcontainer/cgroups",
"@org_golang_google_grpc//:go_default_library",
"@org_golang_google_grpc//codes",
"@org_golang_google_grpc//status",
diff --git a/metropolis/node/core/mounts.go b/metropolis/node/core/mounts.go
index cb8351d..047552b 100644
--- a/metropolis/node/core/mounts.go
+++ b/metropolis/node/core/mounts.go
@@ -19,8 +19,8 @@
import (
"fmt"
"os"
- "strings"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
"golang.org/x/sys/unix"
)
@@ -50,39 +50,15 @@
}
}
- // Mount all available CGroups for v1 (v2 uses a single unified hierarchy
- // and is not supported by our runtimes yet)
- if err := unix.Mount("tmpfs", "/sys/fs/cgroup", "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, ""); err != nil {
+ if err := unix.Mount("cgroup2", "/sys/fs/cgroup", "cgroup2", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, "nsdelegate,memory_recursiveprot"); err != nil {
panic(err)
}
- cgroupsRaw, err := os.ReadFile("/proc/cgroups")
- if err != nil {
+ // Create main cgroup "everything" and move ourselves into it.
+ if err := os.Mkdir("/sys/fs/cgroup/everything", 0755); err != nil {
panic(err)
}
-
- cgroupLines := strings.Split(string(cgroupsRaw), "\n")
- for _, cgroupLine := range cgroupLines {
- if cgroupLine == "" || strings.HasPrefix(cgroupLine, "#") {
- continue
- }
- cgroupParts := strings.Split(cgroupLine, "\t")
- cgroupName := cgroupParts[0]
- if err := os.Mkdir("/sys/fs/cgroup/"+cgroupName, 0755); err != nil {
- panic(err)
- }
- if err := unix.Mount("cgroup", "/sys/fs/cgroup/"+cgroupName, "cgroup", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, cgroupName); err != nil {
- panic(err)
- }
- }
-
- // Enable hierarchical memory accounting
- useMemoryHierarchy, err := os.OpenFile("/sys/fs/cgroup/memory/memory.use_hierarchy", os.O_RDWR, 0)
- if err != nil {
+ if err := cgroups.WriteCgroupProc("/sys/fs/cgroup/everything", os.Getpid()); err != nil {
panic(err)
}
- if _, err := useMemoryHierarchy.WriteString("1"); err != nil {
- panic(err)
- }
- useMemoryHierarchy.Close()
return nil
}
diff --git a/metropolis/node/kubernetes/kubelet.go b/metropolis/node/kubernetes/kubelet.go
index 19a79b2..2c46080 100644
--- a/metropolis/node/kubernetes/kubelet.go
+++ b/metropolis/node/kubernetes/kubelet.go
@@ -102,8 +102,12 @@
EnableControllerAttachDetach: reconciler.False(),
HairpinMode: "none",
MakeIPTablesUtilChains: reconciler.False(), // We don't have iptables
- FailSwapOn: reconciler.False(), // Our kernel doesn't have swap enabled which breaks Kubelet's detection
- CgroupRoot: "/",
+ FailSwapOn: reconciler.False(),
+ MemorySwap: kubeletconfig.MemorySwapConfiguration{
+ // Only allow burstable pods to use swap
+ SwapBehavior: "LimitedSwap",
+ },
+ CgroupRoot: "/",
KubeReserved: map[string]string{
"cpu": "200m",
"memory": "300Mi",
@@ -114,7 +118,7 @@
VolumePluginDir: s.EphemeralDirectory.FlexvolumePlugins.FullPath(),
// Currently we allocate a /24 per node, so we can have a maximum of
// 253 pods per node.
- MaxPods: 253,
+ MaxPods: 253,
PodLogsDir: "/data/kubelet/logs",
}
}
diff --git a/third_party/linux/linux-metropolis.config b/third_party/linux/linux-metropolis.config
index 780db80..8f75141 100644
--- a/third_party/linux/linux-metropolis.config
+++ b/third_party/linux/linux-metropolis.config
@@ -109,8 +109,10 @@
#
# BPF subsystem
#
-# CONFIG_BPF_SYSCALL is not set
+CONFIG_BPF_SYSCALL=y
# CONFIG_BPF_JIT is not set
+CONFIG_BPF_UNPRIV_DEFAULT_OFF=y
+# CONFIG_BPF_PRELOAD is not set
# end of BPF subsystem
CONFIG_PREEMPT_BUILD=y
@@ -148,6 +150,7 @@
CONFIG_TASKS_RCU_GENERIC=y
CONFIG_TASKS_RCU=y
CONFIG_TASKS_RUDE_RCU=y
+CONFIG_TASKS_TRACE_RCU=y
CONFIG_RCU_STALL_COMMON=y
CONFIG_RCU_NEED_SEGCBLIST=y
# end of RCU Subsystem
@@ -193,8 +196,10 @@
CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_BPF=y
CONFIG_CGROUP_MISC=y
# CONFIG_CGROUP_DEBUG is not set
+CONFIG_SOCK_CGROUP_DATA=y
CONFIG_NAMESPACES=y
CONFIG_UTS_NS=y
CONFIG_TIME_NS=y
@@ -514,6 +519,7 @@
#
# CONFIG_SUSPEND is not set
CONFIG_HIBERNATE_CALLBACKS=y
+# CONFIG_HIBERNATION is not set
CONFIG_PM_SLEEP=y
CONFIG_PM_SLEEP_SMP=y
# CONFIG_PM_AUTOSLEEP is not set
@@ -936,7 +942,11 @@
#
# Memory Management options
#
-# CONFIG_SWAP is not set
+CONFIG_SWAP=y
+# CONFIG_ZSWAP is not set
+CONFIG_ZSMALLOC=y
+# CONFIG_ZSMALLOC_STAT is not set
+CONFIG_ZSMALLOC_CHAIN_SIZE=8
#
# SLAB allocator options
@@ -1044,6 +1054,7 @@
# CONFIG_TLS is not set
# CONFIG_XFRM_USER is not set
# CONFIG_NET_KEY is not set
+# CONFIG_XDP_SOCKETS is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
# CONFIG_IP_ADVANCED_ROUTER is not set
@@ -1124,6 +1135,7 @@
CONFIG_NETFILTER_EGRESS=y
CONFIG_NETFILTER_SKIP_EGRESS=y
CONFIG_NETFILTER_NETLINK=y
+CONFIG_NETFILTER_BPF_LINK=y
# CONFIG_NETFILTER_NETLINK_HOOK is not set
CONFIG_NETFILTER_NETLINK_ACCT=y
CONFIG_NETFILTER_NETLINK_QUEUE=y
@@ -1361,6 +1373,7 @@
# CONFIG_CGROUP_NET_CLASSID is not set
CONFIG_NET_RX_BUSY_POLL=y
CONFIG_BQL=y
+# CONFIG_BPF_STREAM_PARSER is not set
CONFIG_NET_FLOW_LIMIT=y
#
@@ -1389,6 +1402,7 @@
CONFIG_DST_CACHE=y
CONFIG_GRO_CELLS=y
CONFIG_NET_SELFTESTS=y
+CONFIG_NET_SOCK_MSG=y
CONFIG_NET_DEVLINK=y
CONFIG_PAGE_POOL=y
# CONFIG_PAGE_POOL_STATS is not set
@@ -1592,7 +1606,13 @@
# CONFIG_BLK_DEV_NULL_BLK is not set
# CONFIG_BLK_DEV_FD is not set
# CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set
-# CONFIG_ZRAM is not set
+CONFIG_ZRAM=y
+CONFIG_ZRAM_DEF_COMP_ZSTD=y
+# CONFIG_ZRAM_DEF_COMP_LZ4 is not set
+CONFIG_ZRAM_DEF_COMP="zstd"
+# CONFIG_ZRAM_WRITEBACK is not set
+# CONFIG_ZRAM_MEMORY_TRACKING is not set
+# CONFIG_ZRAM_MULTI_COMP is not set
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_LOOP_MIN_COUNT=0
# CONFIG_BLK_DEV_DRBD is not set
@@ -3188,6 +3208,7 @@
#
# HID-BPF support
#
+# CONFIG_HID_BPF is not set
# end of HID-BPF support
#
@@ -4809,6 +4830,7 @@
# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set
# CONFIG_BLK_DEV_IO_TRACE is not set
CONFIG_UPROBE_EVENTS=y
+CONFIG_BPF_EVENTS=y
CONFIG_DYNAMIC_EVENTS=y
CONFIG_PROBE_EVENTS=y
CONFIG_FTRACE_MCOUNT_RECORD=y