m/node: switch to cgroupv2
This switches us from legacy cgroup (v1) to cgroup v2 aka unified
cgroup. Our versions of Kubernetes, containerd and runc/gVisor all
support this by now.
cgroup_bpf needs to be enabled in the kernel for containerd with cgroup
v2. Also enable swap as this now works with cgroup v2, this gets rid of
a warning for every pod being started.
We are not really using cgroups ourselves, but as the root cgroup in v2
is special, move our own process into a subgroup at startup.
Change-Id: I8d63b2ad672568c052c3fe1a2306182f033667fa
Reviewed-on: https://review.monogon.dev/c/monogon/+/3207
Tested-by: Jenkins CI
Reviewed-by: Jan Schär <jan@monogon.tech>
diff --git a/metropolis/node/core/BUILD.bazel b/metropolis/node/core/BUILD.bazel
index 938a7d7..9c80741 100644
--- a/metropolis/node/core/BUILD.bazel
+++ b/metropolis/node/core/BUILD.bazel
@@ -45,6 +45,7 @@
"@com_github_cenkalti_backoff_v4//:backoff",
"@com_github_containerd_containerd//:containerd",
"@com_github_containerd_containerd//namespaces",
+ "@com_github_opencontainers_runc//libcontainer/cgroups",
"@org_golang_google_grpc//:go_default_library",
"@org_golang_google_grpc//codes",
"@org_golang_google_grpc//status",
diff --git a/metropolis/node/core/mounts.go b/metropolis/node/core/mounts.go
index cb8351d..047552b 100644
--- a/metropolis/node/core/mounts.go
+++ b/metropolis/node/core/mounts.go
@@ -19,8 +19,8 @@
import (
"fmt"
"os"
- "strings"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
"golang.org/x/sys/unix"
)
@@ -50,39 +50,15 @@
}
}
- // Mount all available CGroups for v1 (v2 uses a single unified hierarchy
- // and is not supported by our runtimes yet)
- if err := unix.Mount("tmpfs", "/sys/fs/cgroup", "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, ""); err != nil {
+ if err := unix.Mount("cgroup2", "/sys/fs/cgroup", "cgroup2", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, "nsdelegate,memory_recursiveprot"); err != nil {
panic(err)
}
- cgroupsRaw, err := os.ReadFile("/proc/cgroups")
- if err != nil {
+ // Create main cgroup "everything" and move ourselves into it.
+ if err := os.Mkdir("/sys/fs/cgroup/everything", 0755); err != nil {
panic(err)
}
-
- cgroupLines := strings.Split(string(cgroupsRaw), "\n")
- for _, cgroupLine := range cgroupLines {
- if cgroupLine == "" || strings.HasPrefix(cgroupLine, "#") {
- continue
- }
- cgroupParts := strings.Split(cgroupLine, "\t")
- cgroupName := cgroupParts[0]
- if err := os.Mkdir("/sys/fs/cgroup/"+cgroupName, 0755); err != nil {
- panic(err)
- }
- if err := unix.Mount("cgroup", "/sys/fs/cgroup/"+cgroupName, "cgroup", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, cgroupName); err != nil {
- panic(err)
- }
- }
-
- // Enable hierarchical memory accounting
- useMemoryHierarchy, err := os.OpenFile("/sys/fs/cgroup/memory/memory.use_hierarchy", os.O_RDWR, 0)
- if err != nil {
+ if err := cgroups.WriteCgroupProc("/sys/fs/cgroup/everything", os.Getpid()); err != nil {
panic(err)
}
- if _, err := useMemoryHierarchy.WriteString("1"); err != nil {
- panic(err)
- }
- useMemoryHierarchy.Close()
return nil
}
diff --git a/metropolis/node/kubernetes/kubelet.go b/metropolis/node/kubernetes/kubelet.go
index 19a79b2..2c46080 100644
--- a/metropolis/node/kubernetes/kubelet.go
+++ b/metropolis/node/kubernetes/kubelet.go
@@ -102,8 +102,12 @@
EnableControllerAttachDetach: reconciler.False(),
HairpinMode: "none",
MakeIPTablesUtilChains: reconciler.False(), // We don't have iptables
- FailSwapOn: reconciler.False(), // Our kernel doesn't have swap enabled which breaks Kubelet's detection
- CgroupRoot: "/",
+ FailSwapOn: reconciler.False(),
+ MemorySwap: kubeletconfig.MemorySwapConfiguration{
+ // Only allow burstable pods to use swap
+ SwapBehavior: "LimitedSwap",
+ },
+ CgroupRoot: "/",
KubeReserved: map[string]string{
"cpu": "200m",
"memory": "300Mi",
@@ -114,7 +118,7 @@
VolumePluginDir: s.EphemeralDirectory.FlexvolumePlugins.FullPath(),
// Currently we allocate a /24 per node, so we can have a maximum of
// 253 pods per node.
- MaxPods: 253,
+ MaxPods: 253,
PodLogsDir: "/data/kubelet/logs",
}
}