node/core: add sysctls

Change-Id: I47b0d639a62f73f134430c5164a35eef2b5622d7
Reviewed-on: https://review.monogon.dev/c/monogon/+/2273
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/BUILD.bazel b/metropolis/node/core/BUILD.bazel
index e8367f9..9043d8e 100644
--- a/metropolis/node/core/BUILD.bazel
+++ b/metropolis/node/core/BUILD.bazel
@@ -9,6 +9,7 @@
         "nodeparams.go",
         "panichandler.go",
         "pstore.go",
+        "sysctl.go",
     ] + select({
         "//metropolis/node:debug_build": [
             "debug_service_enabled.go",
@@ -36,6 +37,7 @@
         "//metropolis/pkg/logtree",
         "//metropolis/pkg/pstore",
         "//metropolis/pkg/supervisor",
+        "//metropolis/pkg/sysctl",
         "//metropolis/pkg/tpm",
         "//metropolis/proto/api",
         "@com_github_cenkalti_backoff_v4//:backoff",
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index 3eb9024..583d72f 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -184,6 +184,9 @@
 		if err := supervisor.Run(ctx, "pstore", dumpAndCleanPstore); err != nil {
 			return fmt.Errorf("when starting pstore: %w", err)
 		}
+		if err := supervisor.Run(ctx, "sysctl", nodeSysctls); err != nil {
+			return fmt.Errorf("when applying sysctls: %w", err)
+		}
 
 		// The kernel does of course not run in this runnable, only the log pipe
 		// runs in it.
diff --git a/metropolis/node/core/network/BUILD.bazel b/metropolis/node/core/network/BUILD.bazel
index 1500eff..4f5a148 100644
--- a/metropolis/node/core/network/BUILD.bazel
+++ b/metropolis/node/core/network/BUILD.bazel
@@ -17,6 +17,7 @@
         "//metropolis/pkg/event/memory",
         "//metropolis/pkg/logtree",
         "//metropolis/pkg/supervisor",
+        "//metropolis/pkg/sysctl",
         "//net/proto",
         "@com_github_google_nftables//:nftables",
         "@com_github_google_nftables//expr",
diff --git a/metropolis/node/core/network/main.go b/metropolis/node/core/network/main.go
index 3cfe0f5..6532404 100644
--- a/metropolis/node/core/network/main.go
+++ b/metropolis/node/core/network/main.go
@@ -20,10 +20,7 @@
 	"context"
 	"fmt"
 	"net"
-	"os"
-	"path"
 	"strconv"
-	"strings"
 
 	"github.com/google/nftables"
 	"github.com/google/nftables/expr"
@@ -36,6 +33,8 @@
 	"source.monogon.dev/metropolis/pkg/event"
 	"source.monogon.dev/metropolis/pkg/event/memory"
 	"source.monogon.dev/metropolis/pkg/supervisor"
+	"source.monogon.dev/metropolis/pkg/sysctl"
+
 	netpb "source.monogon.dev/net/proto"
 )
 
@@ -194,27 +193,6 @@
 	return nil
 }
 
-// sysctlOptions contains sysctl options to apply
-type sysctlOptions map[string]string
-
-// apply attempts to apply all options in sysctlOptions. It aborts on the first
-// one which returns an error when applying.
-func (o sysctlOptions) apply() error {
-	for name, value := range o {
-		filePath := path.Join("/proc/sys/", strings.ReplaceAll(name, ".", "/"))
-		optionFile, err := os.OpenFile(filePath, os.O_WRONLY, 0)
-		if err != nil {
-			return fmt.Errorf("failed to set option %v: %w", name, err)
-		}
-		if _, err := optionFile.WriteString(value + "\n"); err != nil {
-			optionFile.Close()
-			return fmt.Errorf("failed to set option %v: %w", name, err)
-		}
-		optionFile.Close() // In a loop, defer'ing could open a lot of FDs
-	}
-	return nil
-}
-
 // RFC2474 Section 4.2.2.1 with reference to RFC791 Section 3.1 (Network
 // Control Precedence)
 const dscpCS7 = 0x7 << 3
@@ -224,7 +202,7 @@
 	s.dnsSvc.ExtraListenerIPs = s.ExtraDNSListenerIPs
 	supervisor.Run(ctx, "dns", s.dnsSvc.Run)
 
-	earlySysctlOpts := sysctlOptions{
+	earlySysctlOpts := sysctl.Options{
 		// Enable strict reverse path filtering on all interfaces (important
 		// for spoofing prevention from Pods with CAP_NET_ADMIN)
 		"net.ipv4.conf.all.rp_filter": "1",
@@ -239,7 +217,7 @@
 		// Make neighbor discovery use DSCP CS7 without ECN
 		"net.ipv6.conf.all.ndisc_tclass": strconv.Itoa(dscpCS7 << 2),
 	}
-	if err := earlySysctlOpts.apply(); err != nil {
+	if err := earlySysctlOpts.Apply(); err != nil {
 		logger.Fatalf("Error configuring early sysctl options: %v", err)
 	}
 	// Choose between autoconfig and static config runnables
@@ -308,7 +286,7 @@
 		logger.Fatalf("Failed to set up nftables nat chain: %v", err)
 	}
 
-	sysctlOpts := sysctlOptions{
+	sysctlOpts := sysctl.Options{
 		// Enable IP forwarding for our pods
 		"net.ipv4.ip_forward": "1",
 
@@ -319,7 +297,7 @@
 		"net.ipv4.tcp_rmem": "4096 87380 16777216",
 		"net.ipv4.tcp_wmem": "4096 87380 16777216",
 	}
-	if err := sysctlOpts.apply(); err != nil {
+	if err := sysctlOpts.Apply(); err != nil {
 		logger.Fatalf("Failed to set up kernel network config: %v", err)
 	}
 
diff --git a/metropolis/node/core/network/static.go b/metropolis/node/core/network/static.go
index ffecf7d..1752810 100644
--- a/metropolis/node/core/network/static.go
+++ b/metropolis/node/core/network/static.go
@@ -21,6 +21,8 @@
 	"source.monogon.dev/metropolis/node/core/network/dns"
 	"source.monogon.dev/metropolis/pkg/logtree"
 	"source.monogon.dev/metropolis/pkg/supervisor"
+	"source.monogon.dev/metropolis/pkg/sysctl"
+
 	netpb "source.monogon.dev/net/proto"
 )
 
@@ -123,9 +125,9 @@
 			hasIPv4Autoconfig = true
 		}
 		if i.Ipv6Autoconfig != nil {
-			err := sysctlOptions{
+			err := sysctl.Options{
 				"net.ipv6.conf." + newLink.Attrs().Name + ".accept_ra": "1",
-			}.apply()
+			}.Apply()
 			if err != nil {
 				return fmt.Errorf("failed enabling accept_ra for interface %q: %w", newLink.Attrs().Name, err)
 			}
diff --git a/metropolis/node/core/sysctl.go b/metropolis/node/core/sysctl.go
new file mode 100644
index 0000000..eb72aa3
--- /dev/null
+++ b/metropolis/node/core/sysctl.go
@@ -0,0 +1,26 @@
+package main
+
+import (
+	"context"
+	"strconv"
+
+	"source.monogon.dev/metropolis/pkg/supervisor"
+	"source.monogon.dev/metropolis/pkg/sysctl"
+)
+
+func nodeSysctls(ctx context.Context) error {
+	const vmMaxMapCount = 2<<30 - 1
+	options := sysctl.Options{
+		// We increase the max mmap count to nearly the maximum, as it gets
+		// accounted by the cgroup memory limit.
+		"vm.max_map_count": strconv.Itoa(vmMaxMapCount),
+	}
+
+	if err := options.Apply(); err != nil {
+		return err
+	}
+
+	supervisor.Signal(ctx, supervisor.SignalHealthy)
+	supervisor.Signal(ctx, supervisor.SignalDone)
+	return nil
+}
diff --git a/metropolis/pkg/sysctl/BUILD.bazel b/metropolis/pkg/sysctl/BUILD.bazel
new file mode 100644
index 0000000..a945a03
--- /dev/null
+++ b/metropolis/pkg/sysctl/BUILD.bazel
@@ -0,0 +1,8 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "sysctl",
+    srcs = ["options.go"],
+    importpath = "source.monogon.dev/metropolis/pkg/sysctl",
+    visibility = ["//visibility:public"],
+)
diff --git a/metropolis/pkg/sysctl/options.go b/metropolis/pkg/sysctl/options.go
new file mode 100644
index 0000000..b5e1e36
--- /dev/null
+++ b/metropolis/pkg/sysctl/options.go
@@ -0,0 +1,29 @@
+package sysctl
+
+import (
+	"fmt"
+	"os"
+	"path"
+	"strings"
+)
+
+// Options contains sysctl options to apply
+type Options map[string]string
+
+// Apply attempts to apply all options in Options. It aborts on the first
+// one which returns an error when applying.
+func (o Options) Apply() error {
+	for name, value := range o {
+		filePath := path.Join("/proc/sys/", strings.ReplaceAll(name, ".", "/"))
+		optionFile, err := os.OpenFile(filePath, os.O_WRONLY, 0)
+		if err != nil {
+			return fmt.Errorf("failed to set option %v: %w", name, err)
+		}
+		if _, err := optionFile.WriteString(value + "\n"); err != nil {
+			optionFile.Close()
+			return fmt.Errorf("failed to set option %v: %w", name, err)
+		}
+		optionFile.Close() // In a loop, defer'ing could open a lot of FDs
+	}
+	return nil
+}