m/node: implement container networking ourselves

This change gets rid of the CNI mechanism for configuring container
networking in favour of a split approach where the network service is
extended by a gRPC workload network service which handles all of the
work as well as a library which exposes just enough of go-cni's
interface to be a drop-in replacement in containerd, which then talks
to the workload network service.

This is a rather unconventional approach do doing things as CNI itself
is a pluggable interface. The reason for doing it this way is that the
binary executing interface of CNI has a huge spec which is also horrible
to convert into decent Go types and being a binary-calling interface has
inherent lifecycle, complexity and image size disadvantages. The part of
CNI that is actually used by containerd is tiny and its arguments are
well-specified and have decent Go types. It also avoids the whole CNI
caching mechanic which adds further unnecessary complexity.

The reason for the split service model instead of implementing
everything in cniproxy is to allow for more complex logic and Monogon
control plane interfacing from the workload network service. Also this
will allow offloading the actual service to things like DPUs.

Right now there is some uglyness left to make this self-contained. Two
obvious examples are the piping through of the pod network event value
and the exclusion of the first (non-network) IP from the IP allocator.
These will eventually go away but are necessary to get this to work as a
standalone change.

Change-Id: I46c604b7dfd58da9e6ddd0a29241680d25a2a745
Reviewed-on: https://review.monogon.dev/c/monogon/+/4496
Reviewed-by: Jan Schär <jan@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/BUILD.bazel b/metropolis/node/core/BUILD.bazel
index de07e8c..3d92543 100644
--- a/metropolis/node/core/BUILD.bazel
+++ b/metropolis/node/core/BUILD.bazel
@@ -24,6 +24,7 @@
         "//go/logging",
         "//metropolis/node",
         "//metropolis/node/core/cluster",
+        "//metropolis/node/core/clusternet",
         "//metropolis/node/core/devmgr",
         "//metropolis/node/core/localstorage",
         "//metropolis/node/core/localstorage/declarative",
@@ -38,6 +39,7 @@
         "//metropolis/node/core/update",
         "//metropolis/proto/api",
         "//osbase/bringup",
+        "//osbase/event/memory",
         "//osbase/logtree",
         "//osbase/net/dns",
         "//osbase/supervisor",
diff --git a/metropolis/node/core/localstorage/directory_root.go b/metropolis/node/core/localstorage/directory_root.go
index d6d9995..32aa2d8 100644
--- a/metropolis/node/core/localstorage/directory_root.go
+++ b/metropolis/node/core/localstorage/directory_root.go
@@ -48,7 +48,7 @@
 	// TODO(q3k): do this automatically?
 	for _, d := range []declarative.DirectoryPlacement{
 		r.Ephemeral.Consensus,
-		r.Ephemeral.Containerd, r.Ephemeral.Containerd.Tmp, r.Ephemeral.Containerd.RunSC, r.Ephemeral.Containerd.IPAM,
+		r.Ephemeral.Containerd, r.Ephemeral.Containerd.Tmp, r.Ephemeral.Containerd.RunSC,
 		r.Ephemeral.FlexvolumePlugins,
 		r.ESP.Metropolis,
 	} {
diff --git a/metropolis/node/core/localstorage/storage.go b/metropolis/node/core/localstorage/storage.go
index 029ce0c..d21cad4 100644
--- a/metropolis/node/core/localstorage/storage.go
+++ b/metropolis/node/core/localstorage/storage.go
@@ -168,9 +168,6 @@
 	RunSCLogsFIFO declarative.File      `file:"runsc-logs.fifo"`
 	Tmp           declarative.Directory `dir:"tmp"`
 	RunSC         declarative.Directory `dir:"runsc"`
-	IPAM          declarative.Directory `dir:"ipam"`
-	CNI           declarative.Directory `dir:"cni"`
-	CNICache      declarative.Directory `dir:"cni-cache"` // Hardcoded @com_github_containernetworking_cni via patch
 }
 
 type TmpDirectory struct {
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index 2d80698..df9dd78 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -12,6 +12,7 @@
 
 	"source.monogon.dev/go/logging"
 	"source.monogon.dev/metropolis/node/core/cluster"
+	"source.monogon.dev/metropolis/node/core/clusternet"
 	"source.monogon.dev/metropolis/node/core/devmgr"
 	"source.monogon.dev/metropolis/node/core/localstorage"
 	"source.monogon.dev/metropolis/node/core/localstorage/declarative"
@@ -24,6 +25,7 @@
 	timesvc "source.monogon.dev/metropolis/node/core/time"
 	"source.monogon.dev/metropolis/node/core/update"
 	"source.monogon.dev/osbase/bringup"
+	"source.monogon.dev/osbase/event/memory"
 	"source.monogon.dev/osbase/logtree"
 	"source.monogon.dev/osbase/net/dns"
 	"source.monogon.dev/osbase/supervisor"
@@ -119,7 +121,8 @@
 	}
 
 	metrics.CoreRegistry.MustRegister(dns.MetricsRegistry)
-	networkSvc := network.New(nil, []string{"hosts", "kubernetes"})
+	var podNetwork memory.Value[*clusternet.Prefixes]
+	networkSvc := network.New(nil, []string{"hosts", "kubernetes"}, &podNetwork)
 	networkSvc.DHCPVendorClassID = "dev.monogon.metropolis.node.v1"
 	timeSvc := timesvc.New()
 	devmgrSvc := devmgr.New()
@@ -195,6 +198,7 @@
 		Resolver:    res,
 		LogTree:     supervisor.LogTree(ctx),
 		Update:      updateSvc,
+		PodNetwork:  &podNetwork,
 	})
 	if err := supervisor.Run(ctx, "role", rs.Run); err != nil {
 		return fmt.Errorf("failed to start role service: %w", err)
diff --git a/metropolis/node/core/network/BUILD.bazel b/metropolis/node/core/network/BUILD.bazel
index 22efbb4..b1a5e41 100644
--- a/metropolis/node/core/network/BUILD.bazel
+++ b/metropolis/node/core/network/BUILD.bazel
@@ -16,9 +16,12 @@
         "//go/algorithm/toposort",
         "//go/logging",
         "//metropolis/node",
+        "//metropolis/node/core/clusternet",
         "//metropolis/node/core/network/dhcp4c",
         "//metropolis/node/core/network/dhcp4c/callback",
+        "//metropolis/node/core/network/workloads",
         "//metropolis/node/core/productinfo",
+        "//osbase/event",
         "//osbase/event/memory",
         "//osbase/net/dns",
         "//osbase/net/dns/forward",
diff --git a/metropolis/node/core/network/main.go b/metropolis/node/core/network/main.go
index 3e069c3..623e62f 100644
--- a/metropolis/node/core/network/main.go
+++ b/metropolis/node/core/network/main.go
@@ -17,8 +17,11 @@
 	"github.com/vishvananda/netlink"
 
 	"source.monogon.dev/metropolis/node"
+	"source.monogon.dev/metropolis/node/core/clusternet"
 	"source.monogon.dev/metropolis/node/core/network/dhcp4c"
 	dhcpcb "source.monogon.dev/metropolis/node/core/network/dhcp4c/callback"
+	"source.monogon.dev/metropolis/node/core/network/workloads"
+	"source.monogon.dev/osbase/event"
 	"source.monogon.dev/osbase/event/memory"
 	"source.monogon.dev/osbase/net/dns"
 	"source.monogon.dev/osbase/net/dns/forward"
@@ -59,6 +62,8 @@
 
 	// Status is the current status of the network as seen by the service.
 	Status memory.Value[*node.NetStatus]
+
+	workloadSvc *workloads.Service
 }
 
 // New instantiates a new network service. If autoconfiguration is desired,
@@ -67,15 +72,21 @@
 // If dnsHandlerNames is non-nil, DNS handlers with these names must be set
 // on the DNS service with s.DNS.SetHandler. When serving DNS queries, they
 // will be tried in the order they appear here before forwarding.
-func New(staticConfig *netpb.Net, dnsHandlerNames []string) *Service {
+func New(staticConfig *netpb.Net, dnsHandlerNames []string, ipamPrefixSrc event.Value[*clusternet.Prefixes]) *Service {
 	dnsSvc := dns.New(slices.Concat(dnsHandlerNames, []string{"forward"}))
 	dnsForward := forward.New()
 	dnsSvc.SetHandler("forward", dnsForward)
 
+	var wlSvc *workloads.Service
+	if ipamPrefixSrc != nil {
+		wlSvc = workloads.New(ipamPrefixSrc)
+	}
+
 	return &Service{
 		DNS:          dnsSvc,
 		dnsForward:   dnsForward,
 		StaticConfig: staticConfig,
+		workloadSvc:  wlSvc,
 	}
 }
 
@@ -243,6 +254,10 @@
 	supervisor.Run(ctx, "dns", s.DNS.Run)
 	supervisor.Run(ctx, "dns-forward", s.dnsForward.Run)
 
+	if s.workloadSvc != nil {
+		supervisor.Run(ctx, "workloads", s.workloadSvc.Run)
+	}
+
 	s.natTable = s.nftConn.AddTable(&nftables.Table{
 		Family: nftables.TableFamilyIPv4,
 		Name:   "nat",
diff --git a/metropolis/node/core/network/workloads/BUILD.bazel b/metropolis/node/core/network/workloads/BUILD.bazel
new file mode 100644
index 0000000..357b892
--- /dev/null
+++ b/metropolis/node/core/network/workloads/BUILD.bazel
@@ -0,0 +1,21 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "workloads",
+    srcs = ["workloads.go"],
+    importpath = "source.monogon.dev/metropolis/node/core/network/workloads",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//metropolis/node",
+        "//metropolis/node/core/clusternet",
+        "//metropolis/node/core/network/workloads/spec",
+        "//osbase/event",
+        "//osbase/supervisor",
+        "@com_github_vishvananda_netlink//:netlink",
+        "@com_github_vishvananda_netns//:netns",
+        "@org_golang_google_grpc//:grpc",
+        "@org_golang_google_grpc//codes",
+        "@org_golang_google_grpc//status",
+        "@org_golang_x_sys//unix",
+    ],
+)
diff --git a/metropolis/node/core/network/workloads/spec/BUILD.bazel b/metropolis/node/core/network/workloads/spec/BUILD.bazel
new file mode 100644
index 0000000..5c6c5c6
--- /dev/null
+++ b/metropolis/node/core/network/workloads/spec/BUILD.bazel
@@ -0,0 +1,27 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@rules_proto//proto:defs.bzl", "proto_library")
+
+proto_library(
+    name = "metropolis_node_core_network_workloads_proto",
+    srcs = ["workload.proto"],
+    visibility = ["//visibility:public"],
+)
+
+go_proto_library(
+    name = "metropolis_node_core_network_workloads_go_proto",
+    compilers = [
+        "@io_bazel_rules_go//proto:go_proto",
+        "@io_bazel_rules_go//proto:go_grpc_v2",
+    ],
+    importpath = "source.monogon.dev/metropolis/node/core/network/workloads/spec",
+    proto = ":metropolis_node_core_network_workloads_proto",
+    visibility = ["//visibility:public"],
+)
+
+go_library(
+    name = "spec",
+    embed = [":metropolis_node_core_network_workloads_go_proto"],
+    importpath = "source.monogon.dev/metropolis/node/core/network/workloads/spec",
+    visibility = ["//visibility:public"],
+)
diff --git a/metropolis/node/core/network/workloads/spec/gomod-generated-placeholder.go b/metropolis/node/core/network/workloads/spec/gomod-generated-placeholder.go
new file mode 100644
index 0000000..f09cd57
--- /dev/null
+++ b/metropolis/node/core/network/workloads/spec/gomod-generated-placeholder.go
@@ -0,0 +1 @@
+package spec
diff --git a/metropolis/node/core/network/workloads/spec/workload.proto b/metropolis/node/core/network/workloads/spec/workload.proto
new file mode 100644
index 0000000..235763d
--- /dev/null
+++ b/metropolis/node/core/network/workloads/spec/workload.proto
@@ -0,0 +1,66 @@
+syntax = "proto3";
+
+package metropolis.node.core.network.workloads;
+
+message NetNSAttachment {
+    // Path to either a nsfs mountpoint or /proc/$pid/ns/net of the network
+    // namespace the workload to be attached will/is running in.
+    string netns_path = 1;
+    // Name of the network interface created in the given network namespace.
+    string if_name = 2;
+}
+
+message AttachRequest {
+    // Workload ID is the identity of the workload to attach to the network.
+    // Right now this is just the Kubernetes Pod ID.
+    string workload_id = 1;
+
+    // This will be extended to support things like vhost-user
+    // and PCIe VFs for DPUs.
+    NetNSAttachment netns = 2;
+}
+
+message AttachResponse {
+    repeated bytes ip = 1;
+}
+
+message DetachRequest {
+    // Workload ID is the identity of the workload to attach to the network.
+    // Right now this is just the Kubernetes Pod ID.
+    string workload_id = 1;
+
+    // This will be extended to support things like vhost-user
+    // and PCIe VFs for DPUs.
+    NetNSAttachment netns = 2;
+}
+
+message DetachResponse {
+
+}
+
+message StatusRequest {}
+message StatusResponse {}
+
+// The workload networking service attaches workloads to the network.
+// The service is served over a unix socket.
+// It is called by containerd, and possibly other workload runtimes later.
+// It is a replacement for the Container Network Interface (CNI);
+// see https://github.com/containernetworking/cni/blob/main/SPEC.md
+//
+// Concurrent calls are allowed if they don't have the same workload_id.
+// For a specific workload_id, Attach may not be followed by another Attach
+// without a Detach in-between. Detach may be called multiple times.
+service WorkloadNetworking {
+    // Attach the workload to the network. This allocates the workload IP
+    // addresses (at most one for IPv4 and IPv6), and sets up the main network
+    // interface. For network namespaces, it also enables the loopback
+    // interface.
+    rpc Attach(AttachRequest) returns (AttachResponse);
+    // Detach removes the interfaces and IP address allocation of the workload.
+    // It succeeds even if some or all resources don't exist, and removes those
+    // which do exist.
+    rpc Detach(DetachRequest) returns (DetachResponse);
+    // Status returns an error if the service is not ready.
+    rpc Status(StatusRequest) returns (StatusResponse);
+}
+
diff --git a/metropolis/node/core/network/workloads/workloads.go b/metropolis/node/core/network/workloads/workloads.go
new file mode 100644
index 0000000..2b4f5d3
--- /dev/null
+++ b/metropolis/node/core/network/workloads/workloads.go
@@ -0,0 +1,330 @@
+// Copyright The Monogon Project Authors.
+// SPDX-License-Identifier: Apache-2.0
+
+package workloads
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"net/netip"
+	"os"
+	"sync"
+
+	"github.com/vishvananda/netlink"
+	"github.com/vishvananda/netns"
+	"golang.org/x/sys/unix"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+
+	"source.monogon.dev/metropolis/node"
+	"source.monogon.dev/metropolis/node/core/clusternet"
+	wlapi "source.monogon.dev/metropolis/node/core/network/workloads/spec"
+	"source.monogon.dev/osbase/event"
+	"source.monogon.dev/osbase/supervisor"
+)
+
+var (
+	firstHopV4 = net.IPv4(169, 254, 77, 1)
+	firstHopV6 = net.ParseIP("fe80::1")
+	// TODO: Replace prefix with Monogon OUI once we have it, right now
+	// it's just a random locally-administered MAC.
+	firstHopMAC = net.HardwareAddr{0x02, 0x9c, 0x52, 0xfe, 0x6d, 0x0a}
+)
+
+type Service struct {
+	mux          sync.Mutex
+	workloadNets []netip.Prefix
+	attachments  map[netip.Addr]string
+	// workloadToIntf maps workload name to short interface name.
+	workloadToIntf map[string]string
+	// intfUsed is the set of allocated short interface names.
+	intfUsed map[string]struct{}
+
+	k8sNodePrefix event.Value[*clusternet.Prefixes]
+}
+
+func New(k8sNodePrefix event.Value[*clusternet.Prefixes]) *Service {
+	return &Service{
+		workloadNets:   []netip.Prefix{},
+		attachments:    make(map[netip.Addr]string),
+		workloadToIntf: make(map[string]string),
+		intfUsed:       make(map[string]struct{}),
+		k8sNodePrefix:  k8sNodePrefix,
+	}
+}
+
+func (s *Service) allocateIPs(workloadId string) ([]net.IP, error) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	// This is a really simple allocator as it won't stay for long. It just
+	// walks the entire prefix and finds the first free IP. The size of the
+	// map is bound to 2x256 (max pods per node) for its life so this is fine.
+	var addrs []netip.Addr
+	for _, wlNet := range s.workloadNets {
+		candidateAddr := wlNet.Addr()
+		// The second address is reserved by clusternet for the host loopback,
+		// this will go away with the clusternet refactor.
+		reservedForHost := wlNet.Addr().Next()
+		for s.attachments[candidateAddr] != "" || candidateAddr == reservedForHost {
+			candidateAddr = candidateAddr.Next()
+		}
+		// Allocator ran off the prefix
+		if !wlNet.Contains(candidateAddr) {
+			return nil, fmt.Errorf("no free IP addresses in prefix %v", wlNet)
+		}
+		addrs = append(addrs, candidateAddr)
+	}
+	var addrsOut []net.IP
+	for _, addr := range addrs {
+		s.attachments[addr] = workloadId
+		addrsOut = append(addrsOut, net.IP(addr.AsSlice()))
+	}
+	return addrsOut, nil
+}
+
+func (s *Service) deallocateIPs(workloadId string) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	for ip, wlId := range s.attachments {
+		if wlId == workloadId {
+			delete(s.attachments, ip)
+		}
+	}
+}
+
+// allocateIntfName allocates a short interface name for the workload. This is
+// needed because interface names are limited to 15 characters.
+func (s *Service) allocateIntfName(workloadId string) (string, error) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	if _, ok := s.workloadToIntf[workloadId]; ok {
+		return "", fmt.Errorf("workload %q already has an interface", workloadId)
+	}
+	intfPrefix := "wk" + workloadId[:8]
+	intf := intfPrefix
+	for i := 0; ; i++ {
+		if _, ok := s.intfUsed[intf]; !ok {
+			break
+		}
+		if i > 0xffff {
+			return "", fmt.Errorf("too many interface name collisions for workload %q", workloadId)
+		}
+		intf = fmt.Sprintf("%s-%04x", intfPrefix, i)
+	}
+	s.workloadToIntf[workloadId] = intf
+	s.intfUsed[intf] = struct{}{}
+	return intf, nil
+}
+
+func (s *Service) getIntfName(workloadId string) (string, bool) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	intf, ok := s.workloadToIntf[workloadId]
+	return intf, ok
+}
+
+func (s *Service) deallocateIntfName(workloadId string) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	intf, ok := s.workloadToIntf[workloadId]
+	if !ok {
+		return
+	}
+	delete(s.workloadToIntf, workloadId)
+	delete(s.intfUsed, intf)
+}
+
+func (s *Service) Run(ctx context.Context) error {
+	l := supervisor.Logger(ctx)
+
+	srv := grpc.NewServer()
+	wlapi.RegisterWorkloadNetworkingServer(srv, s)
+	os.Remove("/ephemeral/workloadnet.sock")
+	lis, err := net.ListenUnix("unix", &net.UnixAddr{Net: "unix", Name: "/ephemeral/workloadnet.sock"})
+	if err != nil {
+		return fmt.Errorf("failed to listen: %w", err)
+	}
+	supervisor.Run(ctx, "api", supervisor.GRPCServer(srv, lis, true))
+	w := s.k8sNodePrefix.Watch()
+	defer w.Close()
+
+	lo, err := netlink.LinkByIndex(1)
+	if err != nil {
+		panic(err)
+	}
+	if err := netlink.AddrAdd(lo, &netlink.Addr{
+		IPNet: &net.IPNet{IP: firstHopV4, Mask: net.CIDRMask(32, 32)},
+		Label: "Router",
+		Scope: unix.RT_SCOPE_LINK,
+	}); err != nil && !errors.Is(err, unix.EEXIST) {
+		l.Errorf("Unable to add router IP: %v", err)
+	}
+
+	supervisor.Signal(ctx, supervisor.SignalHealthy)
+	// It's undefined what happens when the workloadNets actually change right
+	// now with K8s IPAM. So just assign new workloads to the new prefixes for
+	// now. With the Monogon IPAM implementation this will have defined
+	// behavior.
+	for {
+		prefixes, err := w.Get(ctx)
+		if err != nil {
+			return err
+		}
+		if prefixes != nil {
+			s.mux.Lock()
+			s.workloadNets = *prefixes
+			s.mux.Unlock()
+		}
+	}
+}
+
+func (s *Service) Attach(ctx context.Context, req *wlapi.AttachRequest) (*wlapi.AttachResponse, error) {
+	intf, err := s.allocateIntfName(req.WorkloadId)
+	if err != nil {
+		return nil, status.Errorf(codes.AlreadyExists, "cannot add interface: %v", err)
+	}
+	workloadAddrs, err := s.allocateIPs(req.WorkloadId)
+	if err != nil {
+		return nil, status.Errorf(codes.ResourceExhausted, "cannot allocate IPs: %v", err)
+	}
+
+	linkAttrs := netlink.NewLinkAttrs()
+	linkAttrs.Group = node.LinkGroupK8sPod
+	linkAttrs.Name = intf
+	linkAttrs.HardwareAddr = firstHopMAC
+
+	netns, err := netns.GetFromPath(req.GetNetns().NetnsPath)
+	if err != nil {
+		return nil, fmt.Errorf("cannot open network namespace: %w", err)
+	}
+	defer netns.Close()
+
+	nsHandle, err := netlink.NewHandleAt(netns, unix.NETLINK_ROUTE)
+	if err != nil {
+		return nil, fmt.Errorf("unable to get ns handle: %w", err)
+	}
+	defer nsHandle.Close()
+
+	hostIf := netlink.Veth{LinkAttrs: linkAttrs, PeerName: req.GetNetns().IfName, PeerNamespace: netlink.NsFd(netns)}
+	if err := netlink.LinkAdd(&hostIf); err != nil {
+		return nil, fmt.Errorf("unable to create veth pair: %w", err)
+	}
+	// Linux is currently unable to assign aliases on interface creation.
+	if err := netlink.LinkSetAlias(&hostIf, "wk"+req.WorkloadId); err != nil {
+		return nil, fmt.Errorf("failed to assign alias: %w", err)
+	}
+	if err := netlink.LinkSetUp(&hostIf); err != nil {
+		return nil, fmt.Errorf("failed to set host up: %w", err)
+	}
+
+	// Loopback is always at index 1 by convention
+	loIf, err := nsHandle.LinkByIndex(1)
+	if err != nil {
+		return nil, fmt.Errorf("unable to get loopback interface in namespace: %w", err)
+	}
+	if err := nsHandle.LinkSetUp(loIf); err != nil {
+		return nil, fmt.Errorf("failed to set loopback up: %w", err)
+	}
+
+	workloadIf, err := nsHandle.LinkByName(req.GetNetns().IfName)
+	if err != nil {
+		return nil, fmt.Errorf("unable to get just-created peer interface in namespace: %w", err)
+	}
+
+	if err := nsHandle.LinkSetUp(workloadIf); err != nil {
+		return nil, fmt.Errorf("failed to set peer up: %w", err)
+	}
+	var outAddrs [][]byte
+	for _, workloadIP := range workloadAddrs {
+		outAddrs = append(outAddrs, workloadIP)
+
+		defaultMask := net.CIDRMask(0, 32) // /0
+		zeroIP := net.IPv4zero
+		hostMask := net.CIDRMask(32, 32) // /32
+		firstHop := firstHopV4
+		if workloadIP.To4() == nil {
+			defaultMask = net.CIDRMask(0, 128) // /0
+			zeroIP = net.IPv6zero
+			hostMask = net.CIDRMask(128, 128) // /128
+			firstHop = firstHopV6
+		}
+
+		if err := netlink.RouteAdd(&netlink.Route{
+			Dst:       &net.IPNet{IP: workloadIP, Mask: hostMask},
+			LinkIndex: hostIf.Index,
+			Scope:     netlink.SCOPE_UNIVERSE,
+			Protocol:  unix.RTPROT_STATIC,
+		}); err != nil {
+			return nil, fmt.Errorf("failed to add host to workload route: %w", err)
+		}
+
+		if err := nsHandle.AddrAdd(workloadIf, &netlink.Addr{
+			IPNet: &net.IPNet{IP: workloadIP, Mask: hostMask},
+		}); err != nil {
+			return nil, fmt.Errorf("failed to add address: %w", err)
+		}
+		// Use dedicated on-link route instead of RTNH_F_ONLINK which gVisor
+		// doesn't understand.
+		if err := nsHandle.RouteAdd(&netlink.Route{
+			Dst:       &net.IPNet{IP: firstHop, Mask: hostMask},
+			Scope:     netlink.SCOPE_LINK,
+			Protocol:  unix.RTPROT_STATIC,
+			LinkIndex: workloadIf.Attrs().Index,
+		}); err != nil {
+			return nil, fmt.Errorf("failed to add peer route: %w", err)
+		}
+		if err := nsHandle.RouteAdd(&netlink.Route{
+			Dst:       &net.IPNet{IP: zeroIP, Mask: defaultMask},
+			Gw:        firstHop,
+			Scope:     netlink.SCOPE_UNIVERSE,
+			Protocol:  unix.RTPROT_STATIC,
+			LinkIndex: workloadIf.Attrs().Index,
+			Src:       workloadIP,
+		}); err != nil {
+			return nil, fmt.Errorf("failed to add default route: %w", err)
+		}
+	}
+	return &wlapi.AttachResponse{Ip: outAddrs}, nil
+}
+
+func (s *Service) Detach(ctx context.Context, req *wlapi.DetachRequest) (*wlapi.DetachResponse, error) {
+	defer s.deallocateIntfName(req.WorkloadId)
+	defer s.deallocateIPs(req.WorkloadId)
+	intf, ok := s.getIntfName(req.WorkloadId)
+	if !ok {
+		return &wlapi.DetachResponse{}, nil
+	}
+
+	hostIf, err := netlink.LinkByName(intf)
+	if errors.As(err, &netlink.LinkNotFoundError{}) {
+		// CNI requires that DEL calls return success if the interface in
+		// question does not exist.
+		return &wlapi.DetachResponse{}, nil
+	}
+	if err != nil {
+		return nil, status.Errorf(codes.Unavailable, "error getting interface for deletion: %v", err)
+	}
+	if hostIf.Attrs().Group != node.LinkGroupK8sPod {
+		return nil, status.Errorf(codes.InvalidArgument, "refusing to delete interface not belonging to workload, has group %d", hostIf.Attrs().Group)
+	}
+	// Routes and addresses do not need to be cleaned up as Linux already takes
+	// care of that when the link is deleted.
+	if err := netlink.LinkDel(hostIf); err != nil {
+		return nil, status.Errorf(codes.Unavailable, "unable to delete veth interface: %v", err)
+	}
+	return &wlapi.DetachResponse{}, nil
+}
+
+func (s *Service) Status(ctx context.Context, req *wlapi.StatusRequest) (*wlapi.StatusResponse, error) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+
+	if len(s.workloadNets) == 0 {
+		return nil, status.Errorf(codes.Unavailable, "no prefixes available")
+	}
+
+	return &wlapi.StatusResponse{}, nil
+}
diff --git a/metropolis/node/core/roleserve/roleserve.go b/metropolis/node/core/roleserve/roleserve.go
index 1f9604c..4c1f610 100644
--- a/metropolis/node/core/roleserve/roleserve.go
+++ b/metropolis/node/core/roleserve/roleserve.go
@@ -66,6 +66,8 @@
 	// Network is a handle to the network service, used by workloads.
 	Network *network.Service
 
+	PodNetwork *memory.Value[*clusternet.Prefixes]
+
 	// resolver is the main, long-lived, authenticated cluster resolver that is used
 	// for all subsequent gRPC calls by the subordinates of the roleserver. It is
 	// created early in the roleserver lifecycle, and is seeded with node
@@ -86,7 +88,6 @@
 	KubernetesStatus      memory.Value[*KubernetesStatus]
 	bootstrapData         memory.Value[*BootstrapData]
 	LocalRoles            memory.Value[*cpb.NodeRoles]
-	podNetwork            memory.Value[*clusternet.Prefixes]
 	clusterDirectorySaved memory.Value[bool]
 	localControlPlane     memory.Value[*localControlPlane]
 	CuratorConnection     memory.Value[*CuratorConnection]
@@ -141,7 +142,7 @@
 		curatorConnection: &s.CuratorConnection,
 
 		kubernetesStatus: &s.KubernetesStatus,
-		podNetwork:       &s.podNetwork,
+		podNetwork:       s.Config.PodNetwork,
 	}
 
 	s.rolefetch = &workerRoleFetch{
@@ -161,7 +162,7 @@
 		storageRoot: s.StorageRoot,
 
 		curatorConnection: &s.CuratorConnection,
-		podNetwork:        &s.podNetwork,
+		podNetwork:        s.Config.PodNetwork,
 		network:           s.Network,
 	}
 
diff --git a/metropolis/node/core/update/e2e/testos/main.go b/metropolis/node/core/update/e2e/testos/main.go
index 0b40127..87db99f 100644
--- a/metropolis/node/core/update/e2e/testos/main.go
+++ b/metropolis/node/core/update/e2e/testos/main.go
@@ -30,7 +30,7 @@
 
 func testosRunnable(ctx context.Context) error {
 	supervisor.Logger(ctx).Info("TESTOS_VARIANT=" + Variant)
-	networkSvc := network.New(nil, nil)
+	networkSvc := network.New(nil, nil, nil)
 	networkSvc.DHCPVendorClassID = "dev.monogon.testos.v1"
 	supervisor.Run(ctx, "networking", networkSvc.Run)