m/node: implement container networking ourselves

This change gets rid of the CNI mechanism for configuring container
networking in favour of a split approach where the network service is
extended by a gRPC workload network service which handles all of the
work as well as a library which exposes just enough of go-cni's
interface to be a drop-in replacement in containerd, which then talks
to the workload network service.

This is a rather unconventional approach do doing things as CNI itself
is a pluggable interface. The reason for doing it this way is that the
binary executing interface of CNI has a huge spec which is also horrible
to convert into decent Go types and being a binary-calling interface has
inherent lifecycle, complexity and image size disadvantages. The part of
CNI that is actually used by containerd is tiny and its arguments are
well-specified and have decent Go types. It also avoids the whole CNI
caching mechanic which adds further unnecessary complexity.

The reason for the split service model instead of implementing
everything in cniproxy is to allow for more complex logic and Monogon
control plane interfacing from the workload network service. Also this
will allow offloading the actual service to things like DPUs.

Right now there is some uglyness left to make this self-contained. Two
obvious examples are the piping through of the pod network event value
and the exclusion of the first (non-network) IP from the IP allocator.
These will eventually go away but are necessary to get this to work as a
standalone change.

Change-Id: I46c604b7dfd58da9e6ddd0a29241680d25a2a745
Reviewed-on: https://review.monogon.dev/c/monogon/+/4496
Reviewed-by: Jan Schär <jan@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/BUILD.bazel b/metropolis/node/BUILD.bazel
index 479f4d9..d568e37 100644
--- a/metropolis/node/BUILD.bazel
+++ b/metropolis/node/BUILD.bazel
@@ -86,17 +86,11 @@
         # Containerd config files
         "/containerd/conf/runsc.toml": "//metropolis/node/kubernetes/containerd:runsc.toml",
         "/containerd/conf/config.toml": "//metropolis/node/kubernetes/containerd:config.toml",
-        "/containerd/conf/cnispec.gojson": "//metropolis/node/kubernetes/containerd:cnispec.gojson",
 
         # Containerd preseed bundles
         "/containerd/preseed/k8s.io/preseedtest.tar": "//metropolis/test/e2e/preseedtest:preseedtest_tarball",
         "/containerd/preseed/k8s.io/pause.tar": "//metropolis/node/kubernetes/pause:pause_tarball",
 
-        # CNI Plugins
-        "/containerd/bin/cni/loopback": "@com_github_containernetworking_plugins//plugins/main/loopback",
-        "/containerd/bin/cni/ptp": "@com_github_containernetworking_plugins//plugins/main/ptp",
-        "/containerd/bin/cni/host-local": "@com_github_containernetworking_plugins//plugins/ipam/host-local",
-
         # Delve
         "/dlv": "@com_github_go_delve_delve//cmd/dlv:dlv",
 
diff --git a/metropolis/node/core/BUILD.bazel b/metropolis/node/core/BUILD.bazel
index de07e8c..3d92543 100644
--- a/metropolis/node/core/BUILD.bazel
+++ b/metropolis/node/core/BUILD.bazel
@@ -24,6 +24,7 @@
         "//go/logging",
         "//metropolis/node",
         "//metropolis/node/core/cluster",
+        "//metropolis/node/core/clusternet",
         "//metropolis/node/core/devmgr",
         "//metropolis/node/core/localstorage",
         "//metropolis/node/core/localstorage/declarative",
@@ -38,6 +39,7 @@
         "//metropolis/node/core/update",
         "//metropolis/proto/api",
         "//osbase/bringup",
+        "//osbase/event/memory",
         "//osbase/logtree",
         "//osbase/net/dns",
         "//osbase/supervisor",
diff --git a/metropolis/node/core/localstorage/directory_root.go b/metropolis/node/core/localstorage/directory_root.go
index d6d9995..32aa2d8 100644
--- a/metropolis/node/core/localstorage/directory_root.go
+++ b/metropolis/node/core/localstorage/directory_root.go
@@ -48,7 +48,7 @@
 	// TODO(q3k): do this automatically?
 	for _, d := range []declarative.DirectoryPlacement{
 		r.Ephemeral.Consensus,
-		r.Ephemeral.Containerd, r.Ephemeral.Containerd.Tmp, r.Ephemeral.Containerd.RunSC, r.Ephemeral.Containerd.IPAM,
+		r.Ephemeral.Containerd, r.Ephemeral.Containerd.Tmp, r.Ephemeral.Containerd.RunSC,
 		r.Ephemeral.FlexvolumePlugins,
 		r.ESP.Metropolis,
 	} {
diff --git a/metropolis/node/core/localstorage/storage.go b/metropolis/node/core/localstorage/storage.go
index 029ce0c..d21cad4 100644
--- a/metropolis/node/core/localstorage/storage.go
+++ b/metropolis/node/core/localstorage/storage.go
@@ -168,9 +168,6 @@
 	RunSCLogsFIFO declarative.File      `file:"runsc-logs.fifo"`
 	Tmp           declarative.Directory `dir:"tmp"`
 	RunSC         declarative.Directory `dir:"runsc"`
-	IPAM          declarative.Directory `dir:"ipam"`
-	CNI           declarative.Directory `dir:"cni"`
-	CNICache      declarative.Directory `dir:"cni-cache"` // Hardcoded @com_github_containernetworking_cni via patch
 }
 
 type TmpDirectory struct {
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index 2d80698..df9dd78 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -12,6 +12,7 @@
 
 	"source.monogon.dev/go/logging"
 	"source.monogon.dev/metropolis/node/core/cluster"
+	"source.monogon.dev/metropolis/node/core/clusternet"
 	"source.monogon.dev/metropolis/node/core/devmgr"
 	"source.monogon.dev/metropolis/node/core/localstorage"
 	"source.monogon.dev/metropolis/node/core/localstorage/declarative"
@@ -24,6 +25,7 @@
 	timesvc "source.monogon.dev/metropolis/node/core/time"
 	"source.monogon.dev/metropolis/node/core/update"
 	"source.monogon.dev/osbase/bringup"
+	"source.monogon.dev/osbase/event/memory"
 	"source.monogon.dev/osbase/logtree"
 	"source.monogon.dev/osbase/net/dns"
 	"source.monogon.dev/osbase/supervisor"
@@ -119,7 +121,8 @@
 	}
 
 	metrics.CoreRegistry.MustRegister(dns.MetricsRegistry)
-	networkSvc := network.New(nil, []string{"hosts", "kubernetes"})
+	var podNetwork memory.Value[*clusternet.Prefixes]
+	networkSvc := network.New(nil, []string{"hosts", "kubernetes"}, &podNetwork)
 	networkSvc.DHCPVendorClassID = "dev.monogon.metropolis.node.v1"
 	timeSvc := timesvc.New()
 	devmgrSvc := devmgr.New()
@@ -195,6 +198,7 @@
 		Resolver:    res,
 		LogTree:     supervisor.LogTree(ctx),
 		Update:      updateSvc,
+		PodNetwork:  &podNetwork,
 	})
 	if err := supervisor.Run(ctx, "role", rs.Run); err != nil {
 		return fmt.Errorf("failed to start role service: %w", err)
diff --git a/metropolis/node/core/network/BUILD.bazel b/metropolis/node/core/network/BUILD.bazel
index 22efbb4..b1a5e41 100644
--- a/metropolis/node/core/network/BUILD.bazel
+++ b/metropolis/node/core/network/BUILD.bazel
@@ -16,9 +16,12 @@
         "//go/algorithm/toposort",
         "//go/logging",
         "//metropolis/node",
+        "//metropolis/node/core/clusternet",
         "//metropolis/node/core/network/dhcp4c",
         "//metropolis/node/core/network/dhcp4c/callback",
+        "//metropolis/node/core/network/workloads",
         "//metropolis/node/core/productinfo",
+        "//osbase/event",
         "//osbase/event/memory",
         "//osbase/net/dns",
         "//osbase/net/dns/forward",
diff --git a/metropolis/node/core/network/main.go b/metropolis/node/core/network/main.go
index 3e069c3..623e62f 100644
--- a/metropolis/node/core/network/main.go
+++ b/metropolis/node/core/network/main.go
@@ -17,8 +17,11 @@
 	"github.com/vishvananda/netlink"
 
 	"source.monogon.dev/metropolis/node"
+	"source.monogon.dev/metropolis/node/core/clusternet"
 	"source.monogon.dev/metropolis/node/core/network/dhcp4c"
 	dhcpcb "source.monogon.dev/metropolis/node/core/network/dhcp4c/callback"
+	"source.monogon.dev/metropolis/node/core/network/workloads"
+	"source.monogon.dev/osbase/event"
 	"source.monogon.dev/osbase/event/memory"
 	"source.monogon.dev/osbase/net/dns"
 	"source.monogon.dev/osbase/net/dns/forward"
@@ -59,6 +62,8 @@
 
 	// Status is the current status of the network as seen by the service.
 	Status memory.Value[*node.NetStatus]
+
+	workloadSvc *workloads.Service
 }
 
 // New instantiates a new network service. If autoconfiguration is desired,
@@ -67,15 +72,21 @@
 // If dnsHandlerNames is non-nil, DNS handlers with these names must be set
 // on the DNS service with s.DNS.SetHandler. When serving DNS queries, they
 // will be tried in the order they appear here before forwarding.
-func New(staticConfig *netpb.Net, dnsHandlerNames []string) *Service {
+func New(staticConfig *netpb.Net, dnsHandlerNames []string, ipamPrefixSrc event.Value[*clusternet.Prefixes]) *Service {
 	dnsSvc := dns.New(slices.Concat(dnsHandlerNames, []string{"forward"}))
 	dnsForward := forward.New()
 	dnsSvc.SetHandler("forward", dnsForward)
 
+	var wlSvc *workloads.Service
+	if ipamPrefixSrc != nil {
+		wlSvc = workloads.New(ipamPrefixSrc)
+	}
+
 	return &Service{
 		DNS:          dnsSvc,
 		dnsForward:   dnsForward,
 		StaticConfig: staticConfig,
+		workloadSvc:  wlSvc,
 	}
 }
 
@@ -243,6 +254,10 @@
 	supervisor.Run(ctx, "dns", s.DNS.Run)
 	supervisor.Run(ctx, "dns-forward", s.dnsForward.Run)
 
+	if s.workloadSvc != nil {
+		supervisor.Run(ctx, "workloads", s.workloadSvc.Run)
+	}
+
 	s.natTable = s.nftConn.AddTable(&nftables.Table{
 		Family: nftables.TableFamilyIPv4,
 		Name:   "nat",
diff --git a/metropolis/node/core/network/workloads/BUILD.bazel b/metropolis/node/core/network/workloads/BUILD.bazel
new file mode 100644
index 0000000..357b892
--- /dev/null
+++ b/metropolis/node/core/network/workloads/BUILD.bazel
@@ -0,0 +1,21 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "workloads",
+    srcs = ["workloads.go"],
+    importpath = "source.monogon.dev/metropolis/node/core/network/workloads",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//metropolis/node",
+        "//metropolis/node/core/clusternet",
+        "//metropolis/node/core/network/workloads/spec",
+        "//osbase/event",
+        "//osbase/supervisor",
+        "@com_github_vishvananda_netlink//:netlink",
+        "@com_github_vishvananda_netns//:netns",
+        "@org_golang_google_grpc//:grpc",
+        "@org_golang_google_grpc//codes",
+        "@org_golang_google_grpc//status",
+        "@org_golang_x_sys//unix",
+    ],
+)
diff --git a/metropolis/node/core/network/workloads/spec/BUILD.bazel b/metropolis/node/core/network/workloads/spec/BUILD.bazel
new file mode 100644
index 0000000..5c6c5c6
--- /dev/null
+++ b/metropolis/node/core/network/workloads/spec/BUILD.bazel
@@ -0,0 +1,27 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@rules_proto//proto:defs.bzl", "proto_library")
+
+proto_library(
+    name = "metropolis_node_core_network_workloads_proto",
+    srcs = ["workload.proto"],
+    visibility = ["//visibility:public"],
+)
+
+go_proto_library(
+    name = "metropolis_node_core_network_workloads_go_proto",
+    compilers = [
+        "@io_bazel_rules_go//proto:go_proto",
+        "@io_bazel_rules_go//proto:go_grpc_v2",
+    ],
+    importpath = "source.monogon.dev/metropolis/node/core/network/workloads/spec",
+    proto = ":metropolis_node_core_network_workloads_proto",
+    visibility = ["//visibility:public"],
+)
+
+go_library(
+    name = "spec",
+    embed = [":metropolis_node_core_network_workloads_go_proto"],
+    importpath = "source.monogon.dev/metropolis/node/core/network/workloads/spec",
+    visibility = ["//visibility:public"],
+)
diff --git a/metropolis/node/core/network/workloads/spec/gomod-generated-placeholder.go b/metropolis/node/core/network/workloads/spec/gomod-generated-placeholder.go
new file mode 100644
index 0000000..f09cd57
--- /dev/null
+++ b/metropolis/node/core/network/workloads/spec/gomod-generated-placeholder.go
@@ -0,0 +1 @@
+package spec
diff --git a/metropolis/node/core/network/workloads/spec/workload.proto b/metropolis/node/core/network/workloads/spec/workload.proto
new file mode 100644
index 0000000..235763d
--- /dev/null
+++ b/metropolis/node/core/network/workloads/spec/workload.proto
@@ -0,0 +1,66 @@
+syntax = "proto3";
+
+package metropolis.node.core.network.workloads;
+
+message NetNSAttachment {
+    // Path to either a nsfs mountpoint or /proc/$pid/ns/net of the network
+    // namespace the workload to be attached will/is running in.
+    string netns_path = 1;
+    // Name of the network interface created in the given network namespace.
+    string if_name = 2;
+}
+
+message AttachRequest {
+    // Workload ID is the identity of the workload to attach to the network.
+    // Right now this is just the Kubernetes Pod ID.
+    string workload_id = 1;
+
+    // This will be extended to support things like vhost-user
+    // and PCIe VFs for DPUs.
+    NetNSAttachment netns = 2;
+}
+
+message AttachResponse {
+    repeated bytes ip = 1;
+}
+
+message DetachRequest {
+    // Workload ID is the identity of the workload to attach to the network.
+    // Right now this is just the Kubernetes Pod ID.
+    string workload_id = 1;
+
+    // This will be extended to support things like vhost-user
+    // and PCIe VFs for DPUs.
+    NetNSAttachment netns = 2;
+}
+
+message DetachResponse {
+
+}
+
+message StatusRequest {}
+message StatusResponse {}
+
+// The workload networking service attaches workloads to the network.
+// The service is served over a unix socket.
+// It is called by containerd, and possibly other workload runtimes later.
+// It is a replacement for the Container Network Interface (CNI);
+// see https://github.com/containernetworking/cni/blob/main/SPEC.md
+//
+// Concurrent calls are allowed if they don't have the same workload_id.
+// For a specific workload_id, Attach may not be followed by another Attach
+// without a Detach in-between. Detach may be called multiple times.
+service WorkloadNetworking {
+    // Attach the workload to the network. This allocates the workload IP
+    // addresses (at most one for IPv4 and IPv6), and sets up the main network
+    // interface. For network namespaces, it also enables the loopback
+    // interface.
+    rpc Attach(AttachRequest) returns (AttachResponse);
+    // Detach removes the interfaces and IP address allocation of the workload.
+    // It succeeds even if some or all resources don't exist, and removes those
+    // which do exist.
+    rpc Detach(DetachRequest) returns (DetachResponse);
+    // Status returns an error if the service is not ready.
+    rpc Status(StatusRequest) returns (StatusResponse);
+}
+
diff --git a/metropolis/node/core/network/workloads/workloads.go b/metropolis/node/core/network/workloads/workloads.go
new file mode 100644
index 0000000..2b4f5d3
--- /dev/null
+++ b/metropolis/node/core/network/workloads/workloads.go
@@ -0,0 +1,330 @@
+// Copyright The Monogon Project Authors.
+// SPDX-License-Identifier: Apache-2.0
+
+package workloads
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"net/netip"
+	"os"
+	"sync"
+
+	"github.com/vishvananda/netlink"
+	"github.com/vishvananda/netns"
+	"golang.org/x/sys/unix"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+
+	"source.monogon.dev/metropolis/node"
+	"source.monogon.dev/metropolis/node/core/clusternet"
+	wlapi "source.monogon.dev/metropolis/node/core/network/workloads/spec"
+	"source.monogon.dev/osbase/event"
+	"source.monogon.dev/osbase/supervisor"
+)
+
+var (
+	firstHopV4 = net.IPv4(169, 254, 77, 1)
+	firstHopV6 = net.ParseIP("fe80::1")
+	// TODO: Replace prefix with Monogon OUI once we have it, right now
+	// it's just a random locally-administered MAC.
+	firstHopMAC = net.HardwareAddr{0x02, 0x9c, 0x52, 0xfe, 0x6d, 0x0a}
+)
+
+type Service struct {
+	mux          sync.Mutex
+	workloadNets []netip.Prefix
+	attachments  map[netip.Addr]string
+	// workloadToIntf maps workload name to short interface name.
+	workloadToIntf map[string]string
+	// intfUsed is the set of allocated short interface names.
+	intfUsed map[string]struct{}
+
+	k8sNodePrefix event.Value[*clusternet.Prefixes]
+}
+
+func New(k8sNodePrefix event.Value[*clusternet.Prefixes]) *Service {
+	return &Service{
+		workloadNets:   []netip.Prefix{},
+		attachments:    make(map[netip.Addr]string),
+		workloadToIntf: make(map[string]string),
+		intfUsed:       make(map[string]struct{}),
+		k8sNodePrefix:  k8sNodePrefix,
+	}
+}
+
+func (s *Service) allocateIPs(workloadId string) ([]net.IP, error) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	// This is a really simple allocator as it won't stay for long. It just
+	// walks the entire prefix and finds the first free IP. The size of the
+	// map is bound to 2x256 (max pods per node) for its life so this is fine.
+	var addrs []netip.Addr
+	for _, wlNet := range s.workloadNets {
+		candidateAddr := wlNet.Addr()
+		// The second address is reserved by clusternet for the host loopback,
+		// this will go away with the clusternet refactor.
+		reservedForHost := wlNet.Addr().Next()
+		for s.attachments[candidateAddr] != "" || candidateAddr == reservedForHost {
+			candidateAddr = candidateAddr.Next()
+		}
+		// Allocator ran off the prefix
+		if !wlNet.Contains(candidateAddr) {
+			return nil, fmt.Errorf("no free IP addresses in prefix %v", wlNet)
+		}
+		addrs = append(addrs, candidateAddr)
+	}
+	var addrsOut []net.IP
+	for _, addr := range addrs {
+		s.attachments[addr] = workloadId
+		addrsOut = append(addrsOut, net.IP(addr.AsSlice()))
+	}
+	return addrsOut, nil
+}
+
+func (s *Service) deallocateIPs(workloadId string) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	for ip, wlId := range s.attachments {
+		if wlId == workloadId {
+			delete(s.attachments, ip)
+		}
+	}
+}
+
+// allocateIntfName allocates a short interface name for the workload. This is
+// needed because interface names are limited to 15 characters.
+func (s *Service) allocateIntfName(workloadId string) (string, error) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	if _, ok := s.workloadToIntf[workloadId]; ok {
+		return "", fmt.Errorf("workload %q already has an interface", workloadId)
+	}
+	intfPrefix := "wk" + workloadId[:8]
+	intf := intfPrefix
+	for i := 0; ; i++ {
+		if _, ok := s.intfUsed[intf]; !ok {
+			break
+		}
+		if i > 0xffff {
+			return "", fmt.Errorf("too many interface name collisions for workload %q", workloadId)
+		}
+		intf = fmt.Sprintf("%s-%04x", intfPrefix, i)
+	}
+	s.workloadToIntf[workloadId] = intf
+	s.intfUsed[intf] = struct{}{}
+	return intf, nil
+}
+
+func (s *Service) getIntfName(workloadId string) (string, bool) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	intf, ok := s.workloadToIntf[workloadId]
+	return intf, ok
+}
+
+func (s *Service) deallocateIntfName(workloadId string) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+	intf, ok := s.workloadToIntf[workloadId]
+	if !ok {
+		return
+	}
+	delete(s.workloadToIntf, workloadId)
+	delete(s.intfUsed, intf)
+}
+
+func (s *Service) Run(ctx context.Context) error {
+	l := supervisor.Logger(ctx)
+
+	srv := grpc.NewServer()
+	wlapi.RegisterWorkloadNetworkingServer(srv, s)
+	os.Remove("/ephemeral/workloadnet.sock")
+	lis, err := net.ListenUnix("unix", &net.UnixAddr{Net: "unix", Name: "/ephemeral/workloadnet.sock"})
+	if err != nil {
+		return fmt.Errorf("failed to listen: %w", err)
+	}
+	supervisor.Run(ctx, "api", supervisor.GRPCServer(srv, lis, true))
+	w := s.k8sNodePrefix.Watch()
+	defer w.Close()
+
+	lo, err := netlink.LinkByIndex(1)
+	if err != nil {
+		panic(err)
+	}
+	if err := netlink.AddrAdd(lo, &netlink.Addr{
+		IPNet: &net.IPNet{IP: firstHopV4, Mask: net.CIDRMask(32, 32)},
+		Label: "Router",
+		Scope: unix.RT_SCOPE_LINK,
+	}); err != nil && !errors.Is(err, unix.EEXIST) {
+		l.Errorf("Unable to add router IP: %v", err)
+	}
+
+	supervisor.Signal(ctx, supervisor.SignalHealthy)
+	// It's undefined what happens when the workloadNets actually change right
+	// now with K8s IPAM. So just assign new workloads to the new prefixes for
+	// now. With the Monogon IPAM implementation this will have defined
+	// behavior.
+	for {
+		prefixes, err := w.Get(ctx)
+		if err != nil {
+			return err
+		}
+		if prefixes != nil {
+			s.mux.Lock()
+			s.workloadNets = *prefixes
+			s.mux.Unlock()
+		}
+	}
+}
+
+func (s *Service) Attach(ctx context.Context, req *wlapi.AttachRequest) (*wlapi.AttachResponse, error) {
+	intf, err := s.allocateIntfName(req.WorkloadId)
+	if err != nil {
+		return nil, status.Errorf(codes.AlreadyExists, "cannot add interface: %v", err)
+	}
+	workloadAddrs, err := s.allocateIPs(req.WorkloadId)
+	if err != nil {
+		return nil, status.Errorf(codes.ResourceExhausted, "cannot allocate IPs: %v", err)
+	}
+
+	linkAttrs := netlink.NewLinkAttrs()
+	linkAttrs.Group = node.LinkGroupK8sPod
+	linkAttrs.Name = intf
+	linkAttrs.HardwareAddr = firstHopMAC
+
+	netns, err := netns.GetFromPath(req.GetNetns().NetnsPath)
+	if err != nil {
+		return nil, fmt.Errorf("cannot open network namespace: %w", err)
+	}
+	defer netns.Close()
+
+	nsHandle, err := netlink.NewHandleAt(netns, unix.NETLINK_ROUTE)
+	if err != nil {
+		return nil, fmt.Errorf("unable to get ns handle: %w", err)
+	}
+	defer nsHandle.Close()
+
+	hostIf := netlink.Veth{LinkAttrs: linkAttrs, PeerName: req.GetNetns().IfName, PeerNamespace: netlink.NsFd(netns)}
+	if err := netlink.LinkAdd(&hostIf); err != nil {
+		return nil, fmt.Errorf("unable to create veth pair: %w", err)
+	}
+	// Linux is currently unable to assign aliases on interface creation.
+	if err := netlink.LinkSetAlias(&hostIf, "wk"+req.WorkloadId); err != nil {
+		return nil, fmt.Errorf("failed to assign alias: %w", err)
+	}
+	if err := netlink.LinkSetUp(&hostIf); err != nil {
+		return nil, fmt.Errorf("failed to set host up: %w", err)
+	}
+
+	// Loopback is always at index 1 by convention
+	loIf, err := nsHandle.LinkByIndex(1)
+	if err != nil {
+		return nil, fmt.Errorf("unable to get loopback interface in namespace: %w", err)
+	}
+	if err := nsHandle.LinkSetUp(loIf); err != nil {
+		return nil, fmt.Errorf("failed to set loopback up: %w", err)
+	}
+
+	workloadIf, err := nsHandle.LinkByName(req.GetNetns().IfName)
+	if err != nil {
+		return nil, fmt.Errorf("unable to get just-created peer interface in namespace: %w", err)
+	}
+
+	if err := nsHandle.LinkSetUp(workloadIf); err != nil {
+		return nil, fmt.Errorf("failed to set peer up: %w", err)
+	}
+	var outAddrs [][]byte
+	for _, workloadIP := range workloadAddrs {
+		outAddrs = append(outAddrs, workloadIP)
+
+		defaultMask := net.CIDRMask(0, 32) // /0
+		zeroIP := net.IPv4zero
+		hostMask := net.CIDRMask(32, 32) // /32
+		firstHop := firstHopV4
+		if workloadIP.To4() == nil {
+			defaultMask = net.CIDRMask(0, 128) // /0
+			zeroIP = net.IPv6zero
+			hostMask = net.CIDRMask(128, 128) // /128
+			firstHop = firstHopV6
+		}
+
+		if err := netlink.RouteAdd(&netlink.Route{
+			Dst:       &net.IPNet{IP: workloadIP, Mask: hostMask},
+			LinkIndex: hostIf.Index,
+			Scope:     netlink.SCOPE_UNIVERSE,
+			Protocol:  unix.RTPROT_STATIC,
+		}); err != nil {
+			return nil, fmt.Errorf("failed to add host to workload route: %w", err)
+		}
+
+		if err := nsHandle.AddrAdd(workloadIf, &netlink.Addr{
+			IPNet: &net.IPNet{IP: workloadIP, Mask: hostMask},
+		}); err != nil {
+			return nil, fmt.Errorf("failed to add address: %w", err)
+		}
+		// Use dedicated on-link route instead of RTNH_F_ONLINK which gVisor
+		// doesn't understand.
+		if err := nsHandle.RouteAdd(&netlink.Route{
+			Dst:       &net.IPNet{IP: firstHop, Mask: hostMask},
+			Scope:     netlink.SCOPE_LINK,
+			Protocol:  unix.RTPROT_STATIC,
+			LinkIndex: workloadIf.Attrs().Index,
+		}); err != nil {
+			return nil, fmt.Errorf("failed to add peer route: %w", err)
+		}
+		if err := nsHandle.RouteAdd(&netlink.Route{
+			Dst:       &net.IPNet{IP: zeroIP, Mask: defaultMask},
+			Gw:        firstHop,
+			Scope:     netlink.SCOPE_UNIVERSE,
+			Protocol:  unix.RTPROT_STATIC,
+			LinkIndex: workloadIf.Attrs().Index,
+			Src:       workloadIP,
+		}); err != nil {
+			return nil, fmt.Errorf("failed to add default route: %w", err)
+		}
+	}
+	return &wlapi.AttachResponse{Ip: outAddrs}, nil
+}
+
+func (s *Service) Detach(ctx context.Context, req *wlapi.DetachRequest) (*wlapi.DetachResponse, error) {
+	defer s.deallocateIntfName(req.WorkloadId)
+	defer s.deallocateIPs(req.WorkloadId)
+	intf, ok := s.getIntfName(req.WorkloadId)
+	if !ok {
+		return &wlapi.DetachResponse{}, nil
+	}
+
+	hostIf, err := netlink.LinkByName(intf)
+	if errors.As(err, &netlink.LinkNotFoundError{}) {
+		// CNI requires that DEL calls return success if the interface in
+		// question does not exist.
+		return &wlapi.DetachResponse{}, nil
+	}
+	if err != nil {
+		return nil, status.Errorf(codes.Unavailable, "error getting interface for deletion: %v", err)
+	}
+	if hostIf.Attrs().Group != node.LinkGroupK8sPod {
+		return nil, status.Errorf(codes.InvalidArgument, "refusing to delete interface not belonging to workload, has group %d", hostIf.Attrs().Group)
+	}
+	// Routes and addresses do not need to be cleaned up as Linux already takes
+	// care of that when the link is deleted.
+	if err := netlink.LinkDel(hostIf); err != nil {
+		return nil, status.Errorf(codes.Unavailable, "unable to delete veth interface: %v", err)
+	}
+	return &wlapi.DetachResponse{}, nil
+}
+
+func (s *Service) Status(ctx context.Context, req *wlapi.StatusRequest) (*wlapi.StatusResponse, error) {
+	s.mux.Lock()
+	defer s.mux.Unlock()
+
+	if len(s.workloadNets) == 0 {
+		return nil, status.Errorf(codes.Unavailable, "no prefixes available")
+	}
+
+	return &wlapi.StatusResponse{}, nil
+}
diff --git a/metropolis/node/core/roleserve/roleserve.go b/metropolis/node/core/roleserve/roleserve.go
index 1f9604c..4c1f610 100644
--- a/metropolis/node/core/roleserve/roleserve.go
+++ b/metropolis/node/core/roleserve/roleserve.go
@@ -66,6 +66,8 @@
 	// Network is a handle to the network service, used by workloads.
 	Network *network.Service
 
+	PodNetwork *memory.Value[*clusternet.Prefixes]
+
 	// resolver is the main, long-lived, authenticated cluster resolver that is used
 	// for all subsequent gRPC calls by the subordinates of the roleserver. It is
 	// created early in the roleserver lifecycle, and is seeded with node
@@ -86,7 +88,6 @@
 	KubernetesStatus      memory.Value[*KubernetesStatus]
 	bootstrapData         memory.Value[*BootstrapData]
 	LocalRoles            memory.Value[*cpb.NodeRoles]
-	podNetwork            memory.Value[*clusternet.Prefixes]
 	clusterDirectorySaved memory.Value[bool]
 	localControlPlane     memory.Value[*localControlPlane]
 	CuratorConnection     memory.Value[*CuratorConnection]
@@ -141,7 +142,7 @@
 		curatorConnection: &s.CuratorConnection,
 
 		kubernetesStatus: &s.KubernetesStatus,
-		podNetwork:       &s.podNetwork,
+		podNetwork:       s.Config.PodNetwork,
 	}
 
 	s.rolefetch = &workerRoleFetch{
@@ -161,7 +162,7 @@
 		storageRoot: s.StorageRoot,
 
 		curatorConnection: &s.CuratorConnection,
-		podNetwork:        &s.podNetwork,
+		podNetwork:        s.Config.PodNetwork,
 		network:           s.Network,
 	}
 
diff --git a/metropolis/node/core/update/e2e/testos/main.go b/metropolis/node/core/update/e2e/testos/main.go
index 0b40127..87db99f 100644
--- a/metropolis/node/core/update/e2e/testos/main.go
+++ b/metropolis/node/core/update/e2e/testos/main.go
@@ -30,7 +30,7 @@
 
 func testosRunnable(ctx context.Context) error {
 	supervisor.Logger(ctx).Info("TESTOS_VARIANT=" + Variant)
-	networkSvc := network.New(nil, nil)
+	networkSvc := network.New(nil, nil, nil)
 	networkSvc.DHCPVendorClassID = "dev.monogon.testos.v1"
 	supervisor.Run(ctx, "networking", networkSvc.Run)
 
diff --git a/metropolis/node/kubernetes/containerd/cniproxy/BUILD.bazel b/metropolis/node/kubernetes/containerd/cniproxy/BUILD.bazel
new file mode 100644
index 0000000..a53db4d
--- /dev/null
+++ b/metropolis/node/kubernetes/containerd/cniproxy/BUILD.bazel
@@ -0,0 +1,18 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "cniproxy",
+    srcs = [
+        "cniproxy.go",
+        "cnitypes.go",
+        "opts.go",
+    ],
+    importpath = "source.monogon.dev/metropolis/node/kubernetes/containerd/cniproxy",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//metropolis/node/core/network/workloads/spec",
+        "@com_github_containernetworking_cni//pkg/types",
+        "@org_golang_google_grpc//:grpc",
+        "@org_golang_google_grpc//credentials/insecure",
+    ],
+)
diff --git a/metropolis/node/kubernetes/containerd/cniproxy/cniproxy.go b/metropolis/node/kubernetes/containerd/cniproxy/cniproxy.go
new file mode 100644
index 0000000..eabcfd3
--- /dev/null
+++ b/metropolis/node/kubernetes/containerd/cniproxy/cniproxy.go
@@ -0,0 +1,161 @@
+// Copyright The Monogon Project Authors.
+// SPDX-License-Identifier: Apache-2.0
+
+// Package cni implements an adapter between the go-cni interface and
+// the Monogon gRPC Workload Attachment interface. As we do not intend to
+// actually implement a CNI-compliant plugin it makes more sense to just cut
+// out as much unnecessary logic and take over at the containerd API boundary.
+package cni
+
+import (
+	"context"
+	"fmt"
+	"net"
+
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+
+	wlapi "source.monogon.dev/metropolis/node/core/network/workloads/spec"
+)
+
+func New(_ ...Opt) (CNI, error) {
+	conn, err := grpc.NewClient("unix:/ephemeral/workloadnet.sock", grpc.WithTransportCredentials(insecure.NewCredentials()))
+	if err != nil {
+		panic(err)
+	}
+	wlClient := wlapi.NewWorkloadNetworkingClient(conn)
+	return &adapter{
+		client: wlClient,
+	}, nil
+}
+
+type NamespaceOpts func(n *Namespace) error
+
+// Namespace differs significantly from upstream as we do not have the actual
+// underlying CNI interface and thus we do not need to transform the data into
+// JSON keys.
+type Namespace struct {
+	labels      map[string]string
+	annotations map[string]string
+	portMapping []PortMapping
+	bandwidth   BandWidth
+	dns         DNS
+	cgroupPath  string
+}
+
+func WithLabels(labels map[string]string) NamespaceOpts {
+	return func(n *Namespace) error {
+		n.labels = labels
+		return nil
+	}
+}
+
+func WithCapability(name string, capability interface{}) NamespaceOpts {
+	return func(n *Namespace) error {
+		if name == "io.kubernetes.cri.pod-annotations" {
+			n.annotations = capability.(map[string]string)
+		}
+		return nil
+	}
+}
+
+func WithCapabilityPortMap(portMapping []PortMapping) NamespaceOpts {
+	return func(c *Namespace) error {
+		c.portMapping = portMapping
+		return nil
+	}
+}
+
+func WithCapabilityBandWidth(bandWidth BandWidth) NamespaceOpts {
+	return func(c *Namespace) error {
+		c.bandwidth = bandWidth
+		return nil
+	}
+}
+
+func WithCapabilityDNS(dns DNS) NamespaceOpts {
+	return func(c *Namespace) error {
+		c.dns = dns
+		return nil
+	}
+}
+
+func WithCapabilityCgroupPath(cgroupPath string) NamespaceOpts {
+	return func(c *Namespace) error {
+		c.cgroupPath = cgroupPath
+		return nil
+	}
+}
+
+type adapter struct {
+	client wlapi.WorkloadNetworkingClient
+}
+
+func (s *adapter) Setup(ctx context.Context, id string, path string, opts ...NamespaceOpts) (*Result, error) {
+	var n Namespace
+	for _, opt := range opts {
+		opt(&n)
+	}
+	res, err := s.client.Attach(ctx, &wlapi.AttachRequest{
+		WorkloadId: n.labels["K8S_POD_UID"],
+		Netns: &wlapi.NetNSAttachment{
+			NetnsPath: path,
+			IfName:    "eth0",
+		},
+	})
+	if err != nil {
+		return nil, fmt.Errorf("while requesting workload network attachment: %w", err)
+	}
+	// Provide IP to containerd/CRI, rest is ignored anyways.
+	var ipConfigs []*IPConfig
+	for _, ip := range res.Ip {
+		ipConfigs = append(ipConfigs, &IPConfig{IP: net.IP(ip)})
+	}
+	return &Result{
+		Interfaces: map[string]*Config{
+			"eth0": {
+				IPConfigs: ipConfigs,
+			},
+		},
+	}, nil
+}
+
+func (s *adapter) SetupSerially(ctx context.Context, id string, path string, opts ...NamespaceOpts) (*Result, error) {
+	// We do not support multiple plugins, the distinction between serial or
+	// parallel does not exist. Just forward the call.
+	return s.Setup(ctx, id, path, opts...)
+}
+
+func (s *adapter) Remove(ctx context.Context, id string, path string, opts ...NamespaceOpts) error {
+	var n Namespace
+	for _, opt := range opts {
+		opt(&n)
+	}
+
+	_, err := s.client.Detach(ctx, &wlapi.DetachRequest{
+		WorkloadId: n.labels["K8S_POD_UID"],
+		Netns: &wlapi.NetNSAttachment{
+			NetnsPath: path,
+			IfName:    "eth0",
+		},
+	})
+	return err
+}
+
+func (s *adapter) Check(ctx context.Context, id string, path string, opts ...NamespaceOpts) error {
+	return nil
+}
+
+func (s *adapter) Load(opts ...Opt) error {
+	// Stub, we do not actually have any CNI config.
+	return nil
+}
+
+func (s *adapter) Status() error {
+	_, err := s.client.Status(context.Background(), &wlapi.StatusRequest{})
+	return err
+}
+
+func (s *adapter) GetConfig() *ConfigResult {
+	return &ConfigResult{}
+}
diff --git a/metropolis/node/kubernetes/containerd/cniproxy/cnitypes.go b/metropolis/node/kubernetes/containerd/cniproxy/cnitypes.go
new file mode 100644
index 0000000..27bac36
--- /dev/null
+++ b/metropolis/node/kubernetes/containerd/cniproxy/cnitypes.go
@@ -0,0 +1,81 @@
+// Copyright The Monogon Project Authors.
+// Copyright The containerd Authors.
+// SPDX-License-Identifier: Apache-2.0
+
+package cni
+
+// This file contains types mostly or entirely lifted from go-cni but copied
+// here to allow API compatibility. Redefining these is not viable as their
+// references to other types would point to go-cni's types.
+
+import (
+	"context"
+	"net"
+
+	"github.com/containernetworking/cni/pkg/types"
+)
+
+type CNI interface {
+	// Setup setup the network for the namespace
+	Setup(ctx context.Context, id string, path string, opts ...NamespaceOpts) (*Result, error)
+	// SetupSerially sets up each of the network interfaces for the namespace in serial
+	SetupSerially(ctx context.Context, id string, path string, opts ...NamespaceOpts) (*Result, error)
+	// Remove tears down the network of the namespace.
+	Remove(ctx context.Context, id string, path string, opts ...NamespaceOpts) error
+	// Check checks if the network is still in desired state
+	Check(ctx context.Context, id string, path string, opts ...NamespaceOpts) error
+	// Load loads the cni network config
+	Load(opts ...Opt) error
+	// Status checks the status of the cni initialization
+	Status() error
+	// GetConfig returns a copy of the CNI plugin configurations as parsed by CNI
+	GetConfig() *ConfigResult
+}
+
+type PortMapping struct {
+	HostPort      int32
+	ContainerPort int32
+	Protocol      string
+	HostIP        string
+}
+
+// BandWidth defines the ingress/egress rate and burst limits
+type BandWidth struct {
+	IngressRate  uint64
+	IngressBurst uint64
+	EgressRate   uint64
+	EgressBurst  uint64
+}
+
+// DNS defines the dns config
+type DNS struct {
+	// List of DNS servers of the cluster.
+	Servers []string
+	// List of DNS search domains of the cluster.
+	Searches []string
+	// List of DNS options.
+	Options []string
+}
+
+type IPConfig struct {
+	IP      net.IP
+	Gateway net.IP
+}
+
+type Config struct {
+	IPConfigs  []*IPConfig
+	Mac        string
+	Sandbox    string
+	PciID      string
+	SocketPath string
+}
+
+type Result struct {
+	Interfaces map[string]*Config
+	DNS        []types.DNS
+	Routes     []*types.Route
+}
+
+// ConfigResult is not used by containerd and it's a complex type, leave it
+// for now.
+type ConfigResult struct{}
diff --git a/metropolis/node/kubernetes/containerd/cniproxy/opts.go b/metropolis/node/kubernetes/containerd/cniproxy/opts.go
new file mode 100644
index 0000000..0316276
--- /dev/null
+++ b/metropolis/node/kubernetes/containerd/cniproxy/opts.go
@@ -0,0 +1,58 @@
+// Copyright The Monogon Project Authors.
+// SPDX-License-Identifier: Apache-2.0
+
+package cni
+
+// Opt doesn't do anything as all configuration is ignored.
+type Opt func() error
+
+func noopOpt() error {
+	return nil
+}
+
+func WithConf(bytes []byte) Opt {
+	return noopOpt
+}
+
+func WithConfFile(fileName string) Opt {
+	return noopOpt
+}
+
+func WithConfIndex(bytes []byte, index int) Opt {
+	return noopOpt
+}
+
+func WithConfListBytes(bytes []byte) Opt {
+	return noopOpt
+}
+
+func WithConfListFile(fileName string) Opt {
+	return noopOpt
+}
+
+func WithInterfacePrefix(prefix string) Opt {
+	return noopOpt
+}
+
+func WithMinNetworkCount(count int) Opt {
+	return noopOpt
+}
+
+func WithPluginConfDir(dir string) Opt {
+	return noopOpt
+}
+
+func WithPluginDir(dirs []string) Opt {
+	return noopOpt
+}
+
+func WithPluginMaxConfNum(max int) Opt {
+	return noopOpt
+}
+
+func WithDefaultConf() error {
+	return nil
+}
+func WithLoNetwork() error {
+	return nil
+}
diff --git a/metropolis/node/kubernetes/containerd/cnispec.gojson b/metropolis/node/kubernetes/containerd/cnispec.gojson
deleted file mode 100644
index 4fca790..0000000
--- a/metropolis/node/kubernetes/containerd/cnispec.gojson
+++ /dev/null
@@ -1,31 +0,0 @@
-{{- /*gotype: github.com/containerd/cri/pkg/server.cniConfigTemplate*/ -}}
-{
-    "name": "k8s-pod-network",
-    "cniVersion": "0.3.1",
-    "plugins": [
-        {
-            "type": "ptp",
-            "mtu": 1420,
-            {{/* Must be node.LinkGroupK8sPod */}}
-            "linkGroup": 8,
-            "ipam": {
-                "type": "host-local",
-                "dataDir": "/ephemeral/containerd/ipam",
-                "ranges": [
-                    {{range $i, $range := .PodCIDRRanges}}{{if $i}},
-            {{end}}[
-            {
-                "subnet": "{{$range}}"
-            }
-        ]
-        {{end}}
-    ],
-    "routes": [
-        {{range $i, $route := .Routes}}{{if $i}},
-    {{end}}{
-    "dst": "{{$route}}"
-}{{end}}]
-}
-}
-]
-}
\ No newline at end of file
diff --git a/metropolis/node/kubernetes/containerd/config.toml b/metropolis/node/kubernetes/containerd/config.toml
index 88d9ea0..eff2c3d 100644
--- a/metropolis/node/kubernetes/containerd/config.toml
+++ b/metropolis/node/kubernetes/containerd/config.toml
@@ -142,11 +142,11 @@
             TypeUrl = "io.containerd.runsc.v1.options"
 
     [plugins."io.containerd.cri.v1.runtime".cni]
-      bin_dir = "/containerd/bin/cni"
-      conf_dir = "/ephemeral/containerd/cni"
+      bin_dir = ""
+      conf_dir = ""
       max_conf_num = 0
       setup_serially = false
-      conf_template = "/containerd/conf/cnispec.gojson"
+      conf_template = ""
       ip_pref = ""
       use_internal_loopback = false
 
diff --git a/metropolis/node/tools/tools.go b/metropolis/node/tools/tools.go
index 5377496..b021495 100644
--- a/metropolis/node/tools/tools.go
+++ b/metropolis/node/tools/tools.go
@@ -9,9 +9,6 @@
 import (
 	_ "github.com/containerd/containerd/v2/cmd/containerd"
 	_ "github.com/containerd/containerd/v2/cmd/containerd-shim-runc-v2"
-	_ "github.com/containernetworking/plugins/plugins/ipam/host-local"
-	_ "github.com/containernetworking/plugins/plugins/main/loopback"
-	_ "github.com/containernetworking/plugins/plugins/main/ptp"
 	_ "github.com/go-delve/delve/cmd/dlv"
 	_ "github.com/opencontainers/runc"
 	_ "github.com/prometheus/node_exporter"