Add service proxy

This adds a service proxy based on nfproxy and changes to the service IP allocation to make it work.
Also adds support for masquerading outbound traffic for outbound network connectivity.

Test Plan:
Currently manually tested by creating an alpine pod and running 'apk add curl && curl -k https://192.168.188.1:443/'.
Will be covered later by CTS.

Bug: T810

X-Origin-Diff: phab/D580
GitOrigin-RevId: cace863fd8c2f045560f8abf84c40cc77bc275d4
diff --git a/build/fietsje/main.go b/build/fietsje/main.go
index f227d9b..c62d6c7 100644
--- a/build/fietsje/main.go
+++ b/build/fietsje/main.go
@@ -110,6 +110,13 @@
 		"github.com/mdlayher/genetlink",
 	)
 
+	p.collect(
+		"github.com/sbezverk/nfproxy", "7fac5f39824e7f34228b08ba8b7640770ca6a9f4",
+		patches("nfproxy.patch"),
+	).use(
+		"github.com/sbezverk/nftableslib",
+	)
+
 	// First generate the repositories starlark rule into memory. This is because rendering will lock all unlocked
 	// dependencies, which might take a while. If a use were to interrupt it now, they would end up with an incomplete
 	// repositories.bzl and would have to restore from git.
diff --git a/core/internal/kubernetes/BUILD.bazel b/core/internal/kubernetes/BUILD.bazel
index 3bcbe6a..6b5d652 100644
--- a/core/internal/kubernetes/BUILD.bazel
+++ b/core/internal/kubernetes/BUILD.bazel
@@ -17,6 +17,7 @@
         "//core/internal/common:go_default_library",
         "//core/internal/common/supervisor:go_default_library",
         "//core/internal/kubernetes/clusternet:go_default_library",
+        "//core/internal/kubernetes/nfproxy:go_default_library",
         "//core/internal/kubernetes/pki:go_default_library",
         "//core/internal/kubernetes/reconciler:go_default_library",
         "//core/internal/localstorage:go_default_library",
diff --git a/core/internal/kubernetes/clusternet/clusternet.go b/core/internal/kubernetes/clusternet/clusternet.go
index 5c42bb8..e41ba8a 100644
--- a/core/internal/kubernetes/clusternet/clusternet.go
+++ b/core/internal/kubernetes/clusternet/clusternet.go
@@ -108,6 +108,7 @@
 		}
 		allowedIPs = append(allowedIPs, *podNet)
 	}
+	allowedIPs = append(allowedIPs, net.IPNet{IP: internalIP, Mask: net.CIDRMask(32, 32)})
 	s.logger.Debug("Adding/Updating WireGuard peer node", zap.String("node", newNode.Name),
 		zap.String("endpointIP", internalIP.String()), zap.Any("allowedIPs", allowedIPs))
 	// WireGuard's kernel side has create/update semantics on peers by default. So we can just add the peer multiple
diff --git a/core/internal/kubernetes/nfproxy/BUILD.bazel b/core/internal/kubernetes/nfproxy/BUILD.bazel
new file mode 100644
index 0000000..4bc7ab7
--- /dev/null
+++ b/core/internal/kubernetes/nfproxy/BUILD.bazel
@@ -0,0 +1,22 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["nfproxy.go"],
+    importpath = "git.monogon.dev/source/nexantic.git/core/internal/kubernetes/nfproxy",
+    visibility = ["//core:__subpackages__"],
+    deps = [
+        "//core/internal/common/supervisor:go_default_library",
+        "@com_github_sbezverk_nfproxy//pkg/controller:go_default_library",
+        "@com_github_sbezverk_nfproxy//pkg/nftables:go_default_library",
+        "@com_github_sbezverk_nfproxy//pkg/proxy:go_default_library",
+        "@io_k8s_api//core/v1:go_default_library",
+        "@io_k8s_apimachinery//pkg/apis/meta/v1:go_default_library",
+        "@io_k8s_apimachinery//pkg/labels:go_default_library",
+        "@io_k8s_apimachinery//pkg/selection:go_default_library",
+        "@io_k8s_client_go//informers:go_default_library",
+        "@io_k8s_client_go//kubernetes:go_default_library",
+        "@io_k8s_client_go//kubernetes/scheme:go_default_library",
+        "@io_k8s_client_go//tools/record:go_default_library",
+    ],
+)
diff --git a/core/internal/kubernetes/nfproxy/nfproxy.go b/core/internal/kubernetes/nfproxy/nfproxy.go
new file mode 100644
index 0000000..25962bf
--- /dev/null
+++ b/core/internal/kubernetes/nfproxy/nfproxy.go
@@ -0,0 +1,104 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package nfproxy is a Kubernetes Service IP proxy based exclusively on the Linux nftables interface.
+// It uses netfilter's NAT capabilities to accept traffic on service IPs and DNAT it to the respective endpoint.
+package nfproxy
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net"
+	"os"
+	"time"
+
+	"git.monogon.dev/source/nexantic.git/core/internal/common/supervisor"
+
+	"github.com/sbezverk/nfproxy/pkg/controller"
+	"github.com/sbezverk/nfproxy/pkg/nftables"
+	"github.com/sbezverk/nfproxy/pkg/proxy"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/selection"
+	kubeinformers "k8s.io/client-go/informers"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/tools/record"
+)
+
+type Service struct {
+	// Traffic in ClusterCIDR is assumed to be originated inside the cluster and will not be SNATed
+	ClusterCIDR net.IPNet
+	// A Kubernetes ClientSet with read access to endpoints and services
+	ClientSet kubernetes.Interface
+}
+
+func (s *Service) Run(ctx context.Context) error {
+	var ipv4ClusterCIDR string
+	var ipv6ClusterCIDR string
+	if s.ClusterCIDR.IP.To4() == nil && s.ClusterCIDR.IP.To16() != nil {
+		ipv6ClusterCIDR = s.ClusterCIDR.String()
+	} else if s.ClusterCIDR.IP.To4() != nil {
+		ipv4ClusterCIDR = s.ClusterCIDR.String()
+	} else {
+		return errors.New("invalid ClusterCIDR")
+	}
+	nfti, err := nftables.InitNFTables(ipv4ClusterCIDR, ipv6ClusterCIDR)
+	if err != nil {
+		return fmt.Errorf("failed to initialize nftables with error: %w", err)
+	}
+
+	// Create event recorder to report events into K8s
+	hostname, err := os.Hostname()
+	if err != nil {
+		return fmt.Errorf("failed to get local host name with error: %w", err)
+	}
+	eventBroadcaster := record.NewBroadcaster()
+	recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "nfproxy", Host: hostname})
+
+	// Create new proxy controller with endpoint slices enabled
+	// https://kubernetes.io/docs/concepts/services-networking/endpoint-slices/
+	nfproxy := proxy.NewProxy(nfti, hostname, recorder, true)
+
+	// Create special informer which doesn't track headless services
+	noHeadlessEndpoints, err := labels.NewRequirement(v1.IsHeadlessService, selection.DoesNotExist, nil)
+	if err != nil {
+		return fmt.Errorf("failed to create Requirement for noHeadlessEndpoints: %w", err)
+	}
+	labelSelector := labels.NewSelector()
+	labelSelector = labelSelector.Add(*noHeadlessEndpoints)
+
+	kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(s.ClientSet, time.Minute*5,
+		kubeinformers.WithTweakListOptions(func(options *metav1.ListOptions) {
+			options.LabelSelector = labelSelector.String()
+		}))
+
+	svcController := controller.NewServiceController(nfproxy, s.ClientSet, kubeInformerFactory.Core().V1().Services())
+	ep := controller.NewEndpointSliceController(nfproxy, s.ClientSet, kubeInformerFactory.Discovery().V1beta1().EndpointSlices())
+	kubeInformerFactory.Start(ctx.Done())
+
+	if err = svcController.Start(ctx.Done()); err != nil {
+		return fmt.Errorf("error running Service controller: %w", err)
+	}
+	if err = ep.Start(ctx.Done()); err != nil {
+		return fmt.Errorf("error running endpoint controller: %w", err)
+	}
+	supervisor.Signal(ctx, supervisor.SignalHealthy)
+	supervisor.Signal(ctx, supervisor.SignalDone)
+	return nil
+}
diff --git a/core/internal/kubernetes/pki/kubernetes.go b/core/internal/kubernetes/pki/kubernetes.go
index 48ce6e9..0de8f6d 100644
--- a/core/internal/kubernetes/pki/kubernetes.go
+++ b/core/internal/kubernetes/pki/kubernetes.go
@@ -103,7 +103,7 @@
 			"kubernetes.default.svc.cluster.local",
 			"localhost",
 		},
-		[]net.IP{{127, 0, 0, 1}}, // TODO(q3k): add service network internal apiserver address
+		[]net.IP{{10, 0, 255, 1}, {127, 0, 0, 1}}, // TODO(q3k): add service network internal apiserver address
 	))
 	make(IdCA, KubeletClient, Client("smalltown:apiserver-kubelet-client", nil))
 	make(IdCA, ControllerManagerClient, Client("system:kube-controller-manager", nil))
diff --git a/core/internal/kubernetes/service.go b/core/internal/kubernetes/service.go
index 2396066..a22b6b9 100644
--- a/core/internal/kubernetes/service.go
+++ b/core/internal/kubernetes/service.go
@@ -33,6 +33,7 @@
 
 	"git.monogon.dev/source/nexantic.git/core/internal/common/supervisor"
 	"git.monogon.dev/source/nexantic.git/core/internal/kubernetes/clusternet"
+	"git.monogon.dev/source/nexantic.git/core/internal/kubernetes/nfproxy"
 	"git.monogon.dev/source/nexantic.git/core/internal/kubernetes/pki"
 	"git.monogon.dev/source/nexantic.git/core/internal/kubernetes/reconciler"
 	"git.monogon.dev/source/nexantic.git/core/internal/localstorage"
@@ -156,6 +157,11 @@
 		DataDirectory:   &s.c.Root.Data.Kubernetes.ClusterNetworking,
 	}
 
+	nfproxy := nfproxy.Service{
+		ClusterCIDR: s.c.ClusterNet,
+		ClientSet:   clientSet,
+	}
+
 	for _, sub := range []struct {
 		name     string
 		runnable supervisor.Runnable
@@ -168,6 +174,7 @@
 		{"csi-plugin", csiPlugin.Run},
 		{"csi-provisioner", csiProvisioner.Run},
 		{"clusternet", clusternet.Run},
+		{"nfproxy", nfproxy.Run},
 	} {
 		err := supervisor.Run(ctx, sub.name, sub.runnable)
 		if err != nil {
diff --git a/core/internal/network/BUILD.bazel b/core/internal/network/BUILD.bazel
index 9eefc1b..ad7de74 100644
--- a/core/internal/network/BUILD.bazel
+++ b/core/internal/network/BUILD.bazel
@@ -8,6 +8,8 @@
     deps = [
         "//core/internal/common/supervisor:go_default_library",
         "//core/internal/network/dhcp:go_default_library",
+        "@com_github_google_nftables//:go_default_library",
+        "@com_github_google_nftables//expr:go_default_library",
         "@com_github_vishvananda_netlink//:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
         "@org_uber_go_zap//:go_default_library",
diff --git a/core/internal/network/main.go b/core/internal/network/main.go
index ac9ce46..c92b21a 100644
--- a/core/internal/network/main.go
+++ b/core/internal/network/main.go
@@ -23,6 +23,9 @@
 	"net"
 	"os"
 
+	"github.com/google/nftables"
+	"github.com/google/nftables/expr"
+
 	"github.com/vishvananda/netlink"
 	"go.uber.org/zap"
 	"golang.org/x/sys/unix"
@@ -97,6 +100,13 @@
 	return nil
 }
 
+// nfifname converts an interface name into 16 bytes padded with zeroes (for nftables)
+func nfifname(n string) []byte {
+	b := make([]byte, 16)
+	copy(b, []byte(n+"\x00"))
+	return b
+}
+
 func (s *Service) useInterface(ctx context.Context, iface netlink.Link) error {
 	err := supervisor.Run(ctx, "dhcp", s.dhcp.Run(iface))
 	if err != nil {
@@ -115,6 +125,40 @@
 		s.logger.Warn("failed to add routes", zap.Error(err))
 	}
 
+	c := nftables.Conn{}
+
+	nat := c.AddTable(&nftables.Table{
+		Family: nftables.TableFamilyIPv4,
+		Name:   "nat",
+	})
+
+	postrouting := c.AddChain(&nftables.Chain{
+		Name:     "postrouting",
+		Hooknum:  nftables.ChainHookPostrouting,
+		Priority: nftables.ChainPriorityNATSource,
+		Table:    nat,
+		Type:     nftables.ChainTypeNAT,
+	})
+
+	// Masquerade/SNAT all traffic going out of the external interface
+	c.AddRule(&nftables.Rule{
+		Table: nat,
+		Chain: postrouting,
+		Exprs: []expr.Any{
+			&expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1},
+			&expr.Cmp{
+				Op:       expr.CmpOpEq,
+				Register: 1,
+				Data:     nfifname(iface.Attrs().Name),
+			},
+			&expr.Masq{},
+		},
+	})
+
+	if err := c.Flush(); err != nil {
+		panic(err)
+	}
+
 	return nil
 }
 
diff --git a/nogo_config.json b/nogo_config.json
index f3f47ff..19e74f5 100644
--- a/nogo_config.json
+++ b/nogo_config.json
@@ -76,7 +76,8 @@
     "exclude_files": {
       "external/io_k8s_kubernetes/": "third_party",
       "external/runc/vendor/github.com/vishvananda/netlink": "third_party",
-      "external/com_github_google_gvisor/": "third_party"
+      "external/com_github_google_gvisor/": "third_party",
+      "external/com_github_sbezverk_nfproxy/": "third_party"
     }
   },
   "structtag": {
diff --git a/third_party/go/patches/nfproxy.patch b/third_party/go/patches/nfproxy.patch
new file mode 100644
index 0000000..307d3d4
--- /dev/null
+++ b/third_party/go/patches/nfproxy.patch
@@ -0,0 +1,36 @@
+Copyright 2020 The Monogon Project Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+Fix nfproxy compatibility with our version of Kubernetes/utilproxy
+--- com_github_sbezverk_nfproxy.orig/pkg/proxy/proxy_service.go	2020-07-16 14:24:06.901176302 +0200
++++ com_github_sbezverk_nfproxy/pkg/proxy/proxy_service.go	2020-07-16 14:08:34.118927035 +0200
+@@ -22,7 +22,6 @@
+ 	utilnftables "github.com/google/nftables"
+ 	"github.com/sbezverk/nfproxy/pkg/nftables"
+ 	v1 "k8s.io/api/core/v1"
+-	"k8s.io/apimachinery/pkg/types"
+ 	"k8s.io/klog"
+ 	utilproxy "k8s.io/kubernetes/pkg/proxy/util"
+ 	utilnet "k8s.io/utils/net"
+@@ -44,8 +43,7 @@
+ 		stickySeconds := int(*svc.Spec.SessionAffinityConfig.ClientIP.TimeoutSeconds)
+ 		klog.V(5).Infof("Service %s/%s has SessionAffinity set for %d seconds", svc.Namespace, svc.Name, stickySeconds)
+ 	}
+-	svcName := types.NamespacedName{Namespace: svc.Namespace, Name: svc.Name}
+-	if utilproxy.ShouldSkipService(svcName, svc) {
++	if utilproxy.ShouldSkipService(svc) {
+ 		return
+ 	}
+ 	for i := range svc.Spec.Ports {
diff --git a/third_party/go/repositories.bzl b/third_party/go/repositories.bzl
index c4e21c6..042aadb 100644
--- a/third_party/go/repositories.bzl
+++ b/third_party/go/repositories.bzl
@@ -1258,6 +1258,22 @@
         sum = "h1:0U2s5loxrTy6/VgfVoLuVLFJcURKLH49ie0zSch7gh4=",
     )
     go_repository(
+        name = "com_github_sbezverk_nfproxy",
+        importpath = "github.com/sbezverk/nfproxy",
+        version = "v0.0.0-20200514180651-7fac5f39824e",
+        sum = "h1:fJ2lHQ7ZUjmgJbvVQ509ioBmrGHcbvlwfjUieExw/dU=",
+        patches = [
+            "//third_party/go/patches:nfproxy.patch",
+        ],
+        patch_args = ["-p1"],
+    )
+    go_repository(
+        name = "com_github_sbezverk_nftableslib",
+        importpath = "github.com/sbezverk/nftableslib",
+        version = "v0.0.0-20200402150358-c20bed91f482",
+        sum = "h1:k7gEZ/EwJhHDTRXFUZQlE4/p1cmoha7zL7PWCDG3ZHQ=",
+    )
+    go_repository(
         name = "com_github_seccomp_libseccomp_golang",
         importpath = "github.com/seccomp/libseccomp-golang",
         version = "v0.9.1",
diff --git a/third_party/go/shelf.pb.text b/third_party/go/shelf.pb.text
index 85372be..9d77503 100644
--- a/third_party/go/shelf.pb.text
+++ b/third_party/go/shelf.pb.text
@@ -1959,6 +1959,20 @@
   semver: "v0.2.1-0.20190427202633-1595213edefa"
 >
 entry: <
+  import_path: "github.com/sbezverk/nfproxy"
+  version: "7fac5f39824e7f34228b08ba8b7640770ca6a9f4"
+  bazel_name: "com_github_sbezverk_nfproxy"
+  sum: "h1:fJ2lHQ7ZUjmgJbvVQ509ioBmrGHcbvlwfjUieExw/dU="
+  semver: "v0.0.0-20200514180651-7fac5f39824e"
+>
+entry: <
+  import_path: "github.com/sbezverk/nftableslib"
+  version: "v0.0.0-20200402150358-c20bed91f482"
+  bazel_name: "com_github_sbezverk_nftableslib"
+  sum: "h1:k7gEZ/EwJhHDTRXFUZQlE4/p1cmoha7zL7PWCDG3ZHQ="
+  semver: "v0.0.0-20200402150358-c20bed91f482"
+>
+entry: <
   import_path: "github.com/seccomp/libseccomp-golang"
   version: "689e3c1541a84461afc49c1c87352a6cedf72e9c"
   bazel_name: "com_github_seccomp_libseccomp_golang"
diff --git a/third_party/linux/linux-smalltown.config b/third_party/linux/linux-smalltown.config
index b4f2df1..e409a05 100644
--- a/third_party/linux/linux-smalltown.config
+++ b/third_party/linux/linux-smalltown.config
@@ -974,6 +974,8 @@
 CONFIG_NFT_REJECT=y
 CONFIG_NFT_REJECT_INET=y
 CONFIG_NFT_HASH=y
+CONFIG_NFT_FIB=y
+# CONFIG_NFT_FIB_INET is not set
 CONFIG_NFT_SOCKET=y
 CONFIG_NFT_OSF=y
 CONFIG_NFT_TPROXY=y
@@ -981,6 +983,7 @@
 CONFIG_NF_DUP_NETDEV=y
 CONFIG_NFT_DUP_NETDEV=y
 CONFIG_NFT_FWD_NETDEV=y
+# CONFIG_NFT_FIB_NETDEV is not set
 CONFIG_NF_FLOW_TABLE_INET=y
 CONFIG_NF_FLOW_TABLE=y
 # CONFIG_NETFILTER_XTABLES is not set
@@ -997,11 +1000,11 @@
 CONFIG_NF_TPROXY_IPV4=y
 CONFIG_NF_TABLES_IPV4=y
 CONFIG_NFT_REJECT_IPV4=y
-# CONFIG_NFT_DUP_IPV4 is not set
-# CONFIG_NFT_FIB_IPV4 is not set
+CONFIG_NFT_DUP_IPV4=y
+CONFIG_NFT_FIB_IPV4=y
 # CONFIG_NF_TABLES_ARP is not set
 # CONFIG_NF_FLOW_TABLE_IPV4 is not set
-# CONFIG_NF_DUP_IPV4 is not set
+CONFIG_NF_DUP_IPV4=y
 # CONFIG_NF_LOG_ARP is not set
 # CONFIG_NF_LOG_IPV4 is not set
 CONFIG_NF_REJECT_IPV4=y
@@ -1016,10 +1019,10 @@
 CONFIG_NF_TPROXY_IPV6=y
 CONFIG_NF_TABLES_IPV6=y
 CONFIG_NFT_REJECT_IPV6=y
-# CONFIG_NFT_DUP_IPV6 is not set
-# CONFIG_NFT_FIB_IPV6 is not set
+CONFIG_NFT_DUP_IPV6=y
+CONFIG_NFT_FIB_IPV6=y
 # CONFIG_NF_FLOW_TABLE_IPV6 is not set
-# CONFIG_NF_DUP_IPV6 is not set
+CONFIG_NF_DUP_IPV6=y
 CONFIG_NF_REJECT_IPV6=y
 # CONFIG_NF_LOG_IPV6 is not set
 # CONFIG_IP6_NF_IPTABLES is not set