m/node/core: fix up resolver keepalives

We set up keepalives in the resolver to quickly detect that a Curator is
not longer available.

This fixes two bugs related to their use:

 1. When the resolver's curator connection dies (eg. when we get kicked
    off by the curator for keepaliving too often), we recreate it fully
    and thus lose any 'keepalive backoff' state that the client
    connection carries, thereby not letting that keepalive bachoff
    mechanism kick in as intended.
 2. The server-side limits for client keepalives were simply too low
    compared to what the resolver library wants to use.

Change-Id: If2e53e20a1462e9f71a3723b92d346aff795d84c
Reviewed-on: https://review.monogon.dev/c/monogon/+/1321
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/curator/BUILD.bazel b/metropolis/node/core/curator/BUILD.bazel
index 972beca..be23b55 100644
--- a/metropolis/node/core/curator/BUILD.bazel
+++ b/metropolis/node/core/curator/BUILD.bazel
@@ -42,6 +42,7 @@
         "@io_etcd_go_etcd_client_v3//concurrency",
         "@org_golang_google_grpc//:go_default_library",
         "@org_golang_google_grpc//codes",
+        "@org_golang_google_grpc//keepalive",
         "@org_golang_google_grpc//status",
         "@org_golang_google_protobuf//proto",
         "@org_golang_google_protobuf//types/known/durationpb",
diff --git a/metropolis/node/core/curator/listener.go b/metropolis/node/core/curator/listener.go
index 00b3ca1..e093d31 100644
--- a/metropolis/node/core/curator/listener.go
+++ b/metropolis/node/core/curator/listener.go
@@ -4,8 +4,10 @@
 	"context"
 	"fmt"
 	"net"
+	"time"
 
 	"google.golang.org/grpc"
+	"google.golang.org/grpc/keepalive"
 
 	"source.monogon.dev/metropolis/node"
 	"source.monogon.dev/metropolis/node/core/consensus"
@@ -77,7 +79,12 @@
 
 	// Prepare a gRPC server and listener.
 	logger := supervisor.MustSubLogger(ctx, "rpc")
-	srv := grpc.NewServer(sec.GRPCOptions(logger)...)
+	opts := sec.GRPCOptions(logger)
+	opts = append(opts, grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
+		MinTime:             time.Second,
+		PermitWithoutStream: true,
+	}))
+	srv := grpc.NewServer(opts...)
 	lis, err := net.Listen("tcp", fmt.Sprintf(":%d", node.CuratorServicePort))
 	if err != nil {
 		return fmt.Errorf("failed to listen on curator socket: %w", err)
diff --git a/metropolis/node/core/rpc/resolver/resolver.go b/metropolis/node/core/rpc/resolver/resolver.go
index 3b8b6a6..dfac477 100644
--- a/metropolis/node/core/rpc/resolver/resolver.go
+++ b/metropolis/node/core/rpc/resolver/resolver.go
@@ -209,21 +209,22 @@
 	bo.MaxElapsedTime = 0
 	bo.MaxInterval = 10 * time.Second
 
-	return backoff.RetryNotify(func() error {
-		// Use a keepalive to make sure we time out fast if the node we're connecting to
-		// partitions.
-		opts = append(opts, grpc.WithResolvers(r), grpc.WithKeepaliveParams(keepalive.ClientParameters{
-			Time:    10 * time.Second,
-			Timeout: 5 * time.Second,
-		}))
-		cl, err := grpc.Dial(MetropolisControlAddress, opts...)
-		if err != nil {
-			// This generally shouldn't happen.
-			return fmt.Errorf("could not dial gRPC: %v", err)
-		}
-		defer cl.Close()
+	// Use a keepalive to make sure we time out fast if the node we're connecting to
+	// partitions.
+	opts = append(opts, grpc.WithResolvers(r), grpc.WithKeepaliveParams(keepalive.ClientParameters{
+		Time:    10 * time.Second,
+		Timeout: 5 * time.Second,
+	}))
+	cl, err := grpc.Dial(MetropolisControlAddress, opts...)
+	if err != nil {
+		// This generally shouldn't happen.
+		return fmt.Errorf("could not dial gRPC: %v", err)
+	}
+	defer cl.Close()
 
-		cur := apb.NewCuratorClient(cl)
+	cur := apb.NewCuratorClient(cl)
+
+	return backoff.RetryNotify(func() error {
 		w, err := cur.Watch(ctx, &apb.WatchRequest{
 			Kind: &apb.WatchRequest_NodesInCluster_{
 				NodesInCluster: &apb.WatchRequest_NodesInCluster{},