m/n/c/r/resolver: enable keepalive on updaters

This prevents the resolver from getting stuck waiting for TCP timeout
when the node it's connected to partitions. This was observed a few times in manual testing when restarting nodes.

Change-Id: I7126888b77e9e1dfbcfcfc009f04639e65119fa6
Reviewed-on: https://review.monogon.dev/c/monogon/+/815
Tested-by: Jenkins CI
Reviewed-by: Mateusz Zalega <mateusz@monogon.tech>
diff --git a/metropolis/node/core/rpc/resolver/BUILD.bazel b/metropolis/node/core/rpc/resolver/BUILD.bazel
index 092ac19..6db036f 100644
--- a/metropolis/node/core/rpc/resolver/BUILD.bazel
+++ b/metropolis/node/core/rpc/resolver/BUILD.bazel
@@ -15,6 +15,7 @@
         "//metropolis/proto/common",
         "@com_github_cenkalti_backoff_v4//:backoff",
         "@org_golang_google_grpc//:go_default_library",
+        "@org_golang_google_grpc//keepalive",
         "@org_golang_google_grpc//resolver",
     ],
 )
diff --git a/metropolis/node/core/rpc/resolver/resolver.go b/metropolis/node/core/rpc/resolver/resolver.go
index f5c011f..3b8b6a6 100644
--- a/metropolis/node/core/rpc/resolver/resolver.go
+++ b/metropolis/node/core/rpc/resolver/resolver.go
@@ -11,6 +11,7 @@
 
 	"github.com/cenkalti/backoff/v4"
 	"google.golang.org/grpc"
+	"google.golang.org/grpc/keepalive"
 
 	common "source.monogon.dev/metropolis/node"
 	apb "source.monogon.dev/metropolis/node/core/curator/proto/api"
@@ -209,7 +210,12 @@
 	bo.MaxInterval = 10 * time.Second
 
 	return backoff.RetryNotify(func() error {
-		opts = append(opts, grpc.WithResolvers(r))
+		// Use a keepalive to make sure we time out fast if the node we're connecting to
+		// partitions.
+		opts = append(opts, grpc.WithResolvers(r), grpc.WithKeepaliveParams(keepalive.ClientParameters{
+			Time:    10 * time.Second,
+			Timeout: 5 * time.Second,
+		}))
 		cl, err := grpc.Dial(MetropolisControlAddress, opts...)
 		if err != nil {
 			// This generally shouldn't happen.
@@ -301,6 +307,15 @@
 // successful. This is used by retry logic to figure out whether to wait before
 // retrying or not.
 func (r *Resolver) watchLeaderVia(ctx context.Context, via string, opts []grpc.DialOption) bool {
+	// Use a keepalive to make sure we time out fast if the node we're connecting to
+	// partitions. This is particularly critical for the leader updater, as we want
+	// to know as early as possible that this happened, so that we can move over to
+	// another node.
+	opts = append(opts, grpc.WithKeepaliveParams(keepalive.ClientParameters{
+		Time:                10 * time.Second,
+		Timeout:             5 * time.Second,
+		PermitWithoutStream: true,
+	}))
 	cl, err := grpc.Dial(via, opts...)
 	if err != nil {
 		r.logger("WATCHLEADER: dialing %s failed: %v", via, err)