m/n/c/r/resolver: enable keepalive on updaters
This prevents the resolver from getting stuck waiting for TCP timeout
when the node it's connected to partitions. This was observed a few times in manual testing when restarting nodes.
Change-Id: I7126888b77e9e1dfbcfcfc009f04639e65119fa6
Reviewed-on: https://review.monogon.dev/c/monogon/+/815
Tested-by: Jenkins CI
Reviewed-by: Mateusz Zalega <mateusz@monogon.tech>
diff --git a/metropolis/node/core/rpc/resolver/BUILD.bazel b/metropolis/node/core/rpc/resolver/BUILD.bazel
index 092ac19..6db036f 100644
--- a/metropolis/node/core/rpc/resolver/BUILD.bazel
+++ b/metropolis/node/core/rpc/resolver/BUILD.bazel
@@ -15,6 +15,7 @@
"//metropolis/proto/common",
"@com_github_cenkalti_backoff_v4//:backoff",
"@org_golang_google_grpc//:go_default_library",
+ "@org_golang_google_grpc//keepalive",
"@org_golang_google_grpc//resolver",
],
)
diff --git a/metropolis/node/core/rpc/resolver/resolver.go b/metropolis/node/core/rpc/resolver/resolver.go
index f5c011f..3b8b6a6 100644
--- a/metropolis/node/core/rpc/resolver/resolver.go
+++ b/metropolis/node/core/rpc/resolver/resolver.go
@@ -11,6 +11,7 @@
"github.com/cenkalti/backoff/v4"
"google.golang.org/grpc"
+ "google.golang.org/grpc/keepalive"
common "source.monogon.dev/metropolis/node"
apb "source.monogon.dev/metropolis/node/core/curator/proto/api"
@@ -209,7 +210,12 @@
bo.MaxInterval = 10 * time.Second
return backoff.RetryNotify(func() error {
- opts = append(opts, grpc.WithResolvers(r))
+ // Use a keepalive to make sure we time out fast if the node we're connecting to
+ // partitions.
+ opts = append(opts, grpc.WithResolvers(r), grpc.WithKeepaliveParams(keepalive.ClientParameters{
+ Time: 10 * time.Second,
+ Timeout: 5 * time.Second,
+ }))
cl, err := grpc.Dial(MetropolisControlAddress, opts...)
if err != nil {
// This generally shouldn't happen.
@@ -301,6 +307,15 @@
// successful. This is used by retry logic to figure out whether to wait before
// retrying or not.
func (r *Resolver) watchLeaderVia(ctx context.Context, via string, opts []grpc.DialOption) bool {
+ // Use a keepalive to make sure we time out fast if the node we're connecting to
+ // partitions. This is particularly critical for the leader updater, as we want
+ // to know as early as possible that this happened, so that we can move over to
+ // another node.
+ opts = append(opts, grpc.WithKeepaliveParams(keepalive.ClientParameters{
+ Time: 10 * time.Second,
+ Timeout: 5 * time.Second,
+ PermitWithoutStream: true,
+ }))
cl, err := grpc.Dial(via, opts...)
if err != nil {
r.logger("WATCHLEADER: dialing %s failed: %v", via, err)