metropolis/resolver: more logging
This should let us figure out HA connectivity issues better in the
future. We were mostly missing logging connection attempts to control
plane nodes and leader information received from them.
Change-Id: I88f3e4b289561e7b31fcbb59d26b674d8b6aea39
Reviewed-on: https://review.monogon.dev/c/monogon/+/2067
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/rpc/resolver/resolver.go b/metropolis/node/core/rpc/resolver/resolver.go
index 6ab3bbd..36ad180 100644
--- a/metropolis/node/core/rpc/resolver/resolver.go
+++ b/metropolis/node/core/rpc/resolver/resolver.go
@@ -311,9 +311,10 @@
bo.MaxElapsedTime = 0
bo.MaxInterval = 10 * time.Second
- return backoff.RetryNotify(func() error {
+ err := backoff.RetryNotify(func() error {
curMap := r.curatorMap()
for _, endpoint := range curMap.candidates() {
+ r.logger("FINDLEADER: trying via %s...", endpoint)
ok := r.watchLeaderVia(ctx, endpoint, opts)
if ok {
bo.Reset()
@@ -323,6 +324,8 @@
}, backoff.WithContext(bo, ctx), func(err error, t time.Duration) {
r.logger("FINDLEADER: error in loop: %v, retrying in %s...", err, t.String())
})
+ r.logger("FINDLEADER: exiting: %v", err)
+ return err
}
// watchLeaderVia connects to the endpoint defined by 'via' and attempts to
@@ -358,6 +361,7 @@
}
ok := false
for {
+ r.logger("WATCHLEADER: receiving...")
leaderInfo, err := cur.Recv()
if err == io.EOF {
r.logger("WATCHLEADER: connection with %s closed", via)
@@ -367,6 +371,7 @@
r.logger("WATCHLEADER: connection with %s failed: %v", via, err)
return ok
}
+ r.logger("WATCHLEADER: received: %+v", leaderInfo)
curMap := r.curatorMap()