m/n/core/rpc/resolver: log current processor state on watcher error
We seem to be having some resolvers getting stuck in production like so:
I0626 09:52:39.708844 resolver.go:275] CURUPDATE: error in loop: when receiving node: rpc error: code = Unimplemented desc = unknown service metropolis.node.core.curator.proto.api.Curator
I0626 09:52:39.708850 resolver.go:276] CURUPDATE: retrying in 10.040771699s...
This introduces extra logging that should help us figure out what
exactly broke, or at least bring us a bit closer to figuring it out.
Change-Id: I658cff6ae86e9124141b5d2c36dceafa377842e9
Reviewed-on: https://review.monogon.dev/c/monogon/+/1852
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/rpc/resolver/processor.go b/metropolis/node/core/rpc/resolver/processor.go
index ef9d4f6..975b2df 100644
--- a/metropolis/node/core/rpc/resolver/processor.go
+++ b/metropolis/node/core/rpc/resolver/processor.go
@@ -23,6 +23,7 @@
ds *requestDialOptionsSet
sub *requestSubscribe
unsub *requestUnsubscribe
+ dbg *requestDebug
}
// requestCuratorMapGet is received from any subsystem which wants to
@@ -110,6 +111,22 @@
resC chan *responseSubscribe
}
+// requestDebug is received from the curator updater on failure, and is used to
+// provide the user with up-to-date debug information about the processor's state
+// at time of failure.
+type requestDebug struct {
+ resC chan *responseDebug
+}
+
+type responseDebug struct {
+ // curmap is the copy of the curator map as seen by the processor at time of
+ // request.
+ curmap *curatorMap
+ // leader is the current leader, if any, as seen by the processor at the time of
+ // the request.
+ leader *requestLeaderUpdate
+}
+
type responseSubscribe struct {
// id is the ID of the subscription, used to cancel the subscription by the
// subscriber.
@@ -241,6 +258,20 @@
close(subscribers[req.unsub.id])
delete(subscribers, req.unsub.id)
}
+ case req.dbg != nil:
+ // Debug
+ var leader2 *requestLeaderUpdate
+ if leader != nil {
+ endpoint := *leader.endpoint
+ leader2 = &requestLeaderUpdate{
+ nodeID: leader.nodeID,
+ endpoint: &endpoint,
+ }
+ }
+ req.dbg.resC <- &responseDebug{
+ curmap: curMap.copy(),
+ leader: leader2,
+ }
default:
panic(fmt.Sprintf("unhandled request: %+v", req))
}
diff --git a/metropolis/node/core/rpc/resolver/resolver.go b/metropolis/node/core/rpc/resolver/resolver.go
index 8b695b0..6ab3bbd 100644
--- a/metropolis/node/core/rpc/resolver/resolver.go
+++ b/metropolis/node/core/rpc/resolver/resolver.go
@@ -284,8 +284,22 @@
}
}
}, backoff.WithContext(bo, ctx), func(err error, t time.Duration) {
- r.logger("CURUPDATE: error in loop: %v", err)
- r.logger("CURUPDATE: retrying in %s...", t.String())
+ c := make(chan *responseDebug)
+ r.reqC <- &request{dbg: &requestDebug{resC: c}}
+ dbg := <-c
+ var msg []string
+ for k, v := range dbg.curmap.curators {
+ msg = append(msg, fmt.Sprintf("curator: %s/%s", k, v.endpoint))
+ }
+ for k, _ := range dbg.curmap.seeds {
+ msg = append(msg, fmt.Sprintf("seed: %s", k))
+ }
+ if dbg.leader != nil {
+ msg = append(msg, fmt.Sprintf("leader: %s/%s", dbg.leader.nodeID, dbg.leader.endpoint.endpoint))
+ }
+
+ r.logger("CURUPDATE: error in loop: %v, retrying in %s...", err, t.String())
+ r.logger("CURUPDATE: processor state: %s", strings.Join(msg, ", "))
})
}