m/n/core/rpc/resolver: log current processor state on watcher error

We seem to be having some resolvers getting stuck in production like so:

I0626 09:52:39.708844 resolver.go:275] CURUPDATE: error in loop: when receiving node: rpc error: code = Unimplemented desc = unknown service metropolis.node.core.curator.proto.api.Curator
I0626 09:52:39.708850 resolver.go:276] CURUPDATE: retrying in 10.040771699s...

This introduces extra logging that should help us figure out what
exactly broke, or at least bring us a bit closer to figuring it out.

Change-Id: I658cff6ae86e9124141b5d2c36dceafa377842e9
Reviewed-on: https://review.monogon.dev/c/monogon/+/1852
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/rpc/resolver/processor.go b/metropolis/node/core/rpc/resolver/processor.go
index ef9d4f6..975b2df 100644
--- a/metropolis/node/core/rpc/resolver/processor.go
+++ b/metropolis/node/core/rpc/resolver/processor.go
@@ -23,6 +23,7 @@
 	ds    *requestDialOptionsSet
 	sub   *requestSubscribe
 	unsub *requestUnsubscribe
+	dbg   *requestDebug
 }
 
 // requestCuratorMapGet is received from any subsystem which wants to
@@ -110,6 +111,22 @@
 	resC chan *responseSubscribe
 }
 
+// requestDebug is received from the curator updater on failure, and is used to
+// provide the user with up-to-date debug information about the processor's state
+// at time of failure.
+type requestDebug struct {
+	resC chan *responseDebug
+}
+
+type responseDebug struct {
+	// curmap is the copy of the curator map as seen by the processor at time of
+	// request.
+	curmap *curatorMap
+	// leader is the current leader, if any, as seen by the processor at the time of
+	// the request.
+	leader *requestLeaderUpdate
+}
+
 type responseSubscribe struct {
 	// id is the ID of the subscription, used to cancel the subscription by the
 	// subscriber.
@@ -241,6 +258,20 @@
 				close(subscribers[req.unsub.id])
 				delete(subscribers, req.unsub.id)
 			}
+		case req.dbg != nil:
+			// Debug
+			var leader2 *requestLeaderUpdate
+			if leader != nil {
+				endpoint := *leader.endpoint
+				leader2 = &requestLeaderUpdate{
+					nodeID:   leader.nodeID,
+					endpoint: &endpoint,
+				}
+			}
+			req.dbg.resC <- &responseDebug{
+				curmap: curMap.copy(),
+				leader: leader2,
+			}
 		default:
 			panic(fmt.Sprintf("unhandled request: %+v", req))
 		}
diff --git a/metropolis/node/core/rpc/resolver/resolver.go b/metropolis/node/core/rpc/resolver/resolver.go
index 8b695b0..6ab3bbd 100644
--- a/metropolis/node/core/rpc/resolver/resolver.go
+++ b/metropolis/node/core/rpc/resolver/resolver.go
@@ -284,8 +284,22 @@
 			}
 		}
 	}, backoff.WithContext(bo, ctx), func(err error, t time.Duration) {
-		r.logger("CURUPDATE: error in loop: %v", err)
-		r.logger("CURUPDATE: retrying in %s...", t.String())
+		c := make(chan *responseDebug)
+		r.reqC <- &request{dbg: &requestDebug{resC: c}}
+		dbg := <-c
+		var msg []string
+		for k, v := range dbg.curmap.curators {
+			msg = append(msg, fmt.Sprintf("curator: %s/%s", k, v.endpoint))
+		}
+		for k, _ := range dbg.curmap.seeds {
+			msg = append(msg, fmt.Sprintf("seed: %s", k))
+		}
+		if dbg.leader != nil {
+			msg = append(msg, fmt.Sprintf("leader: %s/%s", dbg.leader.nodeID, dbg.leader.endpoint.endpoint))
+		}
+
+		r.logger("CURUPDATE: error in loop: %v, retrying in %s...", err, t.String())
+		r.logger("CURUPDATE: processor state: %s", strings.Join(msg, ", "))
 	})
 }