metropolis/node/curator: restart listener on status changes
This change ensures that a Curator's gRPC listener restart not only when
it switches from leader to follower (and vice versa), but also when the
curator instance keeps being a leader/follower but the leadership status
changes in an important way.
Restarting the listener is important when the Curator is a re-elected
leader, as otherwise the listener will continue running with the old
leadership data which includes the old leadership election key/revision,
subsequently failing any leadership transactions when serving gRPC
requests.
Similarly, we need to restart the follower listener when the we the
leader changes to some other node, otherwise we end up serving old
leadership data in GetCurrentLeader.
An alternative fix would be to move the leadership status to an Event
value that gets updated and passed through to the listeners. This would
avoid somewhat unnecessary listener restarts on leadership changes, but
this will do for now.
This should fix issue #276 and perhaps some related flakiness. I'll try
to write a test for this on top of this CR, but this will require a new
kind of Curator test which we don't have yet.
Change-Id: I20ef147adeb0c976b904ef55dbadb9b461111e94
Reviewed-on: https://review.monogon.dev/c/monogon/+/2722
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/curator/curator.go b/metropolis/node/core/curator/curator.go
index bfaa90d..e88493a 100644
--- a/metropolis/node/core/curator/curator.go
+++ b/metropolis/node/core/curator/curator.go
@@ -91,10 +91,30 @@
lockRev int64
}
+func (e *electionStatusLeader) equal(o *electionStatusLeader) bool {
+ if e.lockKey != o.lockKey {
+ return false
+ }
+ if e.lockRev != o.lockRev {
+ return false
+ }
+ return true
+}
+
type electionStatusFollower struct {
lock *ppb.LeaderElectionValue
}
+func (e *electionStatusFollower) equal(o *electionStatusFollower) bool {
+ if e.lock.NodeId != o.lock.NodeId {
+ return false
+ }
+ if e.lock.Ttl != o.lock.Ttl {
+ return false
+ }
+ return true
+}
+
// buildLockValue returns a serialized etcd value that will be set by the
// instance when it becomes a leader. This value is a serialized
// LeaderElectionValue from private/storage.proto.
diff --git a/metropolis/node/core/curator/listener.go b/metropolis/node/core/curator/listener.go
index 77df684..4639729 100644
--- a/metropolis/node/core/curator/listener.go
+++ b/metropolis/node/core/curator/listener.go
@@ -148,6 +148,9 @@
if nst.leader == nil {
return fmt.Errorf("this curator stopped being a leader, quitting")
}
+ if !st.leader.equal(nst.leader) {
+ return fmt.Errorf("this curator got re-elected, quitting")
+ }
}
case st.follower != nil:
supervisor.Logger(ctx).Infof("Follower running until leadership change.")
@@ -159,6 +162,9 @@
if nst.follower == nil {
return fmt.Errorf("this curator stopped being a follower, quitting")
}
+ if !st.follower.equal(nst.follower) {
+ return fmt.Errorf("leader changed, quitting")
+ }
}
default:
panic("unreachable")