m/n/c/curator: clean up stale leader election after reboot
This is useful if we reboot a leader and it comes back while its' old
stale leader election key/value is present.
Without this, other nodes would continue to connect to the newly
rebooted leader even though it is not a leader anymore, confusing
themselves and cluster operators in the process.
Change-Id: I306e7040550084ef39ab20c3c289a3137145a2d9
Reviewed-on: https://review.monogon.dev/c/monogon/+/1845
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/curator/curator.go b/metropolis/node/core/curator/curator.go
index 336b7d6..758da27 100644
--- a/metropolis/node/core/curator/curator.go
+++ b/metropolis/node/core/curator/curator.go
@@ -19,10 +19,12 @@
 	"fmt"
 	"time"
 
+	clientv3 "go.etcd.io/etcd/client/v3"
 	"go.etcd.io/etcd/client/v3/concurrency"
 	"google.golang.org/protobuf/proto"
 
 	"source.monogon.dev/metropolis/node/core/consensus"
+	"source.monogon.dev/metropolis/node/core/consensus/client"
 	ppb "source.monogon.dev/metropolis/node/core/curator/proto/private"
 	"source.monogon.dev/metropolis/node/core/identity"
 	"source.monogon.dev/metropolis/pkg/event/memory"
@@ -135,6 +137,10 @@
 		return fmt.Errorf("getting consensus client failed: %w", err)
 	}
 
+	if err := s.cleanupPreviousLifetime(ctx, cl); err != nil {
+		supervisor.Logger(ctx).Warningf("Failed to cleanup previous lifetime: %v", err)
+	}
+
 	// Establish a lease/session with etcd.
 	session, err := concurrency.NewSession(cl.ThinClient(ctx),
 		concurrency.WithContext(ctx),
@@ -219,6 +225,70 @@
 
 }
 
+// cleanupPreviousLifetime checks if we just started up after ungracefully losing
+// our leadership, and attempts to clean up if so.
+//
+// Having the rest of the cluster assume this node is still the leader is not a
+// problem from a correctness point of view (as the node will refuse to serve
+// leader requests with Unimplemented), but it is quite an eyesore to operators,
+// as all nodes just end up having a ton of client processes all complain with
+// odd 'Unimplemented' errors.
+func (s *Service) cleanupPreviousLifetime(ctx context.Context, cl client.Namespaced) error {
+	// Get the active leader key and value.
+	resp, err := cl.Get(ctx, electionPrefix, clientv3.WithFirstCreate()...)
+	if err != nil {
+		return err
+	}
+	if len(resp.Kvs) < 1 {
+		return nil
+	}
+
+	// Check that this key belonged to use by comparing the embedded node ID with our
+	// own node ID.
+	key := string(resp.Kvs[0].Key)
+	rev := resp.Kvs[0].ModRevision
+	var lock ppb.LeaderElectionValue
+	if err := proto.Unmarshal(resp.Kvs[0].Value, &lock); err != nil {
+		return fmt.Errorf("parsing existing lock value failed: %w", err)
+	}
+
+	// Not our node? Nothing to do.
+	if lock.NodeId != s.config.NodeCredentials.ID() {
+		return nil
+	}
+
+	// Now here's the sketchy part: removing the leader election key if we think it
+	// used to be ours.
+	supervisor.Logger(ctx).Infof("Detecting our own stale lock, attempting to remove...")
+
+	// Just removing the key should be correct, as the key encodes the original
+	// session ID that created the leadership key, and the session ID is unique per
+	// leader. So if the key exists, it is guaranteed to have been created and updated
+	// by exactly one session. And having read it earlier, we know it is a session
+	// that was owned by this node, as it proclaimed this node as the leader.
+	//
+	// The only scenario in which this can fail is if we have the same node ID
+	// running more than one curator service and thus more than one leader election.
+	// But that would be a serious programming/design bug and other things would
+	// likely break at this point, anyway.
+	//
+	// For safety, we add a check that it is still the same ModRevision as when we
+	// checked it earlier on, but that's probably unnecessary.
+	txn := cl.Txn(ctx).If(clientv3.Compare(clientv3.ModRevision(key), "=", rev))
+	txn = txn.Then(clientv3.OpDelete(key))
+	resp2, err := txn.Commit()
+	if err != nil {
+		return err
+	}
+	if resp2.Succeeded {
+		supervisor.Logger(ctx).Infof("Cleanup successful")
+	} else {
+		// This will happen if the key expired by itself already.
+		supervisor.Logger(ctx).Warningf("Cleanup failed - maybe our old lease already expired...")
+	}
+	return nil
+}
+
 func (s *Service) Run(ctx context.Context) error {
 	// Start local election watcher. This logs what this curator knows about its own
 	// leadership.