m/node/core: remove etcd membership before removing consensus role

When removing the consensus role, we also need to remove etcd
membership. It is safer to remove membership first, and then the role,
because otherwise, the etcd cluster is in a degraded state during the
time where etcd on the node has been stopped, but the node is still
counted as a voting member by etcd.

If the membership is removed, but then removing the role fails, the
cluster ends up in an inconsistent state. If the affected node was the
curator or etcd leader, that will almost certainly happen. In this case,
the request can just be retried until it succeeds, and then the cluster
state is consistent again between etcd membership and roles.

Change-Id: I1ab526470a4201e76817e8ca0a597996fb903d1f
Reviewed-on: https://review.monogon.dev/c/monogon/+/3437
Tested-by: Jenkins CI
Reviewed-by: Serge Bazanski <serge@monogon.tech>
diff --git a/metropolis/test/e2e/suites/ha/BUILD.bazel b/metropolis/test/e2e/suites/ha/BUILD.bazel
index 3d9c688..a88d4f7 100644
--- a/metropolis/test/e2e/suites/ha/BUILD.bazel
+++ b/metropolis/test/e2e/suites/ha/BUILD.bazel
@@ -16,6 +16,8 @@
         "xTestImagesManifestPath": "$(rlocationpath //metropolis/test/e2e:testimages_manifest )",
     },
     deps = [
+        "//metropolis/node/core/curator/proto/api",
+        "//metropolis/proto/api",
         "//metropolis/test/launch",
         "//metropolis/test/localregistry",
         "//metropolis/test/util",
diff --git a/metropolis/test/e2e/suites/ha/run_test.go b/metropolis/test/e2e/suites/ha/run_test.go
index f2f7cc2..7dcd228 100644
--- a/metropolis/test/e2e/suites/ha/run_test.go
+++ b/metropolis/test/e2e/suites/ha/run_test.go
@@ -13,6 +13,9 @@
 	"source.monogon.dev/metropolis/test/localregistry"
 	"source.monogon.dev/metropolis/test/util"
 	"source.monogon.dev/osbase/test/launch"
+
+	cpb "source.monogon.dev/metropolis/node/core/curator/proto/api"
+	apb "source.monogon.dev/metropolis/proto/api"
 )
 
 var (
@@ -105,4 +108,47 @@
 			return nil
 		})
 	}
+
+	// Test node role removal.
+	curC, err := cluster.CuratorClient()
+	if err != nil {
+		t.Fatalf("Could not get CuratorClient: %v", err)
+	}
+	mgmt := apb.NewManagementClient(curC)
+	cur := cpb.NewCuratorClient(curC)
+
+	util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
+		fa := false
+		_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
+			Node: &apb.UpdateNodeRolesRequest_Id{
+				Id: cluster.NodeIDs[0],
+			},
+			KubernetesController: &fa,
+		})
+		return err
+	})
+	util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
+		fa := false
+		_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
+			Node: &apb.UpdateNodeRolesRequest_Id{
+				Id: cluster.NodeIDs[0],
+			},
+			ConsensusMember: &fa,
+		})
+		return err
+	})
+
+	// Test that removing the ConsensusMember role from a node removed the
+	// corresponding etcd member from the cluster.
+	var st *cpb.GetConsensusStatusResponse
+	util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
+		st, err = cur.GetConsensusStatus(ctx, &cpb.GetConsensusStatusRequest{})
+		return err
+	})
+
+	for _, member := range st.EtcdMember {
+		if member.Id == cluster.NodeIDs[0] {
+			t.Errorf("member still present in etcd")
+		}
+	}
 }