m/n/c/consensus: fix startup after removing a cluster node The consensus service was waiting for all initial peers to be DNS resolvable before starting etcd. However, the list of initial peers is never updated. If an etcd member is removed from the cluster, it is no longer resolvable, but may still be contained in initial peer lists. The consensus service then fails to start, as it is blocked forever waiting for the removed peer to become resolvable. The wait for resolvability was added in c1cb37ce9c43 with this explanation: > It also makes the consensus service wait for DNS resolvability before > attempting to join an existing cluster, which makes etcd startup much > cleaner (as etcd will itself crash if it cannot immediately resolve > its ExistingPeers in startup). This does not appear to be needed anymore. I did not observe etcd crashes after removing the wait for resolvability. I extended the e2e test to test this scenario. After removing the consensus role, it also deletes the node and reboots the remaining nodes. I moved these tests to the ha_cold suite, because with encryption enabled, we currently cannot reboot a node in a 2-node cluster. Change-Id: If811c79ea127550fa9ca750014272fa885767c77 Reviewed-on: https://review.monogon.dev/c/monogon/+/3454 Tested-by: Jenkins CI Reviewed-by: Serge Bazanski <serge@monogon.tech>

commit: d5538b52d7a8739f7123458c10973be36b27b9ff [log] [tgz]
author: Jan Schär <jan@monogon.tech> Wed Sep 25 13:16:49 2024 +0200
committer: Jan Schär <jan@monogon.tech> Thu Sep 26 11:44:09 2024 +0000
tree: fd718931af36798650ddbf1a8978a94220994e82
parent: 1dcede9600e4c1584da4fbe89128970ea9532860 [diff]
diff --git a/metropolis/test/e2e/suites/ha/BUILD.bazel b/metropolis/test/e2e/suites/ha/BUILD.bazel
index a88d4f7..3d9c688 100644
--- a/metropolis/test/e2e/suites/ha/BUILD.bazel
+++ b/metropolis/test/e2e/suites/ha/BUILD.bazel

@@ -16,8 +16,6 @@
         "xTestImagesManifestPath": "$(rlocationpath //metropolis/test/e2e:testimages_manifest )",
     },
     deps = [
-        "//metropolis/node/core/curator/proto/api",
-        "//metropolis/proto/api",
         "//metropolis/test/launch",
         "//metropolis/test/localregistry",
         "//metropolis/test/util",

diff --git a/metropolis/test/e2e/suites/ha/run_test.go b/metropolis/test/e2e/suites/ha/run_test.go
index 7dcd228..f2f7cc2 100644
--- a/metropolis/test/e2e/suites/ha/run_test.go
+++ b/metropolis/test/e2e/suites/ha/run_test.go

@@ -13,9 +13,6 @@
 	"source.monogon.dev/metropolis/test/localregistry"
 	"source.monogon.dev/metropolis/test/util"
 	"source.monogon.dev/osbase/test/launch"
-
-	cpb "source.monogon.dev/metropolis/node/core/curator/proto/api"
-	apb "source.monogon.dev/metropolis/proto/api"
 )
 
 var (
@@ -108,47 +105,4 @@
 			return nil
 		})
 	}
-
-	// Test node role removal.
-	curC, err := cluster.CuratorClient()
-	if err != nil {
-		t.Fatalf("Could not get CuratorClient: %v", err)
-	}
-	mgmt := apb.NewManagementClient(curC)
-	cur := cpb.NewCuratorClient(curC)
-
-	util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
-		fa := false
-		_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
-			Node: &apb.UpdateNodeRolesRequest_Id{
-				Id: cluster.NodeIDs[0],
-			},
-			KubernetesController: &fa,
-		})
-		return err
-	})
-	util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
-		fa := false
-		_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
-			Node: &apb.UpdateNodeRolesRequest_Id{
-				Id: cluster.NodeIDs[0],
-			},
-			ConsensusMember: &fa,
-		})
-		return err
-	})
-
-	// Test that removing the ConsensusMember role from a node removed the
-	// corresponding etcd member from the cluster.
-	var st *cpb.GetConsensusStatusResponse
-	util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
-		st, err = cur.GetConsensusStatus(ctx, &cpb.GetConsensusStatusRequest{})
-		return err
-	})
-
-	for _, member := range st.EtcdMember {
-		if member.Id == cluster.NodeIDs[0] {
-			t.Errorf("member still present in etcd")
-		}
-	}
 }

diff --git a/metropolis/test/e2e/suites/ha_cold/BUILD.bazel b/metropolis/test/e2e/suites/ha_cold/BUILD.bazel
index c2cfa0d..2ace798 100644
--- a/metropolis/test/e2e/suites/ha_cold/BUILD.bazel
+++ b/metropolis/test/e2e/suites/ha_cold/BUILD.bazel

@@ -15,6 +15,8 @@
         "resources:ram:7000",
     ],
     deps = [
+        "//metropolis/node/core/curator/proto/api",
+        "//metropolis/proto/api",
         "//metropolis/proto/common",
         "//metropolis/test/launch",
         "//metropolis/test/util",

diff --git a/metropolis/test/e2e/suites/ha_cold/run_test.go b/metropolis/test/e2e/suites/ha_cold/run_test.go
index 419d290..6a5c6e8 100644
--- a/metropolis/test/e2e/suites/ha_cold/run_test.go
+++ b/metropolis/test/e2e/suites/ha_cold/run_test.go

@@ -10,6 +10,8 @@
 	"source.monogon.dev/metropolis/test/util"
 	"source.monogon.dev/osbase/test/launch"
 
+	ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
+	apb "source.monogon.dev/metropolis/proto/api"
 	cpb "source.monogon.dev/metropolis/proto/common"
 )
 
@@ -83,4 +85,74 @@
 	}
 	// Check if the cluster comes back up.
 	util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
+
+	// Test node role removal.
+	curC, err := cluster.CuratorClient()
+	if err != nil {
+		t.Fatalf("Could not get CuratorClient: %v", err)
+	}
+	mgmt := apb.NewManagementClient(curC)
+	cur := ipb.NewCuratorClient(curC)
+
+	util.MustTestEventual(t, "Remove KubernetesController role", ctx, 10*time.Second, func(ctx context.Context) error {
+		fa := false
+		_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
+			Node: &apb.UpdateNodeRolesRequest_Id{
+				Id: cluster.NodeIDs[0],
+			},
+			KubernetesController: &fa,
+		})
+		return err
+	})
+	util.MustTestEventual(t, "Remove ConsensusMember role", ctx, time.Minute, func(ctx context.Context) error {
+		fa := false
+		_, err := mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
+			Node: &apb.UpdateNodeRolesRequest_Id{
+				Id: cluster.NodeIDs[0],
+			},
+			ConsensusMember: &fa,
+		})
+		return err
+	})
+
+	// Test that removing the ConsensusMember role from a node removed the
+	// corresponding etcd member from the cluster.
+	var st *ipb.GetConsensusStatusResponse
+	util.MustTestEventual(t, "Get ConsensusStatus", ctx, time.Minute, func(ctx context.Context) error {
+		st, err = cur.GetConsensusStatus(ctx, &ipb.GetConsensusStatusRequest{})
+		return err
+	})
+
+	for _, member := range st.EtcdMember {
+		if member.Id == cluster.NodeIDs[0] {
+			t.Errorf("member still present in etcd")
+		}
+	}
+
+	// Test that that the cluster still works after deleting the first node and
+	// restarting the remaining nodes.
+	util.MustTestEventual(t, "Delete first node", ctx, 10*time.Second, func(ctx context.Context) error {
+		_, err := mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
+			Node: &apb.DeleteNodeRequest_Id{
+				Id: cluster.NodeIDs[0],
+			},
+			SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
+		})
+		return err
+	})
+
+	// Shut every remaining node down.
+	for i := 1; i < clusterOptions.NumNodes; i++ {
+		if err := cluster.ShutdownNode(i); err != nil {
+			t.Fatalf("Could not shutdown node %d", i)
+		}
+	}
+	// Start every remaining node back up.
+	for i := 1; i < clusterOptions.NumNodes; i++ {
+		if err := cluster.StartNode(i); err != nil {
+			t.Fatalf("Could not shutdown node %d", i)
+		}
+	}
+	// Check if the cluster comes back up.
+	util.TestEventual(t, "Heartbeat test successful", ctx, 60*time.Second, cluster.AllNodesHealthy)
 }
commit	d5538b52d7a8739f7123458c10973be36b27b9ff	[log] [tgz]
author	Jan Schär <jan@monogon.tech>	Wed Sep 25 13:16:49 2024 +0200
committer	Jan Schär <jan@monogon.tech>	Thu Sep 26 11:44:09 2024 +0000
tree	fd718931af36798650ddbf1a8978a94220994e82
parent	1dcede9600e4c1584da4fbe89128970ea9532860 [diff]