metropolis: implement node Deletion and framework for Decommissioning This implements the basic ability to remove nodes from a cluster. We prepare for a more complex workflow involving multi-sage decommissioning, but first implement the 'worst case' workflow, in which a node needs to be deleted if it hasn't been gracefully decommissioned. This is what we currently need most in practice, as we have node failures we'd like to deal with. The Delete functionality is still not fully complete though, as we're still accepting client certificates from decommissioned nodes. But we'll fix that in an upcoming CR. Change-Id: I7322cb1464a9e5bc924363321534033dcc8a6246 Reviewed-on: https://review.monogon.dev/c/monogon/+/2270 Tested-by: Jenkins CI Reviewed-by: Lorenz Brun <lorenz@monogon.tech>

commit: 8456ddf02aea2e1015805f18ef1871812c5cb7f6 [log] [tgz]
author: Serge Bazanski <serge@monogon.tech> Mon Oct 30 18:56:59 2023 +0100
committer: Serge Bazanski <serge@monogon.tech> Mon Oct 30 22:12:01 2023 +0000
tree: 7ea30de34d2ed42e2da1c044fb62576464b4e2cc
parent: 7acd92dae19109fff8e6036d0a7fcd64aa1851c1 [diff]
diff --git a/metropolis/node/core/curator/impl_leader_management.go b/metropolis/node/core/curator/impl_leader_management.go
index eac4635..0f0bfe1 100644
--- a/metropolis/node/core/curator/impl_leader_management.go
+++ b/metropolis/node/core/curator/impl_leader_management.go

@@ -379,3 +379,83 @@
 	}
 	return &apb.UpdateNodeRolesResponse{}, nil
 }
+
+func (l *leaderManagement) DecommissionNode(ctx context.Context, req *apb.DecommissionNodeRequest) (*apb.DecommissionNodeResponse, error) {
+	// Decommissioning is currently unimplemented. We'll get to that soon. For now,
+	// use SafetyBypassNotDecommissioned.
+	return nil, status.Error(codes.Unimplemented, "unimplemented")
+}
+
+func (l *leaderManagement) DeleteNode(ctx context.Context, req *apb.DeleteNodeRequest) (*apb.DeleteNodeResponse, error) {
+	bypassRoles := req.SafetyBypassHasRoles != nil
+	bypassDecommissioned := req.SafetyBypassNotDecommissioned != nil
+
+	// Nodes are identifiable by either of their public keys or (string) node IDs.
+	// In case a public key was provided, convert it to a corresponding node ID
+	// here.
+	var id string
+	switch rid := req.Node.(type) {
+	case *apb.DeleteNodeRequest_Pubkey:
+		if len(rid.Pubkey) != ed25519.PublicKeySize {
+			return nil, status.Errorf(codes.InvalidArgument, "pubkey must be %d bytes long", ed25519.PublicKeySize)
+		}
+		// Convert the pubkey into node ID.
+		id = identity.NodeID(rid.Pubkey)
+	case *apb.DeleteNodeRequest_Id:
+		id = rid.Id
+	default:
+		return nil, status.Errorf(codes.InvalidArgument, "exactly one of pubkey or id must be set")
+	}
+
+	// Take l.muNodes before modifying the node.
+	l.muNodes.Lock()
+	defer l.muNodes.Unlock()
+
+	// Find the node matching the requested public key.
+	node, err := nodeLoad(ctx, l.leadership, id)
+	if err == errNodeNotFound {
+		return nil, status.Errorf(codes.NotFound, "node %s not found", id)
+	}
+	if err != nil {
+		return nil, status.Errorf(codes.InvalidArgument, "while loading node %s: %v", id, err)
+	}
+
+	// Check safety assertions.
+	if !bypassRoles {
+		if node.consensusMember != nil {
+			return nil, status.Error(codes.FailedPrecondition, "node still has ConsensusMember role")
+		}
+		if node.kubernetesController != nil {
+			return nil, status.Error(codes.FailedPrecondition, "node still has KubernetesController role")
+		}
+		if node.kubernetesWorker != nil {
+			return nil, status.Error(codes.FailedPrecondition, "node still has KubernetesWorker role")
+		}
+	}
+	switch node.state {
+	case cpb.NodeState_NODE_STATE_NEW:
+		// Okay to remove, NEW node didn't yet receive any data.
+	case cpb.NodeState_NODE_STATE_STANDBY:
+		// Okay to remove, STANDBY node didn't yet receive any data.
+	case cpb.NodeState_NODE_STATE_UP:
+		if !bypassDecommissioned {
+			return nil, status.Error(codes.FailedPrecondition, "node must be decommissioned first")
+		}
+	case cpb.NodeState_NODE_STATE_DECOMMISSIONED:
+		// Always okay to remove a decommissioned node.
+	default:
+		return nil, status.Error(codes.Internal, "node has an invalid internal state")
+
+	}
+
+	// TODO(q3k): ensure deleted nodes are rejected by the leader. Currently the
+	// server-side authentication middleware is completely offline. We should either:
+	//
+	//  1. emit a revocation and distribute it to all nodes
+	//  2. give some additional middleware to the leader that performs online
+	//     verification (which is okay to do on the leader, as the leader always has
+	//     access to cluster data).
+
+	err = nodeDestroy(ctx, l.leadership, node)
+	return &apb.DeleteNodeResponse{}, err
+}

diff --git a/metropolis/node/core/curator/impl_leader_test.go b/metropolis/node/core/curator/impl_leader_test.go
index 08af401..87bd36c 100644
--- a/metropolis/node/core/curator/impl_leader_test.go
+++ b/metropolis/node/core/curator/impl_leader_test.go

@@ -1231,6 +1231,105 @@
 	}
 }
 
+// TestDeleteNode exercises management.DeleteNode.
+func TestDeleteNode(t *testing.T) {
+	cl := fakeLeader(t)
+	ctx, ctxC := context.WithCancel(context.Background())
+	defer ctxC()
+
+	// Create the test nodes.
+	var tn []*Node
+	tn = append(tn, putNode(t, ctx, cl.l, func(n *Node) { n.state = cpb.NodeState_NODE_STATE_UP }))
+	tn = append(tn, putNode(t, ctx, cl.l, func(n *Node) { n.state = cpb.NodeState_NODE_STATE_UP }))
+	tn = append(tn, putNode(t, ctx, cl.l, func(n *Node) { n.state = cpb.NodeState_NODE_STATE_UP }))
+
+	mgmt := apb.NewManagementClient(cl.mgmtConn)
+
+	// Removing all nodes should currently fail because they're not decommissioned.
+	for i, n := range tn {
+		_, err := mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
+			Node: &apb.DeleteNodeRequest_Id{
+				Id: n.ID(),
+			},
+		})
+		if err == nil {
+			t.Fatalf("Removing node %d should've failed", i)
+		}
+	}
+
+	// Remove first node with safety bypass.
+	_, err := mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
+		Node: &apb.DeleteNodeRequest_Id{
+			Id: tn[0].ID(),
+		},
+		SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
+	})
+	if err != nil {
+		t.Fatalf("Removing node 0 should've succeeded with safety bypass, got: %v", err)
+	}
+
+	// Give second node some roles.
+	yes := true
+	_, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
+		Node: &apb.UpdateNodeRolesRequest_Id{
+			Id: tn[1].ID(),
+		},
+		KubernetesWorker: &yes,
+	})
+	if err != nil {
+		t.Fatalf("Adding node 1 a role failed: %v", err)
+	}
+
+	// Removing the second node with the NotDecommisioned safety bypass should now
+	// fail, because the node has roles attached to it.
+	_, err = mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
+		Node: &apb.DeleteNodeRequest_Id{
+			Id: tn[1].ID(),
+		},
+		SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
+	})
+	if err == nil {
+		t.Fatal("Removing node 1 should've failed")
+	}
+
+	// Finally, try again with the HasRoles bypass, which should remove the node.
+	_, err = mgmt.DeleteNode(ctx, &apb.DeleteNodeRequest{
+		Node: &apb.DeleteNodeRequest_Id{
+			Id: tn[1].ID(),
+		},
+		SafetyBypassNotDecommissioned: &apb.DeleteNodeRequest_SafetyBypassNotDecommissioned{},
+		SafetyBypassHasRoles:          &apb.DeleteNodeRequest_SafetyBypassHasRoles{},
+	})
+	if err != nil {
+		t.Fatalf("Removing node 1 should've succeeded with safety bypass, got: %v", err)
+	}
+
+	// The cluster should not know anything about nodes 0 and 1 now.
+	nodes, err := mgmt.GetNodes(ctx, &apb.GetNodesRequest{})
+	if err != nil {
+		t.Fatalf("Could not get nodes: %v", err)
+	}
+	foundN2 := false
+	for {
+		n, err := nodes.Recv()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			t.Fatalf("Could not receive node: %v", err)
+		}
+		if n.Id == tn[0].ID() || n.Id == tn[1].ID() {
+			t.Errorf("Found deleted node ID in GetNodes result")
+		}
+		if n.Id == tn[2].ID() {
+			foundN2 = true
+		}
+	}
+	if !foundN2 {
+		t.Errorf("Didn't find node 2 which shouldn't have been deleted")
+	}
+}
+
 // TestGetCurrentLeader ensures that a leader responds with its own information
 // when asked for information about the current leader.
 func TestGetCurrentLeader(t *testing.T) {

diff --git a/metropolis/node/core/curator/state_node.go b/metropolis/node/core/curator/state_node.go
index 1b53dee..d580e87 100644
--- a/metropolis/node/core/curator/state_node.go
+++ b/metropolis/node/core/curator/state_node.go

@@ -432,6 +432,40 @@
 	return nil
 }
 
+// nodeDestroy removes all traces of a node from etcd. It does not first check
+// whether the node is safe to be removed.
+func nodeDestroy(ctx context.Context, l *leadership, n *Node) error {
+	// Build an etcd operation to save the node with a key based on its ID.
+	id := n.ID()
+	rpc.Trace(ctx).Printf("nodeDestroy(%s)...", id)
+
+	// Get paths for node data and join key.
+	nkey, err := nodeEtcdPrefix.Key(id)
+	if err != nil {
+		rpc.Trace(ctx).Printf("invalid node id: %v", err)
+		return status.Errorf(codes.InvalidArgument, "invalid node id")
+	}
+	jkey, err := n.etcdJoinKeyPath()
+
+	// Delete both.
+	_, err = l.txnAsLeader(ctx,
+		clientv3.OpDelete(nkey),
+		clientv3.OpDelete(jkey),
+	)
+	if err != nil {
+		if rpcErr, ok := rpcError(err); ok {
+			return rpcErr
+		}
+		rpc.Trace(ctx).Printf("could not destroy node: %v", err)
+		return status.Error(codes.Unavailable, "could not destroy node")
+	}
+
+	// TODO(q3k): remove node's data from PKI.
+
+	rpc.Trace(ctx).Printf("nodeDestroy(%s): destroy ok", id)
+	return nil
+}
+
 // nodeIdByJoinKey attempts to fetch a Node ID corresponding to the given Join
 // Key from etcd, within a given active leadership. All returned errors are
 // gRPC statuses that are safe to return to untrusted callers. If the given

diff --git a/metropolis/proto/api/management.proto b/metropolis/proto/api/management.proto
index 14184b6..a08f5cb 100644
--- a/metropolis/proto/api/management.proto
+++ b/metropolis/proto/api/management.proto

@@ -70,6 +70,45 @@
             need: PERMISSION_UPDATE_NODE_ROLES
         };
     }
+
+    // Decommissioning a node takes it from UP, through
+    //
+    //   1. DECOMMISSION_REQUESTED
+    //      The node will detect this state on the cluster and begin a cleanup
+    //      process which consists of removing either key material or zeroing
+    //      out the data partition, depending on cluster policy. It will report
+    //      to the cluster that it has begun the process, which will take it to
+    //      the next state:
+    //
+    //   2. DECOMMISSIONING
+    //      The node will continue cleanup. After cleanup is successful, it will
+    //      report back to the cluster which will take it to DECOMMISSIONED. The
+    //      node then reboots, and never comes back.
+    //
+    //   3. DECOMMISSIONED
+    //      The node can be removed with a subsequent DeleteNode call.
+    //
+    // TODO(q3k): implement this, possibly iron out the state machine involved.
+    //
+    // The node cannot have any roles assigned to it when it is being
+    // decommissioned: none may be assigned when the decommissioning process is
+    // requested, and none may be added to it while it is decommissioning.
+    rpc DecommissionNode(DecommissionNodeRequest) returns (DecommissionNodeResponse) {
+        option (metropolis.proto.ext.authorization) = {
+            need: PERMISSION_DECOMMISSION_NODE
+        };
+    }
+
+    // Delete a node from the cluster. By default the node must be in the
+    // DECOMMISSIONED state and may not have any roles assigned. However, some
+    // safety bypasses are available for nodes which have become unavailable and
+    // thus cannot be decommissioned correctly - see the request documentation
+    // for more details.
+    rpc DeleteNode(DeleteNodeRequest) returns (DeleteNodeResponse) {
+        option (metropolis.proto.ext.authorization) = {
+            need: PERMISSION_DELETE_NODE
+        };
+    }
 }
 
 message GetRegisterTicketRequest {
@@ -194,6 +233,90 @@
 message UpdateNodeRolesResponse {
 }
 
+message DecommissionNodeRequest {
+  // node uniquely identifies the node subject to this request.
+  oneof node {
+    // pubkey is the Ed25519 public key of this node, which can be used to
+    // generate the node's ID.
+    bytes pubkey = 1;
+    // id is the human-readable identifier of the node, based on its public
+    // key.
+    string id = 4;
+  }
+}
+
+message DecommissionNodeResponse {
+}
+
+message DeleteNodeRequest {
+  // node uniquely identifies the node subject to this request.
+  oneof node {
+    // pubkey is the Ed25519 public key of this node, which can be used to
+    // generate the node's ID.
+    bytes pubkey = 1;
+    // id is the human-readable identifier of the node, based on its public
+    // key.
+    string id = 2;
+  }
+
+  message SafetyBypassHasRoles {
+  }
+  // If set, safety_bypass_has_roles allows the removal of nodes which still have
+  // roles assigned.
+  //
+  // Danger: removing nodes which still have roles assigned might leave the
+  // cluster in an inconsistent state. Unassigning roles from a nodes via
+  // UpdateNodeRoles ensures consistency.
+  //
+  // It's also advised to never use this option in automated workflows, as this
+  // prevents a runaway automation from removing nodes that are still used for
+  // actual work.
+  //
+  // Nodes which broke down or otherwise become unreachable shouldn't need to
+  // enable this option, as unassigning the role from a node does not require it
+  // to be healthy.
+  //
+  // A short summary of how to deal with possible inconsistencies after removing
+  // a node with roles still assigned:
+  //
+  // 1. KubernetesWorker: remove the node from the Kubernetes cluster via kubectl
+  //    (kubectl delete node metropolis-xxx).
+  // 2. KubernetesController: no cleanup should be necessary.
+  // 3. ConsensusMember:
+  //     a. the cluster still has quorum: remove the node from etcd.
+  //        TODO(q3k): document this
+  //     b. the cluster has no quorum: rebuild the cluster
+  SafetyBypassHasRoles safety_bypass_has_roles = 3;
+
+  message SafetyBypassNotDecommissioned {
+  }
+  // If set, safety_bypass_not_decommissioned will allow to remove nodes that
+  // haven't been yet decommissioned.
+  //
+  // Danger: removing nodes which haven't been decommissioned via
+  // DecommissionNode can leave nodes attempting to reconnect to the cluster,
+  // and does not fully clean up cryptographic material from the node.
+  //
+  // This option will need to be used when a node has broken down, as it's
+  // impossible to move a node from UP to DECOMMISSIONED if that node is
+  // unreachable.
+  //
+  // To clean up after using this option:
+  //
+  // 1. Make sure that the node does not boot back up. The cluster will prevent
+  //    the node from rejoining the cluster, but the node will by itself
+  //    continue to crash and reboot due to a rejection by the cluster.
+  // 2. Zero our the node's ESP to remove any leftover cryptographic requests.
+  //    These secrets are safeguarded according to the cluster's
+  //    StorageSecurityPolicy and NodeTPMUsage. Depending on the settings,
+  //    cleaning up these secrets before letting other systems access the node
+  //    might be critical to maintaining cluster security.
+  SafetyBypassNotDecommissioned safety_bypass_not_decommissioned = 4;
+}
+
+message DeleteNodeResponse {
+}
+
 // NodeManagement runs on every node of the cluster and providers management
 // and troubleshooting RPCs to operators. All requests must be authenticated.
 service NodeManagement {

diff --git a/metropolis/proto/ext/authorization.proto b/metropolis/proto/ext/authorization.proto
index 208e4b6..1a0e759 100644
--- a/metropolis/proto/ext/authorization.proto
+++ b/metropolis/proto/ext/authorization.proto

@@ -26,6 +26,8 @@
     PERMISSION_UPDATE_NODE_ROLES = 5;
     PERMISSION_READ_NODE_LOGS = 6;
     PERMISSION_UPDATE_NODE = 7;
+    PERMISSION_DECOMMISSION_NODE = 8;
+    PERMISSION_DELETE_NODE = 9;
 }
 
 // Authorization policy for an RPC method. This message/API does not have the
commit	8456ddf02aea2e1015805f18ef1871812c5cb7f6	[log] [tgz]
author	Serge Bazanski <serge@monogon.tech>	Mon Oct 30 18:56:59 2023 +0100
committer	Serge Bazanski <serge@monogon.tech>	Mon Oct 30 22:12:01 2023 +0000
tree	7ea30de34d2ed42e2da1c044fb62576464b4e2cc
parent	7acd92dae19109fff8e6036d0a7fcd64aa1851c1 [diff]