metropolis: implement node Deletion and framework for Decommissioning This implements the basic ability to remove nodes from a cluster. We prepare for a more complex workflow involving multi-sage decommissioning, but first implement the 'worst case' workflow, in which a node needs to be deleted if it hasn't been gracefully decommissioned. This is what we currently need most in practice, as we have node failures we'd like to deal with. The Delete functionality is still not fully complete though, as we're still accepting client certificates from decommissioned nodes. But we'll fix that in an upcoming CR. Change-Id: I7322cb1464a9e5bc924363321534033dcc8a6246 Reviewed-on: https://review.monogon.dev/c/monogon/+/2270 Tested-by: Jenkins CI Reviewed-by: Lorenz Brun <lorenz@monogon.tech>

commit: 8456ddf02aea2e1015805f18ef1871812c5cb7f6 [log] [tgz]
author: Serge Bazanski <serge@monogon.tech> Mon Oct 30 18:56:59 2023 +0100
committer: Serge Bazanski <serge@monogon.tech> Mon Oct 30 22:12:01 2023 +0000
tree: 7ea30de34d2ed42e2da1c044fb62576464b4e2cc
parent: 7acd92dae19109fff8e6036d0a7fcd64aa1851c1 [diff]
diff --git a/metropolis/proto/api/management.proto b/metropolis/proto/api/management.proto
index 14184b6..a08f5cb 100644
--- a/metropolis/proto/api/management.proto
+++ b/metropolis/proto/api/management.proto

@@ -70,6 +70,45 @@
             need: PERMISSION_UPDATE_NODE_ROLES
         };
     }
+
+    // Decommissioning a node takes it from UP, through
+    //
+    //   1. DECOMMISSION_REQUESTED
+    //      The node will detect this state on the cluster and begin a cleanup
+    //      process which consists of removing either key material or zeroing
+    //      out the data partition, depending on cluster policy. It will report
+    //      to the cluster that it has begun the process, which will take it to
+    //      the next state:
+    //
+    //   2. DECOMMISSIONING
+    //      The node will continue cleanup. After cleanup is successful, it will
+    //      report back to the cluster which will take it to DECOMMISSIONED. The
+    //      node then reboots, and never comes back.
+    //
+    //   3. DECOMMISSIONED
+    //      The node can be removed with a subsequent DeleteNode call.
+    //
+    // TODO(q3k): implement this, possibly iron out the state machine involved.
+    //
+    // The node cannot have any roles assigned to it when it is being
+    // decommissioned: none may be assigned when the decommissioning process is
+    // requested, and none may be added to it while it is decommissioning.
+    rpc DecommissionNode(DecommissionNodeRequest) returns (DecommissionNodeResponse) {
+        option (metropolis.proto.ext.authorization) = {
+            need: PERMISSION_DECOMMISSION_NODE
+        };
+    }
+
+    // Delete a node from the cluster. By default the node must be in the
+    // DECOMMISSIONED state and may not have any roles assigned. However, some
+    // safety bypasses are available for nodes which have become unavailable and
+    // thus cannot be decommissioned correctly - see the request documentation
+    // for more details.
+    rpc DeleteNode(DeleteNodeRequest) returns (DeleteNodeResponse) {
+        option (metropolis.proto.ext.authorization) = {
+            need: PERMISSION_DELETE_NODE
+        };
+    }
 }
 
 message GetRegisterTicketRequest {
@@ -194,6 +233,90 @@
 message UpdateNodeRolesResponse {
 }
 
+message DecommissionNodeRequest {
+  // node uniquely identifies the node subject to this request.
+  oneof node {
+    // pubkey is the Ed25519 public key of this node, which can be used to
+    // generate the node's ID.
+    bytes pubkey = 1;
+    // id is the human-readable identifier of the node, based on its public
+    // key.
+    string id = 4;
+  }
+}
+
+message DecommissionNodeResponse {
+}
+
+message DeleteNodeRequest {
+  // node uniquely identifies the node subject to this request.
+  oneof node {
+    // pubkey is the Ed25519 public key of this node, which can be used to
+    // generate the node's ID.
+    bytes pubkey = 1;
+    // id is the human-readable identifier of the node, based on its public
+    // key.
+    string id = 2;
+  }
+
+  message SafetyBypassHasRoles {
+  }
+  // If set, safety_bypass_has_roles allows the removal of nodes which still have
+  // roles assigned.
+  //
+  // Danger: removing nodes which still have roles assigned might leave the
+  // cluster in an inconsistent state. Unassigning roles from a nodes via
+  // UpdateNodeRoles ensures consistency.
+  //
+  // It's also advised to never use this option in automated workflows, as this
+  // prevents a runaway automation from removing nodes that are still used for
+  // actual work.
+  //
+  // Nodes which broke down or otherwise become unreachable shouldn't need to
+  // enable this option, as unassigning the role from a node does not require it
+  // to be healthy.
+  //
+  // A short summary of how to deal with possible inconsistencies after removing
+  // a node with roles still assigned:
+  //
+  // 1. KubernetesWorker: remove the node from the Kubernetes cluster via kubectl
+  //    (kubectl delete node metropolis-xxx).
+  // 2. KubernetesController: no cleanup should be necessary.
+  // 3. ConsensusMember:
+  //     a. the cluster still has quorum: remove the node from etcd.
+  //        TODO(q3k): document this
+  //     b. the cluster has no quorum: rebuild the cluster
+  SafetyBypassHasRoles safety_bypass_has_roles = 3;
+
+  message SafetyBypassNotDecommissioned {
+  }
+  // If set, safety_bypass_not_decommissioned will allow to remove nodes that
+  // haven't been yet decommissioned.
+  //
+  // Danger: removing nodes which haven't been decommissioned via
+  // DecommissionNode can leave nodes attempting to reconnect to the cluster,
+  // and does not fully clean up cryptographic material from the node.
+  //
+  // This option will need to be used when a node has broken down, as it's
+  // impossible to move a node from UP to DECOMMISSIONED if that node is
+  // unreachable.
+  //
+  // To clean up after using this option:
+  //
+  // 1. Make sure that the node does not boot back up. The cluster will prevent
+  //    the node from rejoining the cluster, but the node will by itself
+  //    continue to crash and reboot due to a rejection by the cluster.
+  // 2. Zero our the node's ESP to remove any leftover cryptographic requests.
+  //    These secrets are safeguarded according to the cluster's
+  //    StorageSecurityPolicy and NodeTPMUsage. Depending on the settings,
+  //    cleaning up these secrets before letting other systems access the node
+  //    might be critical to maintaining cluster security.
+  SafetyBypassNotDecommissioned safety_bypass_not_decommissioned = 4;
+}
+
+message DeleteNodeResponse {
+}
+
 // NodeManagement runs on every node of the cluster and providers management
 // and troubleshooting RPCs to operators. All requests must be authenticated.
 service NodeManagement {

diff --git a/metropolis/proto/ext/authorization.proto b/metropolis/proto/ext/authorization.proto
index 208e4b6..1a0e759 100644
--- a/metropolis/proto/ext/authorization.proto
+++ b/metropolis/proto/ext/authorization.proto

@@ -26,6 +26,8 @@
     PERMISSION_UPDATE_NODE_ROLES = 5;
     PERMISSION_READ_NODE_LOGS = 6;
     PERMISSION_UPDATE_NODE = 7;
+    PERMISSION_DECOMMISSION_NODE = 8;
+    PERMISSION_DELETE_NODE = 9;
 }
 
 // Authorization policy for an RPC method. This message/API does not have the
commit	8456ddf02aea2e1015805f18ef1871812c5cb7f6	[log] [tgz]
author	Serge Bazanski <serge@monogon.tech>	Mon Oct 30 18:56:59 2023 +0100
committer	Serge Bazanski <serge@monogon.tech>	Mon Oct 30 22:12:01 2023 +0000
tree	7ea30de34d2ed42e2da1c044fb62576464b4e2cc
parent	7acd92dae19109fff8e6036d0a7fcd64aa1851c1 [diff]