metropolis/proto/api/management.proto - monogon - Gitiles

 syntax = "proto3";
 package metropolis.proto.api;
 option go_package = "source.monogon.dev/metropolis/proto/api";

 import "google/protobuf/duration.proto";
 import "google/protobuf/timestamp.proto";

 import "metropolis/proto/common/common.proto";
 import "metropolis/proto/ext/authorization.proto";

 // Management service available to Cluster Managers, allowing operational work
 // to be performed on the cluster (eg. adding nodes, retrieving information
 // about a running cluster, etc.).
 service Management {
     // GetRegisterTicket retrieves the current RegisterTicket which is required
     // for new nodes to register into the cluster. Presenting this ticket on
     // registration does not automatically grant access to arbitrary node
     // registration. Instead, it is used to guard the API surface of the
     // Register RPC from potential denial of service attacks, and can be
     // regenerated at any time in case it leaks.
     rpc GetRegisterTicket(GetRegisterTicketRequest) returns (GetRegisterTicketResponse) {
         option (metropolis.proto.ext.authorization) = {
             need: PERMISSION_GET_REGISTER_TICKET
         };
     }

     // GetClusterInfo retrieves publicly available summary information about
     // this cluster, notably data required for nodes to register into a cluster
     // or join it (other than the Register Ticket, which is gated by an
     // additional permission).
     rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse) {
         option (metropolis.proto.ext.authorization) = {
             need: PERMISSION_READ_CLUSTER_STATUS
         };
     }

     // GetNodes retrieves information about nodes in the cluster. Currently,
     // it returns all available data about all nodes.
     rpc GetNodes(GetNodesRequest) returns (stream Node) {
         option (metropolis.proto.ext.authorization) = {
             need: PERMISSION_READ_CLUSTER_STATUS
         };
     }

     // ApproveNode progresses a node's registration process by changing its state
     // in the cluster from NEW to STANDBY, if not yet STANDBY. This is required
     // for the node to fully become part of the cluster (ie. have an UP state),
     // and is required to be called by a manager manually.
     //
     // Managers can find out what nodes require approval by performing
     // a GetNodes call and filtering for nodes in the NEW state. This call is
     // idempotent and can be executed multiple times, and is a no-op if the node
     // is already in the STANDBY or even UP states.
     //
     // In the future, approval process will be governed by cluster policy, but
     // currently any node can be approved by a manager, and the manager is
     // responsible for performing an out-of-band attestation of the node being/
     // approved (eg. by verifying that the node that is being approved has the
     // same public key as what the registering node displays in its startup
     // logs).
     rpc ApproveNode(ApproveNodeRequest) returns (ApproveNodeResponse) {
         option (metropolis.proto.ext.authorization) = {
             need: PERMISSION_APPROVE_NODE
         };
     }

     // UpdateNodeRoles updates a single node's roles.
     rpc UpdateNodeRoles(UpdateNodeRolesRequest) returns (UpdateNodeRolesResponse) {
         option (metropolis.proto.ext.authorization) = {
             need: PERMISSION_UPDATE_NODE_ROLES
         };
     }

     // Decommissioning a node takes it from UP, through
     //
     //   1. DECOMMISSION_REQUESTED
     //      The node will detect this state on the cluster and begin a cleanup
     //      process which consists of removing either key material or zeroing
     //      out the data partition, depending on cluster policy. It will report
     //      to the cluster that it has begun the process, which will take it to
     //      the next state:
     //
     //   2. DECOMMISSIONING
     //      The node will continue cleanup. After cleanup is successful, it will
     //      report back to the cluster which will take it to DECOMMISSIONED. The
     //      node then reboots, and never comes back.
     //
     //   3. DECOMMISSIONED
     //      The node can be removed with a subsequent DeleteNode call.
     //
     // TODO(q3k): implement this, possibly iron out the state machine involved.
     //
     // The node cannot have any roles assigned to it when it is being
     // decommissioned: none may be assigned when the decommissioning process is
     // requested, and none may be added to it while it is decommissioning.
     rpc DecommissionNode(DecommissionNodeRequest) returns (DecommissionNodeResponse) {
         option (metropolis.proto.ext.authorization) = {
             need: PERMISSION_DECOMMISSION_NODE
         };
     }

     // Delete a node from the cluster. By default the node must be in the
     // DECOMMISSIONED state and may not have any roles assigned. However, some
     // safety bypasses are available for nodes which have become unavailable and
     // thus cannot be decommissioned correctly - see the request documentation
     // for more details.
     rpc DeleteNode(DeleteNodeRequest) returns (DeleteNodeResponse) {
         option (metropolis.proto.ext.authorization) = {
             need: PERMISSION_DELETE_NODE
         };
     }
 }

 message GetRegisterTicketRequest {
 }

 message GetRegisterTicketResponse {
     // Opaque bytes that comprise the RegisterTicket.
     bytes ticket = 1;
 }

 message GetClusterInfoRequest {
 }

 message GetClusterInfoResponse {
     // cluster_directory contains information about individual nodes in the
     // cluster that can be used to dial the cluster's services.
     metropolis.proto.common.ClusterDirectory cluster_directory = 1;

     // ca_certificate is the x509 DER encoded CA certificate of the cluster.
     bytes ca_certificate = 2;

     metropolis.proto.common.ClusterConfiguration cluster_configuration = 3;
 }

 message GetNodesRequest {
     // filter is a CEL expression used to limit the count of GetNodes results.
     // Each processed node protobuf message is exposed to the filter as
     // "node" variable, while related state and health enum constants are
     // anchored in the root namespace, eg. NODE_STATE_UP, or HEARTBEAT_TIMEOUT.
     // A node is returned each time the expression is evaluated as true. If
     // empty, all nodes are returned.
     string filter = 1;
 }

 // Node in a Metropolis cluster, streamed by Management.GetNodes. For each node
 // in the cluster, this message will be emitted and will contain information
 // about that node.
 //
 // The fields contained are node fields that PERMISSION_READ_CLUSTER_STATUS
 // allows access to, ie. 'non-private' fields, ones that might be internal to
 // the cluster and possibly considered sensitive information about the
 // infrastructure, but whose knowledge does not allow to escalate privileges
 // within the cluster.
 message Node {
     // Raw Ed25519 public key of this node, which can be used to generate
     // the node's ID. This is always set.
     bytes pubkey = 1;
     // Node ID calculated from pubkey, ie. 'metropolis-123456'.
     string id = 7;
     // State of the node from the point of view of the cluster. This is
     // always set.
     metropolis.proto.common.NodeState state = 2;
     // Last reported status by the Node, absent if a node hasn't yet reported
     // its status.
     metropolis.proto.common.NodeStatus status = 3;
     // Roles assigned by the cluster. This is always set.
     metropolis.proto.common.NodeRoles roles = 4;

     // Health describes node's health as seen from the cluster perspective.
     enum Health {
       INVALID = 0;
       // UNKNOWN is used whenever there were no heartbeats received from a
       // given node AND too little time has passed since last Curator leader
       // election to know whether the node is actually timing out. UNKNOWN
       // is also returned for nodes which NodeState does not equal
       // NODE_STATE_UP.
       UNKNOWN = 1;
       // HEALTHY describes nodes that have sent a heartbeat recently.
       HEALTHY = 2;
       // HEARTBEAT_TIMEOUT describes nodes that have not sent a heartbeat in
       // the interval specified by curator.HeartbeatTimeout.
       HEARTBEAT_TIMEOUT = 3;
     }
     Health health = 5;
     // time_since_heartbeat is the duration since the last of the node's
     // heartbeats was received, expressed in nanoseconds. It is only valid with
     // the health status of either HEALTHY or HEARTBEAT_TIMEOUT.
     google.protobuf.Duration time_since_heartbeat = 6;

     // tpm_usage describes whether this node has a TPM 2.0 and whether it is
     // being actively used as part of its membership in the Metropolis cluster.
     //
     // Currently, the TPM 2.0 is only used to seal the local part of the disk
     // encryption key and the early join credentials of the node. Depending on
     // future cluster configuration settings, this might also indicate that the
     // node has actually passed high assurance hardware attestation against the
     // cluster.
     metropolis.proto.common.NodeTPMUsage tpm_usage = 8;
 }

 message ApproveNodeRequest {
     // Raw public key of the node being approved, has to correspond to a node
     // currently in the cluster.
     bytes pubkey = 1;
 }

 message ApproveNodeResponse {
 }

 // UpdateNodeRolesRequest updates roles of a single node matching pubkey. All
 // role fields are optional, and no change will result if they're either unset
 // or if their value matches existing state.
 message UpdateNodeRolesRequest {
   // node uniquely identifies the node subject to this request.
   oneof node {
     // pubkey is the Ed25519 public key of this node, which can be used to
     // generate the node's ID.
     bytes pubkey = 1;
     // id is the human-readable identifier of the node, based on its public
     // key.
     string id = 4;
   }

   // kubernetesController adjusts the appropriate role when set.
   optional bool kubernetesWorker = 2;
   // kubernetesController adjusts the appropriate role when set. Nodes performing
   // this role must also be consensus members.
   optional bool kubernetesController = 5;
   optional bool consensusMember = 3;
 }

 message UpdateNodeRolesResponse {
 }

 message DecommissionNodeRequest {
   // node uniquely identifies the node subject to this request.
   oneof node {
     // pubkey is the Ed25519 public key of this node, which can be used to
     // generate the node's ID.
     bytes pubkey = 1;
     // id is the human-readable identifier of the node, based on its public
     // key.
     string id = 4;
   }
 }

 message DecommissionNodeResponse {
 }

 message DeleteNodeRequest {
   // node uniquely identifies the node subject to this request.
   oneof node {
     // pubkey is the Ed25519 public key of this node, which can be used to
     // generate the node's ID.
     bytes pubkey = 1;
     // id is the human-readable identifier of the node, based on its public
     // key.
     string id = 2;
   }

   message SafetyBypassHasRoles {
   }
   // If set, safety_bypass_has_roles allows the removal of nodes which still have
   // roles assigned.
   //
   // Danger: removing nodes which still have roles assigned might leave the
   // cluster in an inconsistent state. Unassigning roles from a nodes via
   // UpdateNodeRoles ensures consistency.
   //
   // It's also advised to never use this option in automated workflows, as this
   // prevents a runaway automation from removing nodes that are still used for
   // actual work.
   //
   // Nodes which broke down or otherwise become unreachable shouldn't need to
   // enable this option, as unassigning the role from a node does not require it
   // to be healthy.
   //
   // A short summary of how to deal with possible inconsistencies after removing
   // a node with roles still assigned:
   //
   // 1. KubernetesWorker: remove the node from the Kubernetes cluster via kubectl
   //    (kubectl delete node metropolis-xxx).
   // 2. KubernetesController: no cleanup should be necessary.
   // 3. ConsensusMember:
   //     a. the cluster still has quorum: remove the node from etcd.
   //        TODO(q3k): document this
   //     b. the cluster has no quorum: rebuild the cluster
   SafetyBypassHasRoles safety_bypass_has_roles = 3;

   message SafetyBypassNotDecommissioned {
   }
   // If set, safety_bypass_not_decommissioned will allow to remove nodes that
   // haven't been yet decommissioned.
   //
   // Danger: removing nodes which haven't been decommissioned via
   // DecommissionNode can leave nodes attempting to reconnect to the cluster,
   // and does not fully clean up cryptographic material from the node.
   //
   // This option will need to be used when a node has broken down, as it's
   // impossible to move a node from UP to DECOMMISSIONED if that node is
   // unreachable.
   //
   // To clean up after using this option:
   //
   // 1. Make sure that the node does not boot back up. The cluster will prevent
   //    the node from rejoining the cluster, but the node will by itself
   //    continue to crash and reboot due to a rejection by the cluster.
   // 2. Zero our the node's ESP to remove any leftover cryptographic requests.
   //    These secrets are safeguarded according to the cluster's
   //    StorageSecurityPolicy and NodeTPMUsage. Depending on the settings,
   //    cleaning up these secrets before letting other systems access the node
   //    might be critical to maintaining cluster security.
   SafetyBypassNotDecommissioned safety_bypass_not_decommissioned = 4;
 }

 message DeleteNodeResponse {
 }

 // NodeManagement runs on every node of the cluster and providers management
 // and troubleshooting RPCs to operators. All requests must be authenticated.
 service NodeManagement {
   // GetLogs Returns historical and/or streaming logs for a given DN with given
   // filters from the system global LogTree.
   //
   // For more information about this API, see //metropolis/pkg/logtree. But, in
   // summary:
   //   - All logging is performed to a DN (distinguished name), which is a
   //     dot-delimited string like foo.bar.baz.
   //   - Log entries can be either raw (coming from unstructured logging from
   //     an external service, like a running process) or leveled (emitted by
   //     Metropolis code with a source line, timestamp, and severity).
   //   - The DNs form a tree of logging nodes - and when requesting logs, a
   //     given subtree of DNs can be requested, instead of just a given DN.
   //   - All supervised processes live at `root.<supervisor DN>`. For more
   //     example paths, see the console logs of a running Metropolis node, or
   //     request all logs (at DN "").
   //
   rpc Logs(GetLogsRequest) returns (stream GetLogsResponse) {
     option (metropolis.proto.ext.authorization) = {
       need: PERMISSION_READ_NODE_LOGS
     };
   }
   // UpdateNode updates the node operating system to a new version.
   //
   // Metropolis uses a side-by-side (A/B) update process. This method installs
   // the OS from the given bundle into the inactive slot, activates that slot
   // and then (optionally) reboots to activate it.
   rpc UpdateNode(UpdateNodeRequest) returns (UpdateNodeResponse) {
     option (metropolis.proto.ext.authorization) = {
       need: PERMISSION_UPDATE_NODE
     };
   }
 }

 message GetLogsRequest {
   // DN from which to request logs. All supervised runnables live at `root.`,
   // the init code lives at `init.`.
   string dn = 1;
   // Filters to apply to returned data.
   repeated metropolis.proto.common.LogFilter filters = 2;

   enum BacklogMode {
     BACKLOG_INVALID = 0;
     // No historic data will be returned.
     BACKLOG_DISABLE = 1;
     // All available historic data will be returned.
     BACKLOG_ALL = 2;
     // At most backlog_count entries will be returned, if available.
     BACKLOG_COUNT = 3;
   }
   BacklogMode backlog_mode = 3;
   int64 backlog_count = 4;

   enum StreamMode {
     STREAM_INVALID = 0;
     // No streaming entries, gRPC stream will be closed as soon as all backlog
     // data is served.
     STREAM_DISABLE = 1;
     // Entries will be streamed as early as available right after all backlog
     // data is served.
     STREAM_UNBUFFERED = 2;
   }
   StreamMode stream_mode = 5;
 }

 message GetLogsResponse {
   // Entries from the requested historical entries (via WithBackLog). They will
   // all be served before the first stream_entries are served (if any).
   repeated metropolis.proto.common.LogEntry backlog_entries = 1;
   // Entries streamed as they arrive. Currently no server-side buffering is
   // enabled, instead every line is served as early as it arrives. However, this
   // might change in the future, so this behaviour cannot be depended upon.
   repeated metropolis.proto.common.LogEntry stream_entries = 2;
 }

 enum ActivationMode {
   ACTIVATION_INVALID = 0;
   // The new bundle is not activated immediately. It gets activated on the next
   // reboot/reset.
   ACTIVATION_NONE = 1;
   // The node is rebooted immediately to activate the new image.
   ACTIVATION_REBOOT = 2;
   // The node uses kexec to activate the new image immediately without fully
   // rebooting.
   ACTIVATION_KEXEC = 3;
 }

 message UpdateNodeRequest {
   // An HTTPS URL to a Metropolis bundle containing the new OS to install.
   string bundle_url = 1;

   reserved 2;

   // Specifies how the updated image should be activated.
   ActivationMode activation_mode = 3;
 }

 message UpdateNodeResponse {}
	syntax = "proto3";
	package metropolis.proto.api;
	option go_package = "source.monogon.dev/metropolis/proto/api";

	import "google/protobuf/duration.proto";
	import "google/protobuf/timestamp.proto";

	import "metropolis/proto/common/common.proto";
	import "metropolis/proto/ext/authorization.proto";

	// Management service available to Cluster Managers, allowing operational work
	// to be performed on the cluster (eg. adding nodes, retrieving information
	// about a running cluster, etc.).
	service Management {
	// GetRegisterTicket retrieves the current RegisterTicket which is required
	// for new nodes to register into the cluster. Presenting this ticket on
	// registration does not automatically grant access to arbitrary node
	// registration. Instead, it is used to guard the API surface of the
	// Register RPC from potential denial of service attacks, and can be
	// regenerated at any time in case it leaks.
	rpc GetRegisterTicket(GetRegisterTicketRequest) returns (GetRegisterTicketResponse) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_GET_REGISTER_TICKET
	};
	}

	// GetClusterInfo retrieves publicly available summary information about
	// this cluster, notably data required for nodes to register into a cluster
	// or join it (other than the Register Ticket, which is gated by an
	// additional permission).
	rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_READ_CLUSTER_STATUS
	};
	}

	// GetNodes retrieves information about nodes in the cluster. Currently,
	// it returns all available data about all nodes.
	rpc GetNodes(GetNodesRequest) returns (stream Node) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_READ_CLUSTER_STATUS
	};
	}

	// ApproveNode progresses a node's registration process by changing its state
	// in the cluster from NEW to STANDBY, if not yet STANDBY. This is required
	// for the node to fully become part of the cluster (ie. have an UP state),
	// and is required to be called by a manager manually.
	//
	// Managers can find out what nodes require approval by performing
	// a GetNodes call and filtering for nodes in the NEW state. This call is
	// idempotent and can be executed multiple times, and is a no-op if the node
	// is already in the STANDBY or even UP states.
	//
	// In the future, approval process will be governed by cluster policy, but
	// currently any node can be approved by a manager, and the manager is
	// responsible for performing an out-of-band attestation of the node being/
	// approved (eg. by verifying that the node that is being approved has the
	// same public key as what the registering node displays in its startup
	// logs).
	rpc ApproveNode(ApproveNodeRequest) returns (ApproveNodeResponse) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_APPROVE_NODE
	};
	}

	// UpdateNodeRoles updates a single node's roles.
	rpc UpdateNodeRoles(UpdateNodeRolesRequest) returns (UpdateNodeRolesResponse) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_UPDATE_NODE_ROLES
	};
	}

	// Decommissioning a node takes it from UP, through
	//
	// 1. DECOMMISSION_REQUESTED
	// The node will detect this state on the cluster and begin a cleanup
	// process which consists of removing either key material or zeroing
	// out the data partition, depending on cluster policy. It will report
	// to the cluster that it has begun the process, which will take it to
	// the next state:
	//
	// 2. DECOMMISSIONING
	// The node will continue cleanup. After cleanup is successful, it will
	// report back to the cluster which will take it to DECOMMISSIONED. The
	// node then reboots, and never comes back.
	//
	// 3. DECOMMISSIONED
	// The node can be removed with a subsequent DeleteNode call.
	//
	// TODO(q3k): implement this, possibly iron out the state machine involved.
	//
	// The node cannot have any roles assigned to it when it is being
	// decommissioned: none may be assigned when the decommissioning process is
	// requested, and none may be added to it while it is decommissioning.
	rpc DecommissionNode(DecommissionNodeRequest) returns (DecommissionNodeResponse) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_DECOMMISSION_NODE
	};
	}

	// Delete a node from the cluster. By default the node must be in the
	// DECOMMISSIONED state and may not have any roles assigned. However, some
	// safety bypasses are available for nodes which have become unavailable and
	// thus cannot be decommissioned correctly - see the request documentation
	// for more details.
	rpc DeleteNode(DeleteNodeRequest) returns (DeleteNodeResponse) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_DELETE_NODE
	};
	}
	}

	message GetRegisterTicketRequest {
	}

	message GetRegisterTicketResponse {
	// Opaque bytes that comprise the RegisterTicket.
	bytes ticket = 1;
	}

	message GetClusterInfoRequest {
	}

	message GetClusterInfoResponse {
	// cluster_directory contains information about individual nodes in the
	// cluster that can be used to dial the cluster's services.
	metropolis.proto.common.ClusterDirectory cluster_directory = 1;

	// ca_certificate is the x509 DER encoded CA certificate of the cluster.
	bytes ca_certificate = 2;

	metropolis.proto.common.ClusterConfiguration cluster_configuration = 3;
	}

	message GetNodesRequest {
	// filter is a CEL expression used to limit the count of GetNodes results.
	// Each processed node protobuf message is exposed to the filter as
	// "node" variable, while related state and health enum constants are
	// anchored in the root namespace, eg. NODE_STATE_UP, or HEARTBEAT_TIMEOUT.
	// A node is returned each time the expression is evaluated as true. If
	// empty, all nodes are returned.
	string filter = 1;
	}

	// Node in a Metropolis cluster, streamed by Management.GetNodes. For each node
	// in the cluster, this message will be emitted and will contain information
	// about that node.
	//
	// The fields contained are node fields that PERMISSION_READ_CLUSTER_STATUS
	// allows access to, ie. 'non-private' fields, ones that might be internal to
	// the cluster and possibly considered sensitive information about the
	// infrastructure, but whose knowledge does not allow to escalate privileges
	// within the cluster.
	message Node {
	// Raw Ed25519 public key of this node, which can be used to generate
	// the node's ID. This is always set.
	bytes pubkey = 1;
	// Node ID calculated from pubkey, ie. 'metropolis-123456'.
	string id = 7;
	// State of the node from the point of view of the cluster. This is
	// always set.
	metropolis.proto.common.NodeState state = 2;
	// Last reported status by the Node, absent if a node hasn't yet reported
	// its status.
	metropolis.proto.common.NodeStatus status = 3;
	// Roles assigned by the cluster. This is always set.
	metropolis.proto.common.NodeRoles roles = 4;

	// Health describes node's health as seen from the cluster perspective.
	enum Health {
	INVALID = 0;
	// UNKNOWN is used whenever there were no heartbeats received from a
	// given node AND too little time has passed since last Curator leader
	// election to know whether the node is actually timing out. UNKNOWN
	// is also returned for nodes which NodeState does not equal
	// NODE_STATE_UP.
	UNKNOWN = 1;
	// HEALTHY describes nodes that have sent a heartbeat recently.
	HEALTHY = 2;
	// HEARTBEAT_TIMEOUT describes nodes that have not sent a heartbeat in
	// the interval specified by curator.HeartbeatTimeout.
	HEARTBEAT_TIMEOUT = 3;
	}
	Health health = 5;
	// time_since_heartbeat is the duration since the last of the node's
	// heartbeats was received, expressed in nanoseconds. It is only valid with
	// the health status of either HEALTHY or HEARTBEAT_TIMEOUT.
	google.protobuf.Duration time_since_heartbeat = 6;

	// tpm_usage describes whether this node has a TPM 2.0 and whether it is
	// being actively used as part of its membership in the Metropolis cluster.
	//
	// Currently, the TPM 2.0 is only used to seal the local part of the disk
	// encryption key and the early join credentials of the node. Depending on
	// future cluster configuration settings, this might also indicate that the
	// node has actually passed high assurance hardware attestation against the
	// cluster.
	metropolis.proto.common.NodeTPMUsage tpm_usage = 8;
	}

	message ApproveNodeRequest {
	// Raw public key of the node being approved, has to correspond to a node
	// currently in the cluster.
	bytes pubkey = 1;
	}

	message ApproveNodeResponse {
	}

	// UpdateNodeRolesRequest updates roles of a single node matching pubkey. All
	// role fields are optional, and no change will result if they're either unset
	// or if their value matches existing state.
	message UpdateNodeRolesRequest {
	// node uniquely identifies the node subject to this request.
	oneof node {
	// pubkey is the Ed25519 public key of this node, which can be used to
	// generate the node's ID.
	bytes pubkey = 1;
	// id is the human-readable identifier of the node, based on its public
	// key.
	string id = 4;
	}

	// kubernetesController adjusts the appropriate role when set.
	optional bool kubernetesWorker = 2;
	// kubernetesController adjusts the appropriate role when set. Nodes performing
	// this role must also be consensus members.
	optional bool kubernetesController = 5;
	optional bool consensusMember = 3;
	}

	message UpdateNodeRolesResponse {
	}

	message DecommissionNodeRequest {
	// node uniquely identifies the node subject to this request.
	oneof node {
	// pubkey is the Ed25519 public key of this node, which can be used to
	// generate the node's ID.
	bytes pubkey = 1;
	// id is the human-readable identifier of the node, based on its public
	// key.
	string id = 4;
	}
	}

	message DecommissionNodeResponse {
	}

	message DeleteNodeRequest {
	// node uniquely identifies the node subject to this request.
	oneof node {
	// pubkey is the Ed25519 public key of this node, which can be used to
	// generate the node's ID.
	bytes pubkey = 1;
	// id is the human-readable identifier of the node, based on its public
	// key.
	string id = 2;
	}

	message SafetyBypassHasRoles {
	}
	// If set, safety_bypass_has_roles allows the removal of nodes which still have
	// roles assigned.
	//
	// Danger: removing nodes which still have roles assigned might leave the
	// cluster in an inconsistent state. Unassigning roles from a nodes via
	// UpdateNodeRoles ensures consistency.
	//
	// It's also advised to never use this option in automated workflows, as this
	// prevents a runaway automation from removing nodes that are still used for
	// actual work.
	//
	// Nodes which broke down or otherwise become unreachable shouldn't need to
	// enable this option, as unassigning the role from a node does not require it
	// to be healthy.
	//
	// A short summary of how to deal with possible inconsistencies after removing
	// a node with roles still assigned:
	//
	// 1. KubernetesWorker: remove the node from the Kubernetes cluster via kubectl
	// (kubectl delete node metropolis-xxx).
	// 2. KubernetesController: no cleanup should be necessary.
	// 3. ConsensusMember:
	// a. the cluster still has quorum: remove the node from etcd.
	// TODO(q3k): document this
	// b. the cluster has no quorum: rebuild the cluster
	SafetyBypassHasRoles safety_bypass_has_roles = 3;

	message SafetyBypassNotDecommissioned {
	}
	// If set, safety_bypass_not_decommissioned will allow to remove nodes that
	// haven't been yet decommissioned.
	//
	// Danger: removing nodes which haven't been decommissioned via
	// DecommissionNode can leave nodes attempting to reconnect to the cluster,
	// and does not fully clean up cryptographic material from the node.
	//
	// This option will need to be used when a node has broken down, as it's
	// impossible to move a node from UP to DECOMMISSIONED if that node is
	// unreachable.
	//
	// To clean up after using this option:
	//
	// 1. Make sure that the node does not boot back up. The cluster will prevent
	// the node from rejoining the cluster, but the node will by itself
	// continue to crash and reboot due to a rejection by the cluster.
	// 2. Zero our the node's ESP to remove any leftover cryptographic requests.
	// These secrets are safeguarded according to the cluster's
	// StorageSecurityPolicy and NodeTPMUsage. Depending on the settings,
	// cleaning up these secrets before letting other systems access the node
	// might be critical to maintaining cluster security.
	SafetyBypassNotDecommissioned safety_bypass_not_decommissioned = 4;
	}

	message DeleteNodeResponse {
	}

	// NodeManagement runs on every node of the cluster and providers management
	// and troubleshooting RPCs to operators. All requests must be authenticated.
	service NodeManagement {
	// GetLogs Returns historical and/or streaming logs for a given DN with given
	// filters from the system global LogTree.
	//
	// For more information about this API, see //metropolis/pkg/logtree. But, in
	// summary:
	// - All logging is performed to a DN (distinguished name), which is a
	// dot-delimited string like foo.bar.baz.
	// - Log entries can be either raw (coming from unstructured logging from
	// an external service, like a running process) or leveled (emitted by
	// Metropolis code with a source line, timestamp, and severity).
	// - The DNs form a tree of logging nodes - and when requesting logs, a
	// given subtree of DNs can be requested, instead of just a given DN.
	// - All supervised processes live at `root.<supervisor DN>`. For more
	// example paths, see the console logs of a running Metropolis node, or
	// request all logs (at DN "").
	//
	rpc Logs(GetLogsRequest) returns (stream GetLogsResponse) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_READ_NODE_LOGS
	};
	}
	// UpdateNode updates the node operating system to a new version.
	//
	// Metropolis uses a side-by-side (A/B) update process. This method installs
	// the OS from the given bundle into the inactive slot, activates that slot
	// and then (optionally) reboots to activate it.
	rpc UpdateNode(UpdateNodeRequest) returns (UpdateNodeResponse) {
	option (metropolis.proto.ext.authorization) = {
	need: PERMISSION_UPDATE_NODE
	};
	}
	}

	message GetLogsRequest {
	// DN from which to request logs. All supervised runnables live at `root.`,
	// the init code lives at `init.`.
	string dn = 1;
	// Filters to apply to returned data.
	repeated metropolis.proto.common.LogFilter filters = 2;

	enum BacklogMode {
	BACKLOG_INVALID = 0;
	// No historic data will be returned.
	BACKLOG_DISABLE = 1;
	// All available historic data will be returned.
	BACKLOG_ALL = 2;
	// At most backlog_count entries will be returned, if available.
	BACKLOG_COUNT = 3;
	}
	BacklogMode backlog_mode = 3;
	int64 backlog_count = 4;

	enum StreamMode {
	STREAM_INVALID = 0;
	// No streaming entries, gRPC stream will be closed as soon as all backlog
	// data is served.
	STREAM_DISABLE = 1;
	// Entries will be streamed as early as available right after all backlog
	// data is served.
	STREAM_UNBUFFERED = 2;
	}
	StreamMode stream_mode = 5;
	}

	message GetLogsResponse {
	// Entries from the requested historical entries (via WithBackLog). They will
	// all be served before the first stream_entries are served (if any).
	repeated metropolis.proto.common.LogEntry backlog_entries = 1;
	// Entries streamed as they arrive. Currently no server-side buffering is
	// enabled, instead every line is served as early as it arrives. However, this
	// might change in the future, so this behaviour cannot be depended upon.
	repeated metropolis.proto.common.LogEntry stream_entries = 2;
	}

	enum ActivationMode {
	ACTIVATION_INVALID = 0;
	// The new bundle is not activated immediately. It gets activated on the next
	// reboot/reset.
	ACTIVATION_NONE = 1;
	// The node is rebooted immediately to activate the new image.
	ACTIVATION_REBOOT = 2;
	// The node uses kexec to activate the new image immediately without fully
	// rebooting.
	ACTIVATION_KEXEC = 3;
	}

	message UpdateNodeRequest {
	// An HTTPS URL to a Metropolis bundle containing the new OS to install.
	string bundle_url = 1;

	reserved 2;

	// Specifies how the updated image should be activated.
	ActivationMode activation_mode = 3;
	}

	message UpdateNodeResponse {}