| syntax = "proto3"; | 
 | package metropolis.proto.api; | 
 | option go_package = "source.monogon.dev/metropolis/proto/api"; | 
 |  | 
 | import "google/protobuf/duration.proto"; | 
 | import "google/protobuf/timestamp.proto"; | 
 |  | 
 | import "metropolis/proto/common/common.proto"; | 
 | import "metropolis/proto/ext/authorization.proto"; | 
 |  | 
 | // Management service available to Cluster Managers, allowing operational work | 
 | // to be performed on the cluster (eg. adding nodes, retrieving information | 
 | // about a running cluster, etc.). | 
 | service Management { | 
 |     // GetRegisterTicket retrieves the current RegisterTicket which is required | 
 |     // for new nodes to register into the cluster. Presenting this ticket on | 
 |     // registration does not automatically grant access to arbitrary node | 
 |     // registration. Instead, it is used to guard the API surface of the | 
 |     // Register RPC from potential denial of service attacks, and can be | 
 |     // regenerated at any time in case it leaks. | 
 |     rpc GetRegisterTicket(GetRegisterTicketRequest) returns (GetRegisterTicketResponse) { | 
 |         option (metropolis.proto.ext.authorization) = { | 
 |             need: PERMISSION_GET_REGISTER_TICKET | 
 |         }; | 
 |     } | 
 |  | 
 |     // GetClusterInfo retrieves publicly available summary information about | 
 |     // this cluster, notably data required for nodes to register into a cluster | 
 |     // or join it (other than the Register Ticket, which is gated by an | 
 |     // additional permission). | 
 |     rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse) { | 
 |         option (metropolis.proto.ext.authorization) = { | 
 |             need: PERMISSION_READ_CLUSTER_STATUS | 
 |         }; | 
 |     } | 
 |  | 
 |     // GetNodes retrieves information about nodes in the cluster. Currently, | 
 |     // it returns all available data about all nodes. | 
 |     rpc GetNodes(GetNodesRequest) returns (stream Node) { | 
 |         option (metropolis.proto.ext.authorization) = { | 
 |             need: PERMISSION_READ_CLUSTER_STATUS | 
 |         }; | 
 |     } | 
 |  | 
 |     // ApproveNode progresses a node's registration process by changing its state | 
 |     // in the cluster from NEW to STANDBY, if not yet STANDBY. This is required | 
 |     // for the node to fully become part of the cluster (ie. have an UP state), | 
 |     // and is required to be called by a manager manually. | 
 |     // | 
 |     // Managers can find out what nodes require approval by performing | 
 |     // a GetNodes call and filtering for nodes in the NEW state. This call is | 
 |     // idempotent and can be executed multiple times, and is a no-op if the node | 
 |     // is already in the STANDBY or even UP states. | 
 |     // | 
 |     // In the future, approval process will be governed by cluster policy, but | 
 |     // currently any node can be approved by a manager, and the manager is | 
 |     // responsible for performing an out-of-band attestation of the node being/ | 
 |     // approved (eg. by verifying that the node that is being approved has the | 
 |     // same public key as what the registering node displays in its startup | 
 |     // logs). | 
 |     rpc ApproveNode(ApproveNodeRequest) returns (ApproveNodeResponse) { | 
 |         option (metropolis.proto.ext.authorization) = { | 
 |             need: PERMISSION_APPROVE_NODE | 
 |         }; | 
 |     } | 
 |  | 
 |     // UpdateNodeRoles updates a single node's roles. | 
 |     rpc UpdateNodeRoles(UpdateNodeRolesRequest) returns (UpdateNodeRolesResponse) { | 
 |         option (metropolis.proto.ext.authorization) = { | 
 |             need: PERMISSION_UPDATE_NODE_ROLES | 
 |         }; | 
 |     } | 
 |  | 
 |     // Decommissioning a node takes it from UP, through | 
 |     // | 
 |     //   1. DECOMMISSION_REQUESTED | 
 |     //      The node will detect this state on the cluster and begin a cleanup | 
 |     //      process which consists of removing either key material or zeroing | 
 |     //      out the data partition, depending on cluster policy. It will report | 
 |     //      to the cluster that it has begun the process, which will take it to | 
 |     //      the next state: | 
 |     // | 
 |     //   2. DECOMMISSIONING | 
 |     //      The node will continue cleanup. After cleanup is successful, it will | 
 |     //      report back to the cluster which will take it to DECOMMISSIONED. The | 
 |     //      node then reboots, and never comes back. | 
 |     // | 
 |     //   3. DECOMMISSIONED | 
 |     //      The node can be removed with a subsequent DeleteNode call. | 
 |     // | 
 |     // TODO(q3k): implement this, possibly iron out the state machine involved. | 
 |     // | 
 |     // The node cannot have any roles assigned to it when it is being | 
 |     // decommissioned: none may be assigned when the decommissioning process is | 
 |     // requested, and none may be added to it while it is decommissioning. | 
 |     rpc DecommissionNode(DecommissionNodeRequest) returns (DecommissionNodeResponse) { | 
 |         option (metropolis.proto.ext.authorization) = { | 
 |             need: PERMISSION_DECOMMISSION_NODE | 
 |         }; | 
 |     } | 
 |  | 
 |     // Delete a node from the cluster. By default the node must be in the | 
 |     // DECOMMISSIONED state and may not have any roles assigned. However, some | 
 |     // safety bypasses are available for nodes which have become unavailable and | 
 |     // thus cannot be decommissioned correctly - see the request documentation | 
 |     // for more details. | 
 |     rpc DeleteNode(DeleteNodeRequest) returns (DeleteNodeResponse) { | 
 |         option (metropolis.proto.ext.authorization) = { | 
 |             need: PERMISSION_DELETE_NODE | 
 |         }; | 
 |     } | 
 | } | 
 |  | 
 | message GetRegisterTicketRequest { | 
 | } | 
 |  | 
 | message GetRegisterTicketResponse { | 
 |     // Opaque bytes that comprise the RegisterTicket. | 
 |     bytes ticket = 1; | 
 | } | 
 |  | 
 | message GetClusterInfoRequest { | 
 | } | 
 |  | 
 | message GetClusterInfoResponse { | 
 |     // cluster_directory contains information about individual nodes in the | 
 |     // cluster that can be used to dial the cluster's services. | 
 |     metropolis.proto.common.ClusterDirectory cluster_directory = 1; | 
 |  | 
 |     // ca_certificate is the x509 DER encoded CA certificate of the cluster. | 
 |     bytes ca_certificate = 2; | 
 |  | 
 |     metropolis.proto.common.ClusterConfiguration cluster_configuration = 3; | 
 | } | 
 |  | 
 | message GetNodesRequest { | 
 |     // filter is a CEL expression used to limit the count of GetNodes results. | 
 |     // Each processed node protobuf message is exposed to the filter as | 
 |     // "node" variable, while related state and health enum constants are | 
 |     // anchored in the root namespace, eg. NODE_STATE_UP, or HEARTBEAT_TIMEOUT. | 
 |     // A node is returned each time the expression is evaluated as true. If | 
 |     // empty, all nodes are returned. | 
 |     string filter = 1; | 
 | } | 
 |  | 
 | // Node in a Metropolis cluster, streamed by Management.GetNodes. For each node | 
 | // in the cluster, this message will be emitted and will contain information | 
 | // about that node. | 
 | // | 
 | // The fields contained are node fields that PERMISSION_READ_CLUSTER_STATUS | 
 | // allows access to, ie. 'non-private' fields, ones that might be internal to | 
 | // the cluster and possibly considered sensitive information about the | 
 | // infrastructure, but whose knowledge does not allow to escalate privileges | 
 | // within the cluster. | 
 | message Node { | 
 |     // Raw Ed25519 public key of this node, which can be used to generate | 
 |     // the node's ID. This is always set. | 
 |     bytes pubkey = 1; | 
 |     // Node ID calculated from pubkey, ie. 'metropolis-123456'. | 
 |     string id = 7; | 
 |     // State of the node from the point of view of the cluster. This is | 
 |     // always set. | 
 |     metropolis.proto.common.NodeState state = 2; | 
 |     // Last reported status by the Node, absent if a node hasn't yet reported | 
 |     // its status. | 
 |     metropolis.proto.common.NodeStatus status = 3; | 
 |     // Roles assigned by the cluster. This is always set. | 
 |     metropolis.proto.common.NodeRoles roles = 4; | 
 |  | 
 |     // Health describes node's health as seen from the cluster perspective. | 
 |     enum Health { | 
 |       INVALID = 0; | 
 |       // UNKNOWN is used whenever there were no heartbeats received from a | 
 |       // given node AND too little time has passed since last Curator leader | 
 |       // election to know whether the node is actually timing out. UNKNOWN | 
 |       // is also returned for nodes which NodeState does not equal | 
 |       // NODE_STATE_UP. | 
 |       UNKNOWN = 1; | 
 |       // HEALTHY describes nodes that have sent a heartbeat recently. | 
 |       HEALTHY = 2; | 
 |       // HEARTBEAT_TIMEOUT describes nodes that have not sent a heartbeat in | 
 |       // the interval specified by curator.HeartbeatTimeout. | 
 |       HEARTBEAT_TIMEOUT = 3; | 
 |     } | 
 |     Health health = 5; | 
 |     // time_since_heartbeat is the duration since the last of the node's | 
 |     // heartbeats was received, expressed in nanoseconds. It is only valid with | 
 |     // the health status of either HEALTHY or HEARTBEAT_TIMEOUT. | 
 |     google.protobuf.Duration time_since_heartbeat = 6; | 
 |  | 
 |     // tpm_usage describes whether this node has a TPM 2.0 and whether it is | 
 |     // being actively used as part of its membership in the Metropolis cluster. | 
 |     // | 
 |     // Currently, the TPM 2.0 is only used to seal the local part of the disk | 
 |     // encryption key and the early join credentials of the node. Depending on | 
 |     // future cluster configuration settings, this might also indicate that the | 
 |     // node has actually passed high assurance hardware attestation against the | 
 |     // cluster. | 
 |     metropolis.proto.common.NodeTPMUsage tpm_usage = 8; | 
 | } | 
 |  | 
 | message ApproveNodeRequest { | 
 |     // Raw public key of the node being approved, has to correspond to a node | 
 |     // currently in the cluster. | 
 |     bytes pubkey = 1; | 
 | } | 
 |  | 
 | message ApproveNodeResponse { | 
 | } | 
 |  | 
 | // UpdateNodeRolesRequest updates roles of a single node matching pubkey. All | 
 | // role fields are optional, and no change will result if they're either unset | 
 | // or if their value matches existing state. | 
 | message UpdateNodeRolesRequest { | 
 |   // node uniquely identifies the node subject to this request. | 
 |   oneof node { | 
 |     // pubkey is the Ed25519 public key of this node, which can be used to | 
 |     // generate the node's ID. | 
 |     bytes pubkey = 1; | 
 |     // id is the human-readable identifier of the node, based on its public | 
 |     // key. | 
 |     string id = 4; | 
 |   } | 
 |  | 
 |   // kubernetesController adjusts the appropriate role when set. | 
 |   optional bool kubernetesWorker = 2; | 
 |   // kubernetesController adjusts the appropriate role when set. Nodes performing | 
 |   // this role must also be consensus members. | 
 |   optional bool kubernetesController = 5; | 
 |   optional bool consensusMember = 3; | 
 | } | 
 |  | 
 | message UpdateNodeRolesResponse { | 
 | } | 
 |  | 
 | message DecommissionNodeRequest { | 
 |   // node uniquely identifies the node subject to this request. | 
 |   oneof node { | 
 |     // pubkey is the Ed25519 public key of this node, which can be used to | 
 |     // generate the node's ID. | 
 |     bytes pubkey = 1; | 
 |     // id is the human-readable identifier of the node, based on its public | 
 |     // key. | 
 |     string id = 4; | 
 |   } | 
 | } | 
 |  | 
 | message DecommissionNodeResponse { | 
 | } | 
 |  | 
 | message DeleteNodeRequest { | 
 |   // node uniquely identifies the node subject to this request. | 
 |   oneof node { | 
 |     // pubkey is the Ed25519 public key of this node, which can be used to | 
 |     // generate the node's ID. | 
 |     bytes pubkey = 1; | 
 |     // id is the human-readable identifier of the node, based on its public | 
 |     // key. | 
 |     string id = 2; | 
 |   } | 
 |  | 
 |   message SafetyBypassHasRoles { | 
 |   } | 
 |   // If set, safety_bypass_has_roles allows the removal of nodes which still have | 
 |   // roles assigned. | 
 |   // | 
 |   // Danger: removing nodes which still have roles assigned might leave the | 
 |   // cluster in an inconsistent state. Unassigning roles from a nodes via | 
 |   // UpdateNodeRoles ensures consistency. | 
 |   // | 
 |   // It's also advised to never use this option in automated workflows, as this | 
 |   // prevents a runaway automation from removing nodes that are still used for | 
 |   // actual work. | 
 |   // | 
 |   // Nodes which broke down or otherwise become unreachable shouldn't need to | 
 |   // enable this option, as unassigning the role from a node does not require it | 
 |   // to be healthy. | 
 |   // | 
 |   // A short summary of how to deal with possible inconsistencies after removing | 
 |   // a node with roles still assigned: | 
 |   // | 
 |   // 1. KubernetesWorker: remove the node from the Kubernetes cluster via kubectl | 
 |   //    (kubectl delete node metropolis-xxx). | 
 |   // 2. KubernetesController: no cleanup should be necessary. | 
 |   // 3. ConsensusMember: | 
 |   //     a. the cluster still has quorum: remove the node from etcd. | 
 |   //        TODO(q3k): document this | 
 |   //     b. the cluster has no quorum: rebuild the cluster | 
 |   SafetyBypassHasRoles safety_bypass_has_roles = 3; | 
 |  | 
 |   message SafetyBypassNotDecommissioned { | 
 |   } | 
 |   // If set, safety_bypass_not_decommissioned will allow to remove nodes that | 
 |   // haven't been yet decommissioned. | 
 |   // | 
 |   // Danger: removing nodes which haven't been decommissioned via | 
 |   // DecommissionNode can leave nodes attempting to reconnect to the cluster, | 
 |   // and does not fully clean up cryptographic material from the node. | 
 |   // | 
 |   // This option will need to be used when a node has broken down, as it's | 
 |   // impossible to move a node from UP to DECOMMISSIONED if that node is | 
 |   // unreachable. | 
 |   // | 
 |   // To clean up after using this option: | 
 |   // | 
 |   // 1. Make sure that the node does not boot back up. The cluster will prevent | 
 |   //    the node from rejoining the cluster, but the node will by itself | 
 |   //    continue to crash and reboot due to a rejection by the cluster. | 
 |   // 2. Zero our the node's ESP to remove any leftover cryptographic requests. | 
 |   //    These secrets are safeguarded according to the cluster's | 
 |   //    StorageSecurityPolicy and NodeTPMUsage. Depending on the settings, | 
 |   //    cleaning up these secrets before letting other systems access the node | 
 |   //    might be critical to maintaining cluster security. | 
 |   SafetyBypassNotDecommissioned safety_bypass_not_decommissioned = 4; | 
 | } | 
 |  | 
 | message DeleteNodeResponse { | 
 | } | 
 |  | 
 | // NodeManagement runs on every node of the cluster and providers management | 
 | // and troubleshooting RPCs to operators. All requests must be authenticated. | 
 | service NodeManagement { | 
 |   // GetLogs Returns historical and/or streaming logs for a given DN with given | 
 |   // filters from the system global LogTree. | 
 |   // | 
 |   // For more information about this API, see //metropolis/pkg/logtree. But, in | 
 |   // summary: | 
 |   //   - All logging is performed to a DN (distinguished name), which is a | 
 |   //     dot-delimited string like foo.bar.baz. | 
 |   //   - Log entries can be either raw (coming from unstructured logging from | 
 |   //     an external service, like a running process) or leveled (emitted by | 
 |   //     Metropolis code with a source line, timestamp, and severity). | 
 |   //   - The DNs form a tree of logging nodes - and when requesting logs, a | 
 |   //     given subtree of DNs can be requested, instead of just a given DN. | 
 |   //   - All supervised processes live at `root.<supervisor DN>`. For more | 
 |   //     example paths, see the console logs of a running Metropolis node, or | 
 |   //     request all logs (at DN ""). | 
 |   // | 
 |   rpc Logs(GetLogsRequest) returns (stream GetLogsResponse) { | 
 |     option (metropolis.proto.ext.authorization) = { | 
 |       need: PERMISSION_READ_NODE_LOGS | 
 |     }; | 
 |   } | 
 |   // UpdateNode updates the node operating system to a new version. | 
 |   // | 
 |   // Metropolis uses a side-by-side (A/B) update process. This method installs | 
 |   // the OS from the given bundle into the inactive slot, activates that slot | 
 |   // and then (optionally) reboots to activate it. | 
 |   rpc UpdateNode(UpdateNodeRequest) returns (UpdateNodeResponse) { | 
 |     option (metropolis.proto.ext.authorization) = { | 
 |       need: PERMISSION_UPDATE_NODE | 
 |     }; | 
 |   } | 
 | } | 
 |  | 
 | message GetLogsRequest { | 
 |   // DN from which to request logs. All supervised runnables live at `root.`, | 
 |   // the init code lives at `init.`. | 
 |   string dn = 1; | 
 |   // Filters to apply to returned data. | 
 |   repeated metropolis.proto.common.LogFilter filters = 2; | 
 |  | 
 |   enum BacklogMode { | 
 |     BACKLOG_INVALID = 0; | 
 |     // No historic data will be returned. | 
 |     BACKLOG_DISABLE = 1; | 
 |     // All available historic data will be returned. | 
 |     BACKLOG_ALL = 2; | 
 |     // At most backlog_count entries will be returned, if available. | 
 |     BACKLOG_COUNT = 3; | 
 |   } | 
 |   BacklogMode backlog_mode = 3; | 
 |   int64 backlog_count = 4; | 
 |  | 
 |   enum StreamMode { | 
 |     STREAM_INVALID = 0; | 
 |     // No streaming entries, gRPC stream will be closed as soon as all backlog | 
 |     // data is served. | 
 |     STREAM_DISABLE = 1; | 
 |     // Entries will be streamed as early as available right after all backlog | 
 |     // data is served. | 
 |     STREAM_UNBUFFERED = 2; | 
 |   } | 
 |   StreamMode stream_mode = 5; | 
 | } | 
 |  | 
 | message GetLogsResponse { | 
 |   // Entries from the requested historical entries (via WithBackLog). They will | 
 |   // all be served before the first stream_entries are served (if any). | 
 |   repeated metropolis.proto.common.LogEntry backlog_entries = 1; | 
 |   // Entries streamed as they arrive. Currently no server-side buffering is | 
 |   // enabled, instead every line is served as early as it arrives. However, this | 
 |   // might change in the future, so this behaviour cannot be depended upon. | 
 |   repeated metropolis.proto.common.LogEntry stream_entries = 2; | 
 | } | 
 |  | 
 | enum ActivationMode { | 
 |   ACTIVATION_INVALID = 0; | 
 |   // The new bundle is not activated immediately. It gets activated on the next | 
 |   // reboot/reset. | 
 |   ACTIVATION_NONE = 1; | 
 |   // The node is rebooted immediately to activate the new image. | 
 |   ACTIVATION_REBOOT = 2; | 
 |   // The node uses kexec to activate the new image immediately without fully | 
 |   // rebooting. | 
 |   ACTIVATION_KEXEC = 3; | 
 | } | 
 |  | 
 | message UpdateNodeRequest { | 
 |   // An HTTPS URL to a Metropolis bundle containing the new OS to install. | 
 |   string bundle_url = 1; | 
 |  | 
 |   reserved 2; | 
 |  | 
 |   // Specifies how the updated image should be activated. | 
 |   ActivationMode activation_mode = 3; | 
 | } | 
 |  | 
 | message UpdateNodeResponse {} |