Serge Bazanski | 6bd4159 | 2021-08-23 13:18:37 +0200 | [diff] [blame] | 1 | syntax = "proto3"; |
| 2 | package metropolis.proto.api; |
| 3 | option go_package = "source.monogon.dev/metropolis/proto/api"; |
| 4 | |
Mateusz Zalega | 944cb53 | 2022-06-20 16:54:17 +0200 | [diff] [blame] | 5 | import "google/protobuf/duration.proto"; |
Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 6 | import "google/protobuf/timestamp.proto"; |
Mateusz Zalega | 944cb53 | 2022-06-20 16:54:17 +0200 | [diff] [blame] | 7 | |
Serge Bazanski | bc671d0 | 2021-10-05 17:53:32 +0200 | [diff] [blame] | 8 | import "metropolis/proto/common/common.proto"; |
Serge Bazanski | 9ffa1f9 | 2021-09-01 15:42:23 +0200 | [diff] [blame] | 9 | import "metropolis/proto/ext/authorization.proto"; |
| 10 | |
Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 11 | // Management service available to Cluster Managers, allowing operational work |
| 12 | // to be performed on the cluster (eg. adding nodes, retrieving information |
| 13 | // about a running cluster, etc.). |
Serge Bazanski | 6bd4159 | 2021-08-23 13:18:37 +0200 | [diff] [blame] | 14 | service Management { |
| 15 | // GetRegisterTicket retrieves the current RegisterTicket which is required |
| 16 | // for new nodes to register into the cluster. Presenting this ticket on |
| 17 | // registration does not automatically grant access to arbitrary node |
| 18 | // registration. Instead, it is used to guard the API surface of the |
| 19 | // Register RPC from potential denial of service attacks, and can be |
| 20 | // regenerated at any time in case it leaks. |
Serge Bazanski | 9ffa1f9 | 2021-09-01 15:42:23 +0200 | [diff] [blame] | 21 | rpc GetRegisterTicket(GetRegisterTicketRequest) returns (GetRegisterTicketResponse) { |
| 22 | option (metropolis.proto.ext.authorization) = { |
| 23 | need: PERMISSION_GET_REGISTER_TICKET |
| 24 | }; |
| 25 | } |
Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 26 | |
Serge Bazanski | bc671d0 | 2021-10-05 17:53:32 +0200 | [diff] [blame] | 27 | // GetClusterInfo retrieves publicly available summary information about |
| 28 | // this cluster, notably data required for nodes to register into a cluster |
| 29 | // or join it (other than the Register Ticket, which is gated by an |
| 30 | // additional permission). |
| 31 | rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse) { |
| 32 | option (metropolis.proto.ext.authorization) = { |
| 33 | need: PERMISSION_READ_CLUSTER_STATUS |
| 34 | }; |
| 35 | } |
Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 36 | |
| 37 | // GetNodes retrieves information about nodes in the cluster. Currently, |
| 38 | // it returns all available data about all nodes. |
| 39 | rpc GetNodes(GetNodesRequest) returns (stream Node) { |
| 40 | option (metropolis.proto.ext.authorization) = { |
| 41 | need: PERMISSION_READ_CLUSTER_STATUS |
| 42 | }; |
| 43 | } |
Serge Bazanski | 1612d4b | 2021-11-12 13:54:15 +0100 | [diff] [blame] | 44 | |
| 45 | // ApproveNode progresses a node's registration process by changing its state |
| 46 | // in the cluster from NEW to STANDBY, if not yet STANDBY. This is required |
| 47 | // for the node to fully become part of the cluster (ie. have an UP state), |
| 48 | // and is required to be called by a manager manually. |
| 49 | // |
| 50 | // Managers can find out what nodes require approval by performing |
| 51 | // a GetNodes call and filtering for nodes in the NEW state. This call is |
| 52 | // idempotent and can be executed multiple times, and is a no-op if the node |
| 53 | // is already in the STANDBY or even UP states. |
| 54 | // |
| 55 | // In the future, approval process will be governed by cluster policy, but |
| 56 | // currently any node can be approved by a manager, and the manager is |
| 57 | // responsible for performing an out-of-band attestation of the node being/ |
| 58 | // approved (eg. by verifying that the node that is being approved has the |
| 59 | // same public key as what the registering node displays in its startup |
| 60 | // logs). |
| 61 | rpc ApproveNode(ApproveNodeRequest) returns (ApproveNodeResponse) { |
| 62 | option (metropolis.proto.ext.authorization) = { |
| 63 | need: PERMISSION_APPROVE_NODE |
| 64 | }; |
| 65 | } |
Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 66 | |
| 67 | // UpdateNodeRoles updates a single node's roles. |
| 68 | rpc UpdateNodeRoles(UpdateNodeRolesRequest) returns (UpdateNodeRolesResponse) { |
| 69 | option (metropolis.proto.ext.authorization) = { |
| 70 | need: PERMISSION_UPDATE_NODE_ROLES |
| 71 | }; |
| 72 | } |
Serge Bazanski | 6bd4159 | 2021-08-23 13:18:37 +0200 | [diff] [blame] | 73 | } |
| 74 | |
| 75 | message GetRegisterTicketRequest { |
| 76 | } |
| 77 | |
| 78 | message GetRegisterTicketResponse { |
| 79 | // Opaque bytes that comprise the RegisterTicket. |
| 80 | bytes ticket = 1; |
Serge Bazanski | 2893e98 | 2021-09-09 13:06:16 +0200 | [diff] [blame] | 81 | } |
Serge Bazanski | bc671d0 | 2021-10-05 17:53:32 +0200 | [diff] [blame] | 82 | |
| 83 | message GetClusterInfoRequest { |
| 84 | } |
| 85 | |
| 86 | message GetClusterInfoResponse { |
| 87 | // cluster_directory contains information about individual nodes in the |
| 88 | // cluster that can be used to dial the cluster's services. |
| 89 | metropolis.proto.common.ClusterDirectory cluster_directory = 1; |
Serge Bazanski | 2f58ac0 | 2021-10-05 11:47:20 +0200 | [diff] [blame] | 90 | |
Serge Bazanski | fbd38e2 | 2021-10-08 14:41:16 +0200 | [diff] [blame] | 91 | // ca_certificate is the x509 DER encoded CA certificate of the cluster. |
| 92 | bytes ca_certificate = 2; |
Serge Bazanski | 5df62ba | 2023-03-22 17:56:46 +0100 | [diff] [blame] | 93 | |
| 94 | metropolis.proto.common.ClusterConfiguration cluster_configuration = 3; |
Serge Bazanski | bc671d0 | 2021-10-05 17:53:32 +0200 | [diff] [blame] | 95 | } |
Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 96 | |
| 97 | message GetNodesRequest { |
Mateusz Zalega | 955e46e | 2022-05-27 18:00:50 +0200 | [diff] [blame] | 98 | // filter is a CEL expression used to limit the count of GetNodes results. |
| 99 | // Each processed node protobuf message is exposed to the filter as |
| 100 | // "node" variable, while related state and health enum constants are |
| 101 | // anchored in the root namespace, eg. NODE_STATE_UP, or HEARTBEAT_TIMEOUT. |
| 102 | // A node is returned each time the expression is evaluated as true. If |
| 103 | // empty, all nodes are returned. |
| 104 | string filter = 1; |
Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 105 | } |
| 106 | |
| 107 | // Node in a Metropolis cluster, streamed by Management.GetNodes. For each node |
| 108 | // in the cluster, this message will be emitted and will contain information |
| 109 | // about that node. |
| 110 | // |
| 111 | // The fields contained are node fields that PERMISSION_READ_CLUSTER_STATUS |
| 112 | // allows access to, ie. 'non-private' fields, ones that might be internal to |
| 113 | // the cluster and possibly considered sensitive information about the |
| 114 | // infrastructure, but whose knowledge does not allow to escalate privileges |
| 115 | // within the cluster. |
| 116 | message Node { |
| 117 | // Raw Ed25519 public key of this node, which can be used to generate |
| 118 | // the node's ID. This is always set. |
| 119 | bytes pubkey = 1; |
Serge Bazanski | 30fd154 | 2023-03-29 14:19:02 +0200 | [diff] [blame] | 120 | // Node ID calculated from pubkey, ie. 'metropolis-123456'. |
| 121 | string id = 7; |
Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 122 | // State of the node from the point of view of the cluster. This is |
| 123 | // always set. |
| 124 | metropolis.proto.common.NodeState state = 2; |
| 125 | // Last reported status by the Node, absent if a node hasn't yet reported |
| 126 | // its status. |
| 127 | metropolis.proto.common.NodeStatus status = 3; |
| 128 | // Roles assigned by the cluster. This is always set. |
| 129 | metropolis.proto.common.NodeRoles roles = 4; |
Serge Bazanski | 1612d4b | 2021-11-12 13:54:15 +0100 | [diff] [blame] | 130 | |
Mateusz Zalega | 32b1929 | 2022-05-17 13:26:55 +0200 | [diff] [blame] | 131 | // Health describes node's health as seen from the cluster perspective. |
| 132 | enum Health { |
| 133 | INVALID = 0; |
| 134 | // UNKNOWN is used whenever there were no heartbeats received from a |
| 135 | // given node AND too little time has passed since last Curator leader |
| 136 | // election to know whether the node is actually timing out. UNKNOWN |
| 137 | // is also returned for nodes which NodeState does not equal |
| 138 | // NODE_STATE_UP. |
| 139 | UNKNOWN = 1; |
| 140 | // HEALTHY describes nodes that have sent a heartbeat recently. |
| 141 | HEALTHY = 2; |
| 142 | // HEARTBEAT_TIMEOUT describes nodes that have not sent a heartbeat in |
| 143 | // the interval specified by curator.HeartbeatTimeout. |
| 144 | HEARTBEAT_TIMEOUT = 3; |
| 145 | } |
| 146 | Health health = 5; |
Mateusz Zalega | 2175ec9 | 2022-06-13 09:29:09 +0200 | [diff] [blame] | 147 | // time_since_heartbeat is the duration since the last of the node's |
| 148 | // heartbeats was received, expressed in nanoseconds. It is only valid with |
| 149 | // the health status of either HEALTHY or HEARTBEAT_TIMEOUT. |
Mateusz Zalega | 944cb53 | 2022-06-20 16:54:17 +0200 | [diff] [blame] | 150 | google.protobuf.Duration time_since_heartbeat = 6; |
Serge Bazanski | e4a4ce1 | 2023-03-22 18:29:54 +0100 | [diff] [blame] | 151 | |
| 152 | // tpm_usage describes whether this node has a TPM 2.0 and whether it is |
| 153 | // being actively used as part of its membership in the Metropolis cluster. |
| 154 | // |
| 155 | // Currently, the TPM 2.0 is only used to seal the local part of the disk |
| 156 | // encryption key and the early join credentials of the node. Depending on |
| 157 | // future cluster configuration settings, this might also indicate that the |
| 158 | // node has actually passed high assurance hardware attestation against the |
| 159 | // cluster. |
| 160 | metropolis.proto.common.NodeTPMUsage tpm_usage = 8; |
Mateusz Zalega | 32b1929 | 2022-05-17 13:26:55 +0200 | [diff] [blame] | 161 | } |
Serge Bazanski | 1612d4b | 2021-11-12 13:54:15 +0100 | [diff] [blame] | 162 | |
| 163 | message ApproveNodeRequest { |
| 164 | // Raw public key of the node being approved, has to correspond to a node |
| 165 | // currently in the cluster. |
| 166 | bytes pubkey = 1; |
| 167 | } |
| 168 | |
| 169 | message ApproveNodeResponse { |
Mateusz Zalega | 32b1929 | 2022-05-17 13:26:55 +0200 | [diff] [blame] | 170 | } |
Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 171 | |
| 172 | // UpdateNodeRolesRequest updates roles of a single node matching pubkey. All |
| 173 | // role fields are optional, and no change will result if they're either unset |
| 174 | // or if their value matches existing state. |
| 175 | message UpdateNodeRolesRequest { |
Mateusz Zalega | 9c315f1 | 2022-08-11 16:31:22 +0200 | [diff] [blame] | 176 | // node uniquely identifies the node subject to this request. |
| 177 | oneof node { |
| 178 | // pubkey is the Ed25519 public key of this node, which can be used to |
| 179 | // generate the node's ID. |
| 180 | bytes pubkey = 1; |
| 181 | // id is the human-readable identifier of the node, based on its public |
| 182 | // key. |
| 183 | string id = 4; |
| 184 | } |
Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 185 | |
Serge Bazanski | 15f7f63 | 2023-03-14 17:17:20 +0100 | [diff] [blame] | 186 | // kubernetesController adjusts the appropriate role when set. |
Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 187 | optional bool kubernetesWorker = 2; |
Serge Bazanski | 15f7f63 | 2023-03-14 17:17:20 +0100 | [diff] [blame] | 188 | // kubernetesController adjusts the appropriate role when set. Nodes performing |
| 189 | // this role must also be consensus members. |
| 190 | optional bool kubernetesController = 5; |
Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 191 | optional bool consensusMember = 3; |
| 192 | } |
| 193 | |
| 194 | message UpdateNodeRolesResponse { |
| 195 | } |
Serge Bazanski | b40c008 | 2023-03-29 14:28:04 +0200 | [diff] [blame] | 196 | |
| 197 | // NodeManagement runs on every node of the cluster and providers management |
| 198 | // and troubleshooting RPCs to operators. All requests must be authenticated. |
| 199 | service NodeManagement { |
Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 200 | // GetLogs Returns historical and/or streaming logs for a given DN with given |
| 201 | // filters from the system global LogTree. |
| 202 | // |
| 203 | // For more information about this API, see //metropolis/pkg/logtree. But, in |
| 204 | // summary: |
| 205 | // - All logging is performed to a DN (distinguished name), which is a |
| 206 | // dot-delimited string like foo.bar.baz. |
| 207 | // - Log entries can be either raw (coming from unstructured logging from |
| 208 | // an external service, like a running process) or leveled (emitted by |
| 209 | // Metropolis code with a source line, timestamp, and severity). |
| 210 | // - The DNs form a tree of logging nodes - and when requesting logs, a |
| 211 | // given subtree of DNs can be requested, instead of just a given DN. |
| 212 | // - All supervised processes live at `root.<supervisor DN>`. For more |
| 213 | // example paths, see the console logs of a running Metropolis node, or |
| 214 | // request all logs (at DN ""). |
| 215 | // |
Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 216 | rpc Logs(GetLogsRequest) returns (stream GetLogsResponse) { |
| 217 | option (metropolis.proto.ext.authorization) = { |
| 218 | need: PERMISSION_READ_NODE_LOGS |
| 219 | }; |
| 220 | } |
Lorenz Brun | 35fcf03 | 2023-06-29 04:15:58 +0200 | [diff] [blame] | 221 | // UpdateNode updates the node operating system to a new version. |
| 222 | // |
| 223 | // Metropolis uses a side-by-side (A/B) update process. This method installs |
| 224 | // the OS from the given bundle into the inactive slot, activates that slot |
| 225 | // and then (optionally) reboots to activate it. |
| 226 | rpc UpdateNode(UpdateNodeRequest) returns (UpdateNodeResponse) { |
| 227 | option (metropolis.proto.ext.authorization) = { |
| 228 | need: PERMISSION_UPDATE_NODE |
| 229 | }; |
| 230 | } |
Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 231 | } |
| 232 | |
Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 233 | message GetLogsRequest { |
| 234 | // DN from which to request logs. All supervised runnables live at `root.`, |
| 235 | // the init code lives at `init.`. |
| 236 | string dn = 1; |
| 237 | // Filters to apply to returned data. |
Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 238 | repeated metropolis.proto.common.LogFilter filters = 2; |
Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 239 | |
| 240 | enum BacklogMode { |
| 241 | BACKLOG_INVALID = 0; |
| 242 | // No historic data will be returned. |
| 243 | BACKLOG_DISABLE = 1; |
| 244 | // All available historic data will be returned. |
| 245 | BACKLOG_ALL = 2; |
| 246 | // At most backlog_count entries will be returned, if available. |
| 247 | BACKLOG_COUNT = 3; |
| 248 | } |
| 249 | BacklogMode backlog_mode = 3; |
| 250 | int64 backlog_count = 4; |
| 251 | |
| 252 | enum StreamMode { |
| 253 | STREAM_INVALID = 0; |
Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 254 | // No streaming entries, gRPC stream will be closed as soon as all backlog |
| 255 | // data is served. |
Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 256 | STREAM_DISABLE = 1; |
Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 257 | // Entries will be streamed as early as available right after all backlog |
| 258 | // data is served. |
Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 259 | STREAM_UNBUFFERED = 2; |
| 260 | } |
| 261 | StreamMode stream_mode = 5; |
| 262 | } |
| 263 | |
Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 264 | message GetLogsResponse { |
Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 265 | // Entries from the requested historical entries (via WithBackLog). They will |
| 266 | // all be served before the first stream_entries are served (if any). |
| 267 | repeated metropolis.proto.common.LogEntry backlog_entries = 1; |
| 268 | // Entries streamed as they arrive. Currently no server-side buffering is |
| 269 | // enabled, instead every line is served as early as it arrives. However, this |
| 270 | // might change in the future, so this behaviour cannot be depended upon. |
| 271 | repeated metropolis.proto.common.LogEntry stream_entries = 2; |
Lorenz Brun | 35fcf03 | 2023-06-29 04:15:58 +0200 | [diff] [blame] | 272 | } |
| 273 | |
Lorenz Brun | d14be0e | 2023-07-31 16:46:14 +0200 | [diff] [blame] | 274 | enum ActivationMode { |
| 275 | ACTIVATION_INVALID = 0; |
| 276 | // The new bundle is not activated immediately. It gets activated on the next |
| 277 | // reboot/reset. |
| 278 | ACTIVATION_NONE = 1; |
| 279 | // The node is rebooted immediately to activate the new image. |
| 280 | ACTIVATION_REBOOT = 2; |
| 281 | // The node uses kexec to activate the new image immediately without fully |
| 282 | // rebooting. |
| 283 | ACTIVATION_KEXEC = 3; |
| 284 | } |
| 285 | |
Lorenz Brun | 35fcf03 | 2023-06-29 04:15:58 +0200 | [diff] [blame] | 286 | message UpdateNodeRequest { |
| 287 | // An HTTPS URL to a Metropolis bundle containing the new OS to install. |
| 288 | string bundle_url = 1; |
| 289 | |
Lorenz Brun | d14be0e | 2023-07-31 16:46:14 +0200 | [diff] [blame] | 290 | reserved 2; |
| 291 | |
| 292 | // Specifies how the updated image should be activated. |
| 293 | ActivationMode activation_mode = 3; |
Lorenz Brun | 35fcf03 | 2023-06-29 04:15:58 +0200 | [diff] [blame] | 294 | } |
| 295 | |
| 296 | message UpdateNodeResponse {} |