| Serge Bazanski | 6bd4159 | 2021-08-23 13:18:37 +0200 | [diff] [blame] | 1 | syntax = "proto3"; |
| 2 | package metropolis.proto.api; |
| 3 | option go_package = "source.monogon.dev/metropolis/proto/api"; |
| 4 | |
| Mateusz Zalega | 944cb53 | 2022-06-20 16:54:17 +0200 | [diff] [blame] | 5 | import "google/protobuf/duration.proto"; |
| Serge Bazanski | 1e39914 | 2024-10-22 10:58:15 +0000 | [diff] [blame^] | 6 | import "google/protobuf/field_mask.proto"; |
| Mateusz Zalega | 944cb53 | 2022-06-20 16:54:17 +0200 | [diff] [blame] | 7 | |
| Tim Windelschmidt | 9f21f53 | 2024-05-07 15:14:20 +0200 | [diff] [blame] | 8 | import "osbase/logtree/proto/logtree.proto"; |
| Serge Bazanski | bc671d0 | 2021-10-05 17:53:32 +0200 | [diff] [blame] | 9 | import "metropolis/proto/common/common.proto"; |
| Serge Bazanski | 9ffa1f9 | 2021-09-01 15:42:23 +0200 | [diff] [blame] | 10 | import "metropolis/proto/ext/authorization.proto"; |
| 11 | |
| Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 12 | // Management service available to Cluster Managers, allowing operational work |
| 13 | // to be performed on the cluster (eg. adding nodes, retrieving information |
| 14 | // about a running cluster, etc.). |
| Serge Bazanski | 6bd4159 | 2021-08-23 13:18:37 +0200 | [diff] [blame] | 15 | service Management { |
| 16 | // GetRegisterTicket retrieves the current RegisterTicket which is required |
| 17 | // for new nodes to register into the cluster. Presenting this ticket on |
| 18 | // registration does not automatically grant access to arbitrary node |
| 19 | // registration. Instead, it is used to guard the API surface of the |
| 20 | // Register RPC from potential denial of service attacks, and can be |
| 21 | // regenerated at any time in case it leaks. |
| Serge Bazanski | 9ffa1f9 | 2021-09-01 15:42:23 +0200 | [diff] [blame] | 22 | rpc GetRegisterTicket(GetRegisterTicketRequest) returns (GetRegisterTicketResponse) { |
| 23 | option (metropolis.proto.ext.authorization) = { |
| 24 | need: PERMISSION_GET_REGISTER_TICKET |
| 25 | }; |
| 26 | } |
| Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 27 | |
| Serge Bazanski | bc671d0 | 2021-10-05 17:53:32 +0200 | [diff] [blame] | 28 | // GetClusterInfo retrieves publicly available summary information about |
| 29 | // this cluster, notably data required for nodes to register into a cluster |
| 30 | // or join it (other than the Register Ticket, which is gated by an |
| 31 | // additional permission). |
| 32 | rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse) { |
| 33 | option (metropolis.proto.ext.authorization) = { |
| 34 | need: PERMISSION_READ_CLUSTER_STATUS |
| 35 | }; |
| 36 | } |
| Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 37 | |
| 38 | // GetNodes retrieves information about nodes in the cluster. Currently, |
| 39 | // it returns all available data about all nodes. |
| 40 | rpc GetNodes(GetNodesRequest) returns (stream Node) { |
| 41 | option (metropolis.proto.ext.authorization) = { |
| 42 | need: PERMISSION_READ_CLUSTER_STATUS |
| 43 | }; |
| 44 | } |
| Serge Bazanski | 1612d4b | 2021-11-12 13:54:15 +0100 | [diff] [blame] | 45 | |
| 46 | // ApproveNode progresses a node's registration process by changing its state |
| 47 | // in the cluster from NEW to STANDBY, if not yet STANDBY. This is required |
| 48 | // for the node to fully become part of the cluster (ie. have an UP state), |
| 49 | // and is required to be called by a manager manually. |
| 50 | // |
| 51 | // Managers can find out what nodes require approval by performing |
| 52 | // a GetNodes call and filtering for nodes in the NEW state. This call is |
| 53 | // idempotent and can be executed multiple times, and is a no-op if the node |
| 54 | // is already in the STANDBY or even UP states. |
| 55 | // |
| 56 | // In the future, approval process will be governed by cluster policy, but |
| 57 | // currently any node can be approved by a manager, and the manager is |
| 58 | // responsible for performing an out-of-band attestation of the node being/ |
| 59 | // approved (eg. by verifying that the node that is being approved has the |
| 60 | // same public key as what the registering node displays in its startup |
| 61 | // logs). |
| 62 | rpc ApproveNode(ApproveNodeRequest) returns (ApproveNodeResponse) { |
| 63 | option (metropolis.proto.ext.authorization) = { |
| 64 | need: PERMISSION_APPROVE_NODE |
| 65 | }; |
| 66 | } |
| Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 67 | |
| 68 | // UpdateNodeRoles updates a single node's roles. |
| 69 | rpc UpdateNodeRoles(UpdateNodeRolesRequest) returns (UpdateNodeRolesResponse) { |
| 70 | option (metropolis.proto.ext.authorization) = { |
| 71 | need: PERMISSION_UPDATE_NODE_ROLES |
| 72 | }; |
| 73 | } |
| Serge Bazanski | 8456ddf | 2023-10-30 18:56:59 +0100 | [diff] [blame] | 74 | |
| 75 | // Decommissioning a node takes it from UP, through |
| 76 | // |
| 77 | // 1. DECOMMISSION_REQUESTED |
| 78 | // The node will detect this state on the cluster and begin a cleanup |
| 79 | // process which consists of removing either key material or zeroing |
| 80 | // out the data partition, depending on cluster policy. It will report |
| 81 | // to the cluster that it has begun the process, which will take it to |
| 82 | // the next state: |
| 83 | // |
| 84 | // 2. DECOMMISSIONING |
| 85 | // The node will continue cleanup. After cleanup is successful, it will |
| 86 | // report back to the cluster which will take it to DECOMMISSIONED. The |
| 87 | // node then reboots, and never comes back. |
| 88 | // |
| 89 | // 3. DECOMMISSIONED |
| 90 | // The node can be removed with a subsequent DeleteNode call. |
| 91 | // |
| 92 | // TODO(q3k): implement this, possibly iron out the state machine involved. |
| 93 | // |
| 94 | // The node cannot have any roles assigned to it when it is being |
| 95 | // decommissioned: none may be assigned when the decommissioning process is |
| 96 | // requested, and none may be added to it while it is decommissioning. |
| 97 | rpc DecommissionNode(DecommissionNodeRequest) returns (DecommissionNodeResponse) { |
| 98 | option (metropolis.proto.ext.authorization) = { |
| 99 | need: PERMISSION_DECOMMISSION_NODE |
| 100 | }; |
| 101 | } |
| 102 | |
| 103 | // Delete a node from the cluster. By default the node must be in the |
| 104 | // DECOMMISSIONED state and may not have any roles assigned. However, some |
| 105 | // safety bypasses are available for nodes which have become unavailable and |
| 106 | // thus cannot be decommissioned correctly - see the request documentation |
| 107 | // for more details. |
| 108 | rpc DeleteNode(DeleteNodeRequest) returns (DeleteNodeResponse) { |
| 109 | option (metropolis.proto.ext.authorization) = { |
| 110 | need: PERMISSION_DELETE_NODE |
| 111 | }; |
| 112 | } |
| Serge Bazanski | 1f78954 | 2024-05-22 14:01:50 +0200 | [diff] [blame] | 113 | |
| 114 | // Add, update or remove labels from a given node. The given node must exist, |
| 115 | // but can be in any state. |
| 116 | rpc UpdateNodeLabels(UpdateNodeLabelsRequest) returns (UpdateNodeLabelsResponse) { |
| 117 | option (metropolis.proto.ext.authorization) = { |
| 118 | need: PERMISSION_UPDATE_NODE_LABELS |
| 119 | }; |
| 120 | } |
| Serge Bazanski | 1e39914 | 2024-10-22 10:58:15 +0000 | [diff] [blame^] | 121 | |
| 122 | rpc ConfigureCluster(ConfigureClusterRequest) returns (ConfigureClusterResponse) { |
| 123 | option (metropolis.proto.ext.authorization) = { |
| 124 | need: PERMISSION_CONFIGURE_CLUSTER |
| 125 | }; |
| 126 | } |
| Serge Bazanski | 6bd4159 | 2021-08-23 13:18:37 +0200 | [diff] [blame] | 127 | } |
| 128 | |
| 129 | message GetRegisterTicketRequest { |
| 130 | } |
| 131 | |
| 132 | message GetRegisterTicketResponse { |
| 133 | // Opaque bytes that comprise the RegisterTicket. |
| 134 | bytes ticket = 1; |
| Serge Bazanski | 2893e98 | 2021-09-09 13:06:16 +0200 | [diff] [blame] | 135 | } |
| Serge Bazanski | bc671d0 | 2021-10-05 17:53:32 +0200 | [diff] [blame] | 136 | |
| 137 | message GetClusterInfoRequest { |
| 138 | } |
| 139 | |
| 140 | message GetClusterInfoResponse { |
| 141 | // cluster_directory contains information about individual nodes in the |
| 142 | // cluster that can be used to dial the cluster's services. |
| 143 | metropolis.proto.common.ClusterDirectory cluster_directory = 1; |
| Serge Bazanski | 2f58ac0 | 2021-10-05 11:47:20 +0200 | [diff] [blame] | 144 | |
| Serge Bazanski | fbd38e2 | 2021-10-08 14:41:16 +0200 | [diff] [blame] | 145 | // ca_certificate is the x509 DER encoded CA certificate of the cluster. |
| 146 | bytes ca_certificate = 2; |
| Serge Bazanski | 5df62ba | 2023-03-22 17:56:46 +0100 | [diff] [blame] | 147 | |
| 148 | metropolis.proto.common.ClusterConfiguration cluster_configuration = 3; |
| Serge Bazanski | bc671d0 | 2021-10-05 17:53:32 +0200 | [diff] [blame] | 149 | } |
| Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 150 | |
| 151 | message GetNodesRequest { |
| Mateusz Zalega | 955e46e | 2022-05-27 18:00:50 +0200 | [diff] [blame] | 152 | // filter is a CEL expression used to limit the count of GetNodes results. |
| 153 | // Each processed node protobuf message is exposed to the filter as |
| 154 | // "node" variable, while related state and health enum constants are |
| 155 | // anchored in the root namespace, eg. NODE_STATE_UP, or HEARTBEAT_TIMEOUT. |
| 156 | // A node is returned each time the expression is evaluated as true. If |
| 157 | // empty, all nodes are returned. |
| 158 | string filter = 1; |
| Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 159 | } |
| 160 | |
| 161 | // Node in a Metropolis cluster, streamed by Management.GetNodes. For each node |
| 162 | // in the cluster, this message will be emitted and will contain information |
| 163 | // about that node. |
| 164 | // |
| 165 | // The fields contained are node fields that PERMISSION_READ_CLUSTER_STATUS |
| 166 | // allows access to, ie. 'non-private' fields, ones that might be internal to |
| 167 | // the cluster and possibly considered sensitive information about the |
| 168 | // infrastructure, but whose knowledge does not allow to escalate privileges |
| 169 | // within the cluster. |
| 170 | message Node { |
| 171 | // Raw Ed25519 public key of this node, which can be used to generate |
| 172 | // the node's ID. This is always set. |
| 173 | bytes pubkey = 1; |
| Serge Bazanski | 30fd154 | 2023-03-29 14:19:02 +0200 | [diff] [blame] | 174 | // Node ID calculated from pubkey, ie. 'metropolis-123456'. |
| 175 | string id = 7; |
| Serge Bazanski | 5611447 | 2021-10-11 14:47:54 +0200 | [diff] [blame] | 176 | // State of the node from the point of view of the cluster. This is |
| 177 | // always set. |
| 178 | metropolis.proto.common.NodeState state = 2; |
| 179 | // Last reported status by the Node, absent if a node hasn't yet reported |
| 180 | // its status. |
| 181 | metropolis.proto.common.NodeStatus status = 3; |
| 182 | // Roles assigned by the cluster. This is always set. |
| 183 | metropolis.proto.common.NodeRoles roles = 4; |
| Serge Bazanski | 1612d4b | 2021-11-12 13:54:15 +0100 | [diff] [blame] | 184 | |
| Mateusz Zalega | 32b1929 | 2022-05-17 13:26:55 +0200 | [diff] [blame] | 185 | // Health describes node's health as seen from the cluster perspective. |
| 186 | enum Health { |
| 187 | INVALID = 0; |
| 188 | // UNKNOWN is used whenever there were no heartbeats received from a |
| 189 | // given node AND too little time has passed since last Curator leader |
| 190 | // election to know whether the node is actually timing out. UNKNOWN |
| 191 | // is also returned for nodes which NodeState does not equal |
| 192 | // NODE_STATE_UP. |
| 193 | UNKNOWN = 1; |
| 194 | // HEALTHY describes nodes that have sent a heartbeat recently. |
| 195 | HEALTHY = 2; |
| 196 | // HEARTBEAT_TIMEOUT describes nodes that have not sent a heartbeat in |
| 197 | // the interval specified by curator.HeartbeatTimeout. |
| 198 | HEARTBEAT_TIMEOUT = 3; |
| 199 | } |
| 200 | Health health = 5; |
| Mateusz Zalega | 2175ec9 | 2022-06-13 09:29:09 +0200 | [diff] [blame] | 201 | // time_since_heartbeat is the duration since the last of the node's |
| 202 | // heartbeats was received, expressed in nanoseconds. It is only valid with |
| 203 | // the health status of either HEALTHY or HEARTBEAT_TIMEOUT. |
| Mateusz Zalega | 944cb53 | 2022-06-20 16:54:17 +0200 | [diff] [blame] | 204 | google.protobuf.Duration time_since_heartbeat = 6; |
| Serge Bazanski | e4a4ce1 | 2023-03-22 18:29:54 +0100 | [diff] [blame] | 205 | |
| 206 | // tpm_usage describes whether this node has a TPM 2.0 and whether it is |
| 207 | // being actively used as part of its membership in the Metropolis cluster. |
| 208 | // |
| 209 | // Currently, the TPM 2.0 is only used to seal the local part of the disk |
| 210 | // encryption key and the early join credentials of the node. Depending on |
| 211 | // future cluster configuration settings, this might also indicate that the |
| 212 | // node has actually passed high assurance hardware attestation against the |
| 213 | // cluster. |
| 214 | metropolis.proto.common.NodeTPMUsage tpm_usage = 8; |
| Serge Bazanski | 1f78954 | 2024-05-22 14:01:50 +0200 | [diff] [blame] | 215 | |
| 216 | // Labels attached to the node. |
| 217 | metropolis.proto.common.NodeLabels labels = 9; |
| Mateusz Zalega | 32b1929 | 2022-05-17 13:26:55 +0200 | [diff] [blame] | 218 | } |
| Serge Bazanski | 1612d4b | 2021-11-12 13:54:15 +0100 | [diff] [blame] | 219 | |
| 220 | message ApproveNodeRequest { |
| 221 | // Raw public key of the node being approved, has to correspond to a node |
| 222 | // currently in the cluster. |
| 223 | bytes pubkey = 1; |
| 224 | } |
| 225 | |
| 226 | message ApproveNodeResponse { |
| Mateusz Zalega | 32b1929 | 2022-05-17 13:26:55 +0200 | [diff] [blame] | 227 | } |
| Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 228 | |
| 229 | // UpdateNodeRolesRequest updates roles of a single node matching pubkey. All |
| 230 | // role fields are optional, and no change will result if they're either unset |
| 231 | // or if their value matches existing state. |
| 232 | message UpdateNodeRolesRequest { |
| Mateusz Zalega | 9c315f1 | 2022-08-11 16:31:22 +0200 | [diff] [blame] | 233 | // node uniquely identifies the node subject to this request. |
| 234 | oneof node { |
| 235 | // pubkey is the Ed25519 public key of this node, which can be used to |
| 236 | // generate the node's ID. |
| 237 | bytes pubkey = 1; |
| 238 | // id is the human-readable identifier of the node, based on its public |
| 239 | // key. |
| 240 | string id = 4; |
| 241 | } |
| Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 242 | |
| Serge Bazanski | 15f7f63 | 2023-03-14 17:17:20 +0100 | [diff] [blame] | 243 | // kubernetesController adjusts the appropriate role when set. |
| Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 244 | optional bool kubernetesWorker = 2; |
| Serge Bazanski | 15f7f63 | 2023-03-14 17:17:20 +0100 | [diff] [blame] | 245 | // kubernetesController adjusts the appropriate role when set. Nodes performing |
| 246 | // this role must also be consensus members. |
| 247 | optional bool kubernetesController = 5; |
| Mateusz Zalega | bb2edbe | 2022-06-08 11:57:09 +0200 | [diff] [blame] | 248 | optional bool consensusMember = 3; |
| 249 | } |
| 250 | |
| 251 | message UpdateNodeRolesResponse { |
| 252 | } |
| Serge Bazanski | b40c008 | 2023-03-29 14:28:04 +0200 | [diff] [blame] | 253 | |
| Serge Bazanski | 8456ddf | 2023-10-30 18:56:59 +0100 | [diff] [blame] | 254 | message DecommissionNodeRequest { |
| 255 | // node uniquely identifies the node subject to this request. |
| 256 | oneof node { |
| 257 | // pubkey is the Ed25519 public key of this node, which can be used to |
| 258 | // generate the node's ID. |
| 259 | bytes pubkey = 1; |
| 260 | // id is the human-readable identifier of the node, based on its public |
| 261 | // key. |
| 262 | string id = 4; |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | message DecommissionNodeResponse { |
| 267 | } |
| 268 | |
| 269 | message DeleteNodeRequest { |
| 270 | // node uniquely identifies the node subject to this request. |
| 271 | oneof node { |
| 272 | // pubkey is the Ed25519 public key of this node, which can be used to |
| 273 | // generate the node's ID. |
| 274 | bytes pubkey = 1; |
| 275 | // id is the human-readable identifier of the node, based on its public |
| 276 | // key. |
| 277 | string id = 2; |
| 278 | } |
| 279 | |
| 280 | message SafetyBypassHasRoles { |
| 281 | } |
| 282 | // If set, safety_bypass_has_roles allows the removal of nodes which still have |
| 283 | // roles assigned. |
| 284 | // |
| 285 | // Danger: removing nodes which still have roles assigned might leave the |
| 286 | // cluster in an inconsistent state. Unassigning roles from a nodes via |
| 287 | // UpdateNodeRoles ensures consistency. |
| 288 | // |
| 289 | // It's also advised to never use this option in automated workflows, as this |
| 290 | // prevents a runaway automation from removing nodes that are still used for |
| 291 | // actual work. |
| 292 | // |
| 293 | // Nodes which broke down or otherwise become unreachable shouldn't need to |
| 294 | // enable this option, as unassigning the role from a node does not require it |
| 295 | // to be healthy. |
| 296 | // |
| 297 | // A short summary of how to deal with possible inconsistencies after removing |
| 298 | // a node with roles still assigned: |
| 299 | // |
| 300 | // 1. KubernetesWorker: remove the node from the Kubernetes cluster via kubectl |
| 301 | // (kubectl delete node metropolis-xxx). |
| 302 | // 2. KubernetesController: no cleanup should be necessary. |
| 303 | // 3. ConsensusMember: |
| 304 | // a. the cluster still has quorum: remove the node from etcd. |
| 305 | // TODO(q3k): document this |
| 306 | // b. the cluster has no quorum: rebuild the cluster |
| 307 | SafetyBypassHasRoles safety_bypass_has_roles = 3; |
| 308 | |
| 309 | message SafetyBypassNotDecommissioned { |
| 310 | } |
| 311 | // If set, safety_bypass_not_decommissioned will allow to remove nodes that |
| 312 | // haven't been yet decommissioned. |
| 313 | // |
| 314 | // Danger: removing nodes which haven't been decommissioned via |
| 315 | // DecommissionNode can leave nodes attempting to reconnect to the cluster, |
| 316 | // and does not fully clean up cryptographic material from the node. |
| 317 | // |
| 318 | // This option will need to be used when a node has broken down, as it's |
| 319 | // impossible to move a node from UP to DECOMMISSIONED if that node is |
| 320 | // unreachable. |
| 321 | // |
| 322 | // To clean up after using this option: |
| 323 | // |
| 324 | // 1. Make sure that the node does not boot back up. The cluster will prevent |
| 325 | // the node from rejoining the cluster, but the node will by itself |
| 326 | // continue to crash and reboot due to a rejection by the cluster. |
| 327 | // 2. Zero our the node's ESP to remove any leftover cryptographic requests. |
| 328 | // These secrets are safeguarded according to the cluster's |
| 329 | // StorageSecurityPolicy and NodeTPMUsage. Depending on the settings, |
| 330 | // cleaning up these secrets before letting other systems access the node |
| 331 | // might be critical to maintaining cluster security. |
| 332 | SafetyBypassNotDecommissioned safety_bypass_not_decommissioned = 4; |
| 333 | } |
| 334 | |
| 335 | message DeleteNodeResponse { |
| 336 | } |
| 337 | |
| Lorenz Brun | 5a5c66b | 2024-08-22 16:11:44 +0200 | [diff] [blame] | 338 | message RebootRequest { |
| 339 | enum Type { |
| 340 | TYPE_INVALID = 0; |
| 341 | // FIRMWARE performs a firmware-assisted (EFI, PSCI, ...) reboot and |
| 342 | // signals the firmware to perform a thorough reset if possible. This |
| 343 | // maximizes chances to clear hardware-related issues. The exact |
| 344 | // implementation is up to firmware. |
| 345 | FIRMWARE = 1; |
| 346 | // KEXEC performs a KEXEC reboot without going through firmware at all. |
| 347 | // This is the fastest reboot option, but does not fully reset most |
| 348 | // hardware and has compatibility issues on certain hardware. |
| 349 | KEXEC = 2; |
| 350 | // POWER_OFF fully powers off the system. It can only be started again by |
| 351 | // a physical power button, Wake On LAN if set supported by the NIC or |
| 352 | // an out-of-band management controller if available. |
| 353 | POWER_OFF = 3; |
| 354 | } |
| 355 | Type type = 1; |
| 356 | enum NextBoot { |
| 357 | // START_NORMAL starts the system normally, respecting standard A/B slot |
| 358 | // booting rules. Any staged but not activated updates will be activated |
| 359 | // as with a normal reboot. |
| 360 | START_NORMAL = 0; |
| 361 | // START_ROLLBACK tries to boot into the currently inactive slot on reboot. |
| 362 | START_ROLLBACK = 1; |
| 363 | // START_FIRMWARE_UI tries to boot into the EFI firmware UI. Cannot be used |
| 364 | // together with KEXEC as firmare is not involved there. |
| 365 | START_FIRMWARE_UI = 2; |
| 366 | } |
| 367 | // NextBoot can be used to select the boot slot to reboot into. This works |
| 368 | // even for POWER_OFF, but there the next boot will need to be triggered |
| 369 | // externally. START_FIRMWARE_UI cannot be used together with KEXEC. |
| 370 | NextBoot next_boot = 2; |
| 371 | } |
| 372 | |
| 373 | message RebootResponse { |
| 374 | |
| 375 | } |
| 376 | |
| Serge Bazanski | b40c008 | 2023-03-29 14:28:04 +0200 | [diff] [blame] | 377 | // NodeManagement runs on every node of the cluster and providers management |
| 378 | // and troubleshooting RPCs to operators. All requests must be authenticated. |
| 379 | service NodeManagement { |
| Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 380 | // GetLogs Returns historical and/or streaming logs for a given DN with given |
| 381 | // filters from the system global LogTree. |
| 382 | // |
| Tim Windelschmidt | 9f21f53 | 2024-05-07 15:14:20 +0200 | [diff] [blame] | 383 | // For more information about this API, see //osbase/logtree. But, in |
| Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 384 | // summary: |
| 385 | // - All logging is performed to a DN (distinguished name), which is a |
| 386 | // dot-delimited string like foo.bar.baz. |
| 387 | // - Log entries can be either raw (coming from unstructured logging from |
| 388 | // an external service, like a running process) or leveled (emitted by |
| 389 | // Metropolis code with a source line, timestamp, and severity). |
| 390 | // - The DNs form a tree of logging nodes - and when requesting logs, a |
| 391 | // given subtree of DNs can be requested, instead of just a given DN. |
| 392 | // - All supervised processes live at `root.<supervisor DN>`. For more |
| 393 | // example paths, see the console logs of a running Metropolis node, or |
| 394 | // request all logs (at DN ""). |
| 395 | // |
| Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 396 | rpc Logs(GetLogsRequest) returns (stream GetLogsResponse) { |
| 397 | option (metropolis.proto.ext.authorization) = { |
| 398 | need: PERMISSION_READ_NODE_LOGS |
| 399 | }; |
| 400 | } |
| Lorenz Brun | 35fcf03 | 2023-06-29 04:15:58 +0200 | [diff] [blame] | 401 | // UpdateNode updates the node operating system to a new version. |
| 402 | // |
| 403 | // Metropolis uses a side-by-side (A/B) update process. This method installs |
| 404 | // the OS from the given bundle into the inactive slot, activates that slot |
| 405 | // and then (optionally) reboots to activate it. |
| 406 | rpc UpdateNode(UpdateNodeRequest) returns (UpdateNodeResponse) { |
| 407 | option (metropolis.proto.ext.authorization) = { |
| 408 | need: PERMISSION_UPDATE_NODE |
| 409 | }; |
| 410 | } |
| Lorenz Brun | 5a5c66b | 2024-08-22 16:11:44 +0200 | [diff] [blame] | 411 | |
| 412 | // Reboot initiates a node reboot or power-off. It can also be used to roll |
| 413 | // back to the inactive slot. |
| 414 | rpc Reboot(RebootRequest) returns (RebootResponse) { |
| 415 | option (metropolis.proto.ext.authorization) = { |
| 416 | need: PERMISSION_NODE_POWER_MANAGEMENT |
| 417 | }; |
| 418 | } |
| Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 419 | } |
| 420 | |
| Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 421 | message GetLogsRequest { |
| 422 | // DN from which to request logs. All supervised runnables live at `root.`, |
| 423 | // the init code lives at `init.`. |
| 424 | string dn = 1; |
| 425 | // Filters to apply to returned data. |
| Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 426 | repeated metropolis.proto.common.LogFilter filters = 2; |
| Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 427 | |
| 428 | enum BacklogMode { |
| 429 | BACKLOG_INVALID = 0; |
| 430 | // No historic data will be returned. |
| 431 | BACKLOG_DISABLE = 1; |
| 432 | // All available historic data will be returned. |
| 433 | BACKLOG_ALL = 2; |
| 434 | // At most backlog_count entries will be returned, if available. |
| 435 | BACKLOG_COUNT = 3; |
| 436 | } |
| 437 | BacklogMode backlog_mode = 3; |
| 438 | int64 backlog_count = 4; |
| 439 | |
| 440 | enum StreamMode { |
| 441 | STREAM_INVALID = 0; |
| Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 442 | // No streaming entries, gRPC stream will be closed as soon as all backlog |
| 443 | // data is served. |
| Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 444 | STREAM_DISABLE = 1; |
| Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 445 | // Entries will be streamed as early as available right after all backlog |
| 446 | // data is served. |
| Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 447 | STREAM_UNBUFFERED = 2; |
| 448 | } |
| 449 | StreamMode stream_mode = 5; |
| 450 | } |
| 451 | |
| Serge Bazanski | b91938f | 2023-03-29 14:31:22 +0200 | [diff] [blame] | 452 | message GetLogsResponse { |
| Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 453 | // Entries from the requested historical entries (via WithBackLog). They will |
| 454 | // all be served before the first stream_entries are served (if any). |
| Tim Windelschmidt | 9f21f53 | 2024-05-07 15:14:20 +0200 | [diff] [blame] | 455 | repeated osbase.pkg.logtree.proto.LogEntry backlog_entries = 1; |
| Serge Bazanski | da11486 | 2023-03-29 17:46:42 +0200 | [diff] [blame] | 456 | // Entries streamed as they arrive. Currently no server-side buffering is |
| 457 | // enabled, instead every line is served as early as it arrives. However, this |
| 458 | // might change in the future, so this behaviour cannot be depended upon. |
| Tim Windelschmidt | 9f21f53 | 2024-05-07 15:14:20 +0200 | [diff] [blame] | 459 | repeated osbase.pkg.logtree.proto.LogEntry stream_entries = 2; |
| Lorenz Brun | 35fcf03 | 2023-06-29 04:15:58 +0200 | [diff] [blame] | 460 | } |
| 461 | |
| Lorenz Brun | d14be0e | 2023-07-31 16:46:14 +0200 | [diff] [blame] | 462 | enum ActivationMode { |
| 463 | ACTIVATION_INVALID = 0; |
| 464 | // The new bundle is not activated immediately. It gets activated on the next |
| 465 | // reboot/reset. |
| 466 | ACTIVATION_NONE = 1; |
| 467 | // The node is rebooted immediately to activate the new image. |
| 468 | ACTIVATION_REBOOT = 2; |
| 469 | // The node uses kexec to activate the new image immediately without fully |
| 470 | // rebooting. |
| 471 | ACTIVATION_KEXEC = 3; |
| 472 | } |
| 473 | |
| Lorenz Brun | 35fcf03 | 2023-06-29 04:15:58 +0200 | [diff] [blame] | 474 | message UpdateNodeRequest { |
| 475 | // An HTTPS URL to a Metropolis bundle containing the new OS to install. |
| 476 | string bundle_url = 1; |
| 477 | |
| Lorenz Brun | d14be0e | 2023-07-31 16:46:14 +0200 | [diff] [blame] | 478 | reserved 2; |
| 479 | |
| 480 | // Specifies how the updated image should be activated. |
| 481 | ActivationMode activation_mode = 3; |
| Lorenz Brun | 35fcf03 | 2023-06-29 04:15:58 +0200 | [diff] [blame] | 482 | } |
| 483 | |
| Serge Bazanski | 1f78954 | 2024-05-22 14:01:50 +0200 | [diff] [blame] | 484 | message UpdateNodeResponse {} |
| 485 | |
| 486 | message UpdateNodeLabelsRequest { |
| 487 | // node uniquely identifies the node subject to this request. |
| 488 | oneof node { |
| 489 | // pubkey is the Ed25519 public key of this node, which can be used to |
| 490 | // generate the node's ID. |
| 491 | bytes pubkey = 1; |
| 492 | // id is the human-readable identifier of the node, based on its public |
| 493 | // key. |
| 494 | string id = 2; |
| 495 | } |
| 496 | |
| 497 | message Pair { |
| 498 | string key = 1; |
| 499 | string value = 2; |
| 500 | } |
| 501 | // Labels to be added (created or updated by key). |
| 502 | // |
| 503 | // The given pairs must have unique, valid keys and valid values. |
| 504 | repeated Pair upsert = 3; |
| 505 | |
| 506 | // Labels to be removed (by key). |
| 507 | // |
| 508 | // The given keys do not have to exist on the node, but cannot intersect with |
| 509 | // keys given in the upsert list. |
| 510 | repeated string delete = 4; |
| 511 | } |
| 512 | |
| 513 | message UpdateNodeLabelsResponse { |
| 514 | } |
| 515 | |
| Serge Bazanski | 1e39914 | 2024-10-22 10:58:15 +0000 | [diff] [blame^] | 516 | message ConfigureClusterRequest { |
| 517 | // Base configuration to apply the change on. If set, the server will verify |
| 518 | // that the fields in this message (referenced by update_mask) have the same |
| 519 | // value as the current configuration. If there is a difference, an error will |
| 520 | // be returned and the configuration change will be aborted. |
| 521 | // |
| 522 | // This field _should_ be set to prevent race conditions with other clients |
| 523 | // attempting to mutate the configuration. |
| 524 | common.ClusterConfiguration base_config = 1; |
| 525 | |
| 526 | // New configuration to set. Only fields referenced to by update_mask will be |
| 527 | // updated. |
| 528 | common.ClusterConfiguration new_config = 2; |
| 529 | |
| 530 | // Fields that should be changed from the current state (and base config state, |
| 531 | // if set) into the new config state. |
| 532 | // |
| 533 | // Currently, only the following fields can be mutated: |
| 534 | // 1. kubernetes_config.node_labels_to_synchronize |
| 535 | google.protobuf.FieldMask update_mask = 3; |
| 536 | } |
| 537 | |
| 538 | message ConfigureClusterResponse { |
| 539 | // Resulting config as set on the server, merged from the users new_config. |
| 540 | common.ClusterConfiguration resulting_config = 1; |
| 541 | } |