Blame - metropolis/proto/api/management.proto - monogon

blob: a08f5cbf9538ea180ee85440fa814d88aadd90fe [file] [log] [blame]

Serge Bazanski	6bd4159	2021-08-23 13:18:37 +0200	[diff] [blame]	1	syntax = "proto3";
				2	package metropolis.proto.api;
				3	option go_package = "source.monogon.dev/metropolis/proto/api";
				4
Mateusz Zalega	944cb53	2022-06-20 16:54:17 +0200	[diff] [blame]	5	import "google/protobuf/duration.proto";
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	6	import "google/protobuf/timestamp.proto";
Mateusz Zalega	944cb53	2022-06-20 16:54:17 +0200	[diff] [blame]	7
Serge Bazanski	bc671d0	2021-10-05 17:53:32 +0200	[diff] [blame]	8	import "metropolis/proto/common/common.proto";
Serge Bazanski	9ffa1f9	2021-09-01 15:42:23 +0200	[diff] [blame]	9	import "metropolis/proto/ext/authorization.proto";
				10
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	11	// Management service available to Cluster Managers, allowing operational work
				12	// to be performed on the cluster (eg. adding nodes, retrieving information
				13	// about a running cluster, etc.).
Serge Bazanski	6bd4159	2021-08-23 13:18:37 +0200	[diff] [blame]	14	service Management {
				15	// GetRegisterTicket retrieves the current RegisterTicket which is required
				16	// for new nodes to register into the cluster. Presenting this ticket on
				17	// registration does not automatically grant access to arbitrary node
				18	// registration. Instead, it is used to guard the API surface of the
				19	// Register RPC from potential denial of service attacks, and can be
				20	// regenerated at any time in case it leaks.
Serge Bazanski	9ffa1f9	2021-09-01 15:42:23 +0200	[diff] [blame]	21	rpc GetRegisterTicket(GetRegisterTicketRequest) returns (GetRegisterTicketResponse) {
				22	option (metropolis.proto.ext.authorization) = {
				23	need: PERMISSION_GET_REGISTER_TICKET
				24	};
				25	}
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	26
Serge Bazanski	bc671d0	2021-10-05 17:53:32 +0200	[diff] [blame]	27	// GetClusterInfo retrieves publicly available summary information about
				28	// this cluster, notably data required for nodes to register into a cluster
				29	// or join it (other than the Register Ticket, which is gated by an
				30	// additional permission).
				31	rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse) {
				32	option (metropolis.proto.ext.authorization) = {
				33	need: PERMISSION_READ_CLUSTER_STATUS
				34	};
				35	}
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	36
				37	// GetNodes retrieves information about nodes in the cluster. Currently,
				38	// it returns all available data about all nodes.
				39	rpc GetNodes(GetNodesRequest) returns (stream Node) {
				40	option (metropolis.proto.ext.authorization) = {
				41	need: PERMISSION_READ_CLUSTER_STATUS
				42	};
				43	}
Serge Bazanski	1612d4b	2021-11-12 13:54:15 +0100	[diff] [blame]	44
				45	// ApproveNode progresses a node's registration process by changing its state
				46	// in the cluster from NEW to STANDBY, if not yet STANDBY. This is required
				47	// for the node to fully become part of the cluster (ie. have an UP state),
				48	// and is required to be called by a manager manually.
				49	//
				50	// Managers can find out what nodes require approval by performing
				51	// a GetNodes call and filtering for nodes in the NEW state. This call is
				52	// idempotent and can be executed multiple times, and is a no-op if the node
				53	// is already in the STANDBY or even UP states.
				54	//
				55	// In the future, approval process will be governed by cluster policy, but
				56	// currently any node can be approved by a manager, and the manager is
				57	// responsible for performing an out-of-band attestation of the node being/
				58	// approved (eg. by verifying that the node that is being approved has the
				59	// same public key as what the registering node displays in its startup
				60	// logs).
				61	rpc ApproveNode(ApproveNodeRequest) returns (ApproveNodeResponse) {
				62	option (metropolis.proto.ext.authorization) = {
				63	need: PERMISSION_APPROVE_NODE
				64	};
				65	}
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	66
				67	// UpdateNodeRoles updates a single node's roles.
				68	rpc UpdateNodeRoles(UpdateNodeRolesRequest) returns (UpdateNodeRolesResponse) {
				69	option (metropolis.proto.ext.authorization) = {
				70	need: PERMISSION_UPDATE_NODE_ROLES
				71	};
				72	}
Serge Bazanski	8456ddf	2023-10-30 18:56:59 +0100	[diff] [blame]	73
				74	// Decommissioning a node takes it from UP, through
				75	//
				76	// 1. DECOMMISSION_REQUESTED
				77	// The node will detect this state on the cluster and begin a cleanup
				78	// process which consists of removing either key material or zeroing
				79	// out the data partition, depending on cluster policy. It will report
				80	// to the cluster that it has begun the process, which will take it to
				81	// the next state:
				82	//
				83	// 2. DECOMMISSIONING
				84	// The node will continue cleanup. After cleanup is successful, it will
				85	// report back to the cluster which will take it to DECOMMISSIONED. The
				86	// node then reboots, and never comes back.
				87	//
				88	// 3. DECOMMISSIONED
				89	// The node can be removed with a subsequent DeleteNode call.
				90	//
				91	// TODO(q3k): implement this, possibly iron out the state machine involved.
				92	//
				93	// The node cannot have any roles assigned to it when it is being
				94	// decommissioned: none may be assigned when the decommissioning process is
				95	// requested, and none may be added to it while it is decommissioning.
				96	rpc DecommissionNode(DecommissionNodeRequest) returns (DecommissionNodeResponse) {
				97	option (metropolis.proto.ext.authorization) = {
				98	need: PERMISSION_DECOMMISSION_NODE
				99	};
				100	}
				101
				102	// Delete a node from the cluster. By default the node must be in the
				103	// DECOMMISSIONED state and may not have any roles assigned. However, some
				104	// safety bypasses are available for nodes which have become unavailable and
				105	// thus cannot be decommissioned correctly - see the request documentation
				106	// for more details.
				107	rpc DeleteNode(DeleteNodeRequest) returns (DeleteNodeResponse) {
				108	option (metropolis.proto.ext.authorization) = {
				109	need: PERMISSION_DELETE_NODE
				110	};
				111	}
Serge Bazanski	6bd4159	2021-08-23 13:18:37 +0200	[diff] [blame]	112	}
				113
				114	message GetRegisterTicketRequest {
				115	}
				116
				117	message GetRegisterTicketResponse {
				118	// Opaque bytes that comprise the RegisterTicket.
				119	bytes ticket = 1;
Serge Bazanski	2893e98	2021-09-09 13:06:16 +0200	[diff] [blame]	120	}
Serge Bazanski	bc671d0	2021-10-05 17:53:32 +0200	[diff] [blame]	121
				122	message GetClusterInfoRequest {
				123	}
				124
				125	message GetClusterInfoResponse {
				126	// cluster_directory contains information about individual nodes in the
				127	// cluster that can be used to dial the cluster's services.
				128	metropolis.proto.common.ClusterDirectory cluster_directory = 1;
Serge Bazanski	2f58ac0	2021-10-05 11:47:20 +0200	[diff] [blame]	129
Serge Bazanski	fbd38e2	2021-10-08 14:41:16 +0200	[diff] [blame]	130	// ca_certificate is the x509 DER encoded CA certificate of the cluster.
				131	bytes ca_certificate = 2;
Serge Bazanski	5df62ba	2023-03-22 17:56:46 +0100	[diff] [blame]	132
				133	metropolis.proto.common.ClusterConfiguration cluster_configuration = 3;
Serge Bazanski	bc671d0	2021-10-05 17:53:32 +0200	[diff] [blame]	134	}
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	135
				136	message GetNodesRequest {
Mateusz Zalega	955e46e	2022-05-27 18:00:50 +0200	[diff] [blame]	137	// filter is a CEL expression used to limit the count of GetNodes results.
				138	// Each processed node protobuf message is exposed to the filter as
				139	// "node" variable, while related state and health enum constants are
				140	// anchored in the root namespace, eg. NODE_STATE_UP, or HEARTBEAT_TIMEOUT.
				141	// A node is returned each time the expression is evaluated as true. If
				142	// empty, all nodes are returned.
				143	string filter = 1;
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	144	}
				145
				146	// Node in a Metropolis cluster, streamed by Management.GetNodes. For each node
				147	// in the cluster, this message will be emitted and will contain information
				148	// about that node.
				149	//
				150	// The fields contained are node fields that PERMISSION_READ_CLUSTER_STATUS
				151	// allows access to, ie. 'non-private' fields, ones that might be internal to
				152	// the cluster and possibly considered sensitive information about the
				153	// infrastructure, but whose knowledge does not allow to escalate privileges
				154	// within the cluster.
				155	message Node {
				156	// Raw Ed25519 public key of this node, which can be used to generate
				157	// the node's ID. This is always set.
				158	bytes pubkey = 1;
Serge Bazanski	30fd154	2023-03-29 14:19:02 +0200	[diff] [blame]	159	// Node ID calculated from pubkey, ie. 'metropolis-123456'.
				160	string id = 7;
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	161	// State of the node from the point of view of the cluster. This is
				162	// always set.
				163	metropolis.proto.common.NodeState state = 2;
				164	// Last reported status by the Node, absent if a node hasn't yet reported
				165	// its status.
				166	metropolis.proto.common.NodeStatus status = 3;
				167	// Roles assigned by the cluster. This is always set.
				168	metropolis.proto.common.NodeRoles roles = 4;
Serge Bazanski	1612d4b	2021-11-12 13:54:15 +0100	[diff] [blame]	169
Mateusz Zalega	32b1929	2022-05-17 13:26:55 +0200	[diff] [blame]	170	// Health describes node's health as seen from the cluster perspective.
				171	enum Health {
				172	INVALID = 0;
				173	// UNKNOWN is used whenever there were no heartbeats received from a
				174	// given node AND too little time has passed since last Curator leader
				175	// election to know whether the node is actually timing out. UNKNOWN
				176	// is also returned for nodes which NodeState does not equal
				177	// NODE_STATE_UP.
				178	UNKNOWN = 1;
				179	// HEALTHY describes nodes that have sent a heartbeat recently.
				180	HEALTHY = 2;
				181	// HEARTBEAT_TIMEOUT describes nodes that have not sent a heartbeat in
				182	// the interval specified by curator.HeartbeatTimeout.
				183	HEARTBEAT_TIMEOUT = 3;
				184	}
				185	Health health = 5;
Mateusz Zalega	2175ec9	2022-06-13 09:29:09 +0200	[diff] [blame]	186	// time_since_heartbeat is the duration since the last of the node's
				187	// heartbeats was received, expressed in nanoseconds. It is only valid with
				188	// the health status of either HEALTHY or HEARTBEAT_TIMEOUT.
Mateusz Zalega	944cb53	2022-06-20 16:54:17 +0200	[diff] [blame]	189	google.protobuf.Duration time_since_heartbeat = 6;
Serge Bazanski	e4a4ce1	2023-03-22 18:29:54 +0100	[diff] [blame]	190
				191	// tpm_usage describes whether this node has a TPM 2.0 and whether it is
				192	// being actively used as part of its membership in the Metropolis cluster.
				193	//
				194	// Currently, the TPM 2.0 is only used to seal the local part of the disk
				195	// encryption key and the early join credentials of the node. Depending on
				196	// future cluster configuration settings, this might also indicate that the
				197	// node has actually passed high assurance hardware attestation against the
				198	// cluster.
				199	metropolis.proto.common.NodeTPMUsage tpm_usage = 8;
Mateusz Zalega	32b1929	2022-05-17 13:26:55 +0200	[diff] [blame]	200	}
Serge Bazanski	1612d4b	2021-11-12 13:54:15 +0100	[diff] [blame]	201
				202	message ApproveNodeRequest {
				203	// Raw public key of the node being approved, has to correspond to a node
				204	// currently in the cluster.
				205	bytes pubkey = 1;
				206	}
				207
				208	message ApproveNodeResponse {
Mateusz Zalega	32b1929	2022-05-17 13:26:55 +0200	[diff] [blame]	209	}
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	210
				211	// UpdateNodeRolesRequest updates roles of a single node matching pubkey. All
				212	// role fields are optional, and no change will result if they're either unset
				213	// or if their value matches existing state.
				214	message UpdateNodeRolesRequest {
Mateusz Zalega	9c315f1	2022-08-11 16:31:22 +0200	[diff] [blame]	215	// node uniquely identifies the node subject to this request.
				216	oneof node {
				217	// pubkey is the Ed25519 public key of this node, which can be used to
				218	// generate the node's ID.
				219	bytes pubkey = 1;
				220	// id is the human-readable identifier of the node, based on its public
				221	// key.
				222	string id = 4;
				223	}
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	224
Serge Bazanski	15f7f63	2023-03-14 17:17:20 +0100	[diff] [blame]	225	// kubernetesController adjusts the appropriate role when set.
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	226	optional bool kubernetesWorker = 2;
Serge Bazanski	15f7f63	2023-03-14 17:17:20 +0100	[diff] [blame]	227	// kubernetesController adjusts the appropriate role when set. Nodes performing
				228	// this role must also be consensus members.
				229	optional bool kubernetesController = 5;
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	230	optional bool consensusMember = 3;
				231	}
				232
				233	message UpdateNodeRolesResponse {
				234	}
Serge Bazanski	b40c008	2023-03-29 14:28:04 +0200	[diff] [blame]	235
Serge Bazanski	8456ddf	2023-10-30 18:56:59 +0100	[diff] [blame]	236	message DecommissionNodeRequest {
				237	// node uniquely identifies the node subject to this request.
				238	oneof node {
				239	// pubkey is the Ed25519 public key of this node, which can be used to
				240	// generate the node's ID.
				241	bytes pubkey = 1;
				242	// id is the human-readable identifier of the node, based on its public
				243	// key.
				244	string id = 4;
				245	}
				246	}
				247
				248	message DecommissionNodeResponse {
				249	}
				250
				251	message DeleteNodeRequest {
				252	// node uniquely identifies the node subject to this request.
				253	oneof node {
				254	// pubkey is the Ed25519 public key of this node, which can be used to
				255	// generate the node's ID.
				256	bytes pubkey = 1;
				257	// id is the human-readable identifier of the node, based on its public
				258	// key.
				259	string id = 2;
				260	}
				261
				262	message SafetyBypassHasRoles {
				263	}
				264	// If set, safety_bypass_has_roles allows the removal of nodes which still have
				265	// roles assigned.
				266	//
				267	// Danger: removing nodes which still have roles assigned might leave the
				268	// cluster in an inconsistent state. Unassigning roles from a nodes via
				269	// UpdateNodeRoles ensures consistency.
				270	//
				271	// It's also advised to never use this option in automated workflows, as this
				272	// prevents a runaway automation from removing nodes that are still used for
				273	// actual work.
				274	//
				275	// Nodes which broke down or otherwise become unreachable shouldn't need to
				276	// enable this option, as unassigning the role from a node does not require it
				277	// to be healthy.
				278	//
				279	// A short summary of how to deal with possible inconsistencies after removing
				280	// a node with roles still assigned:
				281	//
				282	// 1. KubernetesWorker: remove the node from the Kubernetes cluster via kubectl
				283	// (kubectl delete node metropolis-xxx).
				284	// 2. KubernetesController: no cleanup should be necessary.
				285	// 3. ConsensusMember:
				286	// a. the cluster still has quorum: remove the node from etcd.
				287	// TODO(q3k): document this
				288	// b. the cluster has no quorum: rebuild the cluster
				289	SafetyBypassHasRoles safety_bypass_has_roles = 3;
				290
				291	message SafetyBypassNotDecommissioned {
				292	}
				293	// If set, safety_bypass_not_decommissioned will allow to remove nodes that
				294	// haven't been yet decommissioned.
				295	//
				296	// Danger: removing nodes which haven't been decommissioned via
				297	// DecommissionNode can leave nodes attempting to reconnect to the cluster,
				298	// and does not fully clean up cryptographic material from the node.
				299	//
				300	// This option will need to be used when a node has broken down, as it's
				301	// impossible to move a node from UP to DECOMMISSIONED if that node is
				302	// unreachable.
				303	//
				304	// To clean up after using this option:
				305	//
				306	// 1. Make sure that the node does not boot back up. The cluster will prevent
				307	// the node from rejoining the cluster, but the node will by itself
				308	// continue to crash and reboot due to a rejection by the cluster.
				309	// 2. Zero our the node's ESP to remove any leftover cryptographic requests.
				310	// These secrets are safeguarded according to the cluster's
				311	// StorageSecurityPolicy and NodeTPMUsage. Depending on the settings,
				312	// cleaning up these secrets before letting other systems access the node
				313	// might be critical to maintaining cluster security.
				314	SafetyBypassNotDecommissioned safety_bypass_not_decommissioned = 4;
				315	}
				316
				317	message DeleteNodeResponse {
				318	}
				319
Serge Bazanski	b40c008	2023-03-29 14:28:04 +0200	[diff] [blame]	320	// NodeManagement runs on every node of the cluster and providers management
				321	// and troubleshooting RPCs to operators. All requests must be authenticated.
				322	service NodeManagement {
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	323	// GetLogs Returns historical and/or streaming logs for a given DN with given
				324	// filters from the system global LogTree.
				325	//
				326	// For more information about this API, see //metropolis/pkg/logtree. But, in
				327	// summary:
				328	// - All logging is performed to a DN (distinguished name), which is a
				329	// dot-delimited string like foo.bar.baz.
				330	// - Log entries can be either raw (coming from unstructured logging from
				331	// an external service, like a running process) or leveled (emitted by
				332	// Metropolis code with a source line, timestamp, and severity).
				333	// - The DNs form a tree of logging nodes - and when requesting logs, a
				334	// given subtree of DNs can be requested, instead of just a given DN.
				335	// - All supervised processes live at `root.<supervisor DN>`. For more
				336	// example paths, see the console logs of a running Metropolis node, or
				337	// request all logs (at DN "").
				338	//
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	339	rpc Logs(GetLogsRequest) returns (stream GetLogsResponse) {
				340	option (metropolis.proto.ext.authorization) = {
				341	need: PERMISSION_READ_NODE_LOGS
				342	};
				343	}
Lorenz Brun	35fcf03	2023-06-29 04:15:58 +0200	[diff] [blame]	344	// UpdateNode updates the node operating system to a new version.
				345	//
				346	// Metropolis uses a side-by-side (A/B) update process. This method installs
				347	// the OS from the given bundle into the inactive slot, activates that slot
				348	// and then (optionally) reboots to activate it.
				349	rpc UpdateNode(UpdateNodeRequest) returns (UpdateNodeResponse) {
				350	option (metropolis.proto.ext.authorization) = {
				351	need: PERMISSION_UPDATE_NODE
				352	};
				353	}
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	354	}
				355
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	356	message GetLogsRequest {
				357	// DN from which to request logs. All supervised runnables live at `root.`,
				358	// the init code lives at `init.`.
				359	string dn = 1;
				360	// Filters to apply to returned data.
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	361	repeated metropolis.proto.common.LogFilter filters = 2;
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	362
				363	enum BacklogMode {
				364	BACKLOG_INVALID = 0;
				365	// No historic data will be returned.
				366	BACKLOG_DISABLE = 1;
				367	// All available historic data will be returned.
				368	BACKLOG_ALL = 2;
				369	// At most backlog_count entries will be returned, if available.
				370	BACKLOG_COUNT = 3;
				371	}
				372	BacklogMode backlog_mode = 3;
				373	int64 backlog_count = 4;
				374
				375	enum StreamMode {
				376	STREAM_INVALID = 0;
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	377	// No streaming entries, gRPC stream will be closed as soon as all backlog
				378	// data is served.
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	379	STREAM_DISABLE = 1;
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	380	// Entries will be streamed as early as available right after all backlog
				381	// data is served.
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	382	STREAM_UNBUFFERED = 2;
				383	}
				384	StreamMode stream_mode = 5;
				385	}
				386
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	387	message GetLogsResponse {
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	388	// Entries from the requested historical entries (via WithBackLog). They will
				389	// all be served before the first stream_entries are served (if any).
				390	repeated metropolis.proto.common.LogEntry backlog_entries = 1;
				391	// Entries streamed as they arrive. Currently no server-side buffering is
				392	// enabled, instead every line is served as early as it arrives. However, this
				393	// might change in the future, so this behaviour cannot be depended upon.
				394	repeated metropolis.proto.common.LogEntry stream_entries = 2;
Lorenz Brun	35fcf03	2023-06-29 04:15:58 +0200	[diff] [blame]	395	}
				396
Lorenz Brun	d14be0e	2023-07-31 16:46:14 +0200	[diff] [blame]	397	enum ActivationMode {
				398	ACTIVATION_INVALID = 0;
				399	// The new bundle is not activated immediately. It gets activated on the next
				400	// reboot/reset.
				401	ACTIVATION_NONE = 1;
				402	// The node is rebooted immediately to activate the new image.
				403	ACTIVATION_REBOOT = 2;
				404	// The node uses kexec to activate the new image immediately without fully
				405	// rebooting.
				406	ACTIVATION_KEXEC = 3;
				407	}
				408
Lorenz Brun	35fcf03	2023-06-29 04:15:58 +0200	[diff] [blame]	409	message UpdateNodeRequest {
				410	// An HTTPS URL to a Metropolis bundle containing the new OS to install.
				411	string bundle_url = 1;
				412
Lorenz Brun	d14be0e	2023-07-31 16:46:14 +0200	[diff] [blame]	413	reserved 2;
				414
				415	// Specifies how the updated image should be activated.
				416	ActivationMode activation_mode = 3;
Lorenz Brun	35fcf03	2023-06-29 04:15:58 +0200	[diff] [blame]	417	}
				418
				419	message UpdateNodeResponse {}