Blame - metropolis/proto/api/management.proto - monogon

blob: bdb4a037d1fff3dde5bc54988f6070a5cbedd4e5 [file] [log] [blame]

Serge Bazanski	6bd4159	2021-08-23 13:18:37 +0200	[diff] [blame]	1	syntax = "proto3";
				2	package metropolis.proto.api;
				3	option go_package = "source.monogon.dev/metropolis/proto/api";
				4
Mateusz Zalega	944cb53	2022-06-20 16:54:17 +0200	[diff] [blame]	5	import "google/protobuf/duration.proto";
				6
Serge Bazanski	bc671d0	2021-10-05 17:53:32 +0200	[diff] [blame]	7	import "metropolis/proto/common/common.proto";
Serge Bazanski	9ffa1f9	2021-09-01 15:42:23 +0200	[diff] [blame]	8	import "metropolis/proto/ext/authorization.proto";
				9
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	10	// Management service available to Cluster Managers, allowing operational work
				11	// to be performed on the cluster (eg. adding nodes, retrieving information
				12	// about a running cluster, etc.).
Serge Bazanski	6bd4159	2021-08-23 13:18:37 +0200	[diff] [blame]	13	service Management {
				14	// GetRegisterTicket retrieves the current RegisterTicket which is required
				15	// for new nodes to register into the cluster. Presenting this ticket on
				16	// registration does not automatically grant access to arbitrary node
				17	// registration. Instead, it is used to guard the API surface of the
				18	// Register RPC from potential denial of service attacks, and can be
				19	// regenerated at any time in case it leaks.
Serge Bazanski	9ffa1f9	2021-09-01 15:42:23 +0200	[diff] [blame]	20	rpc GetRegisterTicket(GetRegisterTicketRequest) returns (GetRegisterTicketResponse) {
				21	option (metropolis.proto.ext.authorization) = {
				22	need: PERMISSION_GET_REGISTER_TICKET
				23	};
				24	}
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	25
Serge Bazanski	bc671d0	2021-10-05 17:53:32 +0200	[diff] [blame]	26	// GetClusterInfo retrieves publicly available summary information about
				27	// this cluster, notably data required for nodes to register into a cluster
				28	// or join it (other than the Register Ticket, which is gated by an
				29	// additional permission).
				30	rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse) {
				31	option (metropolis.proto.ext.authorization) = {
				32	need: PERMISSION_READ_CLUSTER_STATUS
				33	};
				34	}
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	35
				36	// GetNodes retrieves information about nodes in the cluster. Currently,
				37	// it returns all available data about all nodes.
				38	rpc GetNodes(GetNodesRequest) returns (stream Node) {
				39	option (metropolis.proto.ext.authorization) = {
				40	need: PERMISSION_READ_CLUSTER_STATUS
				41	};
				42	}
Serge Bazanski	1612d4b	2021-11-12 13:54:15 +0100	[diff] [blame]	43
				44	// ApproveNode progresses a node's registration process by changing its state
				45	// in the cluster from NEW to STANDBY, if not yet STANDBY. This is required
				46	// for the node to fully become part of the cluster (ie. have an UP state),
				47	// and is required to be called by a manager manually.
				48	//
				49	// Managers can find out what nodes require approval by performing
				50	// a GetNodes call and filtering for nodes in the NEW state. This call is
				51	// idempotent and can be executed multiple times, and is a no-op if the node
				52	// is already in the STANDBY or even UP states.
				53	//
				54	// In the future, approval process will be governed by cluster policy, but
				55	// currently any node can be approved by a manager, and the manager is
				56	// responsible for performing an out-of-band attestation of the node being/
				57	// approved (eg. by verifying that the node that is being approved has the
				58	// same public key as what the registering node displays in its startup
				59	// logs).
				60	rpc ApproveNode(ApproveNodeRequest) returns (ApproveNodeResponse) {
				61	option (metropolis.proto.ext.authorization) = {
				62	need: PERMISSION_APPROVE_NODE
				63	};
				64	}
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	65
				66	// UpdateNodeRoles updates a single node's roles.
				67	rpc UpdateNodeRoles(UpdateNodeRolesRequest) returns (UpdateNodeRolesResponse) {
				68	option (metropolis.proto.ext.authorization) = {
				69	need: PERMISSION_UPDATE_NODE_ROLES
				70	};
				71	}
Serge Bazanski	8456ddf	2023-10-30 18:56:59 +0100	[diff] [blame]	72
				73	// Decommissioning a node takes it from UP, through
				74	//
				75	// 1. DECOMMISSION_REQUESTED
				76	// The node will detect this state on the cluster and begin a cleanup
				77	// process which consists of removing either key material or zeroing
				78	// out the data partition, depending on cluster policy. It will report
				79	// to the cluster that it has begun the process, which will take it to
				80	// the next state:
				81	//
				82	// 2. DECOMMISSIONING
				83	// The node will continue cleanup. After cleanup is successful, it will
				84	// report back to the cluster which will take it to DECOMMISSIONED. The
				85	// node then reboots, and never comes back.
				86	//
				87	// 3. DECOMMISSIONED
				88	// The node can be removed with a subsequent DeleteNode call.
				89	//
				90	// TODO(q3k): implement this, possibly iron out the state machine involved.
				91	//
				92	// The node cannot have any roles assigned to it when it is being
				93	// decommissioned: none may be assigned when the decommissioning process is
				94	// requested, and none may be added to it while it is decommissioning.
				95	rpc DecommissionNode(DecommissionNodeRequest) returns (DecommissionNodeResponse) {
				96	option (metropolis.proto.ext.authorization) = {
				97	need: PERMISSION_DECOMMISSION_NODE
				98	};
				99	}
				100
				101	// Delete a node from the cluster. By default the node must be in the
				102	// DECOMMISSIONED state and may not have any roles assigned. However, some
				103	// safety bypasses are available for nodes which have become unavailable and
				104	// thus cannot be decommissioned correctly - see the request documentation
				105	// for more details.
				106	rpc DeleteNode(DeleteNodeRequest) returns (DeleteNodeResponse) {
				107	option (metropolis.proto.ext.authorization) = {
				108	need: PERMISSION_DELETE_NODE
				109	};
				110	}
Serge Bazanski	6bd4159	2021-08-23 13:18:37 +0200	[diff] [blame]	111	}
				112
				113	message GetRegisterTicketRequest {
				114	}
				115
				116	message GetRegisterTicketResponse {
				117	// Opaque bytes that comprise the RegisterTicket.
				118	bytes ticket = 1;
Serge Bazanski	2893e98	2021-09-09 13:06:16 +0200	[diff] [blame]	119	}
Serge Bazanski	bc671d0	2021-10-05 17:53:32 +0200	[diff] [blame]	120
				121	message GetClusterInfoRequest {
				122	}
				123
				124	message GetClusterInfoResponse {
				125	// cluster_directory contains information about individual nodes in the
				126	// cluster that can be used to dial the cluster's services.
				127	metropolis.proto.common.ClusterDirectory cluster_directory = 1;
Serge Bazanski	2f58ac0	2021-10-05 11:47:20 +0200	[diff] [blame]	128
Serge Bazanski	fbd38e2	2021-10-08 14:41:16 +0200	[diff] [blame]	129	// ca_certificate is the x509 DER encoded CA certificate of the cluster.
				130	bytes ca_certificate = 2;
Serge Bazanski	5df62ba	2023-03-22 17:56:46 +0100	[diff] [blame]	131
				132	metropolis.proto.common.ClusterConfiguration cluster_configuration = 3;
Serge Bazanski	bc671d0	2021-10-05 17:53:32 +0200	[diff] [blame]	133	}
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	134
				135	message GetNodesRequest {
Mateusz Zalega	955e46e	2022-05-27 18:00:50 +0200	[diff] [blame]	136	// filter is a CEL expression used to limit the count of GetNodes results.
				137	// Each processed node protobuf message is exposed to the filter as
				138	// "node" variable, while related state and health enum constants are
				139	// anchored in the root namespace, eg. NODE_STATE_UP, or HEARTBEAT_TIMEOUT.
				140	// A node is returned each time the expression is evaluated as true. If
				141	// empty, all nodes are returned.
				142	string filter = 1;
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	143	}
				144
				145	// Node in a Metropolis cluster, streamed by Management.GetNodes. For each node
				146	// in the cluster, this message will be emitted and will contain information
				147	// about that node.
				148	//
				149	// The fields contained are node fields that PERMISSION_READ_CLUSTER_STATUS
				150	// allows access to, ie. 'non-private' fields, ones that might be internal to
				151	// the cluster and possibly considered sensitive information about the
				152	// infrastructure, but whose knowledge does not allow to escalate privileges
				153	// within the cluster.
				154	message Node {
				155	// Raw Ed25519 public key of this node, which can be used to generate
				156	// the node's ID. This is always set.
				157	bytes pubkey = 1;
Serge Bazanski	30fd154	2023-03-29 14:19:02 +0200	[diff] [blame]	158	// Node ID calculated from pubkey, ie. 'metropolis-123456'.
				159	string id = 7;
Serge Bazanski	5611447	2021-10-11 14:47:54 +0200	[diff] [blame]	160	// State of the node from the point of view of the cluster. This is
				161	// always set.
				162	metropolis.proto.common.NodeState state = 2;
				163	// Last reported status by the Node, absent if a node hasn't yet reported
				164	// its status.
				165	metropolis.proto.common.NodeStatus status = 3;
				166	// Roles assigned by the cluster. This is always set.
				167	metropolis.proto.common.NodeRoles roles = 4;
Serge Bazanski	1612d4b	2021-11-12 13:54:15 +0100	[diff] [blame]	168
Mateusz Zalega	32b1929	2022-05-17 13:26:55 +0200	[diff] [blame]	169	// Health describes node's health as seen from the cluster perspective.
				170	enum Health {
				171	INVALID = 0;
				172	// UNKNOWN is used whenever there were no heartbeats received from a
				173	// given node AND too little time has passed since last Curator leader
				174	// election to know whether the node is actually timing out. UNKNOWN
				175	// is also returned for nodes which NodeState does not equal
				176	// NODE_STATE_UP.
				177	UNKNOWN = 1;
				178	// HEALTHY describes nodes that have sent a heartbeat recently.
				179	HEALTHY = 2;
				180	// HEARTBEAT_TIMEOUT describes nodes that have not sent a heartbeat in
				181	// the interval specified by curator.HeartbeatTimeout.
				182	HEARTBEAT_TIMEOUT = 3;
				183	}
				184	Health health = 5;
Mateusz Zalega	2175ec9	2022-06-13 09:29:09 +0200	[diff] [blame]	185	// time_since_heartbeat is the duration since the last of the node's
				186	// heartbeats was received, expressed in nanoseconds. It is only valid with
				187	// the health status of either HEALTHY or HEARTBEAT_TIMEOUT.
Mateusz Zalega	944cb53	2022-06-20 16:54:17 +0200	[diff] [blame]	188	google.protobuf.Duration time_since_heartbeat = 6;
Serge Bazanski	e4a4ce1	2023-03-22 18:29:54 +0100	[diff] [blame]	189
				190	// tpm_usage describes whether this node has a TPM 2.0 and whether it is
				191	// being actively used as part of its membership in the Metropolis cluster.
				192	//
				193	// Currently, the TPM 2.0 is only used to seal the local part of the disk
				194	// encryption key and the early join credentials of the node. Depending on
				195	// future cluster configuration settings, this might also indicate that the
				196	// node has actually passed high assurance hardware attestation against the
				197	// cluster.
				198	metropolis.proto.common.NodeTPMUsage tpm_usage = 8;
Mateusz Zalega	32b1929	2022-05-17 13:26:55 +0200	[diff] [blame]	199	}
Serge Bazanski	1612d4b	2021-11-12 13:54:15 +0100	[diff] [blame]	200
				201	message ApproveNodeRequest {
				202	// Raw public key of the node being approved, has to correspond to a node
				203	// currently in the cluster.
				204	bytes pubkey = 1;
				205	}
				206
				207	message ApproveNodeResponse {
Mateusz Zalega	32b1929	2022-05-17 13:26:55 +0200	[diff] [blame]	208	}
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	209
				210	// UpdateNodeRolesRequest updates roles of a single node matching pubkey. All
				211	// role fields are optional, and no change will result if they're either unset
				212	// or if their value matches existing state.
				213	message UpdateNodeRolesRequest {
Mateusz Zalega	9c315f1	2022-08-11 16:31:22 +0200	[diff] [blame]	214	// node uniquely identifies the node subject to this request.
				215	oneof node {
				216	// pubkey is the Ed25519 public key of this node, which can be used to
				217	// generate the node's ID.
				218	bytes pubkey = 1;
				219	// id is the human-readable identifier of the node, based on its public
				220	// key.
				221	string id = 4;
				222	}
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	223
Serge Bazanski	15f7f63	2023-03-14 17:17:20 +0100	[diff] [blame]	224	// kubernetesController adjusts the appropriate role when set.
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	225	optional bool kubernetesWorker = 2;
Serge Bazanski	15f7f63	2023-03-14 17:17:20 +0100	[diff] [blame]	226	// kubernetesController adjusts the appropriate role when set. Nodes performing
				227	// this role must also be consensus members.
				228	optional bool kubernetesController = 5;
Mateusz Zalega	bb2edbe	2022-06-08 11:57:09 +0200	[diff] [blame]	229	optional bool consensusMember = 3;
				230	}
				231
				232	message UpdateNodeRolesResponse {
				233	}
Serge Bazanski	b40c008	2023-03-29 14:28:04 +0200	[diff] [blame]	234
Serge Bazanski	8456ddf	2023-10-30 18:56:59 +0100	[diff] [blame]	235	message DecommissionNodeRequest {
				236	// node uniquely identifies the node subject to this request.
				237	oneof node {
				238	// pubkey is the Ed25519 public key of this node, which can be used to
				239	// generate the node's ID.
				240	bytes pubkey = 1;
				241	// id is the human-readable identifier of the node, based on its public
				242	// key.
				243	string id = 4;
				244	}
				245	}
				246
				247	message DecommissionNodeResponse {
				248	}
				249
				250	message DeleteNodeRequest {
				251	// node uniquely identifies the node subject to this request.
				252	oneof node {
				253	// pubkey is the Ed25519 public key of this node, which can be used to
				254	// generate the node's ID.
				255	bytes pubkey = 1;
				256	// id is the human-readable identifier of the node, based on its public
				257	// key.
				258	string id = 2;
				259	}
				260
				261	message SafetyBypassHasRoles {
				262	}
				263	// If set, safety_bypass_has_roles allows the removal of nodes which still have
				264	// roles assigned.
				265	//
				266	// Danger: removing nodes which still have roles assigned might leave the
				267	// cluster in an inconsistent state. Unassigning roles from a nodes via
				268	// UpdateNodeRoles ensures consistency.
				269	//
				270	// It's also advised to never use this option in automated workflows, as this
				271	// prevents a runaway automation from removing nodes that are still used for
				272	// actual work.
				273	//
				274	// Nodes which broke down or otherwise become unreachable shouldn't need to
				275	// enable this option, as unassigning the role from a node does not require it
				276	// to be healthy.
				277	//
				278	// A short summary of how to deal with possible inconsistencies after removing
				279	// a node with roles still assigned:
				280	//
				281	// 1. KubernetesWorker: remove the node from the Kubernetes cluster via kubectl
				282	// (kubectl delete node metropolis-xxx).
				283	// 2. KubernetesController: no cleanup should be necessary.
				284	// 3. ConsensusMember:
				285	// a. the cluster still has quorum: remove the node from etcd.
				286	// TODO(q3k): document this
				287	// b. the cluster has no quorum: rebuild the cluster
				288	SafetyBypassHasRoles safety_bypass_has_roles = 3;
				289
				290	message SafetyBypassNotDecommissioned {
				291	}
				292	// If set, safety_bypass_not_decommissioned will allow to remove nodes that
				293	// haven't been yet decommissioned.
				294	//
				295	// Danger: removing nodes which haven't been decommissioned via
				296	// DecommissionNode can leave nodes attempting to reconnect to the cluster,
				297	// and does not fully clean up cryptographic material from the node.
				298	//
				299	// This option will need to be used when a node has broken down, as it's
				300	// impossible to move a node from UP to DECOMMISSIONED if that node is
				301	// unreachable.
				302	//
				303	// To clean up after using this option:
				304	//
				305	// 1. Make sure that the node does not boot back up. The cluster will prevent
				306	// the node from rejoining the cluster, but the node will by itself
				307	// continue to crash and reboot due to a rejection by the cluster.
				308	// 2. Zero our the node's ESP to remove any leftover cryptographic requests.
				309	// These secrets are safeguarded according to the cluster's
				310	// StorageSecurityPolicy and NodeTPMUsage. Depending on the settings,
				311	// cleaning up these secrets before letting other systems access the node
				312	// might be critical to maintaining cluster security.
				313	SafetyBypassNotDecommissioned safety_bypass_not_decommissioned = 4;
				314	}
				315
				316	message DeleteNodeResponse {
				317	}
				318
Serge Bazanski	b40c008	2023-03-29 14:28:04 +0200	[diff] [blame]	319	// NodeManagement runs on every node of the cluster and providers management
				320	// and troubleshooting RPCs to operators. All requests must be authenticated.
				321	service NodeManagement {
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	322	// GetLogs Returns historical and/or streaming logs for a given DN with given
				323	// filters from the system global LogTree.
				324	//
				325	// For more information about this API, see //metropolis/pkg/logtree. But, in
				326	// summary:
				327	// - All logging is performed to a DN (distinguished name), which is a
				328	// dot-delimited string like foo.bar.baz.
				329	// - Log entries can be either raw (coming from unstructured logging from
				330	// an external service, like a running process) or leveled (emitted by
				331	// Metropolis code with a source line, timestamp, and severity).
				332	// - The DNs form a tree of logging nodes - and when requesting logs, a
				333	// given subtree of DNs can be requested, instead of just a given DN.
				334	// - All supervised processes live at `root.<supervisor DN>`. For more
				335	// example paths, see the console logs of a running Metropolis node, or
				336	// request all logs (at DN "").
				337	//
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	338	rpc Logs(GetLogsRequest) returns (stream GetLogsResponse) {
				339	option (metropolis.proto.ext.authorization) = {
				340	need: PERMISSION_READ_NODE_LOGS
				341	};
				342	}
Lorenz Brun	35fcf03	2023-06-29 04:15:58 +0200	[diff] [blame]	343	// UpdateNode updates the node operating system to a new version.
				344	//
				345	// Metropolis uses a side-by-side (A/B) update process. This method installs
				346	// the OS from the given bundle into the inactive slot, activates that slot
				347	// and then (optionally) reboots to activate it.
				348	rpc UpdateNode(UpdateNodeRequest) returns (UpdateNodeResponse) {
				349	option (metropolis.proto.ext.authorization) = {
				350	need: PERMISSION_UPDATE_NODE
				351	};
				352	}
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	353	}
				354
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	355	message GetLogsRequest {
				356	// DN from which to request logs. All supervised runnables live at `root.`,
				357	// the init code lives at `init.`.
				358	string dn = 1;
				359	// Filters to apply to returned data.
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	360	repeated metropolis.proto.common.LogFilter filters = 2;
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	361
				362	enum BacklogMode {
				363	BACKLOG_INVALID = 0;
				364	// No historic data will be returned.
				365	BACKLOG_DISABLE = 1;
				366	// All available historic data will be returned.
				367	BACKLOG_ALL = 2;
				368	// At most backlog_count entries will be returned, if available.
				369	BACKLOG_COUNT = 3;
				370	}
				371	BacklogMode backlog_mode = 3;
				372	int64 backlog_count = 4;
				373
				374	enum StreamMode {
				375	STREAM_INVALID = 0;
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	376	// No streaming entries, gRPC stream will be closed as soon as all backlog
				377	// data is served.
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	378	STREAM_DISABLE = 1;
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	379	// Entries will be streamed as early as available right after all backlog
				380	// data is served.
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	381	STREAM_UNBUFFERED = 2;
				382	}
				383	StreamMode stream_mode = 5;
				384	}
				385
Serge Bazanski	b91938f	2023-03-29 14:31:22 +0200	[diff] [blame]	386	message GetLogsResponse {
Serge Bazanski	da11486	2023-03-29 17:46:42 +0200	[diff] [blame]	387	// Entries from the requested historical entries (via WithBackLog). They will
				388	// all be served before the first stream_entries are served (if any).
				389	repeated metropolis.proto.common.LogEntry backlog_entries = 1;
				390	// Entries streamed as they arrive. Currently no server-side buffering is
				391	// enabled, instead every line is served as early as it arrives. However, this
				392	// might change in the future, so this behaviour cannot be depended upon.
				393	repeated metropolis.proto.common.LogEntry stream_entries = 2;
Lorenz Brun	35fcf03	2023-06-29 04:15:58 +0200	[diff] [blame]	394	}
				395
Lorenz Brun	d14be0e	2023-07-31 16:46:14 +0200	[diff] [blame]	396	enum ActivationMode {
				397	ACTIVATION_INVALID = 0;
				398	// The new bundle is not activated immediately. It gets activated on the next
				399	// reboot/reset.
				400	ACTIVATION_NONE = 1;
				401	// The node is rebooted immediately to activate the new image.
				402	ACTIVATION_REBOOT = 2;
				403	// The node uses kexec to activate the new image immediately without fully
				404	// rebooting.
				405	ACTIVATION_KEXEC = 3;
				406	}
				407
Lorenz Brun	35fcf03	2023-06-29 04:15:58 +0200	[diff] [blame]	408	message UpdateNodeRequest {
				409	// An HTTPS URL to a Metropolis bundle containing the new OS to install.
				410	string bundle_url = 1;
				411
Lorenz Brun	d14be0e	2023-07-31 16:46:14 +0200	[diff] [blame]	412	reserved 2;
				413
				414	// Specifies how the updated image should be activated.
				415	ActivationMode activation_mode = 3;
Lorenz Brun	35fcf03	2023-06-29 04:15:58 +0200	[diff] [blame]	416	}
				417
				418	message UpdateNodeResponse {}