Blame - metropolis/vm/proto/vm.proto - monogon

blob: 3cac9f1ac6127c490e9af6610d85e4f0bf72277c [file] [log] [blame]

Lorenz Brun	db77e82	2021-06-08 14:08:34 +0200	[diff] [blame]	1	syntax = "proto3";
				2
				3	package metropolis.proto.vm;
				4
				5	// VMSpec fully defines all information about a VM and is consumed by the VM
				6	// hypervisor through a runtime environment variable.
				7	message VMSpec {
				8	// Name field from Kubernetes VirtualMachine object.
				9	string name = 1;
				10	// Namespace of VM object
				11	string namespace = 2;
				12
				13	enum StartMode {
				14	SM_UNKNOWN = 0;
				15	// Normal VM start
				16	SM_RUN = 1;
				17	// Initialize the disk of the new VM according to `initial_image` and start
				18	// the VM
				19	SM_PREPARE_IMAGE = 2;
				20	// Wait for an incoming migration and start the migrated VM
				21	SM_INCOMING_MIGRATION = 3;
				22	}
				23	StartMode mode = 3;
				24	// Reference initial data which is copied to the root block device before
				25	// starting the VM for the first time. Only used if starting with
				26	// SM_PREPARE_IMAGE.
				27	InitialImage initial_image = 4;
				28	// Set of IP addresses assigned to the VM. Populated from vmIPs in the
				29	// VirtualMachine object. Currently a maximum of one IP per IP protocol
				30	// version is supported.
				31	repeated string address = 5;
				32	// gRPC endpoint of the controller for this VM
				33	string controller_endpoint = 6;
				34	// Lease mode used for the VM. See LeaseMode for additional info.
				35	LeaseMode lease_mode = 7;
				36	}
				37
				38	// InitialImage represents a source from which a new VM root block device can be
				39	// instantiated.
				40	message InitialImage {
				41	// A URL to an image file. Populated from initialImage.url in the
				42	// VirtualMachine object.
				43	string url = 1;
				44	}
				45
				46	// LeaseMode represents the different modes VM run authorizations can be
				47	// managed. The VM system has its own system for authorizing a given pod to run
				48	// a given VM because it requires different tradeoff as part of its distributed
				49	// systems design than Kubernetes. The core issue is that Kubernetes's design
				50	// does not guarantee that the control plane always has an accurate view
				51	// of running pods especially when nodes fail or get partitioned which they
				52	// trade for potentially better availability by keeping both sides of the
				53	// partition running. Kubernetes is also prone to bugs that result in running
				54	// pods no longer being accounted for (for example
				55	// https://github.com/kubernetes/kubernetes/issues/80968) or duplicated.This can
				56	// result in pods running which the controller cannot see which results in more
				57	// than one running pod for a VM indefinitely.
				58	// For stateful single-instance workloads like VMs this can cause
				59	// control issues (VMs no longer converging to the configured state because
				60	// the current pod is "out of control", continued unavailability because an
				61	// uncontrollable pod holds a lock on the backing storage or even data
				62	// corruption in case two VMs are concurrently writing to the same storage.
				63	//
				64	// To avoid these issues the VM system implements two different strategies
				65	// providing mutual exclusion itself: One for use exclusively with
				66	// local storage-backed VMs and one tailored for VMs with distributed storage.
				67	// They significantly differ in the tradeoffs they make and the guarantees they
				68	// deliver as documented below.
				69	// Both strategies rely (at least in part) on asking the VM controller directly
				70	// if a pod should keep running its VM. The statement of the VM controller
				71	// is called a "run authorization" in the context of the VM system. The exact
				72	// format of this run authorization depends on the strategy in use.
				73	enum LeaseMode {
				74	LM_UNKNOWN = 0;
				75	// In storage locking mode mutual exclusion and thus run authorization is
				76	// provided through locks on the backing block storage system. Control plane
				77	// convergence is only on a best-effort basis, under certain K8s failure modes
				78	// the VM control plane might never converge. A Hypervisor that's partitioned
				79	// from the control plane will continue to run its VM indefinitely and will
				80	// not fence itself off from storage or networking. This mode is appropriate
				81	// for local storage as the full leases mode would introduce more disruptions
				82	// than it solves under these constraints. The run authorization for this
				83	// strategy is a simple STATUS_OK/STATUS_TERMINATE status value with no
				84	// explicit lease expiration as VMs should not stop executing if the control
				85	// plane is unavailable. These authorizations are still useful as a way to
				86	// ensure at least on a best-effort basis that leaked/out-of-control pods shut
				87	// themselves down and locks held by the wrong pods are released.
				88	LM_STORAGE_LOCKING = 1;
				89	// In full leases mode all run authorizations come exclusively from the
				90	// controller and are passed as leases to all external systems (like storage
				91	// and network). A Hypervisor that's partitioned from the control plane
				92	// will after its lease expires kill its VM and fence itself from network and
				93	// storage before terminating itself. This mode is appropriate for fully
				94	// distributed storage as it allows higher availability in that scenario.
				95	// The run authorization for this strategy is an expiring lease which also
				96	// needs to be passed together with any IO operation for proper fencing.
				97	// The hypervisor kills the VM if its lease expires.
				98	// Not implemented currently.
				99	LM_FULL_LEASES = 2;
				100	}
				101
				102	// This is a structure exposing VM metadata to the VM via fw_cfg interface. It
				103	// currently only contains the name of the VM and its network configuration.
				104	// Exposed as vm.metropolis.monogon.dev/v1/metadata.pb to the VM.
				105	message VMMetadata {
				106	// Name field from Kubernetes VirtualMachine object.
				107	string name = 1;
				108	NetworkConfig network_config = 2;
				109	}
				110
				111	// PTPAddress contains the VM IP and the hypervisor IP for an IP point to
				112	// point interface. Both IPs need to be for the same IP protocol version (v4 or
				113	// v6).
				114	// For example on Linux this could be configured using
				115	// `ip addr add $ip peer $peer_ip dev eth0` for the PtP connection and
				116	// `ip route add default via $peer_ip` for the default route.
				117	message PTPAddress {
				118	// IP address of the VM
				119	string ip = 1;
				120	// IP address of the hypervisor side, default gateway for the VM
				121	string peer_ip = 2;
				122	}
				123
				124	// NetworkConfig represents the network configuration the VM needs to configure
				125	// to communicate via its network interface.
				126	message NetworkConfig {
				127	// IPv4 addresses of the PtP link between the VM and the hypervisor, if any.
				128	PTPAddress v4 = 1;
				129	// IPv6 addresses of the PtP link between the VM and the hypervisor, if any.
				130	PTPAddress v6 = 2;
				131	}
				132
				133	// HypervisorID identifies a running instance of a hypervisor uniquely.
				134	message HypervisorID {
				135	// vm_name is the name of the VM object.
				136	string vm_name = 1;
				137	// namespace is the K8s namespace of the VM object.
				138	string namespace = 2;
				139	// pod_name is the pod name in which the hypervisor is running.
				140	string pod_name = 3;
				141	// run_id is selected by the hypervisor at the start of the process to
				142	// uniquely identify that specific running process. A process which starts
				143	// later with respect to other instances on the same node should have a higher
				144	// run_id so that the controller can know that. In practice this should be
				145	// derived from a precise timestamp like nanoseconds since the UNIX epoch.
				146	uint64 run_id = 4;
				147	}
				148
				149	message RunLeaseRequest {
				150	HypervisorID us = 1;
				151	}
				152
				153	message RunLeaseUpdate {
				154	enum Status {
				155	STATUS_UNKNOWN = 0;
				156	// The pod should keep running its VM
				157	STATUS_OK = 1;
				158	// The pod should terminate the VM immediately and exit
				159	STATUS_TERMINATE = 2;
				160	}
				161	Status status = 1;
				162	}
				163
				164	message MigrationSwitchoverRequest {
				165	HypervisorID us = 1;
				166	HypervisorID them = 2;
				167	}
				168	message MigrationSwitchoverResponse {}
				169
				170	message EnsureMigrationTargetRequest {
				171	HypervisorID us = 1;
				172	}
				173
				174	message EnsureMigrationTargetResponse {
				175	enum Action {
				176	ACTION_UNKNOWN = 0;
				177	ACTION_LIVE_MIGRATE = 1;
				178	ACTION_SOFT_SHUTDOWN = 2;
				179	}
				180	Action action = 1;
				181	// Endpoint of the new Pod exposing a metropolis.vm.Hypervisor service if
				182	// action == ACTION_LIVE_MIGRATE.
				183	string target_endpoint = 2;
				184	}
				185
				186	// The VMController service is exposed by the controller for the hypervisors to
				187	// interact with. It is responsible for (pseudo)-leases and and migrations.
				188	// A typical migration looks like this:
				189	// 1. Currently running pod with VM gets SIGTERM.
				190	// 2. Source pod runs EnsureMigrationTarget to inform the controller of its wish
				191	// to migrate its VM away. The controller creates or reuses a target pod to
				192	// migrate to and returns its endpoint to the source pod.
				193	// 3. Source pod runs Hypervisor.StartMigration on the target pod to negotiate a
				194	// channel to migrate.
				195	// 4. Source pod bulk-migrates the vm in a hypervisor-specific way.
				196	// 5. After the bulk migration is done, the source pod stops executing the VM.
				197	// The target pod calls MigrationSwitchover on the controller with `us` set
				198	// to itself and `them` to the `us` parameter in the StartMigrationRequest it
				199	// received in step 3.
				200	// 6. The controller performs the Compare-and-Swap and returns either Ok or
				201	// PreconditionFailed depending on whether the authoritative pod has changed
				202	// in the meantime. If the MigrationSwitchover RPC succeeded, the VM is now
				203	// running on the target pod. If it doesn't succeed, the target pod will
				204	// retry this step for a set period of time and then exit.
				205	// 7. After a set timeout, the source pod will regenerate is run id and attempt
				206	// to call MigrationSwitchover with them set to its old identity and us to
				207	// its new identity formed by updating its run id. This call is expected to
				208	// fail with PreconditionFailed which will cause the source pod to shut
				209	// itself down. If the call succeeds, the source pod will start running the
				210	// VM again.
				211	service VMController {
				212	// EnsureMigrationTarget returns either a request to soft-shutdown or a
				213	// reference to a pod to which the caller should connect to migrate the VM.
				214	// It waits for the pod to run and complete a gRPC health check, but clients
				215	// should still retry a connection a few times before giving up and calling
				216	// this endpoint again.
				217	rpc EnsureMigrationTarget(EnsureMigrationTargetRequest) returns (EnsureMigrationTargetResponse);
				218	// MigrationSwitchover attempts to atomically swap the authoritative Pod and
				219	// PVC from the one in `them` to the one in `us`. If this request succeeds the
				220	// pod in `us` (the caller) is now authoritative for a given VM. If the
				221	// authoritative pod is not the one in `them`, this method will return
				222	// PreconditionFailed and do nothing.
				223	rpc MigrationSwitchover(MigrationSwitchoverRequest) returns (MigrationSwitchoverResponse);
				224	// RunLease requests a pseudo-lease (or a full lease in LeaseMode
				225	// LM_FULL_LEASES) and streams updates to the lease status or new leases (in
				226	// LM_FULL_LEASES). Clients should always attempt to keep one RunLease
				227	// connection open to ensure reliable control from the control plane.
				228	rpc RunLease(RunLeaseRequest) returns (stream RunLeaseUpdate);
				229	}
				230
				231	// The OOBManagement service is exposed by each VM pod to perform OOB
				232	// maintenance on the VM running inside of it.
				233	service OOBManagement {
				234	// Reset resets the virtual CPU of the VM (essentially equivalent to a hard
				235	// reboot). This has no effect on the hypervisor itself.
				236	// TODO(lorenz): This API should have idempotency counters.
				237	rpc Reset(ResetRequest) returns (ResetResponse);
				238	// Console opens a bidirectional stream to the virtual serial port (for
				239	// debugging or OOB data transfer).
				240	// If multiple streams are open data from the VM is broadcast to all clients
				241	// and data from all clients are sent to the VM. Ordering with multiple
				242	// clients connected is best-effort and cannot be relied upon.
				243	rpc Console(stream ConsoleIO) returns (stream ConsoleIO);
				244	}
				245
				246	message ResetRequest {}
				247	message ResetResponse {}
				248
				249	message ConsoleIO {
				250	bytes data = 1;
				251	}
				252
				253	// The Hypervisor service is exposed by each VM pod for migrations.
				254	service Hypervisor {
				255	// StartMigration is called by the source pod when it wants to initiate a
				256	// migration. It is used to negotiate parameters for migration and endpoints
				257	// for the bulk transfer. If no common migration protocol is found,
				258	// InvalidArgument is returned.
				259	rpc StartMigration(StartMigrationRequest) returns (StartMigrationResponse);
				260	}
				261
				262	// MigrationProtocol represents a protocol and some protocol-specific metadata
				263	// to allow for negotiating a connection using that protocol.
				264	// For each migration protocol message, some fields will be set by the source
				265	// as constraints (constraint_*), and some will be populated by the target if
				266	// that migration protocol is picked (negotiated_*). The migration target will
				267	// keep all constraint_* fields that it was aware of, so that the source can
				268	// verify that all critical fields were considered by the target (thereby
				269	// allowing different versions of source/target to communicate).
				270	message MigrationProtocol {
				271	// Qemu represents the native QEMU migration protocol.
				272	message Qemu {
				273	// If set, the root block device is migrated together with the VM. If the
				274	// target doesn't have storage attached directly via QEMU (like RBD or
				275	// iSCSI) this needs to be set otherwise this protocol cannot be picked as
				276	// the VM would loose it storage during the migration. The opposite is
				277	// allowed, it migrates a local-storage volume into QEMU-attached storage
				278	// storage.
				279	bool constraint_with_blockmigration = 1;
				280	// Bulk endpoint on the migration target in QEMU native format
				281	string negotiated_endpoint = 2;
				282	}
				283	oneof kind { Qemu qmeu_block = 1; }
				284	}
				285
				286	message StartMigrationRequest {
				287	// List of migration protocols supported by the source pod
				288	repeated MigrationProtocol supported_migration_protocol = 1;
				289
				290	// Hypervisor ID of the hypervisor making the request (i.e. is currently
				291	// running the VM)
				292	HypervisorID us = 2;
				293	}
				294
				295	message StartMigrationResponse {
				296	// Migration protocol chosen from supported_migration_protocol by the target
				297	// pod.
				298	MigrationProtocol migration_protocol = 1;
				299	}