blob: 3cac9f1ac6127c490e9af6610d85e4f0bf72277c [file] [log] [blame]
Lorenz Brundb77e822021-06-08 14:08:34 +02001syntax = "proto3";
2
3package metropolis.proto.vm;
4
5// VMSpec fully defines all information about a VM and is consumed by the VM
6// hypervisor through a runtime environment variable.
7message VMSpec {
8 // Name field from Kubernetes VirtualMachine object.
9 string name = 1;
10 // Namespace of VM object
11 string namespace = 2;
12
13 enum StartMode {
14 SM_UNKNOWN = 0;
15 // Normal VM start
16 SM_RUN = 1;
17 // Initialize the disk of the new VM according to `initial_image` and start
18 // the VM
19 SM_PREPARE_IMAGE = 2;
20 // Wait for an incoming migration and start the migrated VM
21 SM_INCOMING_MIGRATION = 3;
22 }
23 StartMode mode = 3;
24 // Reference initial data which is copied to the root block device before
25 // starting the VM for the first time. Only used if starting with
26 // SM_PREPARE_IMAGE.
27 InitialImage initial_image = 4;
28 // Set of IP addresses assigned to the VM. Populated from vmIPs in the
29 // VirtualMachine object. Currently a maximum of one IP per IP protocol
30 // version is supported.
31 repeated string address = 5;
32 // gRPC endpoint of the controller for this VM
33 string controller_endpoint = 6;
34 // Lease mode used for the VM. See LeaseMode for additional info.
35 LeaseMode lease_mode = 7;
36}
37
38// InitialImage represents a source from which a new VM root block device can be
39// instantiated.
40message InitialImage {
41 // A URL to an image file. Populated from initialImage.url in the
42 // VirtualMachine object.
43 string url = 1;
44}
45
46// LeaseMode represents the different modes VM run authorizations can be
47// managed. The VM system has its own system for authorizing a given pod to run
48// a given VM because it requires different tradeoff as part of its distributed
49// systems design than Kubernetes. The core issue is that Kubernetes's design
50// does not guarantee that the control plane always has an accurate view
51// of running pods especially when nodes fail or get partitioned which they
52// trade for potentially better availability by keeping both sides of the
53// partition running. Kubernetes is also prone to bugs that result in running
54// pods no longer being accounted for (for example
55// https://github.com/kubernetes/kubernetes/issues/80968) or duplicated.This can
56// result in pods running which the controller cannot see which results in more
57// than one running pod for a VM indefinitely.
58// For stateful single-instance workloads like VMs this can cause
59// control issues (VMs no longer converging to the configured state because
60// the current pod is "out of control", continued unavailability because an
61// uncontrollable pod holds a lock on the backing storage or even data
62// corruption in case two VMs are concurrently writing to the same storage.
63//
64// To avoid these issues the VM system implements two different strategies
65// providing mutual exclusion itself: One for use exclusively with
66// local storage-backed VMs and one tailored for VMs with distributed storage.
67// They significantly differ in the tradeoffs they make and the guarantees they
68// deliver as documented below.
69// Both strategies rely (at least in part) on asking the VM controller directly
70// if a pod should keep running its VM. The statement of the VM controller
71// is called a "run authorization" in the context of the VM system. The exact
72// format of this run authorization depends on the strategy in use.
73enum LeaseMode {
74 LM_UNKNOWN = 0;
75 // In storage locking mode mutual exclusion and thus run authorization is
76 // provided through locks on the backing block storage system. Control plane
77 // convergence is only on a best-effort basis, under certain K8s failure modes
78 // the VM control plane might never converge. A Hypervisor that's partitioned
79 // from the control plane will continue to run its VM indefinitely and will
80 // not fence itself off from storage or networking. This mode is appropriate
81 // for local storage as the full leases mode would introduce more disruptions
82 // than it solves under these constraints. The run authorization for this
83 // strategy is a simple STATUS_OK/STATUS_TERMINATE status value with no
84 // explicit lease expiration as VMs should not stop executing if the control
85 // plane is unavailable. These authorizations are still useful as a way to
86 // ensure at least on a best-effort basis that leaked/out-of-control pods shut
87 // themselves down and locks held by the wrong pods are released.
88 LM_STORAGE_LOCKING = 1;
89 // In full leases mode all run authorizations come exclusively from the
90 // controller and are passed as leases to all external systems (like storage
91 // and network). A Hypervisor that's partitioned from the control plane
92 // will after its lease expires kill its VM and fence itself from network and
93 // storage before terminating itself. This mode is appropriate for fully
94 // distributed storage as it allows higher availability in that scenario.
95 // The run authorization for this strategy is an expiring lease which also
96 // needs to be passed together with any IO operation for proper fencing.
97 // The hypervisor kills the VM if its lease expires.
98 // Not implemented currently.
99 LM_FULL_LEASES = 2;
100}
101
102// This is a structure exposing VM metadata to the VM via fw_cfg interface. It
103// currently only contains the name of the VM and its network configuration.
104// Exposed as vm.metropolis.monogon.dev/v1/metadata.pb to the VM.
105message VMMetadata {
106 // Name field from Kubernetes VirtualMachine object.
107 string name = 1;
108 NetworkConfig network_config = 2;
109}
110
111// PTPAddress contains the VM IP and the hypervisor IP for an IP point to
112// point interface. Both IPs need to be for the same IP protocol version (v4 or
113// v6).
114// For example on Linux this could be configured using
115// `ip addr add $ip peer $peer_ip dev eth0` for the PtP connection and
116// `ip route add default via $peer_ip` for the default route.
117message PTPAddress {
118 // IP address of the VM
119 string ip = 1;
120 // IP address of the hypervisor side, default gateway for the VM
121 string peer_ip = 2;
122}
123
124// NetworkConfig represents the network configuration the VM needs to configure
125// to communicate via its network interface.
126message NetworkConfig {
127 // IPv4 addresses of the PtP link between the VM and the hypervisor, if any.
128 PTPAddress v4 = 1;
129 // IPv6 addresses of the PtP link between the VM and the hypervisor, if any.
130 PTPAddress v6 = 2;
131}
132
133// HypervisorID identifies a running instance of a hypervisor uniquely.
134message HypervisorID {
135 // vm_name is the name of the VM object.
136 string vm_name = 1;
137 // namespace is the K8s namespace of the VM object.
138 string namespace = 2;
139 // pod_name is the pod name in which the hypervisor is running.
140 string pod_name = 3;
141 // run_id is selected by the hypervisor at the start of the process to
142 // uniquely identify that specific running process. A process which starts
143 // later with respect to other instances on the same node should have a higher
144 // run_id so that the controller can know that. In practice this should be
145 // derived from a precise timestamp like nanoseconds since the UNIX epoch.
146 uint64 run_id = 4;
147}
148
149message RunLeaseRequest {
150 HypervisorID us = 1;
151}
152
153message RunLeaseUpdate {
154 enum Status {
155 STATUS_UNKNOWN = 0;
156 // The pod should keep running its VM
157 STATUS_OK = 1;
158 // The pod should terminate the VM immediately and exit
159 STATUS_TERMINATE = 2;
160 }
161 Status status = 1;
162}
163
164message MigrationSwitchoverRequest {
165 HypervisorID us = 1;
166 HypervisorID them = 2;
167}
168message MigrationSwitchoverResponse {}
169
170message EnsureMigrationTargetRequest {
171 HypervisorID us = 1;
172}
173
174message EnsureMigrationTargetResponse {
175 enum Action {
176 ACTION_UNKNOWN = 0;
177 ACTION_LIVE_MIGRATE = 1;
178 ACTION_SOFT_SHUTDOWN = 2;
179 }
180 Action action = 1;
181 // Endpoint of the new Pod exposing a metropolis.vm.Hypervisor service if
182 // action == ACTION_LIVE_MIGRATE.
183 string target_endpoint = 2;
184}
185
186// The VMController service is exposed by the controller for the hypervisors to
187// interact with. It is responsible for (pseudo)-leases and and migrations.
188// A typical migration looks like this:
189// 1. Currently running pod with VM gets SIGTERM.
190// 2. Source pod runs EnsureMigrationTarget to inform the controller of its wish
191// to migrate its VM away. The controller creates or reuses a target pod to
192// migrate to and returns its endpoint to the source pod.
193// 3. Source pod runs Hypervisor.StartMigration on the target pod to negotiate a
194// channel to migrate.
195// 4. Source pod bulk-migrates the vm in a hypervisor-specific way.
196// 5. After the bulk migration is done, the source pod stops executing the VM.
197// The target pod calls MigrationSwitchover on the controller with `us` set
198// to itself and `them` to the `us` parameter in the StartMigrationRequest it
199// received in step 3.
200// 6. The controller performs the Compare-and-Swap and returns either Ok or
201// PreconditionFailed depending on whether the authoritative pod has changed
202// in the meantime. If the MigrationSwitchover RPC succeeded, the VM is now
203// running on the target pod. If it doesn't succeed, the target pod will
204// retry this step for a set period of time and then exit.
205// 7. After a set timeout, the source pod will regenerate is run id and attempt
206// to call MigrationSwitchover with them set to its old identity and us to
207// its new identity formed by updating its run id. This call is expected to
208// fail with PreconditionFailed which will cause the source pod to shut
209// itself down. If the call succeeds, the source pod will start running the
210// VM again.
211service VMController {
212 // EnsureMigrationTarget returns either a request to soft-shutdown or a
213 // reference to a pod to which the caller should connect to migrate the VM.
214 // It waits for the pod to run and complete a gRPC health check, but clients
215 // should still retry a connection a few times before giving up and calling
216 // this endpoint again.
217 rpc EnsureMigrationTarget(EnsureMigrationTargetRequest) returns (EnsureMigrationTargetResponse);
218 // MigrationSwitchover attempts to atomically swap the authoritative Pod and
219 // PVC from the one in `them` to the one in `us`. If this request succeeds the
220 // pod in `us` (the caller) is now authoritative for a given VM. If the
221 // authoritative pod is not the one in `them`, this method will return
222 // PreconditionFailed and do nothing.
223 rpc MigrationSwitchover(MigrationSwitchoverRequest) returns (MigrationSwitchoverResponse);
224 // RunLease requests a pseudo-lease (or a full lease in LeaseMode
225 // LM_FULL_LEASES) and streams updates to the lease status or new leases (in
226 // LM_FULL_LEASES). Clients should always attempt to keep one RunLease
227 // connection open to ensure reliable control from the control plane.
228 rpc RunLease(RunLeaseRequest) returns (stream RunLeaseUpdate);
229}
230
231// The OOBManagement service is exposed by each VM pod to perform OOB
232// maintenance on the VM running inside of it.
233service OOBManagement {
234 // Reset resets the virtual CPU of the VM (essentially equivalent to a hard
235 // reboot). This has no effect on the hypervisor itself.
236 // TODO(lorenz): This API should have idempotency counters.
237 rpc Reset(ResetRequest) returns (ResetResponse);
238 // Console opens a bidirectional stream to the virtual serial port (for
239 // debugging or OOB data transfer).
240 // If multiple streams are open data from the VM is broadcast to all clients
241 // and data from all clients are sent to the VM. Ordering with multiple
242 // clients connected is best-effort and cannot be relied upon.
243 rpc Console(stream ConsoleIO) returns (stream ConsoleIO);
244}
245
246message ResetRequest {}
247message ResetResponse {}
248
249message ConsoleIO {
250 bytes data = 1;
251}
252
253// The Hypervisor service is exposed by each VM pod for migrations.
254service Hypervisor {
255 // StartMigration is called by the source pod when it wants to initiate a
256 // migration. It is used to negotiate parameters for migration and endpoints
257 // for the bulk transfer. If no common migration protocol is found,
258 // InvalidArgument is returned.
259 rpc StartMigration(StartMigrationRequest) returns (StartMigrationResponse);
260}
261
262// MigrationProtocol represents a protocol and some protocol-specific metadata
263// to allow for negotiating a connection using that protocol.
264// For each migration protocol message, some fields will be set by the source
265// as constraints (constraint_*), and some will be populated by the target if
266// that migration protocol is picked (negotiated_*). The migration target will
267// keep all constraint_* fields that it was aware of, so that the source can
268// verify that all critical fields were considered by the target (thereby
269// allowing different versions of source/target to communicate).
270message MigrationProtocol {
271 // Qemu represents the native QEMU migration protocol.
272 message Qemu {
273 // If set, the root block device is migrated together with the VM. If the
274 // target doesn't have storage attached directly via QEMU (like RBD or
275 // iSCSI) this needs to be set otherwise this protocol cannot be picked as
276 // the VM would loose it storage during the migration. The opposite is
277 // allowed, it migrates a local-storage volume into QEMU-attached storage
278 // storage.
279 bool constraint_with_blockmigration = 1;
280 // Bulk endpoint on the migration target in QEMU native format
281 string negotiated_endpoint = 2;
282 }
283 oneof kind { Qemu qmeu_block = 1; }
284}
285
286message StartMigrationRequest {
287 // List of migration protocols supported by the source pod
288 repeated MigrationProtocol supported_migration_protocol = 1;
289
290 // Hypervisor ID of the hypervisor making the request (i.e. is currently
291 // running the VM)
292 HypervisorID us = 2;
293}
294
295message StartMigrationResponse {
296 // Migration protocol chosen from supported_migration_protocol by the target
297 // pod.
298 MigrationProtocol migration_protocol = 1;
299}