|  | syntax = "proto3"; | 
|  |  | 
|  | package metropolis.proto.vm; | 
|  |  | 
|  | // VMSpec fully defines all information about a VM and is consumed by the VM | 
|  | // hypervisor through a runtime environment variable. | 
|  | message VMSpec { | 
|  | // Name field from Kubernetes VirtualMachine object. | 
|  | string name = 1; | 
|  | // Namespace of VM object | 
|  | string namespace = 2; | 
|  |  | 
|  | enum StartMode { | 
|  | SM_UNKNOWN = 0; | 
|  | // Normal VM start | 
|  | SM_RUN = 1; | 
|  | // Initialize the disk of the new VM according to `initial_image` and start | 
|  | // the VM | 
|  | SM_PREPARE_IMAGE = 2; | 
|  | // Wait for an incoming migration and start the migrated VM | 
|  | SM_INCOMING_MIGRATION = 3; | 
|  | } | 
|  | StartMode mode = 3; | 
|  | // Reference initial data which is copied to the root block device before | 
|  | // starting the VM for the first time. Only used if starting with | 
|  | // SM_PREPARE_IMAGE. | 
|  | InitialImage initial_image = 4; | 
|  | // Set of IP addresses assigned to the VM. Populated from vmIPs in the | 
|  | // VirtualMachine object. Currently a maximum of one IP per IP protocol | 
|  | // version is supported. | 
|  | repeated string address = 5; | 
|  | // gRPC endpoint of the controller for this VM | 
|  | string controller_endpoint = 6; | 
|  | // Lease mode used for the VM. See LeaseMode for additional info. | 
|  | LeaseMode lease_mode = 7; | 
|  | } | 
|  |  | 
|  | // InitialImage represents a source from which a new VM root block device can be | 
|  | // instantiated. | 
|  | message InitialImage { | 
|  | // A URL to an image file. Populated from initialImage.url in the | 
|  | // VirtualMachine object. | 
|  | string url = 1; | 
|  | } | 
|  |  | 
|  | // LeaseMode represents the different modes VM run authorizations can be | 
|  | // managed. The VM system has its own system for authorizing a given pod to run | 
|  | // a given VM because it requires different tradeoff as part of its distributed | 
|  | // systems design than Kubernetes. The core issue is that Kubernetes's design | 
|  | // does not guarantee that the control plane always has an accurate view | 
|  | // of running pods especially when nodes fail or get partitioned which they | 
|  | // trade for potentially better availability by keeping both sides of the | 
|  | // partition running. Kubernetes is also prone to bugs that result in running | 
|  | // pods no longer being accounted for (for example | 
|  | // https://github.com/kubernetes/kubernetes/issues/80968) or duplicated.This can | 
|  | // result in pods running which the controller cannot see which results in more | 
|  | // than one running pod for a VM indefinitely. | 
|  | // For stateful single-instance workloads like VMs this can cause | 
|  | // control issues (VMs no longer converging to the configured state because | 
|  | // the current pod is "out of control", continued unavailability because an | 
|  | // uncontrollable pod holds a lock on the backing storage or even data | 
|  | // corruption in case two VMs are concurrently writing to the same storage. | 
|  | // | 
|  | // To avoid these issues the VM system implements two different strategies | 
|  | // providing mutual exclusion itself: One for use exclusively with | 
|  | // local storage-backed VMs and one tailored for VMs with distributed storage. | 
|  | // They significantly differ in the tradeoffs they make and the guarantees they | 
|  | // deliver as documented below. | 
|  | // Both strategies rely (at least in part) on asking the VM controller directly | 
|  | // if a pod should keep running its VM. The statement of the VM controller | 
|  | // is called a "run authorization" in the context of the VM system. The exact | 
|  | // format of this run authorization depends on the strategy in use. | 
|  | enum LeaseMode { | 
|  | LM_UNKNOWN = 0; | 
|  | // In storage locking mode mutual exclusion and thus run authorization is | 
|  | // provided through locks on the backing block storage system. Control plane | 
|  | // convergence is only on a best-effort basis, under certain K8s failure modes | 
|  | // the VM control plane might never converge. A Hypervisor that's partitioned | 
|  | // from the control plane will continue to run its VM indefinitely and will | 
|  | // not fence itself off from storage or networking. This mode is appropriate | 
|  | // for local storage as the full leases mode would introduce more disruptions | 
|  | // than it solves under these constraints. The run authorization for this | 
|  | // strategy is a simple STATUS_OK/STATUS_TERMINATE status value with no | 
|  | // explicit lease expiration as VMs should not stop executing if the control | 
|  | // plane is unavailable. These authorizations are still useful as a way to | 
|  | // ensure at least on a best-effort basis that leaked/out-of-control pods shut | 
|  | // themselves down and locks held by the wrong pods are released. | 
|  | LM_STORAGE_LOCKING = 1; | 
|  | // In full leases mode all run authorizations come exclusively from the | 
|  | // controller and are passed as leases to all external systems (like storage | 
|  | // and network).  A Hypervisor that's partitioned from the control plane | 
|  | // will after its lease expires kill its VM and fence itself from network and | 
|  | // storage before terminating itself. This mode is appropriate for fully | 
|  | // distributed storage as it allows higher availability in that scenario. | 
|  | // The run authorization for this strategy is an expiring lease which also | 
|  | // needs to be passed together with any IO operation for proper fencing. | 
|  | // The hypervisor kills the VM if its lease expires. | 
|  | // Not implemented currently. | 
|  | LM_FULL_LEASES = 2; | 
|  | } | 
|  |  | 
|  | // This is a structure exposing VM metadata to the VM via fw_cfg interface. It | 
|  | // currently only contains the name of the VM and its network configuration. | 
|  | // Exposed as vm.metropolis.monogon.dev/v1/metadata.pb to the VM. | 
|  | message VMMetadata { | 
|  | // Name field from Kubernetes VirtualMachine object. | 
|  | string name = 1; | 
|  | NetworkConfig network_config = 2; | 
|  | } | 
|  |  | 
|  | // PTPAddress contains the VM IP and the hypervisor IP for an IP point to | 
|  | // point interface. Both IPs need to be for the same IP protocol version (v4 or | 
|  | // v6). | 
|  | // For example on Linux this could be configured using | 
|  | // `ip addr add $ip peer $peer_ip dev eth0` for the PtP connection and | 
|  | // `ip route add default via $peer_ip` for the default route. | 
|  | message PTPAddress { | 
|  | // IP address of the VM | 
|  | string ip = 1; | 
|  | // IP address of the hypervisor side, default gateway for the VM | 
|  | string peer_ip = 2; | 
|  | } | 
|  |  | 
|  | // NetworkConfig represents the network configuration the VM needs to configure | 
|  | // to communicate via its network interface. | 
|  | message NetworkConfig { | 
|  | // IPv4 addresses of the PtP link between the VM and the hypervisor, if any. | 
|  | PTPAddress v4 = 1; | 
|  | // IPv6 addresses of the PtP link between the VM and the hypervisor, if any. | 
|  | PTPAddress v6 = 2; | 
|  | } | 
|  |  | 
|  | // HypervisorID identifies a running instance of a hypervisor uniquely. | 
|  | message HypervisorID { | 
|  | // vm_name is the name of the VM object. | 
|  | string vm_name = 1; | 
|  | // namespace is the K8s namespace of the VM object. | 
|  | string namespace = 2; | 
|  | // pod_name is the pod name in which the hypervisor is running. | 
|  | string pod_name = 3; | 
|  | // run_id is selected by the hypervisor at the start of the process to | 
|  | // uniquely identify that specific running process. A process which starts | 
|  | // later with respect to other instances on the same node should have a higher | 
|  | // run_id so that the controller can know that. In practice this should be | 
|  | // derived from a precise timestamp like nanoseconds since the UNIX epoch. | 
|  | uint64 run_id = 4; | 
|  | } | 
|  |  | 
|  | message RunLeaseRequest { | 
|  | HypervisorID us = 1; | 
|  | } | 
|  |  | 
|  | message RunLeaseUpdate { | 
|  | enum Status { | 
|  | STATUS_UNKNOWN = 0; | 
|  | // The pod should keep running its VM | 
|  | STATUS_OK = 1; | 
|  | // The pod should terminate the VM immediately and exit | 
|  | STATUS_TERMINATE = 2; | 
|  | } | 
|  | Status status = 1; | 
|  | } | 
|  |  | 
|  | message MigrationSwitchoverRequest { | 
|  | HypervisorID us = 1; | 
|  | HypervisorID them = 2; | 
|  | } | 
|  | message MigrationSwitchoverResponse {} | 
|  |  | 
|  | message EnsureMigrationTargetRequest { | 
|  | HypervisorID us = 1; | 
|  | } | 
|  |  | 
|  | message EnsureMigrationTargetResponse { | 
|  | enum Action { | 
|  | ACTION_UNKNOWN = 0; | 
|  | ACTION_LIVE_MIGRATE = 1; | 
|  | ACTION_SOFT_SHUTDOWN = 2; | 
|  | } | 
|  | Action action = 1; | 
|  | // Endpoint of the new Pod exposing a metropolis.vm.Hypervisor service if | 
|  | // action == ACTION_LIVE_MIGRATE. | 
|  | string target_endpoint = 2; | 
|  | } | 
|  |  | 
|  | // The VMController service is exposed by the controller for the hypervisors to | 
|  | // interact with. It is responsible for (pseudo)-leases and and migrations. | 
|  | // A typical migration looks like this: | 
|  | // 1. Currently running pod with VM gets SIGTERM. | 
|  | // 2. Source pod runs EnsureMigrationTarget to inform the controller of its wish | 
|  | //    to migrate its VM away. The controller creates or reuses a target pod to | 
|  | //    migrate to and returns its endpoint to the source pod. | 
|  | // 3. Source pod runs Hypervisor.StartMigration on the target pod to negotiate a | 
|  | //    channel to migrate. | 
|  | // 4. Source pod bulk-migrates the vm in a hypervisor-specific way. | 
|  | // 5. After the bulk migration is done, the source pod stops executing the VM. | 
|  | //    The target pod calls MigrationSwitchover on the controller with `us` set | 
|  | //    to itself and `them` to the `us` parameter in the StartMigrationRequest it | 
|  | //    received in step 3. | 
|  | // 6. The controller performs the Compare-and-Swap and returns either Ok or | 
|  | //    PreconditionFailed depending on whether the authoritative pod has changed | 
|  | //    in the meantime. If the MigrationSwitchover RPC succeeded, the VM is now | 
|  | //    running on the target pod. If it doesn't succeed, the target pod will | 
|  | //    retry this step for a set period of time and then exit. | 
|  | // 7. After a set timeout, the source pod will regenerate is run id and attempt | 
|  | //    to call MigrationSwitchover with them set to its old identity and us to | 
|  | //    its new identity formed by updating its run id. This call is expected to | 
|  | //    fail with PreconditionFailed which will cause the source pod to shut | 
|  | //    itself down. If the call succeeds, the source pod will start running the | 
|  | //    VM again. | 
|  | service VMController { | 
|  | // EnsureMigrationTarget returns either a request to soft-shutdown or a | 
|  | // reference to a pod to which the caller should connect to migrate the VM. | 
|  | // It waits for the pod to run and complete a gRPC health check, but clients | 
|  | // should still retry a connection a few times before giving up and calling | 
|  | // this endpoint again. | 
|  | rpc EnsureMigrationTarget(EnsureMigrationTargetRequest) returns (EnsureMigrationTargetResponse); | 
|  | // MigrationSwitchover attempts to atomically swap the authoritative Pod and | 
|  | // PVC from the one in `them` to the one in `us`. If this request succeeds the | 
|  | // pod in `us` (the caller) is now authoritative for a given VM. If the | 
|  | // authoritative pod is not the one in `them`, this method will return | 
|  | // PreconditionFailed and do nothing. | 
|  | rpc MigrationSwitchover(MigrationSwitchoverRequest) returns (MigrationSwitchoverResponse); | 
|  | // RunLease requests a pseudo-lease (or a full lease in LeaseMode | 
|  | // LM_FULL_LEASES) and streams updates to the lease status or new leases (in | 
|  | // LM_FULL_LEASES). Clients should always attempt to keep one RunLease | 
|  | // connection open to ensure reliable control from the control plane. | 
|  | rpc RunLease(RunLeaseRequest) returns (stream RunLeaseUpdate); | 
|  | } | 
|  |  | 
|  | // The OOBManagement service is exposed by each VM pod to perform OOB | 
|  | // maintenance on the VM running inside of it. | 
|  | service OOBManagement { | 
|  | // Reset resets the virtual CPU of the VM (essentially equivalent to a hard | 
|  | // reboot). This has no effect on the hypervisor itself. | 
|  | // TODO(lorenz): This API should have idempotency counters. | 
|  | rpc Reset(ResetRequest) returns (ResetResponse); | 
|  | // Console opens a bidirectional stream to the virtual serial port (for | 
|  | // debugging or OOB data transfer). | 
|  | // If multiple streams are open data from the VM is broadcast to all clients | 
|  | // and data from all clients are sent to the VM. Ordering with multiple | 
|  | // clients connected is best-effort and cannot be relied upon. | 
|  | rpc Console(stream ConsoleIO) returns (stream ConsoleIO); | 
|  | } | 
|  |  | 
|  | message ResetRequest {} | 
|  | message ResetResponse {} | 
|  |  | 
|  | message ConsoleIO { | 
|  | bytes data = 1; | 
|  | } | 
|  |  | 
|  | // The Hypervisor service is exposed by each VM pod for migrations. | 
|  | service Hypervisor { | 
|  | // StartMigration is called by the source pod when it wants to initiate a | 
|  | // migration. It is used to negotiate parameters for migration and endpoints | 
|  | // for the bulk transfer. If no common migration protocol is found, | 
|  | // InvalidArgument is returned. | 
|  | rpc StartMigration(StartMigrationRequest) returns (StartMigrationResponse); | 
|  | } | 
|  |  | 
|  | // MigrationProtocol represents a protocol and some protocol-specific metadata | 
|  | // to allow for negotiating a connection using that protocol. | 
|  | // For each migration  protocol message, some fields will be set by the source | 
|  | // as constraints (constraint_*), and some will be populated by the target if | 
|  | // that migration protocol is picked (negotiated_*). The migration target will | 
|  | // keep all constraint_* fields that it was aware of, so that the source can | 
|  | // verify that all critical fields were considered by the target (thereby | 
|  | // allowing different versions of source/target to communicate). | 
|  | message MigrationProtocol { | 
|  | // Qemu represents the native QEMU migration protocol. | 
|  | message Qemu { | 
|  | // If set, the root block device is migrated together with the VM. If the | 
|  | // target doesn't have storage attached directly via QEMU (like RBD or | 
|  | // iSCSI) this needs to be set otherwise this protocol cannot be picked as | 
|  | // the VM would loose it storage during the migration. The opposite is | 
|  | // allowed, it migrates a local-storage volume into QEMU-attached storage | 
|  | // storage. | 
|  | bool constraint_with_blockmigration = 1; | 
|  | // Bulk endpoint on the migration target in QEMU native format | 
|  | string negotiated_endpoint = 2; | 
|  | } | 
|  | oneof kind { Qemu qmeu_block = 1; } | 
|  | } | 
|  |  | 
|  | message StartMigrationRequest { | 
|  | // List of migration protocols supported by the source pod | 
|  | repeated MigrationProtocol supported_migration_protocol = 1; | 
|  |  | 
|  | // Hypervisor ID of the hypervisor making the request (i.e. is currently | 
|  | // running the VM) | 
|  | HypervisorID us = 2; | 
|  | } | 
|  |  | 
|  | message StartMigrationResponse { | 
|  | // Migration protocol chosen from supported_migration_protocol by the target | 
|  | // pod. | 
|  | MigrationProtocol migration_protocol = 1; | 
|  | } |