Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 1 | // Copyright 2020 The Monogon Project Authors. |
| 2 | // |
| 3 | // SPDX-License-Identifier: Apache-2.0 |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | // you may not use this file except in compliance with the License. |
| 7 | // You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, software |
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | // See the License for the specific language governing permissions and |
| 15 | // limitations under the License. |
| 16 | |
| 17 | package launch |
| 18 | |
| 19 | import ( |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 20 | "bytes" |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 21 | "context" |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 22 | "crypto/rand" |
| 23 | "errors" |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 24 | "fmt" |
| 25 | "io" |
| 26 | "io/ioutil" |
Leopold Schabel | a013ffa | 2020-06-03 15:09:32 +0200 | [diff] [blame] | 27 | "log" |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 28 | "net" |
| 29 | "os" |
| 30 | "os/exec" |
| 31 | "path/filepath" |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 32 | "strconv" |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 33 | "strings" |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 34 | "syscall" |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 35 | "time" |
| 36 | |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 37 | "github.com/golang/protobuf/proto" |
Serge Bazanski | 77cb6c5 | 2020-12-19 00:09:22 +0100 | [diff] [blame] | 38 | grpcretry "github.com/grpc-ecosystem/go-grpc-middleware/retry" |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 39 | "golang.org/x/sys/unix" |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 40 | "google.golang.org/grpc" |
| 41 | |
Serge Bazanski | 31370b0 | 2021-01-07 16:31:14 +0100 | [diff] [blame] | 42 | "source.monogon.dev/metropolis/node" |
| 43 | "source.monogon.dev/metropolis/pkg/freeport" |
| 44 | apb "source.monogon.dev/metropolis/proto/api" |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 45 | ) |
| 46 | |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 47 | type qemuValue map[string][]string |
| 48 | |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 49 | // toOption encodes structured data into a QEMU option. |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 50 | // Example: "test", {"key1": {"val1"}, "key2": {"val2", "val3"}} returns "test,key1=val1,key2=val2,key2=val3" |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 51 | func (value qemuValue) toOption(name string) string { |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 52 | var optionValues []string |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 53 | if name != "" { |
| 54 | optionValues = append(optionValues, name) |
| 55 | } |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 56 | for name, values := range value { |
| 57 | if len(values) == 0 { |
| 58 | optionValues = append(optionValues, name) |
| 59 | } |
| 60 | for _, val := range values { |
| 61 | optionValues = append(optionValues, fmt.Sprintf("%v=%v", name, val)) |
| 62 | } |
| 63 | } |
| 64 | return strings.Join(optionValues, ",") |
| 65 | } |
| 66 | |
| 67 | func copyFile(src, dst string) error { |
| 68 | in, err := os.Open(src) |
| 69 | if err != nil { |
| 70 | return err |
| 71 | } |
| 72 | defer in.Close() |
| 73 | |
| 74 | out, err := os.Create(dst) |
| 75 | if err != nil { |
| 76 | return err |
| 77 | } |
| 78 | defer out.Close() |
| 79 | |
| 80 | _, err = io.Copy(out, in) |
| 81 | if err != nil { |
| 82 | return err |
| 83 | } |
| 84 | return out.Close() |
| 85 | } |
| 86 | |
| 87 | // PortMap represents where VM ports are mapped to on the host. It maps from the VM port number to the host port number. |
| 88 | type PortMap map[uint16]uint16 |
| 89 | |
| 90 | // toQemuForwards generates QEMU hostfwd values (https://qemu.weilnetz.de/doc/qemu-doc.html#:~:text=hostfwd=) for all |
| 91 | // mapped ports. |
| 92 | func (p PortMap) toQemuForwards() []string { |
| 93 | var hostfwdOptions []string |
| 94 | for vmPort, hostPort := range p { |
| 95 | hostfwdOptions = append(hostfwdOptions, fmt.Sprintf("tcp::%v-:%v", hostPort, vmPort)) |
| 96 | } |
| 97 | return hostfwdOptions |
| 98 | } |
| 99 | |
| 100 | // DialGRPC creates a gRPC client for a VM port that's forwarded/mapped to the host. The given port is automatically |
| 101 | // resolved to the host-mapped port. |
| 102 | func (p PortMap) DialGRPC(port uint16, opts ...grpc.DialOption) (*grpc.ClientConn, error) { |
| 103 | mappedPort, ok := p[port] |
| 104 | if !ok { |
| 105 | return nil, fmt.Errorf("cannot dial port: port %v is not mapped/forwarded", port) |
| 106 | } |
| 107 | grpcClient, err := grpc.Dial(fmt.Sprintf("localhost:%v", mappedPort), opts...) |
| 108 | if err != nil { |
| 109 | return nil, fmt.Errorf("failed to dial port %v: %w", port, err) |
| 110 | } |
| 111 | return grpcClient, nil |
| 112 | } |
| 113 | |
| 114 | // Options contains all options that can be passed to Launch() |
| 115 | type Options struct { |
| 116 | // Ports contains the port mapping where to expose the internal ports of the VM to the host. See IdentityPortMap() |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 117 | // and ConflictFreePortMap(). Ignored when ConnectToSocket is set. |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 118 | Ports PortMap |
| 119 | |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 120 | // If set to true, reboots are honored. Otherwise all reboots exit the Launch() command. Metropolis nodes |
| 121 | // generally restarts on almost all errors, so unless you want to test reboot behavior this should be false. |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 122 | AllowReboot bool |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 123 | |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 124 | // By default the VM is connected to the Host via SLIRP. If ConnectToSocket is set, it is instead connected |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 125 | // to the given file descriptor/socket. If this is set, all port maps from the Ports option are ignored. |
| 126 | // Intended for networking this instance together with others for running more complex network configurations. |
| 127 | ConnectToSocket *os.File |
| 128 | |
Serge Bazanski | 686444e | 2020-12-21 14:21:14 +0100 | [diff] [blame] | 129 | // SerialPort is a io.ReadWriter over which you can communicate with the serial port of the machine |
| 130 | // It can be set to an existing file descriptor (like os.Stdout/os.Stderr) or any Go structure implementing this interface. |
| 131 | SerialPort io.ReadWriter |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 132 | |
Serge Bazanski | 0ed2f96 | 2021-03-15 16:39:30 +0100 | [diff] [blame] | 133 | // NodeParameters is passed into the VM and subsequently used for bootstrapping or registering into a cluster. |
| 134 | NodeParameters *apb.NodeParameters |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 135 | } |
| 136 | |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 137 | // NodePorts is the list of ports a fully operational Metropolis node listens on |
Serge Bazanski | 549b72b | 2021-01-07 14:54:19 +0100 | [diff] [blame] | 138 | var NodePorts = []uint16{node.ConsensusPort, node.NodeServicePort, node.MasterServicePort, |
| 139 | node.ExternalServicePort, node.DebugServicePort, node.KubernetesAPIPort, node.DebuggerPort} |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 140 | |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 141 | // IdentityPortMap returns a port map where each given port is mapped onto itself on the host. This is mainly useful |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 142 | // for development against Metropolis. The dbg command requires this mapping. |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 143 | func IdentityPortMap(ports []uint16) PortMap { |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 144 | portMap := make(PortMap) |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 145 | for _, port := range ports { |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 146 | portMap[port] = port |
| 147 | } |
| 148 | return portMap |
| 149 | } |
| 150 | |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 151 | // ConflictFreePortMap returns a port map where each given port is mapped onto a random free port on the host. This is |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 152 | // intended for automated testing where multiple instances of Metropolis nodes might be running. Please call this |
| 153 | // function for each Launch command separately and as close to it as possible since it cannot guarantee that the ports |
| 154 | // will remain free. |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 155 | func ConflictFreePortMap(ports []uint16) (PortMap, error) { |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 156 | portMap := make(PortMap) |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 157 | for _, port := range ports { |
Serge Bazanski | cb883e2 | 2020-07-06 17:47:55 +0200 | [diff] [blame] | 158 | mappedPort, listenCloser, err := freeport.AllocateTCPPort() |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 159 | if err != nil { |
| 160 | return portMap, fmt.Errorf("failed to get free host port: %w", err) |
| 161 | } |
| 162 | // Defer closing of the listening port until the function is done and all ports are allocated |
| 163 | defer listenCloser.Close() |
| 164 | portMap[port] = mappedPort |
| 165 | } |
| 166 | return portMap, nil |
| 167 | } |
| 168 | |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 169 | // Gets a random EUI-48 Ethernet MAC address |
| 170 | func generateRandomEthernetMAC() (*net.HardwareAddr, error) { |
| 171 | macBuf := make([]byte, 6) |
| 172 | _, err := rand.Read(macBuf) |
| 173 | if err != nil { |
| 174 | return nil, fmt.Errorf("failed to read randomness for MAC: %v", err) |
| 175 | } |
| 176 | |
| 177 | // Set U/L bit and clear I/G bit (locally administered individual MAC) |
| 178 | // Ref IEEE 802-2014 Section 8.2.2 |
| 179 | macBuf[0] = (macBuf[0] | 2) & 0xfe |
| 180 | mac := net.HardwareAddr(macBuf) |
| 181 | return &mac, nil |
| 182 | } |
| 183 | |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 184 | // Launch launches a Metropolis node instance with the given options. The instance runs mostly paravirtualized but |
| 185 | // with some emulated hardware similar to how a cloud provider might set up its VMs. The disk is fully writable but |
| 186 | // is run in snapshot mode meaning that changes are not kept beyond a single invocation. |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 187 | func Launch(ctx context.Context, options Options) error { |
| 188 | // Pin temp directory to /tmp until we can use abstract socket namespace in QEMU (next release after 5.0, |
| 189 | // https://github.com/qemu/qemu/commit/776b97d3605ed0fc94443048fdf988c7725e38a9). swtpm accepts already-open FDs |
| 190 | // so we can pass in an abstract socket namespace FD that we open and pass the name of it to QEMU. Not pinning this |
| 191 | // crashes both swtpm and qemu because we run into UNIX socket length limitations (for legacy reasons 108 chars). |
| 192 | tempDir, err := ioutil.TempDir("/tmp", "launch*") |
| 193 | if err != nil { |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 194 | return fmt.Errorf("failed to create temporary directory: %w", err) |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 195 | } |
| 196 | defer os.RemoveAll(tempDir) |
| 197 | |
| 198 | // Copy TPM state into a temporary directory since it's being modified by the emulator |
| 199 | tpmTargetDir := filepath.Join(tempDir, "tpm") |
Serge Bazanski | 77cb6c5 | 2020-12-19 00:09:22 +0100 | [diff] [blame] | 200 | tpmSrcDir := "metropolis/node/tpm" |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 201 | if err := os.Mkdir(tpmTargetDir, 0644); err != nil { |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 202 | return fmt.Errorf("failed to create TPM state directory: %w", err) |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 203 | } |
| 204 | tpmFiles, err := ioutil.ReadDir(tpmSrcDir) |
| 205 | if err != nil { |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 206 | return fmt.Errorf("failed to read TPM directory: %w", err) |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 207 | } |
| 208 | for _, file := range tpmFiles { |
| 209 | name := file.Name() |
| 210 | if err := copyFile(filepath.Join(tpmSrcDir, name), filepath.Join(tpmTargetDir, name)); err != nil { |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 211 | return fmt.Errorf("failed to copy TPM directory: %w", err) |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 212 | } |
| 213 | } |
| 214 | |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 215 | var qemuNetType string |
| 216 | var qemuNetConfig qemuValue |
| 217 | if options.ConnectToSocket != nil { |
| 218 | qemuNetType = "socket" |
| 219 | qemuNetConfig = qemuValue{ |
| 220 | "id": {"net0"}, |
| 221 | "fd": {"3"}, |
| 222 | } |
| 223 | } else { |
| 224 | qemuNetType = "user" |
| 225 | qemuNetConfig = qemuValue{ |
| 226 | "id": {"net0"}, |
| 227 | "net": {"10.42.0.0/24"}, |
| 228 | "dhcpstart": {"10.42.0.10"}, |
| 229 | "hostfwd": options.Ports.toQemuForwards(), |
| 230 | } |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 231 | } |
| 232 | |
| 233 | tpmSocketPath := filepath.Join(tempDir, "tpm-socket") |
| 234 | |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 235 | mac, err := generateRandomEthernetMAC() |
| 236 | if err != nil { |
| 237 | return err |
| 238 | } |
| 239 | |
Lorenz Brun | ca24cfa | 2020-08-18 13:49:37 +0200 | [diff] [blame] | 240 | qemuArgs := []string{"-machine", "q35", "-accel", "kvm", "-nographic", "-nodefaults", "-m", "4096", |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 241 | "-cpu", "host", "-smp", "sockets=1,cpus=1,cores=2,threads=2,maxcpus=4", |
| 242 | "-drive", "if=pflash,format=raw,readonly,file=external/edk2/OVMF_CODE.fd", |
| 243 | "-drive", "if=pflash,format=raw,snapshot=on,file=external/edk2/OVMF_VARS.fd", |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 244 | "-drive", "if=virtio,format=raw,snapshot=on,cache=unsafe,file=metropolis/node/node.img", |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 245 | "-netdev", qemuNetConfig.toOption(qemuNetType), |
| 246 | "-device", "virtio-net-pci,netdev=net0,mac=" + mac.String(), |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 247 | "-chardev", "socket,id=chrtpm,path=" + tpmSocketPath, |
| 248 | "-tpmdev", "emulator,id=tpm0,chardev=chrtpm", |
| 249 | "-device", "tpm-tis,tpmdev=tpm0", |
| 250 | "-device", "virtio-rng-pci", |
| 251 | "-serial", "stdio"} |
| 252 | |
| 253 | if !options.AllowReboot { |
| 254 | qemuArgs = append(qemuArgs, "-no-reboot") |
| 255 | } |
| 256 | |
Serge Bazanski | 0ed2f96 | 2021-03-15 16:39:30 +0100 | [diff] [blame] | 257 | if options.NodeParameters != nil { |
| 258 | parametersPath := filepath.Join(tempDir, "parameters.pb") |
| 259 | parametersRaw, err := proto.Marshal(options.NodeParameters) |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 260 | if err != nil { |
Serge Bazanski | 0ed2f96 | 2021-03-15 16:39:30 +0100 | [diff] [blame] | 261 | return fmt.Errorf("failed to encode node paraeters: %w", err) |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 262 | } |
Serge Bazanski | 0ed2f96 | 2021-03-15 16:39:30 +0100 | [diff] [blame] | 263 | if err := ioutil.WriteFile(parametersPath, parametersRaw, 0644); err != nil { |
| 264 | return fmt.Errorf("failed to write node parameters: %w", err) |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 265 | } |
Serge Bazanski | 0ed2f96 | 2021-03-15 16:39:30 +0100 | [diff] [blame] | 266 | qemuArgs = append(qemuArgs, "-fw_cfg", "name=dev.monogon.metropolis/parameters.pb,file="+parametersPath) |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 267 | } |
| 268 | |
Leopold Schabel | a013ffa | 2020-06-03 15:09:32 +0200 | [diff] [blame] | 269 | // Start TPM emulator as a subprocess |
| 270 | tpmCtx, tpmCancel := context.WithCancel(ctx) |
| 271 | defer tpmCancel() |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 272 | |
Leopold Schabel | a013ffa | 2020-06-03 15:09:32 +0200 | [diff] [blame] | 273 | tpmEmuCmd := exec.CommandContext(tpmCtx, "swtpm", "socket", "--tpm2", "--tpmstate", "dir="+tpmTargetDir, "--ctrl", "type=unixio,path="+tpmSocketPath) |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 274 | tpmEmuCmd.Stderr = os.Stderr |
| 275 | tpmEmuCmd.Stdout = os.Stdout |
Leopold Schabel | a013ffa | 2020-06-03 15:09:32 +0200 | [diff] [blame] | 276 | |
| 277 | err = tpmEmuCmd.Start() |
| 278 | if err != nil { |
| 279 | return fmt.Errorf("failed to start TPM emulator: %w", err) |
| 280 | } |
| 281 | |
| 282 | // Start the main qemu binary |
| 283 | systemCmd := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...) |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 284 | if options.ConnectToSocket != nil { |
| 285 | systemCmd.ExtraFiles = []*os.File{options.ConnectToSocket} |
| 286 | } |
| 287 | |
| 288 | var stdErrBuf bytes.Buffer |
| 289 | systemCmd.Stderr = &stdErrBuf |
| 290 | systemCmd.Stdout = options.SerialPort |
Leopold Schabel | a013ffa | 2020-06-03 15:09:32 +0200 | [diff] [blame] | 291 | |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 292 | err = systemCmd.Run() |
Leopold Schabel | a013ffa | 2020-06-03 15:09:32 +0200 | [diff] [blame] | 293 | |
| 294 | // Stop TPM emulator and wait for it to exit to properly reap the child process |
| 295 | tpmCancel() |
| 296 | log.Print("Waiting for TPM emulator to exit") |
| 297 | // Wait returns a SIGKILL error because we just cancelled its context. |
| 298 | // We still need to call it to avoid creating zombies. |
| 299 | _ = tpmEmuCmd.Wait() |
| 300 | |
Lorenz Brun | 3ff5af3 | 2020-06-24 16:34:11 +0200 | [diff] [blame] | 301 | var exerr *exec.ExitError |
| 302 | if err != nil && errors.As(err, &exerr) { |
| 303 | status := exerr.ProcessState.Sys().(syscall.WaitStatus) |
| 304 | if status.Signaled() && status.Signal() == syscall.SIGKILL { |
| 305 | // Process was killed externally (most likely by our context being canceled). |
| 306 | // This is a normal exit for us, so return nil |
| 307 | return nil |
| 308 | } |
| 309 | exerr.Stderr = stdErrBuf.Bytes() |
| 310 | newErr := QEMUError(*exerr) |
| 311 | return &newErr |
| 312 | } |
| 313 | return err |
| 314 | } |
| 315 | |
| 316 | // NewSocketPair creates a new socket pair. By connecting both ends to different instances you can connect them |
| 317 | // with a virtual "network cable". The ends can be passed into the ConnectToSocket option. |
| 318 | func NewSocketPair() (*os.File, *os.File, error) { |
| 319 | fds, err := unix.Socketpair(unix.AF_UNIX, syscall.SOCK_STREAM, 0) |
| 320 | if err != nil { |
| 321 | return nil, nil, fmt.Errorf("failed to call socketpair: %w", err) |
| 322 | } |
| 323 | |
| 324 | fd1 := os.NewFile(uintptr(fds[0]), "network0") |
| 325 | fd2 := os.NewFile(uintptr(fds[1]), "network1") |
| 326 | return fd1, fd2, nil |
| 327 | } |
| 328 | |
| 329 | // HostInterfaceMAC is the MAC address the host SLIRP network interface has if it is not disabled (see |
| 330 | // DisableHostNetworkInterface in MicroVMOptions) |
| 331 | var HostInterfaceMAC = net.HardwareAddr{0x02, 0x72, 0x82, 0xbf, 0xc3, 0x56} |
| 332 | |
| 333 | // MicroVMOptions contains all options to start a MicroVM |
| 334 | type MicroVMOptions struct { |
| 335 | // Path to the ELF kernel binary |
| 336 | KernelPath string |
| 337 | |
| 338 | // Path to the Initramfs |
| 339 | InitramfsPath string |
| 340 | |
| 341 | // Cmdline contains additional kernel commandline options |
| 342 | Cmdline string |
| 343 | |
| 344 | // SerialPort is a File(descriptor) over which you can communicate with the serial port of the machine |
| 345 | // It can be set to an existing file descriptor (like os.Stdout/os.Stderr) or you can use NewSocketPair() to get one |
| 346 | // end to talk to from Go. |
| 347 | SerialPort *os.File |
| 348 | |
| 349 | // ExtraChardevs can be used similar to SerialPort, but can contain an arbitrary number of additional serial ports |
| 350 | ExtraChardevs []*os.File |
| 351 | |
| 352 | // ExtraNetworkInterfaces can contain an arbitrary number of file descriptors which are mapped into the VM as virtio |
| 353 | // network interfaces. The first interface is always a SLIRP-backed interface for communicating with the host. |
| 354 | ExtraNetworkInterfaces []*os.File |
| 355 | |
| 356 | // PortMap contains ports that are mapped to the host through the built-in SLIRP network interface. |
| 357 | PortMap PortMap |
| 358 | |
| 359 | // DisableHostNetworkInterface disables the SLIRP-backed host network interface that is normally the first network |
| 360 | // interface. If this is set PortMap is ignored. Mostly useful for speeding up QEMU's startup time for tests. |
| 361 | DisableHostNetworkInterface bool |
| 362 | } |
| 363 | |
| 364 | // RunMicroVM launches a tiny VM mostly intended for testing. Very quick to boot (<40ms). |
| 365 | func RunMicroVM(ctx context.Context, opts *MicroVMOptions) error { |
| 366 | // Generate options for all the file descriptors we'll be passing as virtio "serial ports" |
| 367 | var extraArgs []string |
| 368 | for idx, _ := range opts.ExtraChardevs { |
| 369 | idxStr := strconv.Itoa(idx) |
| 370 | id := "extra" + idxStr |
| 371 | // That this works is pretty much a hack, but upstream QEMU doesn't have a bidirectional chardev backend not |
| 372 | // based around files/sockets on the disk which are a giant pain to work with. |
| 373 | // We're using QEMU's fdset functionality to make FDs available as pseudo-files and then "ab"using the pipe |
| 374 | // backend's fallback functionality to get a single bidirectional chardev backend backed by a passed-down |
| 375 | // RDWR fd. |
| 376 | // Ref https://lists.gnu.org/archive/html/qemu-devel/2015-12/msg01256.html |
| 377 | addFdConf := qemuValue{ |
| 378 | "set": {idxStr}, |
| 379 | "fd": {strconv.Itoa(idx + 3)}, |
| 380 | } |
| 381 | chardevConf := qemuValue{ |
| 382 | "id": {id}, |
| 383 | "path": {"/dev/fdset/" + idxStr}, |
| 384 | } |
| 385 | deviceConf := qemuValue{ |
| 386 | "chardev": {id}, |
| 387 | } |
| 388 | extraArgs = append(extraArgs, "-add-fd", addFdConf.toOption(""), |
| 389 | "-chardev", chardevConf.toOption("pipe"), "-device", deviceConf.toOption("virtserialport")) |
| 390 | } |
| 391 | |
| 392 | for idx, _ := range opts.ExtraNetworkInterfaces { |
| 393 | id := fmt.Sprintf("net%v", idx) |
| 394 | netdevConf := qemuValue{ |
| 395 | "id": {id}, |
| 396 | "fd": {strconv.Itoa(idx + 3 + len(opts.ExtraChardevs))}, |
| 397 | } |
| 398 | extraArgs = append(extraArgs, "-netdev", netdevConf.toOption("socket"), "-device", "virtio-net-device,netdev="+id) |
| 399 | } |
| 400 | |
| 401 | // This sets up a minimum viable environment for our Linux kernel. |
| 402 | // It clears all standard QEMU configuration and sets up a MicroVM machine |
| 403 | // (https://github.com/qemu/qemu/blob/master/docs/microvm.rst) with all legacy emulation turned off. This means |
| 404 | // the only "hardware" the Linux kernel inside can communicate with is a single virtio-mmio region. Over that MMIO |
| 405 | // interface we run a paravirtualized RNG (since the kernel in there has nothing to gather that from and it |
| 406 | // delays booting), a single paravirtualized console and an arbitrary number of extra serial ports for talking to |
| 407 | // various things that might run inside. The kernel, initramfs and command line are mapped into VM memory at boot |
| 408 | // time and not loaded from any sort of disk. Booting and shutting off one of these VMs takes <100ms. |
| 409 | baseArgs := []string{"-nodefaults", "-no-user-config", "-nographic", "-no-reboot", |
| 410 | "-accel", "kvm", "-cpu", "host", |
| 411 | // Needed until QEMU updates their bundled qboot version (needs https://github.com/bonzini/qboot/pull/28) |
| 412 | "-bios", "external/com_github_bonzini_qboot/bios.bin", |
| 413 | "-M", "microvm,x-option-roms=off,pic=off,pit=off,rtc=off,isa-serial=off", |
| 414 | "-kernel", opts.KernelPath, |
| 415 | // We force using a triple-fault reboot strategy since otherwise the kernel first tries others (like ACPI) which |
| 416 | // are not available in this very restricted environment. Similarly we need to override the boot console since |
| 417 | // there's nothing on the ISA bus that the kernel could talk to. We also force quiet for performance reasons. |
| 418 | "-append", "reboot=t console=hvc0 quiet " + opts.Cmdline, |
| 419 | "-initrd", opts.InitramfsPath, |
| 420 | "-device", "virtio-rng-device,max-bytes=1024,period=1000", |
| 421 | "-device", "virtio-serial-device,max_ports=16", |
| 422 | "-chardev", "stdio,id=con0", "-device", "virtconsole,chardev=con0", |
| 423 | } |
| 424 | |
| 425 | if !opts.DisableHostNetworkInterface { |
| 426 | qemuNetType := "user" |
| 427 | qemuNetConfig := qemuValue{ |
| 428 | "id": {"usernet0"}, |
| 429 | "net": {"10.42.0.0/24"}, |
| 430 | "dhcpstart": {"10.42.0.10"}, |
| 431 | } |
| 432 | if opts.PortMap != nil { |
| 433 | qemuNetConfig["hostfwd"] = opts.PortMap.toQemuForwards() |
| 434 | } |
| 435 | |
| 436 | baseArgs = append(baseArgs, "-netdev", qemuNetConfig.toOption(qemuNetType), |
| 437 | "-device", "virtio-net-device,netdev=usernet0,mac="+HostInterfaceMAC.String()) |
| 438 | } |
| 439 | |
| 440 | var stdErrBuf bytes.Buffer |
| 441 | cmd := exec.CommandContext(ctx, "qemu-system-x86_64", append(baseArgs, extraArgs...)...) |
| 442 | cmd.Stdout = opts.SerialPort |
| 443 | cmd.Stderr = &stdErrBuf |
| 444 | |
| 445 | cmd.ExtraFiles = append(cmd.ExtraFiles, opts.ExtraChardevs...) |
| 446 | cmd.ExtraFiles = append(cmd.ExtraFiles, opts.ExtraNetworkInterfaces...) |
| 447 | |
| 448 | err := cmd.Run() |
| 449 | var exerr *exec.ExitError |
| 450 | if err != nil && errors.As(err, &exerr) { |
| 451 | exerr.Stderr = stdErrBuf.Bytes() |
| 452 | newErr := QEMUError(*exerr) |
| 453 | return &newErr |
| 454 | } |
| 455 | return err |
| 456 | } |
| 457 | |
| 458 | // QEMUError is a special type of ExitError used when QEMU fails. In addition to normal ExitError features it |
| 459 | // prints stderr for debugging. |
| 460 | type QEMUError exec.ExitError |
| 461 | |
| 462 | func (e *QEMUError) Error() string { |
| 463 | return fmt.Sprintf("%v: %v", e.String(), string(e.Stderr)) |
Lorenz Brun | fc5dbc6 | 2020-05-28 12:18:07 +0200 | [diff] [blame] | 464 | } |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 465 | |
| 466 | // NanoswitchPorts contains all ports forwarded by Nanoswitch to the first VM |
| 467 | var NanoswitchPorts = []uint16{ |
Serge Bazanski | 549b72b | 2021-01-07 14:54:19 +0100 | [diff] [blame] | 468 | node.ExternalServicePort, |
| 469 | node.DebugServicePort, |
| 470 | node.KubernetesAPIPort, |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 471 | } |
| 472 | |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 473 | // ClusterOptions contains all options for launching a Metropolis cluster |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 474 | type ClusterOptions struct { |
| 475 | // The number of nodes this cluster should be started with initially |
| 476 | NumNodes int |
| 477 | } |
| 478 | |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 479 | // LaunchCluster launches a cluster of Metropolis node VMs together with a Nanoswitch instance to network them all together. |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 480 | func LaunchCluster(ctx context.Context, opts ClusterOptions) (apb.NodeDebugServiceClient, PortMap, error) { |
| 481 | var switchPorts []*os.File |
| 482 | var vmPorts []*os.File |
| 483 | for i := 0; i < opts.NumNodes; i++ { |
| 484 | switchPort, vmPort, err := NewSocketPair() |
| 485 | if err != nil { |
| 486 | return nil, nil, fmt.Errorf("failed to get socketpair: %w", err) |
| 487 | } |
| 488 | switchPorts = append(switchPorts, switchPort) |
| 489 | vmPorts = append(vmPorts, vmPort) |
| 490 | } |
| 491 | |
| 492 | if opts.NumNodes == 0 { |
| 493 | return nil, nil, errors.New("refusing to start cluster with zero nodes") |
| 494 | } |
| 495 | |
| 496 | if opts.NumNodes > 2 { |
| 497 | return nil, nil, errors.New("launching more than 2 nodes is unsupported pending replacement of golden tickets") |
| 498 | } |
| 499 | |
| 500 | go func() { |
Serge Bazanski | 0ed2f96 | 2021-03-15 16:39:30 +0100 | [diff] [blame] | 501 | if err := Launch(ctx, Options{ |
| 502 | ConnectToSocket: vmPorts[0], |
| 503 | NodeParameters: &apb.NodeParameters{ |
| 504 | Cluster: &apb.NodeParameters_ClusterBootstrap_{ |
| 505 | ClusterBootstrap: &apb.NodeParameters_ClusterBootstrap{}, |
| 506 | }, |
| 507 | }, |
| 508 | }); err != nil { |
| 509 | |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 510 | // Launch() only terminates when QEMU has terminated. At that point our function probably doesn't run anymore |
| 511 | // so we have no way of communicating the error back up, so let's just log it. Also a failure in launching |
| 512 | // VMs should be very visible by the unavailability of the clients we return. |
| 513 | log.Printf("Failed to launch vm0: %v", err) |
| 514 | } |
| 515 | }() |
| 516 | |
| 517 | portMap, err := ConflictFreePortMap(NanoswitchPorts) |
| 518 | if err != nil { |
| 519 | return nil, nil, fmt.Errorf("failed to allocate ephemeral ports: %w", err) |
| 520 | } |
| 521 | |
| 522 | go func() { |
| 523 | if err := RunMicroVM(ctx, &MicroVMOptions{ |
Serge Bazanski | f055a7f | 2021-04-13 16:22:33 +0200 | [diff] [blame^] | 524 | KernelPath: "metropolis/test/ktest/vmlinux", |
Serge Bazanski | 77cb6c5 | 2020-12-19 00:09:22 +0100 | [diff] [blame] | 525 | InitramfsPath: "metropolis/test/nanoswitch/initramfs.lz4", |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 526 | ExtraNetworkInterfaces: switchPorts, |
| 527 | PortMap: portMap, |
| 528 | }); err != nil { |
| 529 | log.Printf("Failed to launch nanoswitch: %v", err) |
| 530 | } |
| 531 | }() |
| 532 | copts := []grpcretry.CallOption{ |
| 533 | grpcretry.WithBackoff(grpcretry.BackoffExponential(100 * time.Millisecond)), |
| 534 | } |
Serge Bazanski | 549b72b | 2021-01-07 14:54:19 +0100 | [diff] [blame] | 535 | conn, err := portMap.DialGRPC(node.DebugServicePort, grpc.WithInsecure(), |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 536 | grpc.WithUnaryInterceptor(grpcretry.UnaryClientInterceptor(copts...))) |
| 537 | if err != nil { |
| 538 | return nil, nil, fmt.Errorf("failed to dial debug service: %w", err) |
| 539 | } |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 540 | debug := apb.NewNodeDebugServiceClient(conn) |
| 541 | |
| 542 | if opts.NumNodes == 2 { |
Serge Bazanski | 0ed2f96 | 2021-03-15 16:39:30 +0100 | [diff] [blame] | 543 | return nil, nil, fmt.Errorf("multinode unimplemented") |
Lorenz Brun | ed0503c | 2020-07-28 17:21:25 +0200 | [diff] [blame] | 544 | } |
| 545 | |
| 546 | return debug, portMap, nil |
| 547 | } |