blob: 6e6891a86ce292e5f197522eac8005cbb4164ae7 [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package launch
18
19import (
Lorenz Brun3ff5af32020-06-24 16:34:11 +020020 "bytes"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020021 "context"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020022 "crypto/rand"
23 "errors"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020024 "fmt"
25 "io"
26 "io/ioutil"
Leopold Schabela013ffa2020-06-03 15:09:32 +020027 "log"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020028 "net"
29 "os"
30 "os/exec"
31 "path/filepath"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020032 "strconv"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020033 "strings"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020034 "syscall"
Lorenz Bruned0503c2020-07-28 17:21:25 +020035 "time"
36
Lorenz Brun3ff5af32020-06-24 16:34:11 +020037 "github.com/golang/protobuf/proto"
Serge Bazanski77cb6c52020-12-19 00:09:22 +010038 grpcretry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020039 "golang.org/x/sys/unix"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020040 "google.golang.org/grpc"
41
Serge Bazanski31370b02021-01-07 16:31:14 +010042 "source.monogon.dev/metropolis/node"
43 "source.monogon.dev/metropolis/pkg/freeport"
44 apb "source.monogon.dev/metropolis/proto/api"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020045)
46
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020047type qemuValue map[string][]string
48
Lorenz Brun3ff5af32020-06-24 16:34:11 +020049// toOption encodes structured data into a QEMU option.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020050// Example: "test", {"key1": {"val1"}, "key2": {"val2", "val3"}} returns "test,key1=val1,key2=val2,key2=val3"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020051func (value qemuValue) toOption(name string) string {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020052 var optionValues []string
Lorenz Brun3ff5af32020-06-24 16:34:11 +020053 if name != "" {
54 optionValues = append(optionValues, name)
55 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020056 for name, values := range value {
57 if len(values) == 0 {
58 optionValues = append(optionValues, name)
59 }
60 for _, val := range values {
61 optionValues = append(optionValues, fmt.Sprintf("%v=%v", name, val))
62 }
63 }
64 return strings.Join(optionValues, ",")
65}
66
67func copyFile(src, dst string) error {
68 in, err := os.Open(src)
69 if err != nil {
70 return err
71 }
72 defer in.Close()
73
74 out, err := os.Create(dst)
75 if err != nil {
76 return err
77 }
78 defer out.Close()
79
80 _, err = io.Copy(out, in)
81 if err != nil {
82 return err
83 }
84 return out.Close()
85}
86
87// PortMap represents where VM ports are mapped to on the host. It maps from the VM port number to the host port number.
88type PortMap map[uint16]uint16
89
90// toQemuForwards generates QEMU hostfwd values (https://qemu.weilnetz.de/doc/qemu-doc.html#:~:text=hostfwd=) for all
91// mapped ports.
92func (p PortMap) toQemuForwards() []string {
93 var hostfwdOptions []string
94 for vmPort, hostPort := range p {
95 hostfwdOptions = append(hostfwdOptions, fmt.Sprintf("tcp::%v-:%v", hostPort, vmPort))
96 }
97 return hostfwdOptions
98}
99
100// DialGRPC creates a gRPC client for a VM port that's forwarded/mapped to the host. The given port is automatically
101// resolved to the host-mapped port.
102func (p PortMap) DialGRPC(port uint16, opts ...grpc.DialOption) (*grpc.ClientConn, error) {
103 mappedPort, ok := p[port]
104 if !ok {
105 return nil, fmt.Errorf("cannot dial port: port %v is not mapped/forwarded", port)
106 }
107 grpcClient, err := grpc.Dial(fmt.Sprintf("localhost:%v", mappedPort), opts...)
108 if err != nil {
109 return nil, fmt.Errorf("failed to dial port %v: %w", port, err)
110 }
111 return grpcClient, nil
112}
113
114// Options contains all options that can be passed to Launch()
115type Options struct {
116 // Ports contains the port mapping where to expose the internal ports of the VM to the host. See IdentityPortMap()
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200117 // and ConflictFreePortMap(). Ignored when ConnectToSocket is set.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200118 Ports PortMap
119
Serge Bazanski662b5b32020-12-21 13:49:00 +0100120 // If set to true, reboots are honored. Otherwise all reboots exit the Launch() command. Metropolis nodes
121 // generally restarts on almost all errors, so unless you want to test reboot behavior this should be false.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200122 AllowReboot bool
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200123
Serge Bazanski662b5b32020-12-21 13:49:00 +0100124 // By default the VM is connected to the Host via SLIRP. If ConnectToSocket is set, it is instead connected
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200125 // to the given file descriptor/socket. If this is set, all port maps from the Ports option are ignored.
126 // Intended for networking this instance together with others for running more complex network configurations.
127 ConnectToSocket *os.File
128
Serge Bazanski686444e2020-12-21 14:21:14 +0100129 // SerialPort is a io.ReadWriter over which you can communicate with the serial port of the machine
130 // It can be set to an existing file descriptor (like os.Stdout/os.Stderr) or any Go structure implementing this interface.
131 SerialPort io.ReadWriter
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200132
133 // EnrolmentConfig is passed into the VM and subsequently used for bootstrapping if no enrolment config is built-in
Serge Bazanskiefdb6e92020-07-13 17:19:27 +0200134 EnrolmentConfig *apb.EnrolmentConfig
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200135}
136
Serge Bazanski662b5b32020-12-21 13:49:00 +0100137// NodePorts is the list of ports a fully operational Metropolis node listens on
Serge Bazanski549b72b2021-01-07 14:54:19 +0100138var NodePorts = []uint16{node.ConsensusPort, node.NodeServicePort, node.MasterServicePort,
139 node.ExternalServicePort, node.DebugServicePort, node.KubernetesAPIPort, node.DebuggerPort}
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200140
Lorenz Bruned0503c2020-07-28 17:21:25 +0200141// IdentityPortMap returns a port map where each given port is mapped onto itself on the host. This is mainly useful
Serge Bazanski662b5b32020-12-21 13:49:00 +0100142// for development against Metropolis. The dbg command requires this mapping.
Lorenz Bruned0503c2020-07-28 17:21:25 +0200143func IdentityPortMap(ports []uint16) PortMap {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200144 portMap := make(PortMap)
Lorenz Bruned0503c2020-07-28 17:21:25 +0200145 for _, port := range ports {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200146 portMap[port] = port
147 }
148 return portMap
149}
150
Lorenz Bruned0503c2020-07-28 17:21:25 +0200151// ConflictFreePortMap returns a port map where each given port is mapped onto a random free port on the host. This is
Serge Bazanski662b5b32020-12-21 13:49:00 +0100152// intended for automated testing where multiple instances of Metropolis nodes might be running. Please call this
153// function for each Launch command separately and as close to it as possible since it cannot guarantee that the ports
154// will remain free.
Lorenz Bruned0503c2020-07-28 17:21:25 +0200155func ConflictFreePortMap(ports []uint16) (PortMap, error) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200156 portMap := make(PortMap)
Lorenz Bruned0503c2020-07-28 17:21:25 +0200157 for _, port := range ports {
Serge Bazanskicb883e22020-07-06 17:47:55 +0200158 mappedPort, listenCloser, err := freeport.AllocateTCPPort()
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200159 if err != nil {
160 return portMap, fmt.Errorf("failed to get free host port: %w", err)
161 }
162 // Defer closing of the listening port until the function is done and all ports are allocated
163 defer listenCloser.Close()
164 portMap[port] = mappedPort
165 }
166 return portMap, nil
167}
168
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200169// Gets a random EUI-48 Ethernet MAC address
170func generateRandomEthernetMAC() (*net.HardwareAddr, error) {
171 macBuf := make([]byte, 6)
172 _, err := rand.Read(macBuf)
173 if err != nil {
174 return nil, fmt.Errorf("failed to read randomness for MAC: %v", err)
175 }
176
177 // Set U/L bit and clear I/G bit (locally administered individual MAC)
178 // Ref IEEE 802-2014 Section 8.2.2
179 macBuf[0] = (macBuf[0] | 2) & 0xfe
180 mac := net.HardwareAddr(macBuf)
181 return &mac, nil
182}
183
Serge Bazanski662b5b32020-12-21 13:49:00 +0100184// Launch launches a Metropolis node instance with the given options. The instance runs mostly paravirtualized but
185// with some emulated hardware similar to how a cloud provider might set up its VMs. The disk is fully writable but
186// is run in snapshot mode meaning that changes are not kept beyond a single invocation.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200187func Launch(ctx context.Context, options Options) error {
188 // Pin temp directory to /tmp until we can use abstract socket namespace in QEMU (next release after 5.0,
189 // https://github.com/qemu/qemu/commit/776b97d3605ed0fc94443048fdf988c7725e38a9). swtpm accepts already-open FDs
190 // so we can pass in an abstract socket namespace FD that we open and pass the name of it to QEMU. Not pinning this
191 // crashes both swtpm and qemu because we run into UNIX socket length limitations (for legacy reasons 108 chars).
192 tempDir, err := ioutil.TempDir("/tmp", "launch*")
193 if err != nil {
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200194 return fmt.Errorf("failed to create temporary directory: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200195 }
196 defer os.RemoveAll(tempDir)
197
198 // Copy TPM state into a temporary directory since it's being modified by the emulator
199 tpmTargetDir := filepath.Join(tempDir, "tpm")
Serge Bazanski77cb6c52020-12-19 00:09:22 +0100200 tpmSrcDir := "metropolis/node/tpm"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200201 if err := os.Mkdir(tpmTargetDir, 0644); err != nil {
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200202 return fmt.Errorf("failed to create TPM state directory: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200203 }
204 tpmFiles, err := ioutil.ReadDir(tpmSrcDir)
205 if err != nil {
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200206 return fmt.Errorf("failed to read TPM directory: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200207 }
208 for _, file := range tpmFiles {
209 name := file.Name()
210 if err := copyFile(filepath.Join(tpmSrcDir, name), filepath.Join(tpmTargetDir, name)); err != nil {
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200211 return fmt.Errorf("failed to copy TPM directory: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200212 }
213 }
214
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200215 var qemuNetType string
216 var qemuNetConfig qemuValue
217 if options.ConnectToSocket != nil {
218 qemuNetType = "socket"
219 qemuNetConfig = qemuValue{
220 "id": {"net0"},
221 "fd": {"3"},
222 }
223 } else {
224 qemuNetType = "user"
225 qemuNetConfig = qemuValue{
226 "id": {"net0"},
227 "net": {"10.42.0.0/24"},
228 "dhcpstart": {"10.42.0.10"},
229 "hostfwd": options.Ports.toQemuForwards(),
230 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200231 }
232
233 tpmSocketPath := filepath.Join(tempDir, "tpm-socket")
234
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200235 mac, err := generateRandomEthernetMAC()
236 if err != nil {
237 return err
238 }
239
Lorenz Brunca24cfa2020-08-18 13:49:37 +0200240 qemuArgs := []string{"-machine", "q35", "-accel", "kvm", "-nographic", "-nodefaults", "-m", "4096",
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200241 "-cpu", "host", "-smp", "sockets=1,cpus=1,cores=2,threads=2,maxcpus=4",
242 "-drive", "if=pflash,format=raw,readonly,file=external/edk2/OVMF_CODE.fd",
243 "-drive", "if=pflash,format=raw,snapshot=on,file=external/edk2/OVMF_VARS.fd",
Serge Bazanski662b5b32020-12-21 13:49:00 +0100244 "-drive", "if=virtio,format=raw,snapshot=on,cache=unsafe,file=metropolis/node/node.img",
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200245 "-netdev", qemuNetConfig.toOption(qemuNetType),
246 "-device", "virtio-net-pci,netdev=net0,mac=" + mac.String(),
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200247 "-chardev", "socket,id=chrtpm,path=" + tpmSocketPath,
248 "-tpmdev", "emulator,id=tpm0,chardev=chrtpm",
249 "-device", "tpm-tis,tpmdev=tpm0",
250 "-device", "virtio-rng-pci",
251 "-serial", "stdio"}
252
253 if !options.AllowReboot {
254 qemuArgs = append(qemuArgs, "-no-reboot")
255 }
256
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200257 if options.EnrolmentConfig != nil {
258 enrolmentConfigPath := filepath.Join(tempDir, "enrolment.pb")
259 enrolmentConfigRaw, err := proto.Marshal(options.EnrolmentConfig)
260 if err != nil {
261 return fmt.Errorf("failed to encode enrolment config: %w", err)
262 }
263 if err := ioutil.WriteFile(enrolmentConfigPath, enrolmentConfigRaw, 0644); err != nil {
264 return fmt.Errorf("failed to write enrolment config: %w", err)
265 }
Serge Bazanski662b5b32020-12-21 13:49:00 +0100266 qemuArgs = append(qemuArgs, "-fw_cfg", "name=dev.monogon.metropolis/enrolment.pb,file="+enrolmentConfigPath)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200267 }
268
Leopold Schabela013ffa2020-06-03 15:09:32 +0200269 // Start TPM emulator as a subprocess
270 tpmCtx, tpmCancel := context.WithCancel(ctx)
271 defer tpmCancel()
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200272
Leopold Schabela013ffa2020-06-03 15:09:32 +0200273 tpmEmuCmd := exec.CommandContext(tpmCtx, "swtpm", "socket", "--tpm2", "--tpmstate", "dir="+tpmTargetDir, "--ctrl", "type=unixio,path="+tpmSocketPath)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200274 tpmEmuCmd.Stderr = os.Stderr
275 tpmEmuCmd.Stdout = os.Stdout
Leopold Schabela013ffa2020-06-03 15:09:32 +0200276
277 err = tpmEmuCmd.Start()
278 if err != nil {
279 return fmt.Errorf("failed to start TPM emulator: %w", err)
280 }
281
282 // Start the main qemu binary
283 systemCmd := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200284 if options.ConnectToSocket != nil {
285 systemCmd.ExtraFiles = []*os.File{options.ConnectToSocket}
286 }
287
288 var stdErrBuf bytes.Buffer
289 systemCmd.Stderr = &stdErrBuf
290 systemCmd.Stdout = options.SerialPort
Leopold Schabela013ffa2020-06-03 15:09:32 +0200291
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200292 err = systemCmd.Run()
Leopold Schabela013ffa2020-06-03 15:09:32 +0200293
294 // Stop TPM emulator and wait for it to exit to properly reap the child process
295 tpmCancel()
296 log.Print("Waiting for TPM emulator to exit")
297 // Wait returns a SIGKILL error because we just cancelled its context.
298 // We still need to call it to avoid creating zombies.
299 _ = tpmEmuCmd.Wait()
300
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200301 var exerr *exec.ExitError
302 if err != nil && errors.As(err, &exerr) {
303 status := exerr.ProcessState.Sys().(syscall.WaitStatus)
304 if status.Signaled() && status.Signal() == syscall.SIGKILL {
305 // Process was killed externally (most likely by our context being canceled).
306 // This is a normal exit for us, so return nil
307 return nil
308 }
309 exerr.Stderr = stdErrBuf.Bytes()
310 newErr := QEMUError(*exerr)
311 return &newErr
312 }
313 return err
314}
315
316// NewSocketPair creates a new socket pair. By connecting both ends to different instances you can connect them
317// with a virtual "network cable". The ends can be passed into the ConnectToSocket option.
318func NewSocketPair() (*os.File, *os.File, error) {
319 fds, err := unix.Socketpair(unix.AF_UNIX, syscall.SOCK_STREAM, 0)
320 if err != nil {
321 return nil, nil, fmt.Errorf("failed to call socketpair: %w", err)
322 }
323
324 fd1 := os.NewFile(uintptr(fds[0]), "network0")
325 fd2 := os.NewFile(uintptr(fds[1]), "network1")
326 return fd1, fd2, nil
327}
328
329// HostInterfaceMAC is the MAC address the host SLIRP network interface has if it is not disabled (see
330// DisableHostNetworkInterface in MicroVMOptions)
331var HostInterfaceMAC = net.HardwareAddr{0x02, 0x72, 0x82, 0xbf, 0xc3, 0x56}
332
333// MicroVMOptions contains all options to start a MicroVM
334type MicroVMOptions struct {
335 // Path to the ELF kernel binary
336 KernelPath string
337
338 // Path to the Initramfs
339 InitramfsPath string
340
341 // Cmdline contains additional kernel commandline options
342 Cmdline string
343
344 // SerialPort is a File(descriptor) over which you can communicate with the serial port of the machine
345 // It can be set to an existing file descriptor (like os.Stdout/os.Stderr) or you can use NewSocketPair() to get one
346 // end to talk to from Go.
347 SerialPort *os.File
348
349 // ExtraChardevs can be used similar to SerialPort, but can contain an arbitrary number of additional serial ports
350 ExtraChardevs []*os.File
351
352 // ExtraNetworkInterfaces can contain an arbitrary number of file descriptors which are mapped into the VM as virtio
353 // network interfaces. The first interface is always a SLIRP-backed interface for communicating with the host.
354 ExtraNetworkInterfaces []*os.File
355
356 // PortMap contains ports that are mapped to the host through the built-in SLIRP network interface.
357 PortMap PortMap
358
359 // DisableHostNetworkInterface disables the SLIRP-backed host network interface that is normally the first network
360 // interface. If this is set PortMap is ignored. Mostly useful for speeding up QEMU's startup time for tests.
361 DisableHostNetworkInterface bool
362}
363
364// RunMicroVM launches a tiny VM mostly intended for testing. Very quick to boot (<40ms).
365func RunMicroVM(ctx context.Context, opts *MicroVMOptions) error {
366 // Generate options for all the file descriptors we'll be passing as virtio "serial ports"
367 var extraArgs []string
368 for idx, _ := range opts.ExtraChardevs {
369 idxStr := strconv.Itoa(idx)
370 id := "extra" + idxStr
371 // That this works is pretty much a hack, but upstream QEMU doesn't have a bidirectional chardev backend not
372 // based around files/sockets on the disk which are a giant pain to work with.
373 // We're using QEMU's fdset functionality to make FDs available as pseudo-files and then "ab"using the pipe
374 // backend's fallback functionality to get a single bidirectional chardev backend backed by a passed-down
375 // RDWR fd.
376 // Ref https://lists.gnu.org/archive/html/qemu-devel/2015-12/msg01256.html
377 addFdConf := qemuValue{
378 "set": {idxStr},
379 "fd": {strconv.Itoa(idx + 3)},
380 }
381 chardevConf := qemuValue{
382 "id": {id},
383 "path": {"/dev/fdset/" + idxStr},
384 }
385 deviceConf := qemuValue{
386 "chardev": {id},
387 }
388 extraArgs = append(extraArgs, "-add-fd", addFdConf.toOption(""),
389 "-chardev", chardevConf.toOption("pipe"), "-device", deviceConf.toOption("virtserialport"))
390 }
391
392 for idx, _ := range opts.ExtraNetworkInterfaces {
393 id := fmt.Sprintf("net%v", idx)
394 netdevConf := qemuValue{
395 "id": {id},
396 "fd": {strconv.Itoa(idx + 3 + len(opts.ExtraChardevs))},
397 }
398 extraArgs = append(extraArgs, "-netdev", netdevConf.toOption("socket"), "-device", "virtio-net-device,netdev="+id)
399 }
400
401 // This sets up a minimum viable environment for our Linux kernel.
402 // It clears all standard QEMU configuration and sets up a MicroVM machine
403 // (https://github.com/qemu/qemu/blob/master/docs/microvm.rst) with all legacy emulation turned off. This means
404 // the only "hardware" the Linux kernel inside can communicate with is a single virtio-mmio region. Over that MMIO
405 // interface we run a paravirtualized RNG (since the kernel in there has nothing to gather that from and it
406 // delays booting), a single paravirtualized console and an arbitrary number of extra serial ports for talking to
407 // various things that might run inside. The kernel, initramfs and command line are mapped into VM memory at boot
408 // time and not loaded from any sort of disk. Booting and shutting off one of these VMs takes <100ms.
409 baseArgs := []string{"-nodefaults", "-no-user-config", "-nographic", "-no-reboot",
410 "-accel", "kvm", "-cpu", "host",
411 // Needed until QEMU updates their bundled qboot version (needs https://github.com/bonzini/qboot/pull/28)
412 "-bios", "external/com_github_bonzini_qboot/bios.bin",
413 "-M", "microvm,x-option-roms=off,pic=off,pit=off,rtc=off,isa-serial=off",
414 "-kernel", opts.KernelPath,
415 // We force using a triple-fault reboot strategy since otherwise the kernel first tries others (like ACPI) which
416 // are not available in this very restricted environment. Similarly we need to override the boot console since
417 // there's nothing on the ISA bus that the kernel could talk to. We also force quiet for performance reasons.
418 "-append", "reboot=t console=hvc0 quiet " + opts.Cmdline,
419 "-initrd", opts.InitramfsPath,
420 "-device", "virtio-rng-device,max-bytes=1024,period=1000",
421 "-device", "virtio-serial-device,max_ports=16",
422 "-chardev", "stdio,id=con0", "-device", "virtconsole,chardev=con0",
423 }
424
425 if !opts.DisableHostNetworkInterface {
426 qemuNetType := "user"
427 qemuNetConfig := qemuValue{
428 "id": {"usernet0"},
429 "net": {"10.42.0.0/24"},
430 "dhcpstart": {"10.42.0.10"},
431 }
432 if opts.PortMap != nil {
433 qemuNetConfig["hostfwd"] = opts.PortMap.toQemuForwards()
434 }
435
436 baseArgs = append(baseArgs, "-netdev", qemuNetConfig.toOption(qemuNetType),
437 "-device", "virtio-net-device,netdev=usernet0,mac="+HostInterfaceMAC.String())
438 }
439
440 var stdErrBuf bytes.Buffer
441 cmd := exec.CommandContext(ctx, "qemu-system-x86_64", append(baseArgs, extraArgs...)...)
442 cmd.Stdout = opts.SerialPort
443 cmd.Stderr = &stdErrBuf
444
445 cmd.ExtraFiles = append(cmd.ExtraFiles, opts.ExtraChardevs...)
446 cmd.ExtraFiles = append(cmd.ExtraFiles, opts.ExtraNetworkInterfaces...)
447
448 err := cmd.Run()
449 var exerr *exec.ExitError
450 if err != nil && errors.As(err, &exerr) {
451 exerr.Stderr = stdErrBuf.Bytes()
452 newErr := QEMUError(*exerr)
453 return &newErr
454 }
455 return err
456}
457
458// QEMUError is a special type of ExitError used when QEMU fails. In addition to normal ExitError features it
459// prints stderr for debugging.
460type QEMUError exec.ExitError
461
462func (e *QEMUError) Error() string {
463 return fmt.Sprintf("%v: %v", e.String(), string(e.Stderr))
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200464}
Lorenz Bruned0503c2020-07-28 17:21:25 +0200465
466// NanoswitchPorts contains all ports forwarded by Nanoswitch to the first VM
467var NanoswitchPorts = []uint16{
Serge Bazanski549b72b2021-01-07 14:54:19 +0100468 node.ExternalServicePort,
469 node.DebugServicePort,
470 node.KubernetesAPIPort,
Lorenz Bruned0503c2020-07-28 17:21:25 +0200471}
472
Serge Bazanski662b5b32020-12-21 13:49:00 +0100473// ClusterOptions contains all options for launching a Metropolis cluster
Lorenz Bruned0503c2020-07-28 17:21:25 +0200474type ClusterOptions struct {
475 // The number of nodes this cluster should be started with initially
476 NumNodes int
477}
478
Serge Bazanski662b5b32020-12-21 13:49:00 +0100479// LaunchCluster launches a cluster of Metropolis node VMs together with a Nanoswitch instance to network them all together.
Lorenz Bruned0503c2020-07-28 17:21:25 +0200480func LaunchCluster(ctx context.Context, opts ClusterOptions) (apb.NodeDebugServiceClient, PortMap, error) {
481 var switchPorts []*os.File
482 var vmPorts []*os.File
483 for i := 0; i < opts.NumNodes; i++ {
484 switchPort, vmPort, err := NewSocketPair()
485 if err != nil {
486 return nil, nil, fmt.Errorf("failed to get socketpair: %w", err)
487 }
488 switchPorts = append(switchPorts, switchPort)
489 vmPorts = append(vmPorts, vmPort)
490 }
491
492 if opts.NumNodes == 0 {
493 return nil, nil, errors.New("refusing to start cluster with zero nodes")
494 }
495
496 if opts.NumNodes > 2 {
497 return nil, nil, errors.New("launching more than 2 nodes is unsupported pending replacement of golden tickets")
498 }
499
500 go func() {
501 if err := Launch(ctx, Options{ConnectToSocket: vmPorts[0]}); err != nil {
502 // Launch() only terminates when QEMU has terminated. At that point our function probably doesn't run anymore
503 // so we have no way of communicating the error back up, so let's just log it. Also a failure in launching
504 // VMs should be very visible by the unavailability of the clients we return.
505 log.Printf("Failed to launch vm0: %v", err)
506 }
507 }()
508
509 portMap, err := ConflictFreePortMap(NanoswitchPorts)
510 if err != nil {
511 return nil, nil, fmt.Errorf("failed to allocate ephemeral ports: %w", err)
512 }
513
514 go func() {
515 if err := RunMicroVM(ctx, &MicroVMOptions{
Serge Bazanski77cb6c52020-12-19 00:09:22 +0100516 KernelPath: "metropolis/test/ktest/linux-testing.elf",
517 InitramfsPath: "metropolis/test/nanoswitch/initramfs.lz4",
Lorenz Bruned0503c2020-07-28 17:21:25 +0200518 ExtraNetworkInterfaces: switchPorts,
519 PortMap: portMap,
520 }); err != nil {
521 log.Printf("Failed to launch nanoswitch: %v", err)
522 }
523 }()
524 copts := []grpcretry.CallOption{
525 grpcretry.WithBackoff(grpcretry.BackoffExponential(100 * time.Millisecond)),
526 }
Serge Bazanski549b72b2021-01-07 14:54:19 +0100527 conn, err := portMap.DialGRPC(node.DebugServicePort, grpc.WithInsecure(),
Lorenz Bruned0503c2020-07-28 17:21:25 +0200528 grpc.WithUnaryInterceptor(grpcretry.UnaryClientInterceptor(copts...)))
529 if err != nil {
530 return nil, nil, fmt.Errorf("failed to dial debug service: %w", err)
531 }
Lorenz Bruned0503c2020-07-28 17:21:25 +0200532 debug := apb.NewNodeDebugServiceClient(conn)
533
534 if opts.NumNodes == 2 {
535 res, err := debug.GetGoldenTicket(ctx, &apb.GetGoldenTicketRequest{
536 // HACK: this is assigned by DHCP, and we assume that everything goes well.
537 ExternalIp: "10.1.0.3",
538 }, grpcretry.WithMax(10))
539 if err != nil {
540 return nil, nil, fmt.Errorf("failed to get golden ticket: %w", err)
541 }
542
543 ec := &apb.EnrolmentConfig{
544 GoldenTicket: res.Ticket,
545 }
546
547 go func() {
548 if err := Launch(ctx, Options{ConnectToSocket: vmPorts[1], EnrolmentConfig: ec}); err != nil {
549 log.Printf("Failed to launch vm1: %v", err)
550 }
551 }()
552 }
553
554 return debug, portMap, nil
555}