blob: 0d137922677fc9cc994f5a442c8000701ac32fea [file] [log] [blame]
Lorenz Brunfc5dbc62020-05-28 12:18:07 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package launch
18
19import (
Lorenz Brun3ff5af32020-06-24 16:34:11 +020020 "bytes"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020021 "context"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020022 "crypto/rand"
23 "errors"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020024 "fmt"
25 "io"
26 "io/ioutil"
Leopold Schabela013ffa2020-06-03 15:09:32 +020027 "log"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020028 "net"
29 "os"
30 "os/exec"
31 "path/filepath"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020032 "strconv"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020033 "strings"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020034 "syscall"
Lorenz Bruned0503c2020-07-28 17:21:25 +020035 "time"
36
Lorenz Brun3ff5af32020-06-24 16:34:11 +020037 "github.com/golang/protobuf/proto"
Serge Bazanski77cb6c52020-12-19 00:09:22 +010038 grpcretry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020039 "golang.org/x/sys/unix"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020040 "google.golang.org/grpc"
41
Serge Bazanski31370b02021-01-07 16:31:14 +010042 "source.monogon.dev/metropolis/node"
43 "source.monogon.dev/metropolis/pkg/freeport"
44 apb "source.monogon.dev/metropolis/proto/api"
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020045)
46
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020047type qemuValue map[string][]string
48
Serge Bazanski216fe7b2021-05-21 18:36:16 +020049// toOption encodes structured data into a QEMU option. Example: "test", {"key1":
50// {"val1"}, "key2": {"val2", "val3"}} returns "test,key1=val1,key2=val2,key2=val3"
Lorenz Brun3ff5af32020-06-24 16:34:11 +020051func (value qemuValue) toOption(name string) string {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020052 var optionValues []string
Lorenz Brun3ff5af32020-06-24 16:34:11 +020053 if name != "" {
54 optionValues = append(optionValues, name)
55 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020056 for name, values := range value {
57 if len(values) == 0 {
58 optionValues = append(optionValues, name)
59 }
60 for _, val := range values {
61 optionValues = append(optionValues, fmt.Sprintf("%v=%v", name, val))
62 }
63 }
64 return strings.Join(optionValues, ",")
65}
66
67func copyFile(src, dst string) error {
68 in, err := os.Open(src)
69 if err != nil {
Serge Bazanskibe57a032021-05-11 13:41:52 +020070 return fmt.Errorf("when opening source: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020071 }
72 defer in.Close()
73
74 out, err := os.Create(dst)
75 if err != nil {
Serge Bazanskibe57a032021-05-11 13:41:52 +020076 return fmt.Errorf("when creating destination: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020077 }
78 defer out.Close()
79
80 _, err = io.Copy(out, in)
81 if err != nil {
Serge Bazanskibe57a032021-05-11 13:41:52 +020082 return fmt.Errorf("when copying file: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020083 }
84 return out.Close()
85}
86
Serge Bazanski216fe7b2021-05-21 18:36:16 +020087// PortMap represents where VM ports are mapped to on the host. It maps from the VM
88// port number to the host port number.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020089type PortMap map[uint16]uint16
90
Serge Bazanski216fe7b2021-05-21 18:36:16 +020091// toQemuForwards generates QEMU hostfwd values (https://qemu.weilnetz.de/doc/qemu-
92// doc.html#:~:text=hostfwd=) for all mapped ports.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +020093func (p PortMap) toQemuForwards() []string {
94 var hostfwdOptions []string
95 for vmPort, hostPort := range p {
96 hostfwdOptions = append(hostfwdOptions, fmt.Sprintf("tcp::%v-:%v", hostPort, vmPort))
97 }
98 return hostfwdOptions
99}
100
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200101// DialGRPC creates a gRPC client for a VM port that's forwarded/mapped to the
102// host. The given port is automatically resolved to the host-mapped port.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200103func (p PortMap) DialGRPC(port uint16, opts ...grpc.DialOption) (*grpc.ClientConn, error) {
104 mappedPort, ok := p[port]
105 if !ok {
106 return nil, fmt.Errorf("cannot dial port: port %v is not mapped/forwarded", port)
107 }
108 grpcClient, err := grpc.Dial(fmt.Sprintf("localhost:%v", mappedPort), opts...)
109 if err != nil {
110 return nil, fmt.Errorf("failed to dial port %v: %w", port, err)
111 }
112 return grpcClient, nil
113}
114
115// Options contains all options that can be passed to Launch()
116type Options struct {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200117 // Ports contains the port mapping where to expose the internal ports of the VM to
118 // the host. See IdentityPortMap() and ConflictFreePortMap(). Ignored when
119 // ConnectToSocket is set.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200120 Ports PortMap
121
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200122 // If set to true, reboots are honored. Otherwise all reboots exit the Launch()
123 // command. Metropolis nodes generally restarts on almost all errors, so unless you
124 // want to test reboot behavior this should be false.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200125 AllowReboot bool
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200126
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200127 // By default the VM is connected to the Host via SLIRP. If ConnectToSocket is set,
128 // it is instead connected to the given file descriptor/socket. If this is set, all
129 // port maps from the Ports option are ignored. Intended for networking this
130 // instance together with others for running more complex network configurations.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200131 ConnectToSocket *os.File
132
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200133 // SerialPort is a io.ReadWriter over which you can communicate with the serial
134 // port of the machine It can be set to an existing file descriptor (like
135 // os.Stdout/os.Stderr) or any Go structure implementing this interface.
Serge Bazanski686444e2020-12-21 14:21:14 +0100136 SerialPort io.ReadWriter
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200137
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200138 // NodeParameters is passed into the VM and subsequently used for bootstrapping or
139 // registering into a cluster.
Serge Bazanski0ed2f962021-03-15 16:39:30 +0100140 NodeParameters *apb.NodeParameters
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200141}
142
Serge Bazanski662b5b32020-12-21 13:49:00 +0100143// NodePorts is the list of ports a fully operational Metropolis node listens on
Serge Bazanski5b2ae552021-08-17 13:00:14 +0200144var NodePorts = []uint16{node.ConsensusPort, node.CuratorServicePort, node.MasterServicePort,
Serge Bazanski549b72b2021-01-07 14:54:19 +0100145 node.ExternalServicePort, node.DebugServicePort, node.KubernetesAPIPort, node.DebuggerPort}
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200146
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200147// IdentityPortMap returns a port map where each given port is mapped onto itself
148// on the host. This is mainly useful for development against Metropolis. The dbg
149// command requires this mapping.
Lorenz Bruned0503c2020-07-28 17:21:25 +0200150func IdentityPortMap(ports []uint16) PortMap {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200151 portMap := make(PortMap)
Lorenz Bruned0503c2020-07-28 17:21:25 +0200152 for _, port := range ports {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200153 portMap[port] = port
154 }
155 return portMap
156}
157
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200158// ConflictFreePortMap returns a port map where each given port is mapped onto a
159// random free port on the host. This is intended for automated testing where
160// multiple instances of Metropolis nodes might be running. Please call this
161// function for each Launch command separately and as close to it as possible since
162// it cannot guarantee that the ports will remain free.
Lorenz Bruned0503c2020-07-28 17:21:25 +0200163func ConflictFreePortMap(ports []uint16) (PortMap, error) {
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200164 portMap := make(PortMap)
Lorenz Bruned0503c2020-07-28 17:21:25 +0200165 for _, port := range ports {
Serge Bazanskicb883e22020-07-06 17:47:55 +0200166 mappedPort, listenCloser, err := freeport.AllocateTCPPort()
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200167 if err != nil {
168 return portMap, fmt.Errorf("failed to get free host port: %w", err)
169 }
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200170 // Defer closing of the listening port until the function is done and all ports are
171 // allocated
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200172 defer listenCloser.Close()
173 portMap[port] = mappedPort
174 }
175 return portMap, nil
176}
177
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200178// Gets a random EUI-48 Ethernet MAC address
179func generateRandomEthernetMAC() (*net.HardwareAddr, error) {
180 macBuf := make([]byte, 6)
181 _, err := rand.Read(macBuf)
182 if err != nil {
183 return nil, fmt.Errorf("failed to read randomness for MAC: %v", err)
184 }
185
186 // Set U/L bit and clear I/G bit (locally administered individual MAC)
187 // Ref IEEE 802-2014 Section 8.2.2
188 macBuf[0] = (macBuf[0] | 2) & 0xfe
189 mac := net.HardwareAddr(macBuf)
190 return &mac, nil
191}
192
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200193// Launch launches a Metropolis node instance with the given options. The instance
194// runs mostly paravirtualized but with some emulated hardware similar to how a
195// cloud provider might set up its VMs. The disk is fully writable but is run in
196// snapshot mode meaning that changes are not kept beyond a single invocation.
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200197func Launch(ctx context.Context, options Options) error {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200198 // Pin temp directory to /tmp until we can use abstract socket namespace in QEMU
199 // (next release after 5.0,
200 // https://github.com/qemu/qemu/commit/776b97d3605ed0fc94443048fdf988c7725e38a9).
201 // swtpm accepts already-open FDs so we can pass in an abstract socket namespace FD
202 // that we open and pass the name of it to QEMU. Not pinning this crashes both
203 // swtpm and qemu because we run into UNIX socket length limitations (for legacy
204 // reasons 108 chars).
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200205 tempDir, err := ioutil.TempDir("/tmp", "launch*")
206 if err != nil {
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200207 return fmt.Errorf("failed to create temporary directory: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200208 }
209 defer os.RemoveAll(tempDir)
210
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200211 // Copy TPM state into a temporary directory since it's being modified by the
212 // emulator
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200213 tpmTargetDir := filepath.Join(tempDir, "tpm")
Serge Bazanski77cb6c52020-12-19 00:09:22 +0100214 tpmSrcDir := "metropolis/node/tpm"
Serge Bazanskibe57a032021-05-11 13:41:52 +0200215 if err := os.Mkdir(tpmTargetDir, 0755); err != nil {
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200216 return fmt.Errorf("failed to create TPM state directory: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200217 }
218 tpmFiles, err := ioutil.ReadDir(tpmSrcDir)
219 if err != nil {
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200220 return fmt.Errorf("failed to read TPM directory: %w", err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200221 }
222 for _, file := range tpmFiles {
223 name := file.Name()
Serge Bazanskibe57a032021-05-11 13:41:52 +0200224 src := filepath.Join(tpmSrcDir, name)
225 target := filepath.Join(tpmTargetDir, name)
226 if err := copyFile(src, target); err != nil {
227 return fmt.Errorf("failed to copy TPM directory: file %q to %q: %w", src, target, err)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200228 }
229 }
230
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200231 var qemuNetType string
232 var qemuNetConfig qemuValue
233 if options.ConnectToSocket != nil {
234 qemuNetType = "socket"
235 qemuNetConfig = qemuValue{
236 "id": {"net0"},
237 "fd": {"3"},
238 }
239 } else {
240 qemuNetType = "user"
241 qemuNetConfig = qemuValue{
242 "id": {"net0"},
243 "net": {"10.42.0.0/24"},
244 "dhcpstart": {"10.42.0.10"},
245 "hostfwd": options.Ports.toQemuForwards(),
246 }
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200247 }
248
249 tpmSocketPath := filepath.Join(tempDir, "tpm-socket")
250
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200251 mac, err := generateRandomEthernetMAC()
252 if err != nil {
253 return err
254 }
255
Lorenz Brunca24cfa2020-08-18 13:49:37 +0200256 qemuArgs := []string{"-machine", "q35", "-accel", "kvm", "-nographic", "-nodefaults", "-m", "4096",
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200257 "-cpu", "host", "-smp", "sockets=1,cpus=1,cores=2,threads=2,maxcpus=4",
258 "-drive", "if=pflash,format=raw,readonly,file=external/edk2/OVMF_CODE.fd",
259 "-drive", "if=pflash,format=raw,snapshot=on,file=external/edk2/OVMF_VARS.fd",
Serge Bazanski662b5b32020-12-21 13:49:00 +0100260 "-drive", "if=virtio,format=raw,snapshot=on,cache=unsafe,file=metropolis/node/node.img",
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200261 "-netdev", qemuNetConfig.toOption(qemuNetType),
262 "-device", "virtio-net-pci,netdev=net0,mac=" + mac.String(),
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200263 "-chardev", "socket,id=chrtpm,path=" + tpmSocketPath,
264 "-tpmdev", "emulator,id=tpm0,chardev=chrtpm",
265 "-device", "tpm-tis,tpmdev=tpm0",
266 "-device", "virtio-rng-pci",
267 "-serial", "stdio"}
268
269 if !options.AllowReboot {
270 qemuArgs = append(qemuArgs, "-no-reboot")
271 }
272
Serge Bazanski0ed2f962021-03-15 16:39:30 +0100273 if options.NodeParameters != nil {
274 parametersPath := filepath.Join(tempDir, "parameters.pb")
275 parametersRaw, err := proto.Marshal(options.NodeParameters)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200276 if err != nil {
Serge Bazanski0ed2f962021-03-15 16:39:30 +0100277 return fmt.Errorf("failed to encode node paraeters: %w", err)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200278 }
Serge Bazanski0ed2f962021-03-15 16:39:30 +0100279 if err := ioutil.WriteFile(parametersPath, parametersRaw, 0644); err != nil {
280 return fmt.Errorf("failed to write node parameters: %w", err)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200281 }
Serge Bazanski0ed2f962021-03-15 16:39:30 +0100282 qemuArgs = append(qemuArgs, "-fw_cfg", "name=dev.monogon.metropolis/parameters.pb,file="+parametersPath)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200283 }
284
Leopold Schabela013ffa2020-06-03 15:09:32 +0200285 // Start TPM emulator as a subprocess
286 tpmCtx, tpmCancel := context.WithCancel(ctx)
287 defer tpmCancel()
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200288
Leopold Schabela013ffa2020-06-03 15:09:32 +0200289 tpmEmuCmd := exec.CommandContext(tpmCtx, "swtpm", "socket", "--tpm2", "--tpmstate", "dir="+tpmTargetDir, "--ctrl", "type=unixio,path="+tpmSocketPath)
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200290 tpmEmuCmd.Stderr = os.Stderr
291 tpmEmuCmd.Stdout = os.Stdout
Leopold Schabela013ffa2020-06-03 15:09:32 +0200292
293 err = tpmEmuCmd.Start()
294 if err != nil {
295 return fmt.Errorf("failed to start TPM emulator: %w", err)
296 }
297
298 // Start the main qemu binary
299 systemCmd := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200300 if options.ConnectToSocket != nil {
301 systemCmd.ExtraFiles = []*os.File{options.ConnectToSocket}
302 }
303
304 var stdErrBuf bytes.Buffer
305 systemCmd.Stderr = &stdErrBuf
306 systemCmd.Stdout = options.SerialPort
Leopold Schabela013ffa2020-06-03 15:09:32 +0200307
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200308 err = systemCmd.Run()
Leopold Schabela013ffa2020-06-03 15:09:32 +0200309
310 // Stop TPM emulator and wait for it to exit to properly reap the child process
311 tpmCancel()
312 log.Print("Waiting for TPM emulator to exit")
313 // Wait returns a SIGKILL error because we just cancelled its context.
314 // We still need to call it to avoid creating zombies.
315 _ = tpmEmuCmd.Wait()
316
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200317 var exerr *exec.ExitError
318 if err != nil && errors.As(err, &exerr) {
319 status := exerr.ProcessState.Sys().(syscall.WaitStatus)
320 if status.Signaled() && status.Signal() == syscall.SIGKILL {
321 // Process was killed externally (most likely by our context being canceled).
322 // This is a normal exit for us, so return nil
323 return nil
324 }
325 exerr.Stderr = stdErrBuf.Bytes()
326 newErr := QEMUError(*exerr)
327 return &newErr
328 }
329 return err
330}
331
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200332// NewSocketPair creates a new socket pair. By connecting both ends to different
333// instances you can connect them with a virtual "network cable". The ends can be
334// passed into the ConnectToSocket option.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200335func NewSocketPair() (*os.File, *os.File, error) {
336 fds, err := unix.Socketpair(unix.AF_UNIX, syscall.SOCK_STREAM, 0)
337 if err != nil {
338 return nil, nil, fmt.Errorf("failed to call socketpair: %w", err)
339 }
340
341 fd1 := os.NewFile(uintptr(fds[0]), "network0")
342 fd2 := os.NewFile(uintptr(fds[1]), "network1")
343 return fd1, fd2, nil
344}
345
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200346// HostInterfaceMAC is the MAC address the host SLIRP network interface has if it
347// is not disabled (see DisableHostNetworkInterface in MicroVMOptions)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200348var HostInterfaceMAC = net.HardwareAddr{0x02, 0x72, 0x82, 0xbf, 0xc3, 0x56}
349
350// MicroVMOptions contains all options to start a MicroVM
351type MicroVMOptions struct {
352 // Path to the ELF kernel binary
353 KernelPath string
354
355 // Path to the Initramfs
356 InitramfsPath string
357
358 // Cmdline contains additional kernel commandline options
359 Cmdline string
360
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200361 // SerialPort is a File(descriptor) over which you can communicate with the serial
362 // port of the machine It can be set to an existing file descriptor (like
363 // os.Stdout/os.Stderr) or you can use NewSocketPair() to get one end to talk to
364 // from Go.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200365 SerialPort *os.File
366
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200367 // ExtraChardevs can be used similar to SerialPort, but can contain an arbitrary
368 // number of additional serial ports
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200369 ExtraChardevs []*os.File
370
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200371 // ExtraNetworkInterfaces can contain an arbitrary number of file descriptors which
372 // are mapped into the VM as virtio network interfaces. The first interface is
373 // always a SLIRP-backed interface for communicating with the host.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200374 ExtraNetworkInterfaces []*os.File
375
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200376 // PortMap contains ports that are mapped to the host through the built-in SLIRP
377 // network interface.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200378 PortMap PortMap
379
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200380 // DisableHostNetworkInterface disables the SLIRP-backed host network interface
381 // that is normally the first network interface. If this is set PortMap is ignored.
382 // Mostly useful for speeding up QEMU's startup time for tests.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200383 DisableHostNetworkInterface bool
384}
385
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200386// RunMicroVM launches a tiny VM mostly intended for testing. Very quick to boot
387// (<40ms).
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200388func RunMicroVM(ctx context.Context, opts *MicroVMOptions) error {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200389 // Generate options for all the file descriptors we'll be passing as virtio "serial
390 // ports"
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200391 var extraArgs []string
392 for idx, _ := range opts.ExtraChardevs {
393 idxStr := strconv.Itoa(idx)
394 id := "extra" + idxStr
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200395 // That this works is pretty much a hack, but upstream QEMU doesn't have a
396 // bidirectional chardev backend not based around files/sockets on the disk which
397 // are a giant pain to work with. We're using QEMU's fdset functionality to make
398 // FDs available as pseudo-files and then "ab"using the pipe backend's fallback
399 // functionality to get a single bidirectional chardev backend backed by a passed-
400 // down RDWR fd. Ref https://lists.gnu.org/archive/html/qemu-devel/2015-
401 // 12/msg01256.html
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200402 addFdConf := qemuValue{
403 "set": {idxStr},
404 "fd": {strconv.Itoa(idx + 3)},
405 }
406 chardevConf := qemuValue{
407 "id": {id},
408 "path": {"/dev/fdset/" + idxStr},
409 }
410 deviceConf := qemuValue{
411 "chardev": {id},
412 }
413 extraArgs = append(extraArgs, "-add-fd", addFdConf.toOption(""),
414 "-chardev", chardevConf.toOption("pipe"), "-device", deviceConf.toOption("virtserialport"))
415 }
416
417 for idx, _ := range opts.ExtraNetworkInterfaces {
418 id := fmt.Sprintf("net%v", idx)
419 netdevConf := qemuValue{
420 "id": {id},
421 "fd": {strconv.Itoa(idx + 3 + len(opts.ExtraChardevs))},
422 }
423 extraArgs = append(extraArgs, "-netdev", netdevConf.toOption("socket"), "-device", "virtio-net-device,netdev="+id)
424 }
425
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200426 // This sets up a minimum viable environment for our Linux kernel. It clears all
427 // standard QEMU configuration and sets up a MicroVM machine
428 // (https://github.com/qemu/qemu/blob/master/docs/microvm.rst) with all legacy
429 // emulation turned off. This means the only "hardware" the Linux kernel inside can
430 // communicate with is a single virtio-mmio region. Over that MMIO interface we run
431 // a paravirtualized RNG (since the kernel in there has nothing to gather that from
432 // and it delays booting), a single paravirtualized console and an arbitrary number
433 // of extra serial ports for talking to various things that might run inside. The
434 // kernel, initramfs and command line are mapped into VM memory at boot time and
435 // not loaded from any sort of disk. Booting and shutting off one of these VMs
436 // takes <100ms.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200437 baseArgs := []string{"-nodefaults", "-no-user-config", "-nographic", "-no-reboot",
438 "-accel", "kvm", "-cpu", "host",
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200439 // Needed until QEMU updates their bundled qboot version (needs
440 // https://github.com/bonzini/qboot/pull/28)
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200441 "-bios", "external/com_github_bonzini_qboot/bios.bin",
442 "-M", "microvm,x-option-roms=off,pic=off,pit=off,rtc=off,isa-serial=off",
443 "-kernel", opts.KernelPath,
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200444 // We force using a triple-fault reboot strategy since otherwise the kernel first
445 // tries others (like ACPI) which are not available in this very restricted
446 // environment. Similarly we need to override the boot console since there's
447 // nothing on the ISA bus that the kernel could talk to. We also force quiet for
448 // performance reasons.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200449 "-append", "reboot=t console=hvc0 quiet " + opts.Cmdline,
450 "-initrd", opts.InitramfsPath,
451 "-device", "virtio-rng-device,max-bytes=1024,period=1000",
452 "-device", "virtio-serial-device,max_ports=16",
453 "-chardev", "stdio,id=con0", "-device", "virtconsole,chardev=con0",
454 }
455
456 if !opts.DisableHostNetworkInterface {
457 qemuNetType := "user"
458 qemuNetConfig := qemuValue{
459 "id": {"usernet0"},
460 "net": {"10.42.0.0/24"},
461 "dhcpstart": {"10.42.0.10"},
462 }
463 if opts.PortMap != nil {
464 qemuNetConfig["hostfwd"] = opts.PortMap.toQemuForwards()
465 }
466
467 baseArgs = append(baseArgs, "-netdev", qemuNetConfig.toOption(qemuNetType),
468 "-device", "virtio-net-device,netdev=usernet0,mac="+HostInterfaceMAC.String())
469 }
470
471 var stdErrBuf bytes.Buffer
472 cmd := exec.CommandContext(ctx, "qemu-system-x86_64", append(baseArgs, extraArgs...)...)
473 cmd.Stdout = opts.SerialPort
474 cmd.Stderr = &stdErrBuf
475
476 cmd.ExtraFiles = append(cmd.ExtraFiles, opts.ExtraChardevs...)
477 cmd.ExtraFiles = append(cmd.ExtraFiles, opts.ExtraNetworkInterfaces...)
478
479 err := cmd.Run()
480 var exerr *exec.ExitError
481 if err != nil && errors.As(err, &exerr) {
482 exerr.Stderr = stdErrBuf.Bytes()
483 newErr := QEMUError(*exerr)
484 return &newErr
485 }
486 return err
487}
488
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200489// QEMUError is a special type of ExitError used when QEMU fails. In addition to
490// normal ExitError features it prints stderr for debugging.
Lorenz Brun3ff5af32020-06-24 16:34:11 +0200491type QEMUError exec.ExitError
492
493func (e *QEMUError) Error() string {
494 return fmt.Sprintf("%v: %v", e.String(), string(e.Stderr))
Lorenz Brunfc5dbc62020-05-28 12:18:07 +0200495}
Lorenz Bruned0503c2020-07-28 17:21:25 +0200496
497// NanoswitchPorts contains all ports forwarded by Nanoswitch to the first VM
498var NanoswitchPorts = []uint16{
Serge Bazanski549b72b2021-01-07 14:54:19 +0100499 node.ExternalServicePort,
500 node.DebugServicePort,
501 node.KubernetesAPIPort,
Lorenz Bruned0503c2020-07-28 17:21:25 +0200502}
503
Serge Bazanski662b5b32020-12-21 13:49:00 +0100504// ClusterOptions contains all options for launching a Metropolis cluster
Lorenz Bruned0503c2020-07-28 17:21:25 +0200505type ClusterOptions struct {
506 // The number of nodes this cluster should be started with initially
507 NumNodes int
508}
509
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200510// LaunchCluster launches a cluster of Metropolis node VMs together with a
511// Nanoswitch instance to network them all together.
Lorenz Bruned0503c2020-07-28 17:21:25 +0200512func LaunchCluster(ctx context.Context, opts ClusterOptions) (apb.NodeDebugServiceClient, PortMap, error) {
513 var switchPorts []*os.File
514 var vmPorts []*os.File
515 for i := 0; i < opts.NumNodes; i++ {
516 switchPort, vmPort, err := NewSocketPair()
517 if err != nil {
518 return nil, nil, fmt.Errorf("failed to get socketpair: %w", err)
519 }
520 switchPorts = append(switchPorts, switchPort)
521 vmPorts = append(vmPorts, vmPort)
522 }
523
524 if opts.NumNodes == 0 {
525 return nil, nil, errors.New("refusing to start cluster with zero nodes")
526 }
527
528 if opts.NumNodes > 2 {
529 return nil, nil, errors.New("launching more than 2 nodes is unsupported pending replacement of golden tickets")
530 }
531
532 go func() {
Serge Bazanski0ed2f962021-03-15 16:39:30 +0100533 if err := Launch(ctx, Options{
534 ConnectToSocket: vmPorts[0],
535 NodeParameters: &apb.NodeParameters{
536 Cluster: &apb.NodeParameters_ClusterBootstrap_{
537 ClusterBootstrap: &apb.NodeParameters_ClusterBootstrap{},
538 },
539 },
540 }); err != nil {
541
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200542 // Launch() only terminates when QEMU has terminated. At that point our function
543 // probably doesn't run anymore so we have no way of communicating the error back
544 // up, so let's just log it. Also a failure in launching VMs should be very visible
545 // by the unavailability of the clients we return.
Lorenz Bruned0503c2020-07-28 17:21:25 +0200546 log.Printf("Failed to launch vm0: %v", err)
547 }
548 }()
549
550 portMap, err := ConflictFreePortMap(NanoswitchPorts)
551 if err != nil {
552 return nil, nil, fmt.Errorf("failed to allocate ephemeral ports: %w", err)
553 }
554
555 go func() {
556 if err := RunMicroVM(ctx, &MicroVMOptions{
Serge Bazanskif055a7f2021-04-13 16:22:33 +0200557 KernelPath: "metropolis/test/ktest/vmlinux",
Serge Bazanski77cb6c52020-12-19 00:09:22 +0100558 InitramfsPath: "metropolis/test/nanoswitch/initramfs.lz4",
Lorenz Bruned0503c2020-07-28 17:21:25 +0200559 ExtraNetworkInterfaces: switchPorts,
560 PortMap: portMap,
561 }); err != nil {
562 log.Printf("Failed to launch nanoswitch: %v", err)
563 }
564 }()
565 copts := []grpcretry.CallOption{
566 grpcretry.WithBackoff(grpcretry.BackoffExponential(100 * time.Millisecond)),
567 }
Serge Bazanski549b72b2021-01-07 14:54:19 +0100568 conn, err := portMap.DialGRPC(node.DebugServicePort, grpc.WithInsecure(),
Lorenz Bruned0503c2020-07-28 17:21:25 +0200569 grpc.WithUnaryInterceptor(grpcretry.UnaryClientInterceptor(copts...)))
570 if err != nil {
571 return nil, nil, fmt.Errorf("failed to dial debug service: %w", err)
572 }
Lorenz Bruned0503c2020-07-28 17:21:25 +0200573 debug := apb.NewNodeDebugServiceClient(conn)
574
575 if opts.NumNodes == 2 {
Serge Bazanski0ed2f962021-03-15 16:39:30 +0100576 return nil, nil, fmt.Errorf("multinode unimplemented")
Lorenz Bruned0503c2020-07-28 17:21:25 +0200577 }
578
579 return debug, portMap, nil
580}