blob: 60cf424e6dd0e84eed3dcd0e575060278af633b4 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski66e58952021-10-05 17:06:56 +02004// cluster builds on the launch package and implements launching Metropolis
5// nodes and clusters in a virtualized environment using qemu. It's kept in a
6// separate package as it depends on a Metropolis node image, which might not be
7// required for some use of the launch library.
Tim Windelschmidt9f21f532024-05-07 15:14:20 +02008package launch
Serge Bazanski66e58952021-10-05 17:06:56 +02009
10import (
11 "bytes"
12 "context"
Serge Bazanski1f8cad72023-03-20 16:58:10 +010013 "crypto/ed25519"
Serge Bazanski66e58952021-10-05 17:06:56 +020014 "crypto/rand"
15 "crypto/tls"
Serge Bazanski54e212a2023-06-14 13:45:11 +020016 "crypto/x509"
Serge Bazanskia0bc6d32023-06-28 18:57:40 +020017 "encoding/pem"
Serge Bazanski66e58952021-10-05 17:06:56 +020018 "errors"
19 "fmt"
20 "io"
Serge Bazanski66e58952021-10-05 17:06:56 +020021 "net"
Lorenz Brun150f24a2023-07-13 20:11:06 +020022 "net/http"
Serge Bazanski66e58952021-10-05 17:06:56 +020023 "os"
24 "os/exec"
Leopoldacfad5b2023-01-15 14:05:25 +010025 "path"
Serge Bazanski66e58952021-10-05 17:06:56 +020026 "path/filepath"
Serge Bazanski53458ba2024-06-18 09:56:46 +000027 "strconv"
Serge Bazanski630fb5c2023-04-06 10:50:24 +020028 "strings"
Serge Bazanski66e58952021-10-05 17:06:56 +020029 "syscall"
30 "time"
31
32 "github.com/cenkalti/backoff/v4"
Serge Bazanski66e58952021-10-05 17:06:56 +020033 "go.uber.org/multierr"
Serge Bazanskibe742842022-04-04 13:18:50 +020034 "golang.org/x/net/proxy"
Lorenz Brun87bbf7e2024-03-18 18:22:25 +010035 "golang.org/x/sys/unix"
Serge Bazanski66e58952021-10-05 17:06:56 +020036 "google.golang.org/grpc"
Serge Bazanski636032e2022-01-26 14:21:33 +010037 "google.golang.org/grpc/codes"
38 "google.golang.org/grpc/status"
Serge Bazanski66e58952021-10-05 17:06:56 +020039 "google.golang.org/protobuf/proto"
Serge Bazanskia0bc6d32023-06-28 18:57:40 +020040 "k8s.io/client-go/kubernetes"
41 "k8s.io/client-go/rest"
Jan Schärd1a8b642024-12-03 17:40:41 +010042 "k8s.io/utils/ptr"
Serge Bazanski66e58952021-10-05 17:06:56 +020043
Jan Schär0f8ce4c2025-09-04 13:27:50 +020044 "source.monogon.dev/metropolis/node/allocs"
Serge Bazanski37cfcc12024-03-21 11:59:07 +010045 ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
Tim Windelschmidtbe25a3b2023-07-19 16:31:56 +020046 apb "source.monogon.dev/metropolis/proto/api"
47 cpb "source.monogon.dev/metropolis/proto/common"
48
Serge Bazanskica8d9512024-09-12 14:20:57 +020049 "source.monogon.dev/go/logging"
Serge Bazanski1f8cad72023-03-20 16:58:10 +010050 metroctl "source.monogon.dev/metropolis/cli/metroctl/core"
Jan Schäre19d2792025-06-23 12:37:58 +000051 "source.monogon.dev/metropolis/installer/install"
Serge Bazanski66e58952021-10-05 17:06:56 +020052 "source.monogon.dev/metropolis/node"
53 "source.monogon.dev/metropolis/node/core/rpc"
Serge Bazanski5bb8a332022-06-23 17:41:33 +020054 "source.monogon.dev/metropolis/node/core/rpc/resolver"
Jan Schär3b0c8dd2025-06-23 10:32:07 +000055 "source.monogon.dev/osbase/blockdev"
Jan Schär3b0c8dd2025-06-23 10:32:07 +000056 "source.monogon.dev/osbase/oci"
Jan Schäre19d2792025-06-23 12:37:58 +000057 "source.monogon.dev/osbase/oci/osimage"
Jan Schär9d2f3c62025-04-14 11:17:22 +000058 "source.monogon.dev/osbase/oci/registry"
Jan Schär3b0c8dd2025-06-23 10:32:07 +000059 "source.monogon.dev/osbase/structfs"
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +010060 "source.monogon.dev/osbase/test/qemu"
Serge Bazanski66e58952021-10-05 17:06:56 +020061)
62
Serge Bazanski53458ba2024-06-18 09:56:46 +000063const (
Serge Bazanski20498dd2024-09-30 17:07:08 +000064 // NodeNumberKey is the key of the node label used to carry a node's numerical
Serge Bazanski53458ba2024-06-18 09:56:46 +000065 // index in the test system.
Serge Bazanski20498dd2024-09-30 17:07:08 +000066 NodeNumberKey string = "test-node-number"
Serge Bazanski53458ba2024-06-18 09:56:46 +000067)
68
Leopold20a036e2023-01-15 00:17:19 +010069// NodeOptions contains all options that can be passed to Launch()
Serge Bazanski66e58952021-10-05 17:06:56 +020070type NodeOptions struct {
Leopoldaf5086b2023-01-15 14:12:42 +010071 // Name is a human-readable identifier to be used in debug output.
72 Name string
73
Jan Schära9b060b2024-08-07 10:42:29 +020074 // CPUs is the number of virtual CPUs of the VM.
75 CPUs int
76
77 // ThreadsPerCPU is the number of threads per CPU. This is multiplied by
78 // CPUs to get the total number of threads.
79 ThreadsPerCPU int
80
81 // MemoryMiB is the RAM size in MiB of the VM.
82 MemoryMiB int
83
Jan Schär07003572024-08-26 10:42:16 +020084 // DiskBytes contains the size of the root disk in bytes or zero if the
85 // unmodified image size is used.
86 DiskBytes uint64
87
Serge Bazanski66e58952021-10-05 17:06:56 +020088 // Ports contains the port mapping where to expose the internal ports of the VM to
89 // the host. See IdentityPortMap() and ConflictFreePortMap(). Ignored when
90 // ConnectToSocket is set.
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +010091 Ports qemu.PortMap
Serge Bazanski66e58952021-10-05 17:06:56 +020092
Leopold20a036e2023-01-15 00:17:19 +010093 // If set to true, reboots are honored. Otherwise, all reboots exit the Launch()
94 // command. Metropolis nodes generally restart on almost all errors, so unless you
Serge Bazanski66e58952021-10-05 17:06:56 +020095 // want to test reboot behavior this should be false.
96 AllowReboot bool
97
Leopold20a036e2023-01-15 00:17:19 +010098 // By default, the VM is connected to the Host via SLIRP. If ConnectToSocket is
99 // set, it is instead connected to the given file descriptor/socket. If this is
100 // set, all port maps from the Ports option are ignored. Intended for networking
101 // this instance together with others for running more complex network
102 // configurations.
Serge Bazanski66e58952021-10-05 17:06:56 +0200103 ConnectToSocket *os.File
104
Leopoldacfad5b2023-01-15 14:05:25 +0100105 // When PcapDump is set, all traffic is dumped to a pcap file in the
106 // runtime directory (e.g. "net0.pcap" for the first interface).
107 PcapDump bool
108
Leopold20a036e2023-01-15 00:17:19 +0100109 // SerialPort is an io.ReadWriter over which you can communicate with the serial
110 // port of the machine. It can be set to an existing file descriptor (like
Serge Bazanski66e58952021-10-05 17:06:56 +0200111 // os.Stdout/os.Stderr) or any Go structure implementing this interface.
112 SerialPort io.ReadWriter
113
114 // NodeParameters is passed into the VM and subsequently used for bootstrapping or
115 // registering into a cluster.
116 NodeParameters *apb.NodeParameters
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200117
118 // Mac is the node's MAC address.
119 Mac *net.HardwareAddr
120
121 // Runtime keeps the node's QEMU runtime state.
122 Runtime *NodeRuntime
Serge Bazanski62e6f0b2024-09-03 12:18:56 +0200123
124 // RunVNC starts a VNC socket for troubleshooting/testing console code. Note:
125 // this will not work in tests, as those use a built-in qemu which does not
126 // implement a VGA device.
127 RunVNC bool
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200128}
129
Leopold20a036e2023-01-15 00:17:19 +0100130// NodeRuntime keeps the node's QEMU runtime options.
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200131type NodeRuntime struct {
132 // ld points at the node's launch directory storing data such as storage
133 // images, firmware variables or the TPM state.
134 ld string
135 // sd points at the node's socket directory.
136 sd string
137
138 // ctxT is the context QEMU will execute in.
139 ctxT context.Context
140 // CtxC is the QEMU context's cancellation function.
141 CtxC context.CancelFunc
Serge Bazanski66e58952021-10-05 17:06:56 +0200142}
143
144// NodePorts is the list of ports a fully operational Metropolis node listens on
Jan Schär0f8ce4c2025-09-04 13:27:50 +0200145var NodePorts = []allocs.Port{
146 allocs.PortConsensus,
Serge Bazanski66e58952021-10-05 17:06:56 +0200147
Jan Schär0f8ce4c2025-09-04 13:27:50 +0200148 allocs.PortCuratorService,
149 allocs.PortDebugService,
Serge Bazanski66e58952021-10-05 17:06:56 +0200150
Jan Schär0f8ce4c2025-09-04 13:27:50 +0200151 allocs.PortKubernetesAPI,
152 allocs.PortKubernetesAPIWrapped,
153 allocs.PortCuratorService,
154 allocs.PortDebugger,
155 allocs.PortMetrics,
Serge Bazanski66e58952021-10-05 17:06:56 +0200156}
157
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200158// setupRuntime creates the node's QEMU runtime directory, together with all
159// files required to preserve its state, a level below the chosen path ld. The
160// node's socket directory is similarily created a level below sd. It may
161// return an I/O error.
Jan Schär07003572024-08-26 10:42:16 +0200162func setupRuntime(ld, sd string, diskBytes uint64) (*NodeRuntime, error) {
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200163 // Create a temporary directory to keep all the runtime files.
164 stdp, err := os.MkdirTemp(ld, "node_state*")
165 if err != nil {
166 return nil, fmt.Errorf("failed to create the state directory: %w", err)
167 }
168
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000169 // Initialize the node's storage.
Jan Schär2963b682025-07-17 17:03:44 +0200170 ociImage, err := oci.AsImage(oci.ReadLayout(xNodeImagePath))
Jan Schär07003572024-08-26 10:42:16 +0200171 if err != nil {
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000172 return nil, fmt.Errorf("failed to read OS image: %w", err)
Jan Schär07003572024-08-26 10:42:16 +0200173 }
Jan Schäre19d2792025-06-23 12:37:58 +0000174 osImage, err := osimage.Read(ociImage)
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000175 if err != nil {
176 return nil, fmt.Errorf("failed to read OS image: %w", err)
177 }
Jan Schär07003572024-08-26 10:42:16 +0200178
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000179 abloader, err := structfs.OSPathBlob(xAbloaderPath)
180 if err != nil {
181 return nil, fmt.Errorf("cannot open abloader: %w", err)
182 }
183
184 di := filepath.Join(stdp, "image.img")
185 logf("Cluster: generating node image: %s -> %s", xNodeImagePath, di)
186
187 df, err := blockdev.CreateFile(di, 512, int64(diskBytes/512))
Serge Bazanskidd5b03c2024-05-16 18:07:06 +0200188 if err != nil {
189 return nil, fmt.Errorf("while opening image for writing: %w", err)
190 }
191 defer df.Close()
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000192
Jan Schäre19d2792025-06-23 12:37:58 +0000193 installParams := &install.Params{
194 PartitionSize: install.PartitionSizeInfo{
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000195 ESP: 128,
196 System: 1024,
197 Data: 128,
198 },
Jan Schärdaf9e952025-06-23 13:28:16 +0000199 OSImage: osImage,
200 UnverifiedPayloads: true,
201 ABLoader: abloader,
202 Output: df,
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000203 }
Jan Schäre19d2792025-06-23 12:37:58 +0000204
205 if _, err := install.Write(installParams); err != nil {
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000206 return nil, fmt.Errorf("while creating node image: %w", err)
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200207 }
208
209 // Initialize the OVMF firmware variables file.
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000210 dv := filepath.Join(stdp, filepath.Base(xOvmfVarsPath))
211 if err := copyFile(xOvmfVarsPath, dv); err != nil {
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200212 return nil, fmt.Errorf("while copying firmware variables: %w", err)
213 }
214
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200215 // Create the socket directory.
216 sotdp, err := os.MkdirTemp(sd, "node_sock*")
217 if err != nil {
218 return nil, fmt.Errorf("failed to create the socket directory: %w", err)
219 }
220
221 return &NodeRuntime{
222 ld: stdp,
223 sd: sotdp,
224 }, nil
225}
226
Serge Bazanski5bb8a332022-06-23 17:41:33 +0200227// CuratorClient returns an authenticated owner connection to a Curator
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200228// instance within Cluster c, or nil together with an error.
Serge Bazanski5bb8a332022-06-23 17:41:33 +0200229func (c *Cluster) CuratorClient() (*grpc.ClientConn, error) {
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200230 if c.authClient == nil {
Serge Bazanski8535cb52023-03-29 14:15:08 +0200231 authCreds := rpc.NewAuthenticatedCredentials(c.Owner, rpc.WantInsecure())
Serge Bazanskica8d9512024-09-12 14:20:57 +0200232 r := resolver.New(c.ctxT, resolver.WithLogger(logging.NewFunctionBackend(func(severity logging.Severity, msg string) {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100233 logf("Cluster: client resolver: %s: %s", severity, msg)
Serge Bazanskica8d9512024-09-12 14:20:57 +0200234 })))
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100235 for _, n := range c.Nodes {
236 r.AddEndpoint(resolver.NodeAtAddressWithDefaultPort(n.ManagementAddress))
Serge Bazanski5bb8a332022-06-23 17:41:33 +0200237 }
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100238 authClient, err := grpc.NewClient(resolver.MetropolisControlAddress,
Serge Bazanski5bb8a332022-06-23 17:41:33 +0200239 grpc.WithTransportCredentials(authCreds),
240 grpc.WithResolvers(r),
241 grpc.WithContextDialer(c.DialNode),
242 )
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200243 if err != nil {
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100244 return nil, fmt.Errorf("creating client with owner credentials failed: %w", err)
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200245 }
246 c.authClient = authClient
247 }
248 return c.authClient, nil
249}
250
Tim Windelschmidt2137bd62025-08-25 14:03:12 +0200251// NodeClient returns an authenticated owner connection to the specified
252// node within Cluster c, or nil together with an error.
253func (c *Cluster) NodeClient(n *NodeInCluster) (*grpc.ClientConn, error) {
254 authCreds := rpc.NewAuthenticatedCredentials(c.Owner, rpc.WantRemoteCluster(c.CACertificate), rpc.WantRemoteNode(n.ID))
255 endpoint := net.JoinHostPort(n.ManagementAddress, allocs.PortNodeManagement.PortString())
256 authClient, err := grpc.NewClient(endpoint,
257 grpc.WithTransportCredentials(authCreds),
258 grpc.WithContextDialer(c.DialNode),
259 )
260 if err != nil {
261 return nil, fmt.Errorf("creating client with owner credentials failed: %w", err)
262 }
263 return authClient, nil
264}
265
Serge Bazanski66e58952021-10-05 17:06:56 +0200266// LaunchNode launches a single Metropolis node instance with the given options.
267// The instance runs mostly paravirtualized but with some emulated hardware
268// similar to how a cloud provider might set up its VMs. The disk is fully
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200269// writable, and the changes are kept across reboots and shutdowns. ld and sd
270// point to the launch directory and the socket directory, holding the nodes'
271// state files (storage, tpm state, firmware state), and UNIX socket files
272// (swtpm <-> QEMU interplay) respectively. The directories must exist before
273// LaunchNode is called. LaunchNode will update options.Runtime and options.Mac
274// if either are not initialized.
Serge Bazanski2b6dc312024-06-04 17:44:55 +0200275func LaunchNode(ctx context.Context, ld, sd string, tpmFactory *TPMFactory, options *NodeOptions, doneC chan error) error {
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200276 // TODO(mateusz@monogon.tech) try using QEMU's abstract socket namespace instead
277 // of /tmp (requires QEMU version >5.0).
Serge Bazanski66e58952021-10-05 17:06:56 +0200278 // https://github.com/qemu/qemu/commit/776b97d3605ed0fc94443048fdf988c7725e38a9).
279 // swtpm accepts already-open FDs so we can pass in an abstract socket namespace FD
280 // that we open and pass the name of it to QEMU. Not pinning this crashes both
281 // swtpm and qemu because we run into UNIX socket length limitations (for legacy
282 // reasons 108 chars).
Serge Bazanski66e58952021-10-05 17:06:56 +0200283
Jan Schära9b060b2024-08-07 10:42:29 +0200284 if options.CPUs == 0 {
285 options.CPUs = 1
286 }
287 if options.ThreadsPerCPU == 0 {
288 options.ThreadsPerCPU = 1
289 }
290 if options.MemoryMiB == 0 {
291 options.MemoryMiB = 2048
292 }
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000293 if options.DiskBytes == 0 {
294 options.DiskBytes = 5 * 1024 * 1024 * 1024
295 }
Jan Schära9b060b2024-08-07 10:42:29 +0200296
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200297 // If it's the node's first start, set up its runtime directories.
298 if options.Runtime == nil {
Jan Schär07003572024-08-26 10:42:16 +0200299 r, err := setupRuntime(ld, sd, options.DiskBytes)
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200300 if err != nil {
301 return fmt.Errorf("while setting up node runtime: %w", err)
Serge Bazanski66e58952021-10-05 17:06:56 +0200302 }
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200303 options.Runtime = r
Serge Bazanski66e58952021-10-05 17:06:56 +0200304 }
305
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200306 // Replace the node's context with a new one.
307 r := options.Runtime
308 if r.CtxC != nil {
309 r.CtxC()
310 }
311 r.ctxT, r.CtxC = context.WithCancel(ctx)
312
Serge Bazanski66e58952021-10-05 17:06:56 +0200313 var qemuNetType string
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100314 var qemuNetConfig qemu.QemuValue
Serge Bazanski66e58952021-10-05 17:06:56 +0200315 if options.ConnectToSocket != nil {
316 qemuNetType = "socket"
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100317 qemuNetConfig = qemu.QemuValue{
Serge Bazanski66e58952021-10-05 17:06:56 +0200318 "id": {"net0"},
319 "fd": {"3"},
320 }
321 } else {
322 qemuNetType = "user"
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100323 qemuNetConfig = qemu.QemuValue{
Serge Bazanski66e58952021-10-05 17:06:56 +0200324 "id": {"net0"},
325 "net": {"10.42.0.0/24"},
326 "dhcpstart": {"10.42.0.10"},
327 "hostfwd": options.Ports.ToQemuForwards(),
328 }
329 }
330
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200331 // Generate the node's MAC address if it isn't already set in NodeOptions.
332 if options.Mac == nil {
333 mac, err := generateRandomEthernetMAC()
334 if err != nil {
335 return err
336 }
337 options.Mac = mac
Serge Bazanski66e58952021-10-05 17:06:56 +0200338 }
339
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200340 tpmSocketPath := filepath.Join(r.sd, "tpm-socket")
Tim Windelschmidt12240f92025-04-28 14:59:33 +0200341 fwVarPath := filepath.Join(r.ld, "VARS.fd")
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000342 storagePath := filepath.Join(r.ld, "image.img")
Lorenz Brun150f24a2023-07-13 20:11:06 +0200343 qemuArgs := []string{
Jan Schära9b060b2024-08-07 10:42:29 +0200344 "-machine", "q35",
345 "-accel", "kvm",
Serge Bazanski62e6f0b2024-09-03 12:18:56 +0200346 "-display", "none",
Jan Schära9b060b2024-08-07 10:42:29 +0200347 "-nodefaults",
348 "-cpu", "host",
349 "-m", fmt.Sprintf("%dM", options.MemoryMiB),
350 "-smp", fmt.Sprintf("cores=%d,threads=%d", options.CPUs, options.ThreadsPerCPU),
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000351 "-drive", "if=pflash,format=raw,readonly=on,file=" + xOvmfCodePath,
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200352 "-drive", "if=pflash,format=raw,file=" + fwVarPath,
Jan Schär3b0c8dd2025-06-23 10:32:07 +0000353 "-drive", "if=virtio,format=raw,cache=unsafe,file=" + storagePath,
Serge Bazanski66e58952021-10-05 17:06:56 +0200354 "-netdev", qemuNetConfig.ToOption(qemuNetType),
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200355 "-device", "virtio-net-pci,netdev=net0,mac=" + options.Mac.String(),
Serge Bazanski66e58952021-10-05 17:06:56 +0200356 "-chardev", "socket,id=chrtpm,path=" + tpmSocketPath,
357 "-tpmdev", "emulator,id=tpm0,chardev=chrtpm",
358 "-device", "tpm-tis,tpmdev=tpm0",
359 "-device", "virtio-rng-pci",
Lorenz Brun150f24a2023-07-13 20:11:06 +0200360 "-serial", "stdio",
361 }
Serge Bazanski62e6f0b2024-09-03 12:18:56 +0200362 if options.RunVNC {
363 vncSocketPath := filepath.Join(r.sd, "vnc-socket")
364 qemuArgs = append(qemuArgs,
365 "-vnc", "unix:"+vncSocketPath,
366 "-device", "virtio-vga",
367 )
368 }
Serge Bazanski66e58952021-10-05 17:06:56 +0200369
370 if !options.AllowReboot {
371 qemuArgs = append(qemuArgs, "-no-reboot")
372 }
373
374 if options.NodeParameters != nil {
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200375 parametersPath := filepath.Join(r.ld, "parameters.pb")
Serge Bazanski66e58952021-10-05 17:06:56 +0200376 parametersRaw, err := proto.Marshal(options.NodeParameters)
377 if err != nil {
378 return fmt.Errorf("failed to encode node paraeters: %w", err)
379 }
Lorenz Brun150f24a2023-07-13 20:11:06 +0200380 if err := os.WriteFile(parametersPath, parametersRaw, 0o644); err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +0200381 return fmt.Errorf("failed to write node parameters: %w", err)
382 }
383 qemuArgs = append(qemuArgs, "-fw_cfg", "name=dev.monogon.metropolis/parameters.pb,file="+parametersPath)
384 }
385
Leopoldacfad5b2023-01-15 14:05:25 +0100386 if options.PcapDump {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100387 qemuNetDump := qemu.QemuValue{
Tim Windelschmidta7a82f32024-04-11 01:40:25 +0200388 "id": {"net0"},
389 "netdev": {"net0"},
390 "file": {filepath.Join(r.ld, "net0.pcap")},
Leopoldacfad5b2023-01-15 14:05:25 +0100391 }
392 qemuArgs = append(qemuArgs, "-object", qemuNetDump.ToOption("filter-dump"))
393 }
394
Serge Bazanski2b6dc312024-06-04 17:44:55 +0200395 // Manufacture TPM if needed.
396 tpmd := filepath.Join(r.ld, "tpm")
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000397 err := tpmFactory.Manufacture(ctx, tpmd, &TPMPlatform{
Serge Bazanski2b6dc312024-06-04 17:44:55 +0200398 Manufacturer: "Monogon",
399 Version: "1.0",
400 Model: "TestCluster",
401 })
402 if err != nil {
403 return fmt.Errorf("could not manufacture TPM: %w", err)
404 }
405
Serge Bazanski66e58952021-10-05 17:06:56 +0200406 // Start TPM emulator as a subprocess
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200407 tpmCtx, tpmCancel := context.WithCancel(options.Runtime.ctxT)
Serge Bazanski66e58952021-10-05 17:06:56 +0200408
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000409 tpmEmuCmd := exec.CommandContext(tpmCtx, xSwtpmPath, "socket", "--tpm2", "--tpmstate", "dir="+tpmd, "--ctrl", "type=unixio,path="+tpmSocketPath)
Serge Bazanskib07c57a2024-06-04 14:33:27 +0000410 // Silence warnings from unsafe libtpms build (uses non-constant-time
411 // cryptographic operations).
412 tpmEmuCmd.Env = append(tpmEmuCmd.Env, "MONOGON_LIBTPMS_ACKNOWLEDGE_UNSAFE=yes")
Serge Bazanski66e58952021-10-05 17:06:56 +0200413 tpmEmuCmd.Stderr = os.Stderr
414 tpmEmuCmd.Stdout = os.Stdout
415
Tim Windelschmidt244b5672024-02-06 10:18:56 +0100416 err = tpmEmuCmd.Start()
Serge Bazanski66e58952021-10-05 17:06:56 +0200417 if err != nil {
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200418 tpmCancel()
Serge Bazanski66e58952021-10-05 17:06:56 +0200419 return fmt.Errorf("failed to start TPM emulator: %w", err)
420 }
421
Mateusz Zalegae90f4a12022-05-25 18:24:01 +0200422 // Wait for the socket to be created by the TPM emulator before launching
423 // QEMU.
424 for {
425 _, err := os.Stat(tpmSocketPath)
426 if err == nil {
427 break
428 }
Tim Windelschmidta7a82f32024-04-11 01:40:25 +0200429 if !os.IsNotExist(err) {
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200430 tpmCancel()
Mateusz Zalegae90f4a12022-05-25 18:24:01 +0200431 return fmt.Errorf("while stat-ing TPM socket path: %w", err)
432 }
433 if err := tpmCtx.Err(); err != nil {
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200434 tpmCancel()
Mateusz Zalegae90f4a12022-05-25 18:24:01 +0200435 return fmt.Errorf("while waiting for the TPM socket: %w", err)
436 }
437 time.Sleep(time.Millisecond * 100)
438 }
439
Serge Bazanski66e58952021-10-05 17:06:56 +0200440 // Start the main qemu binary
Tim Windelschmidt8f1efe92025-04-01 01:28:43 +0200441 systemCmd := exec.CommandContext(options.Runtime.ctxT, xQEMUPath, qemuArgs...)
Serge Bazanski66e58952021-10-05 17:06:56 +0200442 if options.ConnectToSocket != nil {
443 systemCmd.ExtraFiles = []*os.File{options.ConnectToSocket}
444 }
445
446 var stdErrBuf bytes.Buffer
447 systemCmd.Stderr = &stdErrBuf
448 systemCmd.Stdout = options.SerialPort
449
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100450 qemu.PrettyPrintQemuArgs(options.Name, systemCmd.Args)
Leopoldaf5086b2023-01-15 14:12:42 +0100451
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200452 go func() {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100453 logf("Node: Starting...")
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200454 err = systemCmd.Run()
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100455 logf("Node: Returned: %v", err)
Serge Bazanski66e58952021-10-05 17:06:56 +0200456
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200457 // Stop TPM emulator and wait for it to exit to properly reap the child process
458 tpmCancel()
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100459 logf("Node: Waiting for TPM emulator to exit")
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200460 // Wait returns a SIGKILL error because we just cancelled its context.
461 // We still need to call it to avoid creating zombies.
462 errTpm := tpmEmuCmd.Wait()
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100463 logf("Node: TPM emulator done: %v", errTpm)
Serge Bazanski66e58952021-10-05 17:06:56 +0200464
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200465 var exerr *exec.ExitError
466 if err != nil && errors.As(err, &exerr) {
467 status := exerr.ProcessState.Sys().(syscall.WaitStatus)
468 if status.Signaled() && status.Signal() == syscall.SIGKILL {
469 // Process was killed externally (most likely by our context being canceled).
470 // This is a normal exit for us, so return nil
471 doneC <- nil
472 return
473 }
474 exerr.Stderr = stdErrBuf.Bytes()
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100475 newErr := qemu.QEMUError(*exerr)
476 logf("Node: %q", stdErrBuf.String())
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200477 doneC <- &newErr
478 return
Serge Bazanski66e58952021-10-05 17:06:56 +0200479 }
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200480 doneC <- err
481 }()
482 return nil
Serge Bazanski66e58952021-10-05 17:06:56 +0200483}
484
485func copyFile(src, dst string) error {
486 in, err := os.Open(src)
487 if err != nil {
488 return fmt.Errorf("when opening source: %w", err)
489 }
490 defer in.Close()
491
492 out, err := os.Create(dst)
493 if err != nil {
494 return fmt.Errorf("when creating destination: %w", err)
495 }
496 defer out.Close()
497
Lorenz Brun87bbf7e2024-03-18 18:22:25 +0100498 endPos, err := in.Seek(0, io.SeekEnd)
Serge Bazanski66e58952021-10-05 17:06:56 +0200499 if err != nil {
Lorenz Brun87bbf7e2024-03-18 18:22:25 +0100500 return fmt.Errorf("when getting source end: %w", err)
Serge Bazanski66e58952021-10-05 17:06:56 +0200501 }
Lorenz Brun87bbf7e2024-03-18 18:22:25 +0100502
503 // Copy the file while preserving its sparseness. The image files are very
504 // sparse (less than 10% allocated), so this is a lot faster.
505 var lastHoleStart int64
506 for {
507 dataStart, err := in.Seek(lastHoleStart, unix.SEEK_DATA)
508 if err != nil {
509 return fmt.Errorf("when seeking to next data block: %w", err)
510 }
511 holeStart, err := in.Seek(dataStart, unix.SEEK_HOLE)
512 if err != nil {
513 return fmt.Errorf("when seeking to next hole: %w", err)
514 }
515 lastHoleStart = holeStart
516 if _, err := in.Seek(dataStart, io.SeekStart); err != nil {
517 return fmt.Errorf("when seeking to current data block: %w", err)
518 }
519 if _, err := out.Seek(dataStart, io.SeekStart); err != nil {
520 return fmt.Errorf("when seeking output to next data block: %w", err)
521 }
522 if _, err := io.CopyN(out, in, holeStart-dataStart); err != nil {
523 return fmt.Errorf("when copying file: %w", err)
524 }
525 if endPos == holeStart {
526 // The next hole is at the end of the file, we're done here.
527 break
528 }
529 }
530
Serge Bazanski66e58952021-10-05 17:06:56 +0200531 return out.Close()
532}
533
Serge Bazanskie78a0892021-10-07 17:03:49 +0200534// getNodes wraps around Management.GetNodes to return a list of nodes in a
535// cluster.
536func getNodes(ctx context.Context, mgmt apb.ManagementClient) ([]*apb.Node, error) {
Serge Bazanskie78a0892021-10-07 17:03:49 +0200537 var res []*apb.Node
Serge Bazanski636032e2022-01-26 14:21:33 +0100538 bo := backoff.WithContext(backoff.NewExponentialBackOff(), ctx)
Serge Bazanski075465c2021-11-16 15:38:49 +0100539 err := backoff.Retry(func() error {
540 res = nil
541 srvN, err := mgmt.GetNodes(ctx, &apb.GetNodesRequest{})
Serge Bazanskie78a0892021-10-07 17:03:49 +0200542 if err != nil {
Serge Bazanski075465c2021-11-16 15:38:49 +0100543 return fmt.Errorf("GetNodes: %w", err)
Serge Bazanskie78a0892021-10-07 17:03:49 +0200544 }
Serge Bazanski075465c2021-11-16 15:38:49 +0100545 for {
546 node, err := srvN.Recv()
547 if err == io.EOF {
548 break
549 }
550 if err != nil {
551 return fmt.Errorf("GetNodes.Recv: %w", err)
552 }
553 res = append(res, node)
554 }
555 return nil
556 }, bo)
557 if err != nil {
558 return nil, err
Serge Bazanskie78a0892021-10-07 17:03:49 +0200559 }
560 return res, nil
561}
562
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200563// getNode wraps Management.GetNodes. It returns node information matching
564// given node ID.
565func getNode(ctx context.Context, mgmt apb.ManagementClient, id string) (*apb.Node, error) {
566 nodes, err := getNodes(ctx, mgmt)
567 if err != nil {
568 return nil, fmt.Errorf("could not get nodes: %w", err)
569 }
570 for _, n := range nodes {
Jan Schär39d9c242024-09-24 13:49:55 +0200571 if n.Id == id {
572 return n, nil
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200573 }
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200574 }
Tim Windelschmidt73e98822024-04-18 23:13:49 +0200575 return nil, fmt.Errorf("no such node")
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200576}
577
Serge Bazanski66e58952021-10-05 17:06:56 +0200578// Gets a random EUI-48 Ethernet MAC address
579func generateRandomEthernetMAC() (*net.HardwareAddr, error) {
580 macBuf := make([]byte, 6)
581 _, err := rand.Read(macBuf)
582 if err != nil {
Tim Windelschmidtadcf5d72024-05-21 13:46:25 +0200583 return nil, fmt.Errorf("failed to read randomness for MAC: %w", err)
Serge Bazanski66e58952021-10-05 17:06:56 +0200584 }
585
586 // Set U/L bit and clear I/G bit (locally administered individual MAC)
587 // Ref IEEE 802-2014 Section 8.2.2
588 macBuf[0] = (macBuf[0] | 2) & 0xfe
589 mac := net.HardwareAddr(macBuf)
590 return &mac, nil
591}
592
Serge Bazanskibe742842022-04-04 13:18:50 +0200593const SOCKSPort uint16 = 1080
Serge Bazanski66e58952021-10-05 17:06:56 +0200594
Serge Bazanskibe742842022-04-04 13:18:50 +0200595// ClusterPorts contains all ports handled by Nanoswitch.
596var ClusterPorts = []uint16{
597 // Forwarded to the first node.
Jan Schär0f8ce4c2025-09-04 13:27:50 +0200598 uint16(allocs.PortCuratorService),
599 uint16(allocs.PortDebugService),
600 uint16(allocs.PortKubernetesAPI),
601 uint16(allocs.PortKubernetesAPIWrapped),
Serge Bazanskibe742842022-04-04 13:18:50 +0200602
603 // SOCKS proxy to the switch network
604 SOCKSPort,
Serge Bazanski66e58952021-10-05 17:06:56 +0200605}
606
607// ClusterOptions contains all options for launching a Metropolis cluster.
608type ClusterOptions struct {
609 // The number of nodes this cluster should be started with.
610 NumNodes int
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100611
Jan Schära9b060b2024-08-07 10:42:29 +0200612 // Node are default options of all nodes.
613 Node NodeOptions
614
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100615 // If true, node logs will be saved to individual files instead of being printed
616 // out to stderr. The path of these files will be still printed to stdout.
617 //
618 // The files will be located within the launch directory inside TEST_TMPDIR (or
619 // the default tempdir location, if not set).
620 NodeLogsToFiles bool
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200621
622 // LeaveNodesNew, if set, will leave all non-bootstrap nodes in NEW, without
623 // bootstrapping them. The nodes' address information in Cluster.Nodes will be
624 // incomplete.
625 LeaveNodesNew bool
Lorenz Brun150f24a2023-07-13 20:11:06 +0200626
627 // Optional local registry which will be made available to the cluster to
628 // pull images from. This is a more efficient alternative to preseeding all
629 // images used for testing.
Jan Schär9d2f3c62025-04-14 11:17:22 +0000630 LocalRegistry *registry.Server
Serge Bazanskie564f172024-04-03 12:06:06 +0200631
632 // InitialClusterConfiguration will be passed to the first node when creating the
633 // cluster, and defines some basic properties of the cluster. If not specified,
634 // the cluster will default to defaults as defined in
635 // metropolis.proto.api.NodeParameters.
636 InitialClusterConfiguration *cpb.ClusterConfiguration
Serge Bazanski66e58952021-10-05 17:06:56 +0200637}
638
639// Cluster is the running Metropolis cluster launched using the LaunchCluster
640// function.
641type Cluster struct {
Serge Bazanski66e58952021-10-05 17:06:56 +0200642 // Owner is the TLS Certificate of the owner of the test cluster. This can be
643 // used to authenticate further clients to the running cluster.
644 Owner tls.Certificate
645 // Ports is the PortMap used to access the first nodes' services (defined in
Serge Bazanskibe742842022-04-04 13:18:50 +0200646 // ClusterPorts) and the SOCKS proxy (at SOCKSPort).
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100647 Ports qemu.PortMap
Serge Bazanski66e58952021-10-05 17:06:56 +0200648
Serge Bazanskibe742842022-04-04 13:18:50 +0200649 // Nodes is a map from Node ID to its runtime information.
650 Nodes map[string]*NodeInCluster
651 // NodeIDs is a list of node IDs that are backing this cluster, in order of
652 // creation.
653 NodeIDs []string
654
Serge Bazanski54e212a2023-06-14 13:45:11 +0200655 // CACertificate is the cluster's CA certificate.
656 CACertificate *x509.Certificate
657
Serge Bazanski66e58952021-10-05 17:06:56 +0200658 // nodesDone is a list of channels populated with the return codes from all the
659 // nodes' qemu instances. It's used by Close to ensure all nodes have
Leopold20a036e2023-01-15 00:17:19 +0100660 // successfully been stopped.
Serge Bazanski66e58952021-10-05 17:06:56 +0200661 nodesDone []chan error
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200662 // nodeOpts are the cluster member nodes' mutable launch options, kept here
663 // to facilitate reboots.
664 nodeOpts []NodeOptions
665 // launchDir points at the directory keeping the nodes' state, such as storage
666 // images, firmware variable files, TPM state.
667 launchDir string
668 // socketDir points at the directory keeping UNIX socket files, such as these
669 // used to facilitate communication between QEMU and swtpm. It's different
670 // from launchDir, and anchored nearer the file system root, due to the
671 // socket path length limitation imposed by the kernel.
Serge Bazanski1f8cad72023-03-20 16:58:10 +0100672 socketDir string
673 metroctlDir string
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200674
Lorenz Brun276a7462023-07-12 21:28:54 +0200675 // SOCKSDialer is used by DialNode to establish connections to nodes via the
Serge Bazanskibe742842022-04-04 13:18:50 +0200676 // SOCKS server ran by nanoswitch.
Lorenz Brun276a7462023-07-12 21:28:54 +0200677 SOCKSDialer proxy.Dialer
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200678
679 // authClient is a cached authenticated owner connection to a Curator
680 // instance within the cluster.
681 authClient *grpc.ClientConn
682
683 // ctxT is the context individual node contexts are created from.
684 ctxT context.Context
685 // ctxC is used by Close to cancel the context under which the nodes are
686 // running.
687 ctxC context.CancelFunc
Serge Bazanski2b6dc312024-06-04 17:44:55 +0200688
689 tpmFactory *TPMFactory
Serge Bazanskibe742842022-04-04 13:18:50 +0200690}
691
692// NodeInCluster represents information about a node that's part of a Cluster.
693type NodeInCluster struct {
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100694 // ID of the node, which can be used to dial this node's services via
695 // NewNodeClient.
Serge Bazanskia0bc6d32023-06-28 18:57:40 +0200696 ID string
697 Pubkey []byte
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100698 // Address of the node on the network ran by nanoswitch. Not reachable from
699 // the host unless dialed via NewNodeClient or via the nanoswitch SOCKS
700 // proxy (reachable on Cluster.Ports[SOCKSPort]).
Serge Bazanskibe742842022-04-04 13:18:50 +0200701 ManagementAddress string
702}
703
704// firstConnection performs the initial owner credential escrow with a newly
705// started nanoswitch-backed cluster over SOCKS. It expects the first node to be
706// running at 10.1.0.2, which is always the case with the current nanoswitch
707// implementation.
708//
Leopold20a036e2023-01-15 00:17:19 +0100709// It returns the newly escrowed credentials as well as the first node's
Serge Bazanskibe742842022-04-04 13:18:50 +0200710// information as NodeInCluster.
711func firstConnection(ctx context.Context, socksDialer proxy.Dialer) (*tls.Certificate, *NodeInCluster, error) {
712 // Dial external service.
Jan Schär0f8ce4c2025-09-04 13:27:50 +0200713 remote := fmt.Sprintf("10.1.0.2:%s", allocs.PortCuratorService.PortString())
Serge Bazanski0c280152024-02-05 14:33:19 +0100714 initCreds, err := rpc.NewEphemeralCredentials(InsecurePrivateKey, rpc.WantInsecure())
Serge Bazanskibe742842022-04-04 13:18:50 +0200715 if err != nil {
716 return nil, nil, fmt.Errorf("NewEphemeralCredentials: %w", err)
717 }
718 initDialer := func(_ context.Context, addr string) (net.Conn, error) {
719 return socksDialer.Dial("tcp", addr)
720 }
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100721 initClient, err := grpc.NewClient(remote, grpc.WithContextDialer(initDialer), grpc.WithTransportCredentials(initCreds))
Serge Bazanskibe742842022-04-04 13:18:50 +0200722 if err != nil {
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100723 return nil, nil, fmt.Errorf("creating client with ephemeral credentials failed: %w", err)
Serge Bazanskibe742842022-04-04 13:18:50 +0200724 }
725 defer initClient.Close()
726
727 // Retrieve owner certificate - this can take a while because the node is still
728 // coming up, so do it in a backoff loop.
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100729 logf("Cluster: retrieving owner certificate (this can take a few seconds while the first node boots)...")
Serge Bazanskibe742842022-04-04 13:18:50 +0200730 aaa := apb.NewAAAClient(initClient)
731 var cert *tls.Certificate
732 err = backoff.Retry(func() error {
733 cert, err = rpc.RetrieveOwnerCertificate(ctx, aaa, InsecurePrivateKey)
734 if st, ok := status.FromError(err); ok {
735 if st.Code() == codes.Unavailable {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100736 logf("Cluster: cluster UNAVAILABLE: %v", st.Message())
Serge Bazanskibe742842022-04-04 13:18:50 +0200737 return err
738 }
739 }
740 return backoff.Permanent(err)
Serge Bazanski62e6f0b2024-09-03 12:18:56 +0200741 }, backoff.WithContext(backoff.NewExponentialBackOff(backoff.WithMaxElapsedTime(time.Minute)), ctx))
Serge Bazanskibe742842022-04-04 13:18:50 +0200742 if err != nil {
Serge Bazanski5bb8a332022-06-23 17:41:33 +0200743 return nil, nil, fmt.Errorf("couldn't retrieve owner certificate: %w", err)
Serge Bazanskibe742842022-04-04 13:18:50 +0200744 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100745 logf("Cluster: retrieved owner certificate.")
Serge Bazanskibe742842022-04-04 13:18:50 +0200746
747 // Now connect authenticated and get the node ID.
Serge Bazanski8535cb52023-03-29 14:15:08 +0200748 creds := rpc.NewAuthenticatedCredentials(*cert, rpc.WantInsecure())
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100749 authClient, err := grpc.NewClient(remote, grpc.WithContextDialer(initDialer), grpc.WithTransportCredentials(creds))
Serge Bazanskibe742842022-04-04 13:18:50 +0200750 if err != nil {
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100751 return nil, nil, fmt.Errorf("creating client with owner credentials failed: %w", err)
Serge Bazanskibe742842022-04-04 13:18:50 +0200752 }
753 defer authClient.Close()
754 mgmt := apb.NewManagementClient(authClient)
755
756 var node *NodeInCluster
757 err = backoff.Retry(func() error {
758 nodes, err := getNodes(ctx, mgmt)
759 if err != nil {
760 return fmt.Errorf("retrieving nodes failed: %w", err)
761 }
762 if len(nodes) != 1 {
763 return fmt.Errorf("expected one node, got %d", len(nodes))
764 }
765 n := nodes[0]
766 if n.Status == nil || n.Status.ExternalAddress == "" {
767 return fmt.Errorf("node has no status and/or address")
768 }
769 node = &NodeInCluster{
Jan Schär39d9c242024-09-24 13:49:55 +0200770 ID: n.Id,
Serge Bazanskibe742842022-04-04 13:18:50 +0200771 ManagementAddress: n.Status.ExternalAddress,
772 }
773 return nil
774 }, backoff.WithContext(backoff.NewExponentialBackOff(), ctx))
775 if err != nil {
776 return nil, nil, err
777 }
778
779 return cert, node, nil
Serge Bazanski66e58952021-10-05 17:06:56 +0200780}
781
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100782func NewSerialFileLogger(p string) (io.ReadWriter, error) {
Lorenz Brun150f24a2023-07-13 20:11:06 +0200783 f, err := os.OpenFile(p, os.O_WRONLY|os.O_CREATE, 0o600)
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100784 if err != nil {
785 return nil, err
786 }
787 return f, nil
788}
789
Serge Bazanski66e58952021-10-05 17:06:56 +0200790// LaunchCluster launches a cluster of Metropolis node VMs together with a
791// Nanoswitch instance to network them all together.
792//
793// The given context will be used to run all qemu instances in the cluster, and
794// canceling the context or calling Close() will terminate them.
795func LaunchCluster(ctx context.Context, opts ClusterOptions) (*Cluster, error) {
Serge Bazanskie78a0892021-10-07 17:03:49 +0200796 if opts.NumNodes <= 0 {
Serge Bazanski66e58952021-10-05 17:06:56 +0200797 return nil, errors.New("refusing to start cluster with zero nodes")
798 }
Serge Bazanski66e58952021-10-05 17:06:56 +0200799
Jan Schära9b060b2024-08-07 10:42:29 +0200800 // Prepare the node options. These will be kept as part of Cluster.
801 // nodeOpts[].Runtime will be initialized by LaunchNode during the first
802 // launch. The runtime information can be later used to restart a node.
803 // The 0th node will be initialized first. The rest will follow after it
804 // had bootstrapped the cluster.
805 nodeOpts := make([]NodeOptions, opts.NumNodes)
806 for i := range opts.NumNodes {
807 nodeOpts[i] = opts.Node
808 nodeOpts[i].Name = fmt.Sprintf("node%d", i)
809 nodeOpts[i].SerialPort = newPrefixedStdio(i)
Tim Windelschmidtbd57c1d2025-07-16 15:57:55 +0200810 nodeOpts[i].RunVNC = true
Jan Schära9b060b2024-08-07 10:42:29 +0200811 }
812 nodeOpts[0].NodeParameters = &apb.NodeParameters{
813 Cluster: &apb.NodeParameters_ClusterBootstrap_{
814 ClusterBootstrap: &apb.NodeParameters_ClusterBootstrap{
815 OwnerPublicKey: InsecurePublicKey,
816 InitialClusterConfiguration: opts.InitialClusterConfiguration,
817 Labels: &cpb.NodeLabels{
818 Pairs: []*cpb.NodeLabels_Pair{
Serge Bazanski20498dd2024-09-30 17:07:08 +0000819 {Key: NodeNumberKey, Value: "0"},
Jan Schära9b060b2024-08-07 10:42:29 +0200820 },
821 },
822 },
823 },
824 }
825 nodeOpts[0].PcapDump = true
826
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200827 // Create the launch directory.
Serge Bazanski1f8cad72023-03-20 16:58:10 +0100828 ld, err := os.MkdirTemp(os.Getenv("TEST_TMPDIR"), "cluster-*")
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200829 if err != nil {
830 return nil, fmt.Errorf("failed to create the launch directory: %w", err)
831 }
Tim Windelschmidt1fe3f942025-04-01 14:22:11 +0200832
833 nodeLogDir := ld
834 if os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR") != "" {
835 nodeLogDir = os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")
836 }
837
Serge Bazanski1f8cad72023-03-20 16:58:10 +0100838 // Create the metroctl config directory. We keep it in /tmp because in some
839 // scenarios it's end-user visible and we want it short.
840 md, err := os.MkdirTemp("/tmp", "metroctl-*")
841 if err != nil {
842 return nil, fmt.Errorf("failed to create the metroctl directory: %w", err)
843 }
844
845 // Create the socket directory. We keep it in /tmp because of socket path limits.
846 sd, err := os.MkdirTemp("/tmp", "cluster-*")
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200847 if err != nil {
848 return nil, fmt.Errorf("failed to create the socket directory: %w", err)
849 }
Serge Bazanski66e58952021-10-05 17:06:56 +0200850
Serge Bazanski2b6dc312024-06-04 17:44:55 +0200851 // Set up TPM factory.
852 tpmf, err := NewTPMFactory(filepath.Join(ld, "tpm"))
853 if err != nil {
854 return nil, fmt.Errorf("failed to create TPM factory: %w", err)
855 }
856
Serge Bazanski66e58952021-10-05 17:06:56 +0200857 // Prepare links between nodes and nanoswitch.
858 var switchPorts []*os.File
Jan Schära9b060b2024-08-07 10:42:29 +0200859 for i := range opts.NumNodes {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100860 switchPort, vmPort, err := qemu.NewSocketPair()
Serge Bazanski66e58952021-10-05 17:06:56 +0200861 if err != nil {
Serge Bazanski66e58952021-10-05 17:06:56 +0200862 return nil, fmt.Errorf("failed to get socketpair: %w", err)
863 }
864 switchPorts = append(switchPorts, switchPort)
Jan Schära9b060b2024-08-07 10:42:29 +0200865 nodeOpts[i].ConnectToSocket = vmPort
Serge Bazanski66e58952021-10-05 17:06:56 +0200866 }
867
Serge Bazanskie78a0892021-10-07 17:03:49 +0200868 // Make a list of channels that will be populated by all running node qemu
869 // processes.
Serge Bazanski66e58952021-10-05 17:06:56 +0200870 done := make([]chan error, opts.NumNodes)
Lorenz Brun150f24a2023-07-13 20:11:06 +0200871 for i := range done {
Serge Bazanski66e58952021-10-05 17:06:56 +0200872 done[i] = make(chan error, 1)
873 }
874
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100875 if opts.NodeLogsToFiles {
Jan Schära9b060b2024-08-07 10:42:29 +0200876 for i := range opts.NumNodes {
Lorenz Brun4beaf4f2025-01-14 16:10:55 +0100877 path := path.Join(nodeLogDir, fmt.Sprintf("node-%d.txt", i))
Jan Schära9b060b2024-08-07 10:42:29 +0200878 port, err := NewSerialFileLogger(path)
879 if err != nil {
880 return nil, fmt.Errorf("could not open log file for node %d: %w", i, err)
881 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100882 logf("Node %d logs at %s", i, path)
Jan Schära9b060b2024-08-07 10:42:29 +0200883 nodeOpts[i].SerialPort = port
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100884 }
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100885 }
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200886
887 // Start the first node.
888 ctxT, ctxC := context.WithCancel(ctx)
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100889 logf("Cluster: Starting node %d...", 0)
Serge Bazanski2b6dc312024-06-04 17:44:55 +0200890 if err := LaunchNode(ctxT, ld, sd, tpmf, &nodeOpts[0], done[0]); err != nil {
Serge Bazanskiee8c81b2024-04-03 11:59:38 +0200891 ctxC()
892 return nil, fmt.Errorf("failed to launch first node: %w", err)
893 }
Serge Bazanski66e58952021-10-05 17:06:56 +0200894
Lorenz Brun150f24a2023-07-13 20:11:06 +0200895 localRegistryAddr := net.TCPAddr{
896 IP: net.IPv4(10, 42, 0, 82),
897 Port: 5000,
898 }
899
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100900 var guestSvcMap qemu.GuestServiceMap
Lorenz Brun150f24a2023-07-13 20:11:06 +0200901 if opts.LocalRegistry != nil {
902 l, err := net.ListenTCP("tcp", &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1)})
903 if err != nil {
904 ctxC()
905 return nil, fmt.Errorf("failed to create TCP listener for local registry: %w", err)
906 }
907 s := http.Server{
908 Handler: opts.LocalRegistry,
909 }
910 go s.Serve(l)
911 go func() {
912 <-ctxT.Done()
913 s.Close()
914 }()
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100915 guestSvcMap = qemu.GuestServiceMap{
Lorenz Brun150f24a2023-07-13 20:11:06 +0200916 &localRegistryAddr: *l.Addr().(*net.TCPAddr),
917 }
918 }
919
Serge Bazanskie78a0892021-10-07 17:03:49 +0200920 // Launch nanoswitch.
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100921 portMap, err := qemu.ConflictFreePortMap(ClusterPorts)
Serge Bazanski66e58952021-10-05 17:06:56 +0200922 if err != nil {
923 ctxC()
924 return nil, fmt.Errorf("failed to allocate ephemeral ports: %w", err)
925 }
926
927 go func() {
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100928 var serialPort io.ReadWriter
Tim Windelschmidta5b00bd2024-12-09 22:52:31 +0100929 var err error
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100930 if opts.NodeLogsToFiles {
Tim Windelschmidt1fe3f942025-04-01 14:22:11 +0200931
932 loggerPath := path.Join(nodeLogDir, "nanoswitch.txt")
Tim Windelschmidta5b00bd2024-12-09 22:52:31 +0100933 serialPort, err = NewSerialFileLogger(loggerPath)
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100934 if err != nil {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100935 logf("Could not open log file for nanoswitch: %v", err)
936 os.Exit(1)
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100937 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100938 logf("Nanoswitch logs at %s", loggerPath)
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100939 } else {
940 serialPort = newPrefixedStdio(99)
941 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100942 if err := qemu.RunMicroVM(ctxT, &qemu.MicroVMOptions{
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100943 Name: "nanoswitch",
Tim Windelschmidt82e6af72024-07-23 00:05:42 +0000944 KernelPath: xKernelPath,
945 InitramfsPath: xInitramfsPath,
Serge Bazanski66e58952021-10-05 17:06:56 +0200946 ExtraNetworkInterfaces: switchPorts,
947 PortMap: portMap,
Lorenz Brun150f24a2023-07-13 20:11:06 +0200948 GuestServiceMap: guestSvcMap,
Serge Bazanskid09c58f2023-03-17 00:25:08 +0100949 SerialPort: serialPort,
Tim Windelschmidt1fe3f942025-04-01 14:22:11 +0200950 PcapDump: path.Join(nodeLogDir, "nanoswitch.pcap"),
Serge Bazanski66e58952021-10-05 17:06:56 +0200951 }); err != nil {
952 if !errors.Is(err, ctxT.Err()) {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100953 logf("Failed to launch nanoswitch: %v", err)
954 os.Exit(1)
Serge Bazanski66e58952021-10-05 17:06:56 +0200955 }
956 }
957 }()
958
Serge Bazanskibe742842022-04-04 13:18:50 +0200959 // Build SOCKS dialer.
960 socksRemote := fmt.Sprintf("localhost:%v", portMap[SOCKSPort])
961 socksDialer, err := proxy.SOCKS5("tcp", socksRemote, nil, proxy.Direct)
Serge Bazanski66e58952021-10-05 17:06:56 +0200962 if err != nil {
963 ctxC()
Serge Bazanskibe742842022-04-04 13:18:50 +0200964 return nil, fmt.Errorf("failed to build SOCKS dialer: %w", err)
Serge Bazanski66e58952021-10-05 17:06:56 +0200965 }
966
Serge Bazanskibe742842022-04-04 13:18:50 +0200967 // Retrieve owner credentials and first node.
968 cert, firstNode, err := firstConnection(ctxT, socksDialer)
Serge Bazanski66e58952021-10-05 17:06:56 +0200969 if err != nil {
970 ctxC()
971 return nil, err
972 }
Serge Bazanski66e58952021-10-05 17:06:56 +0200973
Serge Bazanski1f8cad72023-03-20 16:58:10 +0100974 // Write credentials to the metroctl directory.
975 if err := metroctl.WriteOwnerKey(md, cert.PrivateKey.(ed25519.PrivateKey)); err != nil {
976 ctxC()
977 return nil, fmt.Errorf("could not write owner key: %w", err)
978 }
979 if err := metroctl.WriteOwnerCertificate(md, cert.Certificate[0]); err != nil {
980 ctxC()
981 return nil, fmt.Errorf("could not write owner certificate: %w", err)
982 }
983
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +0100984 logf("Cluster: Node %d is %s", 0, firstNode.ID)
Serge Bazanski53458ba2024-06-18 09:56:46 +0000985
986 // Set up a partially initialized cluster instance, to be filled in the
Mateusz Zalega0246f5e2022-04-22 17:29:04 +0200987 // later steps.
Serge Bazanskibe742842022-04-04 13:18:50 +0200988 cluster := &Cluster{
989 Owner: *cert,
990 Ports: portMap,
991 Nodes: map[string]*NodeInCluster{
992 firstNode.ID: firstNode,
993 },
994 NodeIDs: []string{
995 firstNode.ID,
996 },
997
Serge Bazanski1f8cad72023-03-20 16:58:10 +0100998 nodesDone: done,
999 nodeOpts: nodeOpts,
1000 launchDir: ld,
1001 socketDir: sd,
1002 metroctlDir: md,
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001003
Lorenz Brun276a7462023-07-12 21:28:54 +02001004 SOCKSDialer: socksDialer,
Serge Bazanskibe742842022-04-04 13:18:50 +02001005
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001006 ctxT: ctxT,
Serge Bazanskibe742842022-04-04 13:18:50 +02001007 ctxC: ctxC,
Serge Bazanski2b6dc312024-06-04 17:44:55 +02001008
1009 tpmFactory: tpmf,
Serge Bazanskibe742842022-04-04 13:18:50 +02001010 }
1011
1012 // Now start the rest of the nodes and register them into the cluster.
1013
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001014 // Get an authenticated owner client within the cluster.
Serge Bazanski5bb8a332022-06-23 17:41:33 +02001015 curC, err := cluster.CuratorClient()
Serge Bazanski66e58952021-10-05 17:06:56 +02001016 if err != nil {
1017 ctxC()
Serge Bazanski5bb8a332022-06-23 17:41:33 +02001018 return nil, fmt.Errorf("CuratorClient: %w", err)
Serge Bazanski66e58952021-10-05 17:06:56 +02001019 }
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001020 mgmt := apb.NewManagementClient(curC)
Serge Bazanskie78a0892021-10-07 17:03:49 +02001021
1022 // Retrieve register ticket to register further nodes.
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001023 logf("Cluster: retrieving register ticket...")
Serge Bazanskie78a0892021-10-07 17:03:49 +02001024 resT, err := mgmt.GetRegisterTicket(ctx, &apb.GetRegisterTicketRequest{})
1025 if err != nil {
1026 ctxC()
1027 return nil, fmt.Errorf("GetRegisterTicket: %w", err)
1028 }
1029 ticket := resT.Ticket
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001030 logf("Cluster: retrieved register ticket (%d bytes).", len(ticket))
Serge Bazanskie78a0892021-10-07 17:03:49 +02001031
1032 // Retrieve cluster info (for directory and ca public key) to register further
1033 // nodes.
1034 resI, err := mgmt.GetClusterInfo(ctx, &apb.GetClusterInfoRequest{})
1035 if err != nil {
1036 ctxC()
1037 return nil, fmt.Errorf("GetClusterInfo: %w", err)
1038 }
Serge Bazanski54e212a2023-06-14 13:45:11 +02001039 caCert, err := x509.ParseCertificate(resI.CaCertificate)
1040 if err != nil {
1041 ctxC()
1042 return nil, fmt.Errorf("ParseCertificate: %w", err)
1043 }
1044 cluster.CACertificate = caCert
Serge Bazanskie78a0892021-10-07 17:03:49 +02001045
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001046 // Use the retrieved information to configure the rest of the node options.
1047 for i := 1; i < opts.NumNodes; i++ {
Jan Schära9b060b2024-08-07 10:42:29 +02001048 nodeOpts[i].NodeParameters = &apb.NodeParameters{
1049 Cluster: &apb.NodeParameters_ClusterRegister_{
1050 ClusterRegister: &apb.NodeParameters_ClusterRegister{
1051 RegisterTicket: ticket,
1052 ClusterDirectory: resI.ClusterDirectory,
1053 CaCertificate: resI.CaCertificate,
1054 Labels: &cpb.NodeLabels{
1055 Pairs: []*cpb.NodeLabels_Pair{
Serge Bazanski20498dd2024-09-30 17:07:08 +00001056 {Key: NodeNumberKey, Value: fmt.Sprintf("%d", i)},
Serge Bazanski30e30b32024-05-22 14:11:56 +02001057 },
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001058 },
1059 },
1060 },
Serge Bazanskid09c58f2023-03-17 00:25:08 +01001061 }
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001062 }
1063
1064 // Now run the rest of the nodes.
Serge Bazanskie78a0892021-10-07 17:03:49 +02001065 for i := 1; i < opts.NumNodes; i++ {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001066 logf("Cluster: Starting node %d...", i)
Serge Bazanski2b6dc312024-06-04 17:44:55 +02001067 err := LaunchNode(ctxT, ld, sd, tpmf, &nodeOpts[i], done[i])
Serge Bazanskiee8c81b2024-04-03 11:59:38 +02001068 if err != nil {
Jan Schär0b927652024-07-31 18:08:50 +02001069 return nil, fmt.Errorf("failed to launch node %d: %w", i, err)
Serge Bazanskiee8c81b2024-04-03 11:59:38 +02001070 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001071 }
Serge Bazanskie78a0892021-10-07 17:03:49 +02001072
Serge Bazanski53458ba2024-06-18 09:56:46 +00001073 // Wait for nodes to appear as NEW, populate a map from node number (index into
Jan Schära9b060b2024-08-07 10:42:29 +02001074 // nodeOpts, etc.) to Metropolis Node ID.
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001075 seenNodes := make(map[string]bool)
Serge Bazanski53458ba2024-06-18 09:56:46 +00001076 nodeNumberToID := make(map[int]string)
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001077 logf("Cluster: waiting for nodes to appear as NEW...")
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001078 for i := 1; i < opts.NumNodes; i++ {
Serge Bazanskie78a0892021-10-07 17:03:49 +02001079 for {
1080 nodes, err := getNodes(ctx, mgmt)
1081 if err != nil {
1082 ctxC()
1083 return nil, fmt.Errorf("could not get nodes: %w", err)
1084 }
1085 for _, n := range nodes {
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001086 if n.State != cpb.NodeState_NODE_STATE_NEW {
1087 continue
Serge Bazanskie78a0892021-10-07 17:03:49 +02001088 }
Serge Bazanski87d9c592024-03-20 12:35:11 +01001089 if seenNodes[n.Id] {
1090 continue
1091 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001092 seenNodes[n.Id] = true
1093 cluster.Nodes[n.Id] = &NodeInCluster{
1094 ID: n.Id,
1095 Pubkey: n.Pubkey,
1096 }
Serge Bazanski53458ba2024-06-18 09:56:46 +00001097
Serge Bazanski20498dd2024-09-30 17:07:08 +00001098 num, err := strconv.Atoi(node.GetNodeLabel(n.Labels, NodeNumberKey))
Serge Bazanski53458ba2024-06-18 09:56:46 +00001099 if err != nil {
1100 return nil, fmt.Errorf("node %s has undecodable number label: %w", n.Id, err)
1101 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001102 logf("Cluster: Node %d is %s", num, n.Id)
Serge Bazanski53458ba2024-06-18 09:56:46 +00001103 nodeNumberToID[num] = n.Id
Serge Bazanskie78a0892021-10-07 17:03:49 +02001104 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001105
1106 if len(seenNodes) == opts.NumNodes-1 {
Serge Bazanskie78a0892021-10-07 17:03:49 +02001107 break
1108 }
1109 time.Sleep(1 * time.Second)
1110 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001111 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001112 logf("Found all expected nodes")
Serge Bazanskie78a0892021-10-07 17:03:49 +02001113
Serge Bazanski53458ba2024-06-18 09:56:46 +00001114 // Build the rest of NodeIDs from map.
1115 for i := 1; i < opts.NumNodes; i++ {
1116 cluster.NodeIDs = append(cluster.NodeIDs, nodeNumberToID[i])
1117 }
1118
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001119 approvedNodes := make(map[string]bool)
1120 upNodes := make(map[string]bool)
1121 if !opts.LeaveNodesNew {
Serge Bazanskie78a0892021-10-07 17:03:49 +02001122 for {
1123 nodes, err := getNodes(ctx, mgmt)
1124 if err != nil {
1125 ctxC()
1126 return nil, fmt.Errorf("could not get nodes: %w", err)
1127 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001128 for _, node := range nodes {
1129 if !seenNodes[node.Id] {
1130 // Skip nodes that weren't NEW in the previous step.
Serge Bazanskie78a0892021-10-07 17:03:49 +02001131 continue
1132 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001133
1134 if node.State == cpb.NodeState_NODE_STATE_UP && node.Status != nil && node.Status.ExternalAddress != "" {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001135 logf("Cluster: node %s is up", node.Id)
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001136 upNodes[node.Id] = true
1137 cluster.Nodes[node.Id].ManagementAddress = node.Status.ExternalAddress
Serge Bazanskie78a0892021-10-07 17:03:49 +02001138 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001139 if upNodes[node.Id] {
1140 continue
Serge Bazanskibe742842022-04-04 13:18:50 +02001141 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001142
1143 if !approvedNodes[node.Id] {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001144 logf("Cluster: approving node %s", node.Id)
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001145 _, err := mgmt.ApproveNode(ctx, &apb.ApproveNodeRequest{
1146 Pubkey: node.Pubkey,
1147 })
1148 if err != nil {
1149 ctxC()
1150 return nil, fmt.Errorf("ApproveNode(%s): %w", node.Id, err)
1151 }
1152 approvedNodes[node.Id] = true
Serge Bazanskibe742842022-04-04 13:18:50 +02001153 }
Serge Bazanskie78a0892021-10-07 17:03:49 +02001154 }
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001155
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001156 logf("Cluster: want %d up nodes, have %d", opts.NumNodes, len(upNodes)+1)
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001157 if len(upNodes) == opts.NumNodes-1 {
Serge Bazanskie78a0892021-10-07 17:03:49 +02001158 break
1159 }
Serge Bazanskibe742842022-04-04 13:18:50 +02001160 time.Sleep(time.Second)
Serge Bazanskie78a0892021-10-07 17:03:49 +02001161 }
Serge Bazanskie78a0892021-10-07 17:03:49 +02001162 }
Serge Bazanski66e58952021-10-05 17:06:56 +02001163
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001164 logf("Cluster: all nodes up:")
Jan Schär0b927652024-07-31 18:08:50 +02001165 for i, nodeID := range cluster.NodeIDs {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001166 logf("Cluster: %d. %s at %s", i, nodeID, cluster.Nodes[nodeID].ManagementAddress)
Serge Bazanskibe742842022-04-04 13:18:50 +02001167 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001168 logf("Cluster: starting tests...")
Serge Bazanski66e58952021-10-05 17:06:56 +02001169
Serge Bazanskibe742842022-04-04 13:18:50 +02001170 return cluster, nil
Serge Bazanski66e58952021-10-05 17:06:56 +02001171}
1172
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001173// RebootNode reboots the cluster member node matching the given index, and
1174// waits for it to rejoin the cluster. It will use the given context ctx to run
1175// cluster API requests, whereas the resulting QEMU process will be created
1176// using the cluster's context c.ctxT. The nodes are indexed starting at 0.
1177func (c *Cluster) RebootNode(ctx context.Context, idx int) error {
1178 if idx < 0 || idx >= len(c.NodeIDs) {
Serge Bazanskiee8c81b2024-04-03 11:59:38 +02001179 return fmt.Errorf("index out of bounds")
1180 }
1181 if c.nodeOpts[idx].Runtime == nil {
1182 return fmt.Errorf("node not running")
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001183 }
1184 id := c.NodeIDs[idx]
1185
1186 // Get an authenticated owner client within the cluster.
Serge Bazanski5bb8a332022-06-23 17:41:33 +02001187 curC, err := c.CuratorClient()
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001188 if err != nil {
1189 return err
1190 }
1191 mgmt := apb.NewManagementClient(curC)
1192
Tim Windelschmidt2137bd62025-08-25 14:03:12 +02001193 nodeC, err := c.NodeClient(c.Nodes[id])
1194 if err != nil {
1195 return err
1196 }
1197
1198 // Shut down the Node. QEMU will exit once the node powers it off.
1199 nmgmt := apb.NewNodeManagementClient(nodeC)
1200 _, err = nmgmt.Reboot(ctx, &apb.RebootRequest{
1201 Type: apb.RebootRequest_TYPE_POWER_OFF,
1202 NextBoot: apb.RebootRequest_NEXT_BOOT_START_NORMAL,
1203 })
1204 if err != nil {
1205 return err
1206 }
1207
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001208 logf("Cluster: waiting for node %d (%s) to stop.", idx, id)
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001209 err = <-c.nodesDone[idx]
1210 if err != nil {
1211 return fmt.Errorf("while restarting node: %w", err)
1212 }
1213
1214 // Start QEMU again.
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001215 logf("Cluster: restarting node %d (%s).", idx, id)
Serge Bazanski2b6dc312024-06-04 17:44:55 +02001216 if err := LaunchNode(c.ctxT, c.launchDir, c.socketDir, c.tpmFactory, &c.nodeOpts[idx], c.nodesDone[idx]); err != nil {
Serge Bazanskiee8c81b2024-04-03 11:59:38 +02001217 return fmt.Errorf("failed to launch node %d: %w", idx, err)
1218 }
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001219
Serge Bazanskibc969572024-03-21 11:56:13 +01001220 start := time.Now()
1221
1222 // Poll Management.GetNodes until the node is healthy.
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001223 for {
1224 cs, err := getNode(ctx, mgmt, id)
1225 if err != nil {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001226 logf("Cluster: node get error: %v", err)
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001227 return err
1228 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001229 logf("Cluster: node health: %+v", cs.Health)
Serge Bazanskibc969572024-03-21 11:56:13 +01001230
1231 lhb := time.Now().Add(-cs.TimeSinceHeartbeat.AsDuration())
Tim Windelschmidta10d0cb2025-01-13 14:44:15 +01001232 if lhb.After(start) && cs.Health == apb.Node_HEALTH_HEALTHY {
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001233 break
1234 }
1235 time.Sleep(time.Second)
1236 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001237 logf("Cluster: node %d (%s) has rejoined the cluster.", idx, id)
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001238 return nil
1239}
1240
Serge Bazanski500f6e02024-04-03 12:06:40 +02001241// ShutdownNode performs an ungraceful shutdown (i.e. power off) of the node
1242// given by idx. If the node is already shut down, this is a no-op.
1243func (c *Cluster) ShutdownNode(idx int) error {
1244 if idx < 0 || idx >= len(c.NodeIDs) {
1245 return fmt.Errorf("index out of bounds")
1246 }
1247 // Return if node is already stopped.
1248 select {
1249 case <-c.nodeOpts[idx].Runtime.ctxT.Done():
1250 return nil
1251 default:
1252 }
1253 id := c.NodeIDs[idx]
1254
1255 // Cancel the node's context. This will shut down QEMU.
1256 c.nodeOpts[idx].Runtime.CtxC()
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001257 logf("Cluster: waiting for node %d (%s) to stop.", idx, id)
Serge Bazanski500f6e02024-04-03 12:06:40 +02001258 err := <-c.nodesDone[idx]
1259 if err != nil {
1260 return fmt.Errorf("while shutting down node: %w", err)
1261 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001262 logf("Cluster: node %d (%s) stopped.", idx, id)
Serge Bazanski500f6e02024-04-03 12:06:40 +02001263 return nil
1264}
1265
1266// StartNode performs a power on of the node given by idx. If the node is already
1267// running, this is a no-op.
1268func (c *Cluster) StartNode(idx int) error {
1269 if idx < 0 || idx >= len(c.NodeIDs) {
1270 return fmt.Errorf("index out of bounds")
1271 }
1272 id := c.NodeIDs[idx]
1273 // Return if node is already running.
1274 select {
1275 case <-c.nodeOpts[idx].Runtime.ctxT.Done():
1276 default:
1277 return nil
1278 }
1279
1280 // Start QEMU again.
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001281 logf("Cluster: starting node %d (%s).", idx, id)
Serge Bazanski2b6dc312024-06-04 17:44:55 +02001282 if err := LaunchNode(c.ctxT, c.launchDir, c.socketDir, c.tpmFactory, &c.nodeOpts[idx], c.nodesDone[idx]); err != nil {
Serge Bazanski500f6e02024-04-03 12:06:40 +02001283 return fmt.Errorf("failed to launch node %d: %w", idx, err)
1284 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001285 logf("Cluster: node %d (%s) started.", idx, id)
Serge Bazanski500f6e02024-04-03 12:06:40 +02001286 return nil
1287}
1288
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001289// Close cancels the running clusters' context and waits for all virtualized
Serge Bazanski66e58952021-10-05 17:06:56 +02001290// nodes to stop. It returns an error if stopping the nodes failed, or one of
1291// the nodes failed to fully start in the first place.
1292func (c *Cluster) Close() error {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001293 logf("Cluster: stopping...")
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001294 if c.authClient != nil {
1295 c.authClient.Close()
1296 }
Serge Bazanski66e58952021-10-05 17:06:56 +02001297 c.ctxC()
1298
Leopold20a036e2023-01-15 00:17:19 +01001299 var errs []error
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001300 logf("Cluster: waiting for nodes to exit...")
Serge Bazanski66e58952021-10-05 17:06:56 +02001301 for _, c := range c.nodesDone {
1302 err := <-c
1303 if err != nil {
Leopold20a036e2023-01-15 00:17:19 +01001304 errs = append(errs, err)
Serge Bazanski66e58952021-10-05 17:06:56 +02001305 }
1306 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001307 logf("Cluster: removing nodes' state files (%s) and sockets (%s).", c.launchDir, c.socketDir)
Mateusz Zalega0246f5e2022-04-22 17:29:04 +02001308 os.RemoveAll(c.launchDir)
1309 os.RemoveAll(c.socketDir)
Serge Bazanski1f8cad72023-03-20 16:58:10 +01001310 os.RemoveAll(c.metroctlDir)
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001311 logf("Cluster: done")
Leopold20a036e2023-01-15 00:17:19 +01001312 return multierr.Combine(errs...)
Serge Bazanski66e58952021-10-05 17:06:56 +02001313}
Serge Bazanskibe742842022-04-04 13:18:50 +02001314
1315// DialNode is a grpc.WithContextDialer compatible dialer which dials nodes by
1316// their ID. This is performed by connecting to the cluster nanoswitch via its
1317// SOCKS proxy, and using the cluster node list for name resolution.
1318//
1319// For example:
1320//
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +01001321// grpc.NewClient("passthrough:///metropolis-deadbeef:1234", grpc.WithContextDialer(c.DialNode))
Serge Bazanskibe742842022-04-04 13:18:50 +02001322func (c *Cluster) DialNode(_ context.Context, addr string) (net.Conn, error) {
1323 host, port, err := net.SplitHostPort(addr)
1324 if err != nil {
1325 return nil, fmt.Errorf("invalid host:port: %w", err)
1326 }
1327 // Already an IP address?
1328 if net.ParseIP(host) != nil {
Lorenz Brun276a7462023-07-12 21:28:54 +02001329 return c.SOCKSDialer.Dial("tcp", addr)
Serge Bazanskibe742842022-04-04 13:18:50 +02001330 }
1331
1332 // Otherwise, expect a node name.
1333 node, ok := c.Nodes[host]
1334 if !ok {
1335 return nil, fmt.Errorf("unknown node %q", host)
1336 }
1337 addr = net.JoinHostPort(node.ManagementAddress, port)
Lorenz Brun276a7462023-07-12 21:28:54 +02001338 return c.SOCKSDialer.Dial("tcp", addr)
Serge Bazanskibe742842022-04-04 13:18:50 +02001339}
Serge Bazanski1f8cad72023-03-20 16:58:10 +01001340
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001341// GetKubeClientSet gets a Kubernetes client set accessing the Metropolis
1342// Kubernetes authenticating proxy using the cluster owner identity.
1343// It currently has access to everything (i.e. the cluster-admin role)
1344// via the owner-admin binding.
Lorenz Brun8f1254d2025-01-28 14:10:05 +01001345func (c *Cluster) GetKubeClientSet() (kubernetes.Interface, *rest.Config, error) {
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001346 pkcs8Key, err := x509.MarshalPKCS8PrivateKey(c.Owner.PrivateKey)
1347 if err != nil {
1348 // We explicitly pass an Ed25519 private key in, so this can't happen
1349 panic(err)
1350 }
1351
Jan Schär0f8ce4c2025-09-04 13:27:50 +02001352 host := net.JoinHostPort(c.NodeIDs[0], allocs.PortKubernetesAPIWrapped.PortString())
Lorenz Brun150f24a2023-07-13 20:11:06 +02001353 clientConfig := rest.Config{
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001354 Host: host,
1355 TLSClientConfig: rest.TLSClientConfig{
1356 // TODO(q3k): use CA certificate
1357 Insecure: true,
1358 ServerName: "kubernetes.default.svc",
1359 CertData: pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: c.Owner.Certificate[0]}),
1360 KeyData: pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: pkcs8Key}),
1361 },
1362 Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
1363 return c.DialNode(ctx, address)
1364 },
1365 }
Lorenz Brun8f1254d2025-01-28 14:10:05 +01001366 clientSet, err := kubernetes.NewForConfig(&clientConfig)
1367 if err != nil {
1368 return nil, nil, err
1369 }
1370 return clientSet, &clientConfig, nil
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001371}
1372
Serge Bazanski1f8cad72023-03-20 16:58:10 +01001373// KubernetesControllerNodeAddresses returns the list of IP addresses of nodes
1374// which are currently Kubernetes controllers, ie. run an apiserver. This list
1375// might be empty if no node is currently configured with the
1376// 'KubernetesController' node.
1377func (c *Cluster) KubernetesControllerNodeAddresses(ctx context.Context) ([]string, error) {
1378 curC, err := c.CuratorClient()
1379 if err != nil {
1380 return nil, err
1381 }
1382 mgmt := apb.NewManagementClient(curC)
1383 srv, err := mgmt.GetNodes(ctx, &apb.GetNodesRequest{
1384 Filter: "has(node.roles.kubernetes_controller)",
1385 })
1386 if err != nil {
1387 return nil, err
1388 }
1389 defer srv.CloseSend()
1390 var res []string
1391 for {
1392 n, err := srv.Recv()
1393 if err == io.EOF {
1394 break
1395 }
1396 if err != nil {
1397 return nil, err
1398 }
1399 if n.Status == nil || n.Status.ExternalAddress == "" {
1400 continue
1401 }
1402 res = append(res, n.Status.ExternalAddress)
1403 }
1404 return res, nil
1405}
Serge Bazanski630fb5c2023-04-06 10:50:24 +02001406
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001407// AllNodesHealthy returns nil if all the nodes in the cluster are seemingly
1408// healthy.
Serge Bazanski630fb5c2023-04-06 10:50:24 +02001409func (c *Cluster) AllNodesHealthy(ctx context.Context) error {
1410 // Get an authenticated owner client within the cluster.
1411 curC, err := c.CuratorClient()
1412 if err != nil {
1413 return err
1414 }
1415 mgmt := apb.NewManagementClient(curC)
1416 nodes, err := getNodes(ctx, mgmt)
1417 if err != nil {
1418 return err
1419 }
1420
1421 var unhealthy []string
1422 for _, node := range nodes {
Tim Windelschmidta10d0cb2025-01-13 14:44:15 +01001423 if node.Health == apb.Node_HEALTH_HEALTHY {
Serge Bazanski630fb5c2023-04-06 10:50:24 +02001424 continue
1425 }
1426 unhealthy = append(unhealthy, node.Id)
1427 }
1428 if len(unhealthy) == 0 {
1429 return nil
1430 }
1431 return fmt.Errorf("nodes unhealthy: %s", strings.Join(unhealthy, ", "))
1432}
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001433
1434// ApproveNode approves a node by ID, waiting for it to become UP.
1435func (c *Cluster) ApproveNode(ctx context.Context, id string) error {
1436 curC, err := c.CuratorClient()
1437 if err != nil {
1438 return err
1439 }
1440 mgmt := apb.NewManagementClient(curC)
1441
1442 _, err = mgmt.ApproveNode(ctx, &apb.ApproveNodeRequest{
1443 Pubkey: c.Nodes[id].Pubkey,
1444 })
1445 if err != nil {
1446 return fmt.Errorf("ApproveNode: %w", err)
1447 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001448 logf("Cluster: %s: approved, waiting for UP", id)
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001449 for {
1450 nodes, err := mgmt.GetNodes(ctx, &apb.GetNodesRequest{})
1451 if err != nil {
1452 return fmt.Errorf("GetNodes: %w", err)
1453 }
1454 found := false
1455 for {
1456 node, err := nodes.Recv()
1457 if errors.Is(err, io.EOF) {
1458 break
1459 }
1460 if err != nil {
1461 return fmt.Errorf("Nodes.Recv: %w", err)
1462 }
1463 if node.Id != id {
1464 continue
1465 }
1466 if node.State != cpb.NodeState_NODE_STATE_UP {
1467 continue
1468 }
1469 found = true
1470 break
1471 }
1472 nodes.CloseSend()
1473
1474 if found {
1475 break
1476 }
1477 time.Sleep(time.Second)
1478 }
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001479 logf("Cluster: %s: UP", id)
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001480 return nil
1481}
1482
1483// MakeKubernetesWorker adds the KubernetesWorker role to a node by ID.
1484func (c *Cluster) MakeKubernetesWorker(ctx context.Context, id string) error {
1485 curC, err := c.CuratorClient()
1486 if err != nil {
1487 return err
1488 }
1489 mgmt := apb.NewManagementClient(curC)
1490
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001491 logf("Cluster: %s: adding KubernetesWorker", id)
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001492 _, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
1493 Node: &apb.UpdateNodeRolesRequest_Id{
1494 Id: id,
1495 },
Jan Schärd1a8b642024-12-03 17:40:41 +01001496 KubernetesWorker: ptr.To(true),
Serge Bazanskia0bc6d32023-06-28 18:57:40 +02001497 })
1498 return err
1499}
Serge Bazanski37cfcc12024-03-21 11:59:07 +01001500
Jan Schära9b060b2024-08-07 10:42:29 +02001501// MakeKubernetesController adds the KubernetesController role to a node by ID.
1502func (c *Cluster) MakeKubernetesController(ctx context.Context, id string) error {
1503 curC, err := c.CuratorClient()
1504 if err != nil {
1505 return err
1506 }
1507 mgmt := apb.NewManagementClient(curC)
1508
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001509 logf("Cluster: %s: adding KubernetesController", id)
Jan Schära9b060b2024-08-07 10:42:29 +02001510 _, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
1511 Node: &apb.UpdateNodeRolesRequest_Id{
1512 Id: id,
1513 },
Jan Schärd1a8b642024-12-03 17:40:41 +01001514 KubernetesController: ptr.To(true),
Jan Schära9b060b2024-08-07 10:42:29 +02001515 })
1516 return err
1517}
1518
Serge Bazanski37cfcc12024-03-21 11:59:07 +01001519// MakeConsensusMember adds the ConsensusMember role to a node by ID.
1520func (c *Cluster) MakeConsensusMember(ctx context.Context, id string) error {
1521 curC, err := c.CuratorClient()
1522 if err != nil {
1523 return err
1524 }
1525 mgmt := apb.NewManagementClient(curC)
1526 cur := ipb.NewCuratorClient(curC)
1527
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001528 logf("Cluster: %s: adding ConsensusMember", id)
Serge Bazanski37cfcc12024-03-21 11:59:07 +01001529 bo := backoff.NewExponentialBackOff()
1530 bo.MaxElapsedTime = 10 * time.Second
1531
1532 backoff.Retry(func() error {
1533 _, err = mgmt.UpdateNodeRoles(ctx, &apb.UpdateNodeRolesRequest{
1534 Node: &apb.UpdateNodeRolesRequest_Id{
1535 Id: id,
1536 },
Jan Schärd1a8b642024-12-03 17:40:41 +01001537 ConsensusMember: ptr.To(true),
Serge Bazanski37cfcc12024-03-21 11:59:07 +01001538 })
1539 if err != nil {
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001540 logf("Cluster: %s: UpdateNodeRoles failed: %v", id, err)
Serge Bazanski37cfcc12024-03-21 11:59:07 +01001541 }
1542 return err
1543 }, backoff.WithContext(bo, ctx))
1544 if err != nil {
1545 return err
1546 }
1547
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001548 logf("Cluster: %s: waiting for learner/full members...", id)
Serge Bazanski37cfcc12024-03-21 11:59:07 +01001549
1550 learner := false
1551 for {
1552 res, err := cur.GetConsensusStatus(ctx, &ipb.GetConsensusStatusRequest{})
1553 if err != nil {
1554 return fmt.Errorf("GetConsensusStatus: %w", err)
1555 }
1556 for _, member := range res.EtcdMember {
1557 if member.Id != id {
1558 continue
1559 }
1560 switch member.Status {
1561 case ipb.GetConsensusStatusResponse_EtcdMember_STATUS_LEARNER:
1562 if !learner {
1563 learner = true
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001564 logf("Cluster: %s: became a learner, waiting for full member...", id)
Serge Bazanski37cfcc12024-03-21 11:59:07 +01001565 }
1566 case ipb.GetConsensusStatusResponse_EtcdMember_STATUS_FULL:
Tim Windelschmidtd0cdb572025-03-27 17:18:39 +01001567 logf("Cluster: %s: became a full member", id)
Serge Bazanski37cfcc12024-03-21 11:59:07 +01001568 return nil
1569 }
1570 }
1571 time.Sleep(100 * time.Millisecond)
1572 }
1573}