m/test/e2e: use concise-style logging
Making our test logs look like LogEntry.ConciseString() means we have
significantly more readable test logs.
Change-Id: I0b1eab6a5a837bb2001f3b32779c23df2feaa381
Reviewed-on: https://review.monogon.dev/c/monogon/+/1362
Reviewed-by: Leopold Schabel <leo@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/test/e2e/BUILD.bazel b/metropolis/test/e2e/BUILD.bazel
index 2d227e1..8853554 100644
--- a/metropolis/test/e2e/BUILD.bazel
+++ b/metropolis/test/e2e/BUILD.bazel
@@ -35,6 +35,7 @@
"//metropolis/node/core/identity",
"//metropolis/node/core/rpc",
"//metropolis/proto/api",
+ "//metropolis/test/launch",
"//metropolis/test/launch/cluster",
"//metropolis/test/util",
"@io_k8s_api//core/v1:core",
diff --git a/metropolis/test/e2e/main_test.go b/metropolis/test/e2e/main_test.go
index 7ee63d0..c3f6a3c 100644
--- a/metropolis/test/e2e/main_test.go
+++ b/metropolis/test/e2e/main_test.go
@@ -21,7 +21,6 @@
"errors"
"fmt"
"io"
- "log"
"net"
"net/http"
_ "net/http"
@@ -41,6 +40,7 @@
"source.monogon.dev/metropolis/node/core/identity"
"source.monogon.dev/metropolis/node/core/rpc"
apb "source.monogon.dev/metropolis/proto/api"
+ "source.monogon.dev/metropolis/test/launch"
"source.monogon.dev/metropolis/test/launch/cluster"
"source.monogon.dev/metropolis/test/util"
)
@@ -70,12 +70,12 @@
pprofListen, err := net.ListenTCP("tcp", addr)
if err != nil {
- log.Fatalf("Failed to listen on pprof port: %s", pprofListen.Addr())
+ launch.Fatal("Failed to listen on pprof port: %s", pprofListen.Addr())
}
- log.Printf("E2E: pprof server listening on %s", pprofListen.Addr())
+ launch.Log("E2E: pprof server listening on %s", pprofListen.Addr())
go func() {
- log.Printf("E2E: pprof server returned an error: %v", http.Serve(pprofListen, nil))
+ launch.Log("E2E: pprof server returned an error: %v", http.Serve(pprofListen, nil))
pprofListen.Close()
}()
@@ -98,7 +98,7 @@
}
}()
- log.Printf("E2E: Cluster running, starting tests...")
+ launch.Log("E2E: Cluster running, starting tests...")
// Dial first node's curator.
creds := rpc.NewAuthenticatedCredentials(cluster.Owner, nil)
diff --git a/metropolis/test/launch/BUILD.bazel b/metropolis/test/launch/BUILD.bazel
index ff476c6..91c3b3d 100644
--- a/metropolis/test/launch/BUILD.bazel
+++ b/metropolis/test/launch/BUILD.bazel
@@ -2,7 +2,10 @@
go_library(
name = "launch",
- srcs = ["launch.go"],
+ srcs = [
+ "launch.go",
+ "log.go",
+ ],
importpath = "source.monogon.dev/metropolis/test/launch",
visibility = ["//metropolis:__subpackages__"],
deps = [
diff --git a/metropolis/test/launch/cluster/cluster.go b/metropolis/test/launch/cluster/cluster.go
index bd71f06..a0ad82f 100644
--- a/metropolis/test/launch/cluster/cluster.go
+++ b/metropolis/test/launch/cluster/cluster.go
@@ -12,7 +12,6 @@
"errors"
"fmt"
"io"
- "log"
"net"
"os"
"os/exec"
@@ -125,7 +124,7 @@
return nil, fmt.Errorf("while resolving a path: %w", err)
}
di := filepath.Join(stdp, filepath.Base(si))
- log.Printf("Cluster: copying the node image: %s -> %s", si, di)
+ launch.Log("Cluster: copying node image: %s -> %s", si, di)
if err := copyFile(si, di); err != nil {
return nil, fmt.Errorf("while copying the node image: %w", err)
}
@@ -183,7 +182,7 @@
if c.authClient == nil {
authCreds := rpc.NewAuthenticatedCredentials(c.Owner, nil)
r := resolver.New(c.ctxT, resolver.WithLogger(func(f string, args ...interface{}) {
- log.Printf("Cluster: client resolver: %s", fmt.Sprintf(f, args...))
+ launch.Log("Cluster: client resolver: %s", fmt.Sprintf(f, args...))
}))
for _, n := range c.NodeIDs {
ep, err := resolver.NodeWithDefaultPort(n)
@@ -357,11 +356,11 @@
// Stop TPM emulator and wait for it to exit to properly reap the child process
tpmCancel()
- log.Print("Node: Waiting for TPM emulator to exit")
+ launch.Log("Node: Waiting for TPM emulator to exit")
// Wait returns a SIGKILL error because we just cancelled its context.
// We still need to call it to avoid creating zombies.
_ = tpmEmuCmd.Wait()
- log.Print("Node: TPM emulator done")
+ launch.Log("Node: TPM emulator done")
var exerr *exec.ExitError
if err != nil && errors.As(err, &exerr) {
@@ -561,14 +560,14 @@
// Retrieve owner certificate - this can take a while because the node is still
// coming up, so do it in a backoff loop.
- log.Printf("Cluster: retrieving owner certificate (this can take a few seconds while the first node boots)...")
+ launch.Log("Cluster: retrieving owner certificate (this can take a few seconds while the first node boots)...")
aaa := apb.NewAAAClient(initClient)
var cert *tls.Certificate
err = backoff.Retry(func() error {
cert, err = rpc.RetrieveOwnerCertificate(ctx, aaa, InsecurePrivateKey)
if st, ok := status.FromError(err); ok {
if st.Code() == codes.Unavailable {
- log.Printf("Cluster: cluster UNAVAILABLE: %v", st.Message())
+ launch.Log("Cluster: cluster UNAVAILABLE: %v", st.Message())
return err
}
}
@@ -577,7 +576,7 @@
if err != nil {
return nil, nil, fmt.Errorf("couldn't retrieve owner certificate: %w", err)
}
- log.Printf("Cluster: retrieved owner certificate.")
+ launch.Log("Cluster: retrieved owner certificate.")
// Now connect authenticated and get the node ID.
creds := rpc.NewAuthenticatedCredentials(*cert, nil)
@@ -676,11 +675,11 @@
// Start the first node.
ctxT, ctxC := context.WithCancel(ctx)
- log.Printf("Cluster: Starting node %d...", 1)
+ launch.Log("Cluster: Starting node %d...", 1)
go func() {
err := LaunchNode(ctxT, ld, sd, &nodeOpts[0])
if err != nil {
- log.Printf("Node %d finished with an error: %v", 1, err)
+ launch.Log("Node %d finished with an error: %v", 1, err)
}
done[0] <- err
}()
@@ -703,7 +702,7 @@
PcapDump: path.Join(ld, "nanoswitch.pcap"),
}); err != nil {
if !errors.Is(err, ctxT.Err()) {
- log.Fatalf("Failed to launch nanoswitch: %v", err)
+ launch.Fatal("Failed to launch nanoswitch: %v", err)
}
}
}()
@@ -757,14 +756,14 @@
mgmt := apb.NewManagementClient(curC)
// Retrieve register ticket to register further nodes.
- log.Printf("Cluster: retrieving register ticket...")
+ launch.Log("Cluster: retrieving register ticket...")
resT, err := mgmt.GetRegisterTicket(ctx, &apb.GetRegisterTicketRequest{})
if err != nil {
ctxC()
return nil, fmt.Errorf("GetRegisterTicket: %w", err)
}
ticket := resT.Ticket
- log.Printf("Cluster: retrieved register ticket (%d bytes).", len(ticket))
+ launch.Log("Cluster: retrieved register ticket (%d bytes).", len(ticket))
// Retrieve cluster info (for directory and ca public key) to register further
// nodes.
@@ -796,17 +795,17 @@
//
// TODO(q3k): parallelize this
for i := 1; i < opts.NumNodes; i++ {
- log.Printf("Cluster: Starting node %d...", i+1)
+ launch.Log("Cluster: Starting node %d...", i+1)
go func(i int) {
err := LaunchNode(ctxT, ld, sd, &nodeOpts[i])
if err != nil {
- log.Printf("Node %d finished with an error: %v", i, err)
+ launch.Log("Node %d finished with an error: %v", i, err)
}
done[i] <- err
}(i)
var newNode *apb.Node
- log.Printf("Cluster: waiting for node %d to appear as NEW...", i)
+ launch.Log("Cluster: waiting for node %d to appear as NEW...", i)
for {
nodes, err := getNodes(ctx, mgmt)
if err != nil {
@@ -825,9 +824,9 @@
time.Sleep(1 * time.Second)
}
id := identity.NodeID(newNode.Pubkey)
- log.Printf("Cluster: node %d is %s", i, id)
+ launch.Log("Cluster: node %d is %s", i, id)
- log.Printf("Cluster: approving node %d", i)
+ launch.Log("Cluster: approving node %d", i)
_, err := mgmt.ApproveNode(ctx, &apb.ApproveNodeRequest{
Pubkey: newNode.Pubkey,
})
@@ -835,7 +834,7 @@
ctxC()
return nil, fmt.Errorf("ApproveNode(%s): %w", id, err)
}
- log.Printf("Cluster: node %d approved, waiting for it to appear as UP and with a network address...", i)
+ launch.Log("Cluster: node %d approved, waiting for it to appear as UP and with a network address...", i)
for {
nodes, err := getNodes(ctx, mgmt)
if err != nil {
@@ -866,12 +865,12 @@
}
time.Sleep(time.Second)
}
- log.Printf("Cluster: node %d (%s) UP!", i, id)
+ launch.Log("Cluster: node %d (%s) UP!", i, id)
}
- log.Printf("Cluster: all nodes up:")
+ launch.Log("Cluster: all nodes up:")
for _, node := range cluster.Nodes {
- log.Printf("Cluster: - %s at %s", node.ID, node.ManagementAddress)
+ launch.Log("Cluster: - %s at %s", node.ID, node.ManagementAddress)
}
return cluster, nil
@@ -914,18 +913,18 @@
// Cancel the node's context. This will shut down QEMU.
c.nodeOpts[idx].Runtime.CtxC()
- log.Printf("Cluster: waiting for node %d (%s) to stop.", idx, id)
+ launch.Log("Cluster: waiting for node %d (%s) to stop.", idx, id)
err = <-c.nodesDone[idx]
if err != nil {
return fmt.Errorf("while restarting node: %w", err)
}
// Start QEMU again.
- log.Printf("Cluster: restarting node %d (%s).", idx, id)
+ launch.Log("Cluster: restarting node %d (%s).", idx, id)
go func(n int) {
err := LaunchNode(c.ctxT, c.launchDir, c.socketDir, &c.nodeOpts[n])
if err != nil {
- log.Printf("Node %d finished with an error: %v", n, err)
+ launch.Log("Node %d finished with an error: %v", n, err)
}
c.nodesDone[n] <- err
}(idx)
@@ -934,10 +933,10 @@
for {
cs, err := getNode(ctx, mgmt, id)
if err != nil {
- log.Printf("Cluster: node get error: %v", err)
+ launch.Log("Cluster: node get error: %v", err)
return err
}
- log.Printf("Cluster: node status: %+v", cs)
+ launch.Log("Cluster: node status: %+v", cs)
if cs.Status == nil {
continue
}
@@ -946,7 +945,7 @@
}
time.Sleep(time.Second)
}
- log.Printf("Cluster: node %d (%s) has rejoined the cluster.", idx, id)
+ launch.Log("Cluster: node %d (%s) has rejoined the cluster.", idx, id)
return nil
}
@@ -954,24 +953,24 @@
// nodes to stop. It returns an error if stopping the nodes failed, or one of
// the nodes failed to fully start in the first place.
func (c *Cluster) Close() error {
- log.Printf("Cluster: stopping...")
+ launch.Log("Cluster: stopping...")
if c.authClient != nil {
c.authClient.Close()
}
c.ctxC()
var errs []error
- log.Printf("Cluster: waiting for nodes to exit...")
+ launch.Log("Cluster: waiting for nodes to exit...")
for _, c := range c.nodesDone {
err := <-c
if err != nil {
errs = append(errs, err)
}
}
- log.Printf("Cluster: removing nodes' state files.")
+ launch.Log("Cluster: removing nodes' state files.")
os.RemoveAll(c.launchDir)
os.RemoveAll(c.socketDir)
- log.Printf("Cluster: done")
+ launch.Log("Cluster: done")
return multierr.Combine(errs...)
}
@@ -981,8 +980,7 @@
//
// For example:
//
-// grpc.Dial("metropolis-deadbeef:1234", grpc.WithContextDialer(c.DialNode))
-//
+// grpc.Dial("metropolis-deadbeef:1234", grpc.WithContextDialer(c.DialNode))
func (c *Cluster) DialNode(_ context.Context, addr string) (net.Conn, error) {
host, port, err := net.SplitHostPort(addr)
if err != nil {
diff --git a/metropolis/test/launch/cluster/prefixed_stdio.go b/metropolis/test/launch/cluster/prefixed_stdio.go
index 059c3cb..3ea3e18 100644
--- a/metropolis/test/launch/cluster/prefixed_stdio.go
+++ b/metropolis/test/launch/cluster/prefixed_stdio.go
@@ -22,7 +22,8 @@
func newPrefixedStdio(num int) prefixedStdio {
return prefixedStdio{
logbuffer.NewLineBuffer(1024, func(l *logbuffer.Line) {
- s := strings.TrimSpace(l.String())
+ s := strings.TrimRight(l.String(), " \t\n\r")
+ s = strings.TrimLeft(s, "\n\r\t")
// TODO(q3k): don't just skip lines containing escape sequences, strip the
// sequences out. Or stop parsing qemu logs and instead dial log endpoint in
// spawned nodes.
diff --git a/metropolis/test/launch/launch.go b/metropolis/test/launch/launch.go
index e8df009..9760f54 100644
--- a/metropolis/test/launch/launch.go
+++ b/metropolis/test/launch/launch.go
@@ -23,7 +23,6 @@
"errors"
"fmt"
"io"
- "log"
"net"
"os"
"os/exec"
@@ -67,7 +66,7 @@
argsFmt += " "
}
}
- log.Printf("Running %s:\n %s\n", name, argsFmt)
+ Log("Running %s:\n %s\n", name, argsFmt)
}
// PortMap represents where VM ports are mapped to on the host. It maps from the VM
diff --git a/metropolis/test/launch/log.go b/metropolis/test/launch/log.go
new file mode 100644
index 0000000..2637e24
--- /dev/null
+++ b/metropolis/test/launch/log.go
@@ -0,0 +1,28 @@
+package launch
+
+import (
+ "fmt"
+ "os"
+ "strings"
+)
+
+// Log is compatible with the output of ConciseString as used in the Metropolis
+// console log, making the output more readable in unified test logs.
+func Log(f string, args ...any) {
+ formatted := fmt.Sprintf(f, args...)
+ for i, line := range strings.Split(formatted, "\n") {
+ if len(line) == 0 {
+ continue
+ }
+ if i == 0 {
+ fmt.Printf("TT| %20s ! %s\n", "test launch", line)
+ } else {
+ fmt.Printf("TT| %20s | %s\n", "", line)
+ }
+ }
+}
+
+func Fatal(f string, args ...any) {
+ Log(f, args...)
+ os.Exit(1)
+}
diff --git a/metropolis/test/util/BUILD.bazel b/metropolis/test/util/BUILD.bazel
index 7caf53b..16ed382 100644
--- a/metropolis/test/util/BUILD.bazel
+++ b/metropolis/test/util/BUILD.bazel
@@ -5,4 +5,5 @@
srcs = ["runners.go"],
importpath = "source.monogon.dev/metropolis/test/util",
visibility = ["//metropolis:__subpackages__"],
+ deps = ["//metropolis/test/launch"],
)
diff --git a/metropolis/test/util/runners.go b/metropolis/test/util/runners.go
index 47bf59f..a2b1663 100644
--- a/metropolis/test/util/runners.go
+++ b/metropolis/test/util/runners.go
@@ -7,21 +7,25 @@
"errors"
"testing"
"time"
+
+ "source.monogon.dev/metropolis/test/launch"
)
// TestEventual creates a new subtest looping the given function until it
// either doesn't return an error anymore or the timeout is exceeded. The last
// returned non-context-related error is being used as the test error.
func TestEventual(t *testing.T, name string, ctx context.Context, timeout time.Duration, f func(context.Context) error) {
+ start := time.Now()
ctx, cancel := context.WithTimeout(ctx, timeout)
t.Helper()
+ launch.Log("Test: %s: starting...", name)
t.Run(name, func(t *testing.T) {
defer cancel()
var lastErr = errors.New("test didn't run to completion at least once")
- t.Parallel()
for {
err := f(ctx)
if err == nil {
+ launch.Log("Test: %s: okay after %.1f seconds", name, time.Since(start).Seconds())
return
}
if err == ctx.Err() {