m/test/e2e: use concise-style logging

Making our test logs look like LogEntry.ConciseString() means we have
significantly more readable test logs.

Change-Id: I0b1eab6a5a837bb2001f3b32779c23df2feaa381
Reviewed-on: https://review.monogon.dev/c/monogon/+/1362
Reviewed-by: Leopold Schabel <leo@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/test/launch/BUILD.bazel b/metropolis/test/launch/BUILD.bazel
index ff476c6..91c3b3d 100644
--- a/metropolis/test/launch/BUILD.bazel
+++ b/metropolis/test/launch/BUILD.bazel
@@ -2,7 +2,10 @@
 
 go_library(
     name = "launch",
-    srcs = ["launch.go"],
+    srcs = [
+        "launch.go",
+        "log.go",
+    ],
     importpath = "source.monogon.dev/metropolis/test/launch",
     visibility = ["//metropolis:__subpackages__"],
     deps = [
diff --git a/metropolis/test/launch/cluster/cluster.go b/metropolis/test/launch/cluster/cluster.go
index bd71f06..a0ad82f 100644
--- a/metropolis/test/launch/cluster/cluster.go
+++ b/metropolis/test/launch/cluster/cluster.go
@@ -12,7 +12,6 @@
 	"errors"
 	"fmt"
 	"io"
-	"log"
 	"net"
 	"os"
 	"os/exec"
@@ -125,7 +124,7 @@
 		return nil, fmt.Errorf("while resolving a path: %w", err)
 	}
 	di := filepath.Join(stdp, filepath.Base(si))
-	log.Printf("Cluster: copying the node image: %s -> %s", si, di)
+	launch.Log("Cluster: copying node image: %s -> %s", si, di)
 	if err := copyFile(si, di); err != nil {
 		return nil, fmt.Errorf("while copying the node image: %w", err)
 	}
@@ -183,7 +182,7 @@
 	if c.authClient == nil {
 		authCreds := rpc.NewAuthenticatedCredentials(c.Owner, nil)
 		r := resolver.New(c.ctxT, resolver.WithLogger(func(f string, args ...interface{}) {
-			log.Printf("Cluster: client resolver: %s", fmt.Sprintf(f, args...))
+			launch.Log("Cluster: client resolver: %s", fmt.Sprintf(f, args...))
 		}))
 		for _, n := range c.NodeIDs {
 			ep, err := resolver.NodeWithDefaultPort(n)
@@ -357,11 +356,11 @@
 
 	// Stop TPM emulator and wait for it to exit to properly reap the child process
 	tpmCancel()
-	log.Print("Node: Waiting for TPM emulator to exit")
+	launch.Log("Node: Waiting for TPM emulator to exit")
 	// Wait returns a SIGKILL error because we just cancelled its context.
 	// We still need to call it to avoid creating zombies.
 	_ = tpmEmuCmd.Wait()
-	log.Print("Node: TPM emulator done")
+	launch.Log("Node: TPM emulator done")
 
 	var exerr *exec.ExitError
 	if err != nil && errors.As(err, &exerr) {
@@ -561,14 +560,14 @@
 
 	// Retrieve owner certificate - this can take a while because the node is still
 	// coming up, so do it in a backoff loop.
-	log.Printf("Cluster: retrieving owner certificate (this can take a few seconds while the first node boots)...")
+	launch.Log("Cluster: retrieving owner certificate (this can take a few seconds while the first node boots)...")
 	aaa := apb.NewAAAClient(initClient)
 	var cert *tls.Certificate
 	err = backoff.Retry(func() error {
 		cert, err = rpc.RetrieveOwnerCertificate(ctx, aaa, InsecurePrivateKey)
 		if st, ok := status.FromError(err); ok {
 			if st.Code() == codes.Unavailable {
-				log.Printf("Cluster: cluster UNAVAILABLE: %v", st.Message())
+				launch.Log("Cluster: cluster UNAVAILABLE: %v", st.Message())
 				return err
 			}
 		}
@@ -577,7 +576,7 @@
 	if err != nil {
 		return nil, nil, fmt.Errorf("couldn't retrieve owner certificate: %w", err)
 	}
-	log.Printf("Cluster: retrieved owner certificate.")
+	launch.Log("Cluster: retrieved owner certificate.")
 
 	// Now connect authenticated and get the node ID.
 	creds := rpc.NewAuthenticatedCredentials(*cert, nil)
@@ -676,11 +675,11 @@
 
 	// Start the first node.
 	ctxT, ctxC := context.WithCancel(ctx)
-	log.Printf("Cluster: Starting node %d...", 1)
+	launch.Log("Cluster: Starting node %d...", 1)
 	go func() {
 		err := LaunchNode(ctxT, ld, sd, &nodeOpts[0])
 		if err != nil {
-			log.Printf("Node %d finished with an error: %v", 1, err)
+			launch.Log("Node %d finished with an error: %v", 1, err)
 		}
 		done[0] <- err
 	}()
@@ -703,7 +702,7 @@
 			PcapDump:               path.Join(ld, "nanoswitch.pcap"),
 		}); err != nil {
 			if !errors.Is(err, ctxT.Err()) {
-				log.Fatalf("Failed to launch nanoswitch: %v", err)
+				launch.Fatal("Failed to launch nanoswitch: %v", err)
 			}
 		}
 	}()
@@ -757,14 +756,14 @@
 	mgmt := apb.NewManagementClient(curC)
 
 	// Retrieve register ticket to register further nodes.
-	log.Printf("Cluster: retrieving register ticket...")
+	launch.Log("Cluster: retrieving register ticket...")
 	resT, err := mgmt.GetRegisterTicket(ctx, &apb.GetRegisterTicketRequest{})
 	if err != nil {
 		ctxC()
 		return nil, fmt.Errorf("GetRegisterTicket: %w", err)
 	}
 	ticket := resT.Ticket
-	log.Printf("Cluster: retrieved register ticket (%d bytes).", len(ticket))
+	launch.Log("Cluster: retrieved register ticket (%d bytes).", len(ticket))
 
 	// Retrieve cluster info (for directory and ca public key) to register further
 	// nodes.
@@ -796,17 +795,17 @@
 	//
 	// TODO(q3k): parallelize this
 	for i := 1; i < opts.NumNodes; i++ {
-		log.Printf("Cluster: Starting node %d...", i+1)
+		launch.Log("Cluster: Starting node %d...", i+1)
 		go func(i int) {
 			err := LaunchNode(ctxT, ld, sd, &nodeOpts[i])
 			if err != nil {
-				log.Printf("Node %d finished with an error: %v", i, err)
+				launch.Log("Node %d finished with an error: %v", i, err)
 			}
 			done[i] <- err
 		}(i)
 		var newNode *apb.Node
 
-		log.Printf("Cluster: waiting for node %d to appear as NEW...", i)
+		launch.Log("Cluster: waiting for node %d to appear as NEW...", i)
 		for {
 			nodes, err := getNodes(ctx, mgmt)
 			if err != nil {
@@ -825,9 +824,9 @@
 			time.Sleep(1 * time.Second)
 		}
 		id := identity.NodeID(newNode.Pubkey)
-		log.Printf("Cluster: node %d is %s", i, id)
+		launch.Log("Cluster: node %d is %s", i, id)
 
-		log.Printf("Cluster: approving node %d", i)
+		launch.Log("Cluster: approving node %d", i)
 		_, err := mgmt.ApproveNode(ctx, &apb.ApproveNodeRequest{
 			Pubkey: newNode.Pubkey,
 		})
@@ -835,7 +834,7 @@
 			ctxC()
 			return nil, fmt.Errorf("ApproveNode(%s): %w", id, err)
 		}
-		log.Printf("Cluster: node %d approved, waiting for it to appear as UP and with a network address...", i)
+		launch.Log("Cluster: node %d approved, waiting for it to appear as UP and with a network address...", i)
 		for {
 			nodes, err := getNodes(ctx, mgmt)
 			if err != nil {
@@ -866,12 +865,12 @@
 			}
 			time.Sleep(time.Second)
 		}
-		log.Printf("Cluster: node %d (%s) UP!", i, id)
+		launch.Log("Cluster: node %d (%s) UP!", i, id)
 	}
 
-	log.Printf("Cluster: all nodes up:")
+	launch.Log("Cluster: all nodes up:")
 	for _, node := range cluster.Nodes {
-		log.Printf("Cluster:  - %s at %s", node.ID, node.ManagementAddress)
+		launch.Log("Cluster:  - %s at %s", node.ID, node.ManagementAddress)
 	}
 
 	return cluster, nil
@@ -914,18 +913,18 @@
 
 	// Cancel the node's context. This will shut down QEMU.
 	c.nodeOpts[idx].Runtime.CtxC()
-	log.Printf("Cluster: waiting for node %d (%s) to stop.", idx, id)
+	launch.Log("Cluster: waiting for node %d (%s) to stop.", idx, id)
 	err = <-c.nodesDone[idx]
 	if err != nil {
 		return fmt.Errorf("while restarting node: %w", err)
 	}
 
 	// Start QEMU again.
-	log.Printf("Cluster: restarting node %d (%s).", idx, id)
+	launch.Log("Cluster: restarting node %d (%s).", idx, id)
 	go func(n int) {
 		err := LaunchNode(c.ctxT, c.launchDir, c.socketDir, &c.nodeOpts[n])
 		if err != nil {
-			log.Printf("Node %d finished with an error: %v", n, err)
+			launch.Log("Node %d finished with an error: %v", n, err)
 		}
 		c.nodesDone[n] <- err
 	}(idx)
@@ -934,10 +933,10 @@
 	for {
 		cs, err := getNode(ctx, mgmt, id)
 		if err != nil {
-			log.Printf("Cluster: node get error: %v", err)
+			launch.Log("Cluster: node get error: %v", err)
 			return err
 		}
-		log.Printf("Cluster: node status: %+v", cs)
+		launch.Log("Cluster: node status: %+v", cs)
 		if cs.Status == nil {
 			continue
 		}
@@ -946,7 +945,7 @@
 		}
 		time.Sleep(time.Second)
 	}
-	log.Printf("Cluster: node %d (%s) has rejoined the cluster.", idx, id)
+	launch.Log("Cluster: node %d (%s) has rejoined the cluster.", idx, id)
 	return nil
 }
 
@@ -954,24 +953,24 @@
 // nodes to stop. It returns an error if stopping the nodes failed, or one of
 // the nodes failed to fully start in the first place.
 func (c *Cluster) Close() error {
-	log.Printf("Cluster: stopping...")
+	launch.Log("Cluster: stopping...")
 	if c.authClient != nil {
 		c.authClient.Close()
 	}
 	c.ctxC()
 
 	var errs []error
-	log.Printf("Cluster: waiting for nodes to exit...")
+	launch.Log("Cluster: waiting for nodes to exit...")
 	for _, c := range c.nodesDone {
 		err := <-c
 		if err != nil {
 			errs = append(errs, err)
 		}
 	}
-	log.Printf("Cluster: removing nodes' state files.")
+	launch.Log("Cluster: removing nodes' state files.")
 	os.RemoveAll(c.launchDir)
 	os.RemoveAll(c.socketDir)
-	log.Printf("Cluster: done")
+	launch.Log("Cluster: done")
 	return multierr.Combine(errs...)
 }
 
@@ -981,8 +980,7 @@
 //
 // For example:
 //
-//   grpc.Dial("metropolis-deadbeef:1234", grpc.WithContextDialer(c.DialNode))
-//
+//	grpc.Dial("metropolis-deadbeef:1234", grpc.WithContextDialer(c.DialNode))
 func (c *Cluster) DialNode(_ context.Context, addr string) (net.Conn, error) {
 	host, port, err := net.SplitHostPort(addr)
 	if err != nil {
diff --git a/metropolis/test/launch/cluster/prefixed_stdio.go b/metropolis/test/launch/cluster/prefixed_stdio.go
index 059c3cb..3ea3e18 100644
--- a/metropolis/test/launch/cluster/prefixed_stdio.go
+++ b/metropolis/test/launch/cluster/prefixed_stdio.go
@@ -22,7 +22,8 @@
 func newPrefixedStdio(num int) prefixedStdio {
 	return prefixedStdio{
 		logbuffer.NewLineBuffer(1024, func(l *logbuffer.Line) {
-			s := strings.TrimSpace(l.String())
+			s := strings.TrimRight(l.String(), " \t\n\r")
+			s = strings.TrimLeft(s, "\n\r\t")
 			// TODO(q3k): don't just skip lines containing escape sequences, strip the
 			// sequences out. Or stop parsing qemu logs and instead dial log endpoint in
 			// spawned nodes.
diff --git a/metropolis/test/launch/launch.go b/metropolis/test/launch/launch.go
index e8df009..9760f54 100644
--- a/metropolis/test/launch/launch.go
+++ b/metropolis/test/launch/launch.go
@@ -23,7 +23,6 @@
 	"errors"
 	"fmt"
 	"io"
-	"log"
 	"net"
 	"os"
 	"os/exec"
@@ -67,7 +66,7 @@
 			argsFmt += " "
 		}
 	}
-	log.Printf("Running %s:\n  %s\n", name, argsFmt)
+	Log("Running %s:\n  %s\n", name, argsFmt)
 }
 
 // PortMap represents where VM ports are mapped to on the host. It maps from the VM
diff --git a/metropolis/test/launch/log.go b/metropolis/test/launch/log.go
new file mode 100644
index 0000000..2637e24
--- /dev/null
+++ b/metropolis/test/launch/log.go
@@ -0,0 +1,28 @@
+package launch
+
+import (
+	"fmt"
+	"os"
+	"strings"
+)
+
+// Log is compatible with the output of ConciseString as used in the Metropolis
+// console log, making the output more readable in unified test logs.
+func Log(f string, args ...any) {
+	formatted := fmt.Sprintf(f, args...)
+	for i, line := range strings.Split(formatted, "\n") {
+		if len(line) == 0 {
+			continue
+		}
+		if i == 0 {
+			fmt.Printf("TT| %20s ! %s\n", "test launch", line)
+		} else {
+			fmt.Printf("TT| %20s | %s\n", "", line)
+		}
+	}
+}
+
+func Fatal(f string, args ...any) {
+	Log(f, args...)
+	os.Exit(1)
+}