m/test/e2e: deflake

Now that nodes don't heartbeat before they have critical ESP data
persisted, we can simply make sure they heartbeat before any disruptive
reboot and that will remove our biggest source of flakiness in E2E
tests.

Change-Id: I9d4483015341157af6b27c8bd98f5df64da229d2
Reviewed-on: https://review.monogon.dev/c/monogon/+/1499
Tested-by: Jenkins CI
Reviewed-by: Leopold Schabel <leo@monogon.tech>
diff --git a/metropolis/test/launch/cluster/cluster.go b/metropolis/test/launch/cluster/cluster.go
index c433780..0efd08b 100644
--- a/metropolis/test/launch/cluster/cluster.go
+++ b/metropolis/test/launch/cluster/cluster.go
@@ -18,6 +18,7 @@
 	"os/exec"
 	"path"
 	"path/filepath"
+	"strings"
 	"syscall"
 	"time"
 
@@ -1099,3 +1100,28 @@
 	}
 	return res, nil
 }
+
+func (c *Cluster) AllNodesHealthy(ctx context.Context) error {
+	// Get an authenticated owner client within the cluster.
+	curC, err := c.CuratorClient()
+	if err != nil {
+		return err
+	}
+	mgmt := apb.NewManagementClient(curC)
+	nodes, err := getNodes(ctx, mgmt)
+	if err != nil {
+		return err
+	}
+
+	var unhealthy []string
+	for _, node := range nodes {
+		if node.Health == apb.Node_HEALTHY {
+			continue
+		}
+		unhealthy = append(unhealthy, node.Id)
+	}
+	if len(unhealthy) == 0 {
+		return nil
+	}
+	return fmt.Errorf("nodes unhealthy: %s", strings.Join(unhealthy, ", "))
+}