m/test/e2e: deflake Now that nodes don't heartbeat before they have critical ESP data persisted, we can simply make sure they heartbeat before any disruptive reboot and that will remove our biggest source of flakiness in E2E tests. Change-Id: I9d4483015341157af6b27c8bd98f5df64da229d2 Reviewed-on: https://review.monogon.dev/c/monogon/+/1499 Tested-by: Jenkins CI Reviewed-by: Leopold Schabel <leo@monogon.tech>

commit: 630fb5c5349b13330b7de6f8300b495b801db061 [log] [tgz]
author: Serge Bazanski <serge@monogon.tech> Thu Apr 06 10:50:24 2023 +0200
committer: Serge Bazanski <serge@monogon.tech> Thu Apr 06 09:55:51 2023 +0000
tree: 9d946ba0dd34a6ba0567f4f7120174797e29a8fe
parent: 1fb2b10801eb4ea56a1e00f174923ec83f039623 [diff] [blame]
diff --git a/metropolis/test/launch/cluster/cluster.go b/metropolis/test/launch/cluster/cluster.go
index c433780..0efd08b 100644
--- a/metropolis/test/launch/cluster/cluster.go
+++ b/metropolis/test/launch/cluster/cluster.go

@@ -18,6 +18,7 @@
 	"os/exec"
 	"path"
 	"path/filepath"
+	"strings"
 	"syscall"
 	"time"
 
@@ -1099,3 +1100,28 @@
 	}
 	return res, nil
 }
+
+func (c *Cluster) AllNodesHealthy(ctx context.Context) error {
+	// Get an authenticated owner client within the cluster.
+	curC, err := c.CuratorClient()
+	if err != nil {
+		return err
+	}
+	mgmt := apb.NewManagementClient(curC)
+	nodes, err := getNodes(ctx, mgmt)
+	if err != nil {
+		return err
+	}
+
+	var unhealthy []string
+	for _, node := range nodes {
+		if node.Health == apb.Node_HEALTHY {
+			continue
+		}
+		unhealthy = append(unhealthy, node.Id)
+	}
+	if len(unhealthy) == 0 {
+		return nil
+	}
+	return fmt.Errorf("nodes unhealthy: %s", strings.Join(unhealthy, ", "))
+}
commit	630fb5c5349b13330b7de6f8300b495b801db061	[log] [tgz]
author	Serge Bazanski <serge@monogon.tech>	Thu Apr 06 10:50:24 2023 +0200
committer	Serge Bazanski <serge@monogon.tech>	Thu Apr 06 09:55:51 2023 +0000
tree	9d946ba0dd34a6ba0567f4f7120174797e29a8fe
parent	1fb2b10801eb4ea56a1e00f174923ec83f039623 [diff] [blame]