m/test/launch: fail ROC on non-UNAVAILABLE errors

This makes RetrieveOwnerKeys fail fast in tests if some non-transient
(ie. non-UNAVAILABLE) error is encountered. I hit this while developing
something around the codebase and it took me way too long to figure out
why the e2e test was stalling.

This really begs doing a pass on all retry loops to make sure we don't
get stuck like this. Perhaps we should formalize this, too.

Change-Id: I048f5ac79802330f789e67ba316bc38f04d83331
Reviewed-on: https://review.monogon.dev/c/monogon/+/531
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/test/launch/cluster/cluster.go b/metropolis/test/launch/cluster/cluster.go
index bd3a76a..8cc0a48 100644
--- a/metropolis/test/launch/cluster/cluster.go
+++ b/metropolis/test/launch/cluster/cluster.go
@@ -24,6 +24,8 @@
 	grpcretry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
 	"go.uber.org/multierr"
 	"google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
 	"google.golang.org/protobuf/proto"
 
 	"source.monogon.dev/metropolis/node"
@@ -240,7 +242,7 @@
 // cluster.
 func getNodes(ctx context.Context, mgmt apb.ManagementClient) ([]*apb.Node, error) {
 	var res []*apb.Node
-	bo := backoff.NewExponentialBackOff()
+	bo := backoff.WithContext(backoff.NewExponentialBackOff(), ctx)
 	err := backoff.Retry(func() error {
 		res = nil
 		srvN, err := mgmt.GetNodes(ctx, &apb.GetNodesRequest{})
@@ -416,7 +418,12 @@
 	var cert *tls.Certificate
 	err = backoff.Retry(func() error {
 		cert, err = rpc.RetrieveOwnerCertificate(ctxT, aaa, InsecurePrivateKey)
-		return err
+		if st, ok := status.FromError(err); ok {
+			if st.Code() == codes.Unavailable {
+				return err
+			}
+		}
+		return backoff.Permanent(err)
 	}, backoff.WithContext(backoff.NewExponentialBackOff(), ctxT))
 	if err != nil {
 		ctxC()