m/test/launch: fail ROC on non-UNAVAILABLE errors
This makes RetrieveOwnerKeys fail fast in tests if some non-transient
(ie. non-UNAVAILABLE) error is encountered. I hit this while developing
something around the codebase and it took me way too long to figure out
why the e2e test was stalling.
This really begs doing a pass on all retry loops to make sure we don't
get stuck like this. Perhaps we should formalize this, too.
Change-Id: I048f5ac79802330f789e67ba316bc38f04d83331
Reviewed-on: https://review.monogon.dev/c/monogon/+/531
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/test/launch/cluster/BUILD.bazel b/metropolis/test/launch/cluster/BUILD.bazel
index abcdc07..8e829df 100644
--- a/metropolis/test/launch/cluster/BUILD.bazel
+++ b/metropolis/test/launch/cluster/BUILD.bazel
@@ -28,6 +28,8 @@
"@com_github_cenkalti_backoff_v4//:go_default_library",
"@com_github_grpc_ecosystem_go_grpc_middleware//retry:go_default_library",
"@org_golang_google_grpc//:go_default_library",
+ "@org_golang_google_grpc//codes:go_default_library",
+ "@org_golang_google_grpc//status:go_default_library",
"@org_golang_google_protobuf//proto:go_default_library",
"@org_uber_go_multierr//:go_default_library",
],
diff --git a/metropolis/test/launch/cluster/cluster.go b/metropolis/test/launch/cluster/cluster.go
index bd3a76a..8cc0a48 100644
--- a/metropolis/test/launch/cluster/cluster.go
+++ b/metropolis/test/launch/cluster/cluster.go
@@ -24,6 +24,8 @@
grpcretry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
"go.uber.org/multierr"
"google.golang.org/grpc"
+ "google.golang.org/grpc/codes"
+ "google.golang.org/grpc/status"
"google.golang.org/protobuf/proto"
"source.monogon.dev/metropolis/node"
@@ -240,7 +242,7 @@
// cluster.
func getNodes(ctx context.Context, mgmt apb.ManagementClient) ([]*apb.Node, error) {
var res []*apb.Node
- bo := backoff.NewExponentialBackOff()
+ bo := backoff.WithContext(backoff.NewExponentialBackOff(), ctx)
err := backoff.Retry(func() error {
res = nil
srvN, err := mgmt.GetNodes(ctx, &apb.GetNodesRequest{})
@@ -416,7 +418,12 @@
var cert *tls.Certificate
err = backoff.Retry(func() error {
cert, err = rpc.RetrieveOwnerCertificate(ctxT, aaa, InsecurePrivateKey)
- return err
+ if st, ok := status.FromError(err); ok {
+ if st.Code() == codes.Unavailable {
+ return err
+ }
+ }
+ return backoff.Permanent(err)
}, backoff.WithContext(backoff.NewExponentialBackOff(), ctxT))
if err != nil {
ctxC()