m/n/core: retry node joining call indefinitely This causes nodes to get stuck if anything on the network side is not working perfectly. Additionally this races the network runnable itself, making this even more likely. Bug: 128 Change-Id: I8c6847d6fb22a4527ca58def02cd5e994bd3dfff Reviewed-on: https://review.monogon.dev/c/monogon/+/777 Tested-by: Jenkins CI Reviewed-by: Mateusz Zalega <mateusz@monogon.tech>

commit: 83e8b6c897aaabb4230ae73a28bba0ed0aca039c [log] [tgz]
author: Lorenz Brun <lorenz@monogon.tech> Mon Jun 20 17:26:10 2022 +0000
committer: Lorenz Brun <lorenz@monogon.tech> Tue Jun 21 11:18:07 2022 +0000
tree: 03d2ff42dee689b5c735ce97a5cd13821c389c29
parent: 100e22fac40295424b76fcae5a05eddf0f25d345 [diff]
diff --git a/metropolis/node/core/cluster/cluster_join.go b/metropolis/node/core/cluster/cluster_join.go
index 0cb68bb..3349fb7 100644
--- a/metropolis/node/core/cluster/cluster_join.go
+++ b/metropolis/node/core/cluster/cluster_join.go

@@ -6,6 +6,7 @@
 	"crypto/x509"
 	"encoding/hex"
 	"fmt"
+	"time"
 
 	"google.golang.org/grpc"
 
@@ -53,9 +54,14 @@
 
 	// Join the cluster and use the newly obtained CUK to mount the data
 	// partition.
-	jr, err := cur.JoinNode(ctx, &ipb.JoinNodeRequest{})
-	if err != nil {
-		return fmt.Errorf("join call failed: %w", err)
+	var jr *ipb.JoinNodeResponse
+	for {
+		jr, err = cur.JoinNode(ctx, &ipb.JoinNodeRequest{})
+		if err == nil {
+			break
+		}
+		supervisor.Logger(ctx).Warningf("JoinNode call failed, retrying: %v", err)
+		time.Sleep(time.Second)
 	}
 	if err := m.storageRoot.Data.MountExisting(sc, jr.ClusterUnlockKey); err != nil {
 		return fmt.Errorf("while mounting Data: %w", err)
commit	83e8b6c897aaabb4230ae73a28bba0ed0aca039c	[log] [tgz]
author	Lorenz Brun <lorenz@monogon.tech>	Mon Jun 20 17:26:10 2022 +0000
committer	Lorenz Brun <lorenz@monogon.tech>	Tue Jun 21 11:18:07 2022 +0000
tree	03d2ff42dee689b5c735ce97a5cd13821c389c29
parent	100e22fac40295424b76fcae5a05eddf0f25d345 [diff]