m/n/core: retry node joining call indefinitely

This causes nodes to get stuck if anything on the network side is not
working perfectly. Additionally this races the network runnable itself,
making this even more likely.

Bug: 128
Change-Id: I8c6847d6fb22a4527ca58def02cd5e994bd3dfff
Reviewed-on: https://review.monogon.dev/c/monogon/+/777
Tested-by: Jenkins CI
Reviewed-by: Mateusz Zalega <mateusz@monogon.tech>
diff --git a/metropolis/node/core/cluster/cluster_join.go b/metropolis/node/core/cluster/cluster_join.go
index 0cb68bb..3349fb7 100644
--- a/metropolis/node/core/cluster/cluster_join.go
+++ b/metropolis/node/core/cluster/cluster_join.go
@@ -6,6 +6,7 @@
 	"crypto/x509"
 	"encoding/hex"
 	"fmt"
+	"time"
 
 	"google.golang.org/grpc"
 
@@ -53,9 +54,14 @@
 
 	// Join the cluster and use the newly obtained CUK to mount the data
 	// partition.
-	jr, err := cur.JoinNode(ctx, &ipb.JoinNodeRequest{})
-	if err != nil {
-		return fmt.Errorf("join call failed: %w", err)
+	var jr *ipb.JoinNodeResponse
+	for {
+		jr, err = cur.JoinNode(ctx, &ipb.JoinNodeRequest{})
+		if err == nil {
+			break
+		}
+		supervisor.Logger(ctx).Warningf("JoinNode call failed, retrying: %v", err)
+		time.Sleep(time.Second)
 	}
 	if err := m.storageRoot.Data.MountExisting(sc, jr.ClusterUnlockKey); err != nil {
 		return fmt.Errorf("while mounting Data: %w", err)