m/n/kubernetes: run reconciler before starting more services

This makes sure we successfully ran the reconciler at least once before
attempting to running more than the apiserver. It saves us from a whole
bunch of services complaining about not having the right permissions to
(yet) access the cluster.

Change-Id: I605eae9d6bbcc16a9dcb971caa26ee56a06e5d5b
Reviewed-on: https://review.monogon.dev/c/monogon/+/1358
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/kubernetes/reconciler/reconciler.go b/metropolis/node/kubernetes/reconciler/reconciler.go
index 1828060..dfbb42d 100644
--- a/metropolis/node/kubernetes/reconciler/reconciler.go
+++ b/metropolis/node/kubernetes/reconciler/reconciler.go
@@ -126,24 +126,30 @@
 	}
 }
 
-func Run(clientSet kubernetes.Interface) supervisor.Runnable {
+func ReconcileAll(ctx context.Context, clientSet kubernetes.Interface) error {
+	resources := allResources(clientSet)
+	for name, resource := range resources {
+		err := reconcile(ctx, resource)
+		if err != nil {
+			return fmt.Errorf("resource %s: %w", name, err)
+		}
+	}
+	return nil
+}
+
+func Maintain(clientSet kubernetes.Interface) supervisor.Runnable {
 	return func(ctx context.Context) error {
 		log := supervisor.Logger(ctx)
-		resources := allResources(clientSet)
-		t := time.NewTicker(10 * time.Second)
-		reconcileAll := func() {
-			for name, resource := range resources {
-				if err := reconcile(ctx, resource); err != nil {
-					log.Warningf("Failed to reconcile built-in resources %s: %v", name, err)
-				}
-			}
-		}
 		supervisor.Signal(ctx, supervisor.SignalHealthy)
-		reconcileAll()
+		t := time.NewTicker(10 * time.Second)
+		defer t.Stop()
 		for {
 			select {
 			case <-t.C:
-				reconcileAll()
+				err := ReconcileAll(ctx, clientSet)
+				if err != nil {
+					log.Warning(err)
+				}
 			case <-ctx.Done():
 				return nil
 			}
diff --git a/metropolis/node/kubernetes/service.go b/metropolis/node/kubernetes/service.go
index ff0f55c..e989507 100644
--- a/metropolis/node/kubernetes/service.go
+++ b/metropolis/node/kubernetes/service.go
@@ -96,6 +96,7 @@
 	// Sub-runnable which starts all parts of Kubernetes that depend on the
 	// machine's external IP address. If it changes, the runnable will exit.
 	// TODO(q3k): test this
+	startKubelet := make(chan struct{})
 	supervisor.Run(ctx, "networked", func(ctx context.Context) error {
 		networkWatch := s.c.Network.Watch()
 		defer networkWatch.Close()
@@ -130,7 +131,10 @@
 
 		err := supervisor.RunGroup(ctx, map[string]supervisor.Runnable{
 			"apiserver": apiserver.Run,
-			"kubelet":   kubelet.Run,
+			"kubelet": func(ctx context.Context) error {
+				<-startKubelet
+				return kubelet.Run(ctx)
+			},
 		})
 		if err != nil {
 			return fmt.Errorf("when starting apiserver/kubelet: %w", err)
@@ -147,6 +151,25 @@
 		return fmt.Errorf("network configuration changed (%s -> %s)", address.String(), status.ExternalAddress.String())
 	})
 
+	// Before we start anything else, make sure reconciliation passes at least once.
+	// This makes the initial startup of a cluster much cleaner as we don't end up
+	// starting the scheduler/controller-manager/etc just to get them to immediately
+	// fail and back off with 'unauthorized'.
+	startLogging := time.Now().Add(2 * time.Second)
+	supervisor.Logger(ctx).Infof("Performing initial resource reconciliation...")
+	for {
+		err := reconciler.ReconcileAll(ctx, clientSet)
+		if err == nil {
+			supervisor.Logger(ctx).Infof("Initial resource reconciliation succeeded.")
+			close(startKubelet)
+			break
+		}
+		if time.Now().After(startLogging) {
+			supervisor.Logger(ctx).Errorf("Still couldn't do initial reconciliation: %v", err)
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+
 	csiPlugin := csiPluginServer{
 		KubeletDirectory: &s.c.Root.Data.Kubernetes.Kubelet,
 		VolumesDirectory: &s.c.Root.Data.Volumes,
@@ -187,7 +210,7 @@
 	}{
 		{"controller-manager", runControllerManager(*controllerManagerConfig)},
 		{"scheduler", runScheduler(*schedulerConfig)},
-		{"reconciler", reconciler.Run(clientSet)},
+		{"reconciler", reconciler.Maintain(clientSet)},
 		{"csi-plugin", csiPlugin.Run},
 		{"csi-provisioner", csiProvisioner.Run},
 		{"clusternet", clusternet.Run},