m/n/kubernetes: run reconciler before starting more services
This makes sure we successfully ran the reconciler at least once before
attempting to running more than the apiserver. It saves us from a whole
bunch of services complaining about not having the right permissions to
(yet) access the cluster.
Change-Id: I605eae9d6bbcc16a9dcb971caa26ee56a06e5d5b
Reviewed-on: https://review.monogon.dev/c/monogon/+/1358
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/kubernetes/reconciler/reconciler.go b/metropolis/node/kubernetes/reconciler/reconciler.go
index 1828060..dfbb42d 100644
--- a/metropolis/node/kubernetes/reconciler/reconciler.go
+++ b/metropolis/node/kubernetes/reconciler/reconciler.go
@@ -126,24 +126,30 @@
}
}
-func Run(clientSet kubernetes.Interface) supervisor.Runnable {
+func ReconcileAll(ctx context.Context, clientSet kubernetes.Interface) error {
+ resources := allResources(clientSet)
+ for name, resource := range resources {
+ err := reconcile(ctx, resource)
+ if err != nil {
+ return fmt.Errorf("resource %s: %w", name, err)
+ }
+ }
+ return nil
+}
+
+func Maintain(clientSet kubernetes.Interface) supervisor.Runnable {
return func(ctx context.Context) error {
log := supervisor.Logger(ctx)
- resources := allResources(clientSet)
- t := time.NewTicker(10 * time.Second)
- reconcileAll := func() {
- for name, resource := range resources {
- if err := reconcile(ctx, resource); err != nil {
- log.Warningf("Failed to reconcile built-in resources %s: %v", name, err)
- }
- }
- }
supervisor.Signal(ctx, supervisor.SignalHealthy)
- reconcileAll()
+ t := time.NewTicker(10 * time.Second)
+ defer t.Stop()
for {
select {
case <-t.C:
- reconcileAll()
+ err := ReconcileAll(ctx, clientSet)
+ if err != nil {
+ log.Warning(err)
+ }
case <-ctx.Done():
return nil
}
diff --git a/metropolis/node/kubernetes/service.go b/metropolis/node/kubernetes/service.go
index ff0f55c..e989507 100644
--- a/metropolis/node/kubernetes/service.go
+++ b/metropolis/node/kubernetes/service.go
@@ -96,6 +96,7 @@
// Sub-runnable which starts all parts of Kubernetes that depend on the
// machine's external IP address. If it changes, the runnable will exit.
// TODO(q3k): test this
+ startKubelet := make(chan struct{})
supervisor.Run(ctx, "networked", func(ctx context.Context) error {
networkWatch := s.c.Network.Watch()
defer networkWatch.Close()
@@ -130,7 +131,10 @@
err := supervisor.RunGroup(ctx, map[string]supervisor.Runnable{
"apiserver": apiserver.Run,
- "kubelet": kubelet.Run,
+ "kubelet": func(ctx context.Context) error {
+ <-startKubelet
+ return kubelet.Run(ctx)
+ },
})
if err != nil {
return fmt.Errorf("when starting apiserver/kubelet: %w", err)
@@ -147,6 +151,25 @@
return fmt.Errorf("network configuration changed (%s -> %s)", address.String(), status.ExternalAddress.String())
})
+ // Before we start anything else, make sure reconciliation passes at least once.
+ // This makes the initial startup of a cluster much cleaner as we don't end up
+ // starting the scheduler/controller-manager/etc just to get them to immediately
+ // fail and back off with 'unauthorized'.
+ startLogging := time.Now().Add(2 * time.Second)
+ supervisor.Logger(ctx).Infof("Performing initial resource reconciliation...")
+ for {
+ err := reconciler.ReconcileAll(ctx, clientSet)
+ if err == nil {
+ supervisor.Logger(ctx).Infof("Initial resource reconciliation succeeded.")
+ close(startKubelet)
+ break
+ }
+ if time.Now().After(startLogging) {
+ supervisor.Logger(ctx).Errorf("Still couldn't do initial reconciliation: %v", err)
+ }
+ time.Sleep(100 * time.Millisecond)
+ }
+
csiPlugin := csiPluginServer{
KubeletDirectory: &s.c.Root.Data.Kubernetes.Kubelet,
VolumesDirectory: &s.c.Root.Data.Volumes,
@@ -187,7 +210,7 @@
}{
{"controller-manager", runControllerManager(*controllerManagerConfig)},
{"scheduler", runScheduler(*schedulerConfig)},
- {"reconciler", reconciler.Run(clientSet)},
+ {"reconciler", reconciler.Maintain(clientSet)},
{"csi-plugin", csiPlugin.Run},
{"csi-provisioner", csiProvisioner.Run},
{"clusternet", clusternet.Run},