m/n/core/net/hostsfile: do not stomp over cluster directory
If we join a cluster after reboot, we already have a cluster directory
on the ESP. We should not write over it with an empty one, but instead
wait until we've received a recent copy of it from the cluster.
Fixes https://github.com/monogon-dev/monogon/issues/228
Change-Id: Ibbfa23009eaa9feb99a332ac0c5e17dd89aea7bf
Reviewed-on: https://review.monogon.dev/c/monogon/+/1846
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/network/hostsfile/hostsfile.go b/metropolis/node/core/network/hostsfile/hostsfile.go
index 631f7bf..942726d 100644
--- a/metropolis/node/core/network/hostsfile/hostsfile.go
+++ b/metropolis/node/core/network/hostsfile/hostsfile.go
@@ -131,7 +131,10 @@
 }
 
 func (s *Service) Run(ctx context.Context) error {
-	s.ClusterDirectorySaved.Set(false)
+	// Let other components know whether a cluster directory has already been
+	// persisted.
+	exists, _ := s.ESP.Metropolis.ClusterDirectory.Exists()
+	s.ClusterDirectorySaved.Set(exists)
 
 	localC := make(chan *network.Status)
 	s.clusterC = make(chan nodeMap)
@@ -159,6 +162,12 @@
 	}
 
 	supervisor.Signal(ctx, supervisor.SignalHealthy)
+
+	// Keep note of whether we have received data from the cluster, and only persist
+	// cluster directory then. Otherwise we risk overriding an already existing
+	// cluster directory with an empty one or one just containing this node.
+	haveRemoteData := false
+
 	// Update nodeMap in a loop, issuing writes/updates when any change occurred.
 	for {
 		changed := false
@@ -209,6 +218,7 @@
 				nodes[id] = info
 				changed = true
 			}
+			haveRemoteData = true
 		}
 
 		if !changed {
@@ -226,17 +236,19 @@
 		}
 
 		// Update this node's ClusterDirectory.
-		supervisor.Logger(ctx).Info("Updating ClusterDirectory.")
-		cd := nodes.clusterDirectory(ctx)
-		cdirRaw, err := proto.Marshal(cd)
-		if err != nil {
-			return fmt.Errorf("couldn't marshal ClusterDirectory: %w", err)
+		if haveRemoteData {
+			supervisor.Logger(ctx).Info("Updating ClusterDirectory.")
+			cd := nodes.clusterDirectory(ctx)
+			cdirRaw, err := proto.Marshal(cd)
+			if err != nil {
+				return fmt.Errorf("couldn't marshal ClusterDirectory: %w", err)
+			}
+			if err = s.ESP.Metropolis.ClusterDirectory.Write(cdirRaw, 0644); err != nil {
+				return err
+			}
+			unix.Sync()
+			s.ClusterDirectorySaved.Set(true)
 		}
-		if err = s.ESP.Metropolis.ClusterDirectory.Write(cdirRaw, 0644); err != nil {
-			return err
-		}
-		unix.Sync()
-		s.ClusterDirectorySaved.Set(true)
 	}
 }