m/n/core/net/hostsfile: do not stomp over cluster directory
If we join a cluster after reboot, we already have a cluster directory
on the ESP. We should not write over it with an empty one, but instead
wait until we've received a recent copy of it from the cluster.
Fixes https://github.com/monogon-dev/monogon/issues/228
Change-Id: Ibbfa23009eaa9feb99a332ac0c5e17dd89aea7bf
Reviewed-on: https://review.monogon.dev/c/monogon/+/1846
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/network/hostsfile/hostsfile.go b/metropolis/node/core/network/hostsfile/hostsfile.go
index 631f7bf..942726d 100644
--- a/metropolis/node/core/network/hostsfile/hostsfile.go
+++ b/metropolis/node/core/network/hostsfile/hostsfile.go
@@ -131,7 +131,10 @@
}
func (s *Service) Run(ctx context.Context) error {
- s.ClusterDirectorySaved.Set(false)
+ // Let other components know whether a cluster directory has already been
+ // persisted.
+ exists, _ := s.ESP.Metropolis.ClusterDirectory.Exists()
+ s.ClusterDirectorySaved.Set(exists)
localC := make(chan *network.Status)
s.clusterC = make(chan nodeMap)
@@ -159,6 +162,12 @@
}
supervisor.Signal(ctx, supervisor.SignalHealthy)
+
+ // Keep note of whether we have received data from the cluster, and only persist
+ // cluster directory then. Otherwise we risk overriding an already existing
+ // cluster directory with an empty one or one just containing this node.
+ haveRemoteData := false
+
// Update nodeMap in a loop, issuing writes/updates when any change occurred.
for {
changed := false
@@ -209,6 +218,7 @@
nodes[id] = info
changed = true
}
+ haveRemoteData = true
}
if !changed {
@@ -226,17 +236,19 @@
}
// Update this node's ClusterDirectory.
- supervisor.Logger(ctx).Info("Updating ClusterDirectory.")
- cd := nodes.clusterDirectory(ctx)
- cdirRaw, err := proto.Marshal(cd)
- if err != nil {
- return fmt.Errorf("couldn't marshal ClusterDirectory: %w", err)
+ if haveRemoteData {
+ supervisor.Logger(ctx).Info("Updating ClusterDirectory.")
+ cd := nodes.clusterDirectory(ctx)
+ cdirRaw, err := proto.Marshal(cd)
+ if err != nil {
+ return fmt.Errorf("couldn't marshal ClusterDirectory: %w", err)
+ }
+ if err = s.ESP.Metropolis.ClusterDirectory.Write(cdirRaw, 0644); err != nil {
+ return err
+ }
+ unix.Sync()
+ s.ClusterDirectorySaved.Set(true)
}
- if err = s.ESP.Metropolis.ClusterDirectory.Write(cdirRaw, 0644); err != nil {
- return err
- }
- unix.Sync()
- s.ClusterDirectorySaved.Set(true)
}
}