m/n/core/n/hostsfile: persist and load node names to/from Cluster Directory

This change persists node IDs alongside their addresses to the ESP
ClusterDirectory, and loads them up on startup to pre-populate
/etc/hosts.

This is important to bring up clusters from a full cold shutdown, as
these name/address mappings are needed by etcd to connect to other
nodes (etcd connects to other members over metropolis node names, and
expects to be able to resolve them to actual IP addresses).

This didn't affect rolling restarts as other nodes would connect to a
newly started up node. But if all nodes are down, this is needed to
actualy run.

Change-Id: Ifa944bb231909983af2fcb9418a2769e7af65509
Reviewed-on: https://review.monogon.dev/c/monogon/+/2989
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/network/hostsfile/hostsfile.go b/metropolis/node/core/network/hostsfile/hostsfile.go
index 132f71c..c09ea29 100644
--- a/metropolis/node/core/network/hostsfile/hostsfile.go
+++ b/metropolis/node/core/network/hostsfile/hostsfile.go
@@ -27,12 +27,13 @@
 	"google.golang.org/grpc"
 	"google.golang.org/protobuf/proto"
 
-	ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
 	"source.monogon.dev/metropolis/node/core/curator/watcher"
 	"source.monogon.dev/metropolis/node/core/localstorage"
 	"source.monogon.dev/metropolis/node/core/network"
 	"source.monogon.dev/metropolis/pkg/event"
 	"source.monogon.dev/metropolis/pkg/supervisor"
+
+	ipb "source.monogon.dev/metropolis/node/core/curator/proto/api"
 	cpb "source.monogon.dev/metropolis/proto/common"
 )
 
@@ -122,7 +123,7 @@
 // is empty, an empty ClusterDirectory is returned.
 func (m nodeMap) clusterDirectory(ctx context.Context) *cpb.ClusterDirectory {
 	var directory cpb.ClusterDirectory
-	for _, ni := range m {
+	for nid, ni := range m {
 		if !ni.controlPlane {
 			continue
 		}
@@ -131,6 +132,7 @@
 			{Host: ni.address},
 		}
 		node := &cpb.ClusterDirectory_Node{
+			Id:        nid,
 			Addresses: addresses,
 		}
 		directory.Nodes = append(directory.Nodes, node)
@@ -144,6 +146,33 @@
 	exists, _ := s.ESP.Metropolis.ClusterDirectory.Exists()
 	s.ClusterDirectorySaved.Set(exists)
 
+	nodes := make(nodeMap)
+	if exists {
+		supervisor.Logger(ctx).Infof("Saved cluster directory present, restoring host data...")
+		cd, err := s.ESP.Metropolis.ClusterDirectory.Unmarshal()
+		if err != nil {
+			supervisor.Logger(ctx).Errorf("Could not unmarshal saved cluster directory: %v", err)
+		} else {
+			for i, node := range cd.Nodes {
+				if len(node.Id) == 0 {
+					supervisor.Logger(ctx).Warningf("Node %d in cluster directory has no ID, skipping...", i)
+					continue
+				}
+				if len(node.Addresses) == 0 {
+					supervisor.Logger(ctx).Warningf("Node %d (%s) in cluster directory has no addresses, skipping...", i, node.Id)
+					continue
+				}
+				nodes[node.Id] = nodeInfo{
+					address:      node.Addresses[0].Host,
+					local:        false,
+					controlPlane: true,
+				}
+			}
+		}
+	} else {
+		supervisor.Logger(ctx).Infof("Saved cluster directory absent, not restoring any host data.")
+	}
+
 	localC := make(chan *network.Status)
 	s.clusterC = make(chan nodeMap)
 
@@ -162,9 +191,9 @@
 	if err := unix.Sethostname([]byte(s.NodeID)); err != nil {
 		return fmt.Errorf("failed to set runtime hostname: %w", err)
 	}
-	// Immediately write an /etc/hosts just containing localhost, even if we don't
-	// yet have a network address.
-	nodes := make(nodeMap)
+
+	// Immediately write an /etc/hosts just containing localhost and persisted
+	// cluster directory nodes, even if we don't yet have a network address.
 	if err := s.Ephemeral.Hosts.Write(nodes.hosts(ctx), 0644); err != nil {
 		return fmt.Errorf("failed to write %s: %w", s.Ephemeral.Hosts.FullPath(), err)
 	}