m/node/core: run hostsfile from roleserver, provide feedback on cluster directory
Not providing a heartbeat and status until we save a cluster directory
to ESP is a quick and dirty way to make sure we don't mark a node as
HEALTHY until it has performed the bare minimum of setup to be
rebootable.
This is important in our E2E tests to reduce flakiness.
In the future we should have some node status field or general 'sync'
API exposed, but this will do for now.
Change-Id: Ibad9e91f01abeacdfe4400ef7cb36ca17f68ba0a
Reviewed-on: https://review.monogon.dev/c/monogon/+/1498
Tested-by: Jenkins CI
Reviewed-by: Leopold Schabel <leo@monogon.tech>
diff --git a/metropolis/node/core/roleserve/roleserve.go b/metropolis/node/core/roleserve/roleserve.go
index 86b5b5a..a665044 100644
--- a/metropolis/node/core/roleserve/roleserve.go
+++ b/metropolis/node/core/roleserve/roleserve.go
@@ -80,11 +80,12 @@
type Service struct {
Config
- ClusterMembership memory.Value[*ClusterMembership]
- KubernetesStatus memory.Value[*KubernetesStatus]
- bootstrapData memory.Value[*bootstrapData]
- localRoles memory.Value[*cpb.NodeRoles]
- podNetwork memory.Value[*clusternet.Prefixes]
+ ClusterMembership memory.Value[*ClusterMembership]
+ KubernetesStatus memory.Value[*KubernetesStatus]
+ bootstrapData memory.Value[*bootstrapData]
+ localRoles memory.Value[*cpb.NodeRoles]
+ podNetwork memory.Value[*clusternet.Prefixes]
+ clusterDirectorySaved memory.Value[bool]
controlPlane *workerControlPlane
statusPush *workerStatusPush
@@ -93,6 +94,7 @@
rolefetch *workerRoleFetch
nodeMgmt *workerNodeMgmt
clusternet *workerClusternet
+ hostsfile *workerHostsfile
}
// New creates a Role Server services from a Config.
@@ -112,7 +114,8 @@
s.statusPush = &workerStatusPush{
network: s.Network,
- clusterMembership: &s.ClusterMembership,
+ clusterMembership: &s.ClusterMembership,
+ clusterDirectorySaved: &s.clusterDirectorySaved,
}
s.heartbeat = &workerHeartbeat{
@@ -142,6 +145,7 @@
clusterMembership: &s.ClusterMembership,
logTree: s.LogTree,
}
+
s.clusternet = &workerClusternet{
storageRoot: s.StorageRoot,
@@ -149,6 +153,13 @@
podNetwork: &s.podNetwork,
}
+ s.hostsfile = &workerHostsfile{
+ storageRoot: s.StorageRoot,
+ network: s.Network,
+ clusterMembership: &s.ClusterMembership,
+ clusterDirectorySaved: &s.clusterDirectorySaved,
+ }
+
return s
}
@@ -197,6 +208,7 @@
pubkey: credentials.PublicKey(),
resolver: s.Resolver,
})
+ s.clusterDirectorySaved.Set(true)
}
// Run the Role Server service, which uses intermediary workload launchers to
@@ -209,6 +221,7 @@
supervisor.Run(ctx, "rolefetch", s.rolefetch.run)
supervisor.Run(ctx, "nodemgmt", s.nodeMgmt.run)
supervisor.Run(ctx, "clusternet", s.clusternet.run)
+ supervisor.Run(ctx, "hostsfile", s.hostsfile.run)
supervisor.Signal(ctx, supervisor.SignalHealthy)
<-ctx.Done()