metropolis: add boot IDs to status
This allows for precisely determining if a kernel restart has occurred.
Useful for making tests more accurate and relying less on sleeps.
Closes: #357
Change-Id: Ic215b5db841b29b3a3c622333a05be6c35cc6ded
Reviewed-on: https://review.monogon.dev/c/monogon/+/3477
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/metropolis/node/core/roleserve/BUILD.bazel b/metropolis/node/core/roleserve/BUILD.bazel
index f5e140a..1fb4719 100644
--- a/metropolis/node/core/roleserve/BUILD.bazel
+++ b/metropolis/node/core/roleserve/BUILD.bazel
@@ -45,6 +45,7 @@
"//osbase/net/dns",
"//osbase/pki",
"//osbase/supervisor",
+ "@com_github_google_uuid//:uuid",
"@org_golang_google_grpc//:grpc",
"@org_golang_google_protobuf//encoding/prototext",
"@org_golang_google_protobuf//proto",
diff --git a/metropolis/node/core/roleserve/worker_statuspush.go b/metropolis/node/core/roleserve/worker_statuspush.go
index c6987c3..2fd7b33 100644
--- a/metropolis/node/core/roleserve/worker_statuspush.go
+++ b/metropolis/node/core/roleserve/worker_statuspush.go
@@ -1,9 +1,12 @@
package roleserve
import (
+ "bytes"
"context"
"fmt"
+ "os"
+ "github.com/google/uuid"
"google.golang.org/protobuf/encoding/prototext"
common "source.monogon.dev/metropolis/node"
@@ -39,12 +42,29 @@
curatorConnection chan *CuratorConnection
}
+// getBootID is defined as var to make it overridable from tests
+var getBootID = func(ctx context.Context) []byte {
+ bootIDRaw, err := os.ReadFile("/proc/sys/kernel/random/boot_id")
+ if err != nil {
+ supervisor.Logger(ctx).Errorf("Reading boot_id failed, not available: %v", err)
+ return nil
+ }
+ bootID, err := uuid.ParseBytes(bytes.TrimSpace(bootIDRaw))
+ if err != nil {
+ supervisor.Logger(ctx).Errorf("Parsing boot_id value %v failed, not available: %v", bootIDRaw, err)
+ return nil
+ }
+ return bootID[:]
+}
+
// workerStatusPushLoop runs the main loop acting on data received from
// workerStatusPushChannels.
func workerStatusPushLoop(ctx context.Context, chans *workerStatusPushChannels) error {
status := cpb.NodeStatus{
Version: version.Version,
+ BootId: getBootID(ctx),
}
+
var cur ipb.CuratorClient
var nodeID string
diff --git a/metropolis/node/core/roleserve/worker_statuspush_test.go b/metropolis/node/core/roleserve/worker_statuspush_test.go
index 760d97a..2e6024f 100644
--- a/metropolis/node/core/roleserve/worker_statuspush_test.go
+++ b/metropolis/node/core/roleserve/worker_statuspush_test.go
@@ -63,9 +63,26 @@
}
}
+func TestGetBootID(t *testing.T) {
+ bootID := getBootID(context.Background())
+ if len(bootID) != 16 {
+ t.Errorf("Bad boot ID: %x", bootID)
+ }
+ for _, b := range bootID {
+ if b != 0 {
+ return
+ }
+ }
+ t.Error("Boot ID is all-zeroes")
+}
+
// TestWorkerStatusPush ensures that the status push worker main loop behaves as
// expected. It does not exercise the 'map' runnables.
func TestWorkerStatusPush(t *testing.T) {
+ // Override getBootID to make it deterministic
+ getBootID = func(ctx context.Context) []byte {
+ return []byte{1, 2, 3}
+ }
chans := workerStatusPushChannels{
address: make(chan string),
localControlPlane: make(chan *localControlPlane),
@@ -116,6 +133,7 @@
{NodeId: nodeID, Status: &cpb.NodeStatus{
ExternalAddress: "192.0.2.10",
Version: mversion.Version,
+ BootId: []byte{1, 2, 3},
}},
})
@@ -126,10 +144,12 @@
{NodeId: nodeID, Status: &cpb.NodeStatus{
ExternalAddress: "192.0.2.10",
Version: mversion.Version,
+ BootId: []byte{1, 2, 3},
}},
{NodeId: nodeID, Status: &cpb.NodeStatus{
ExternalAddress: "192.0.2.11",
Version: mversion.Version,
+ BootId: []byte{1, 2, 3},
}},
})
@@ -146,10 +166,12 @@
{NodeId: nodeID, Status: &cpb.NodeStatus{
ExternalAddress: "192.0.2.10",
Version: mversion.Version,
+ BootId: []byte{1, 2, 3},
}},
{NodeId: nodeID, Status: &cpb.NodeStatus{
ExternalAddress: "192.0.2.11",
Version: mversion.Version,
+ BootId: []byte{1, 2, 3},
}},
{NodeId: nodeID, Status: &cpb.NodeStatus{
ExternalAddress: "192.0.2.11",
@@ -157,10 +179,12 @@
Port: int32(common.CuratorServicePort),
},
Version: mversion.Version,
+ BootId: []byte{1, 2, 3},
}},
{NodeId: nodeID, Status: &cpb.NodeStatus{
ExternalAddress: "192.0.2.11",
Version: mversion.Version,
+ BootId: []byte{1, 2, 3},
}},
})
}
diff --git a/metropolis/proto/common/common.proto b/metropolis/proto/common/common.proto
index 2f23e08..d922a56 100644
--- a/metropolis/proto/common/common.proto
+++ b/metropolis/proto/common/common.proto
@@ -182,6 +182,9 @@
google.protobuf.Timestamp timestamp = 2;
// version is the Metropolis version that this node is running.
version.spec.Version version = 4;
+ // boot_id is a random value chosen for each kernel start.
+ // If this value changes, a new kernel instance is running on the node.
+ bytes boot_id = 5;
}
// The Cluster Directory is information about the network addressing of nodes