metropolis: add boot IDs to status

This allows for precisely determining if a kernel restart has occurred.
Useful for making tests more accurate and relying less on sleeps.

Closes: #357
Change-Id: Ic215b5db841b29b3a3c622333a05be6c35cc6ded
Reviewed-on: https://review.monogon.dev/c/monogon/+/3477
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/metropolis/node/core/roleserve/BUILD.bazel b/metropolis/node/core/roleserve/BUILD.bazel
index f5e140a..1fb4719 100644
--- a/metropolis/node/core/roleserve/BUILD.bazel
+++ b/metropolis/node/core/roleserve/BUILD.bazel
@@ -45,6 +45,7 @@
         "//osbase/net/dns",
         "//osbase/pki",
         "//osbase/supervisor",
+        "@com_github_google_uuid//:uuid",
         "@org_golang_google_grpc//:grpc",
         "@org_golang_google_protobuf//encoding/prototext",
         "@org_golang_google_protobuf//proto",
diff --git a/metropolis/node/core/roleserve/worker_statuspush.go b/metropolis/node/core/roleserve/worker_statuspush.go
index c6987c3..2fd7b33 100644
--- a/metropolis/node/core/roleserve/worker_statuspush.go
+++ b/metropolis/node/core/roleserve/worker_statuspush.go
@@ -1,9 +1,12 @@
 package roleserve
 
 import (
+	"bytes"
 	"context"
 	"fmt"
+	"os"
 
+	"github.com/google/uuid"
 	"google.golang.org/protobuf/encoding/prototext"
 
 	common "source.monogon.dev/metropolis/node"
@@ -39,12 +42,29 @@
 	curatorConnection chan *CuratorConnection
 }
 
+// getBootID is defined as var to make it overridable from tests
+var getBootID = func(ctx context.Context) []byte {
+	bootIDRaw, err := os.ReadFile("/proc/sys/kernel/random/boot_id")
+	if err != nil {
+		supervisor.Logger(ctx).Errorf("Reading boot_id failed, not available: %v", err)
+		return nil
+	}
+	bootID, err := uuid.ParseBytes(bytes.TrimSpace(bootIDRaw))
+	if err != nil {
+		supervisor.Logger(ctx).Errorf("Parsing boot_id value %v failed, not available: %v", bootIDRaw, err)
+		return nil
+	}
+	return bootID[:]
+}
+
 // workerStatusPushLoop runs the main loop acting on data received from
 // workerStatusPushChannels.
 func workerStatusPushLoop(ctx context.Context, chans *workerStatusPushChannels) error {
 	status := cpb.NodeStatus{
 		Version: version.Version,
+		BootId:  getBootID(ctx),
 	}
+
 	var cur ipb.CuratorClient
 	var nodeID string
 
diff --git a/metropolis/node/core/roleserve/worker_statuspush_test.go b/metropolis/node/core/roleserve/worker_statuspush_test.go
index 760d97a..2e6024f 100644
--- a/metropolis/node/core/roleserve/worker_statuspush_test.go
+++ b/metropolis/node/core/roleserve/worker_statuspush_test.go
@@ -63,9 +63,26 @@
 	}
 }
 
+func TestGetBootID(t *testing.T) {
+	bootID := getBootID(context.Background())
+	if len(bootID) != 16 {
+		t.Errorf("Bad boot ID: %x", bootID)
+	}
+	for _, b := range bootID {
+		if b != 0 {
+			return
+		}
+	}
+	t.Error("Boot ID is all-zeroes")
+}
+
 // TestWorkerStatusPush ensures that the status push worker main loop behaves as
 // expected. It does not exercise the 'map' runnables.
 func TestWorkerStatusPush(t *testing.T) {
+	// Override getBootID to make it deterministic
+	getBootID = func(ctx context.Context) []byte {
+		return []byte{1, 2, 3}
+	}
 	chans := workerStatusPushChannels{
 		address:           make(chan string),
 		localControlPlane: make(chan *localControlPlane),
@@ -116,6 +133,7 @@
 		{NodeId: nodeID, Status: &cpb.NodeStatus{
 			ExternalAddress: "192.0.2.10",
 			Version:         mversion.Version,
+			BootId:          []byte{1, 2, 3},
 		}},
 	})
 
@@ -126,10 +144,12 @@
 		{NodeId: nodeID, Status: &cpb.NodeStatus{
 			ExternalAddress: "192.0.2.10",
 			Version:         mversion.Version,
+			BootId:          []byte{1, 2, 3},
 		}},
 		{NodeId: nodeID, Status: &cpb.NodeStatus{
 			ExternalAddress: "192.0.2.11",
 			Version:         mversion.Version,
+			BootId:          []byte{1, 2, 3},
 		}},
 	})
 
@@ -146,10 +166,12 @@
 		{NodeId: nodeID, Status: &cpb.NodeStatus{
 			ExternalAddress: "192.0.2.10",
 			Version:         mversion.Version,
+			BootId:          []byte{1, 2, 3},
 		}},
 		{NodeId: nodeID, Status: &cpb.NodeStatus{
 			ExternalAddress: "192.0.2.11",
 			Version:         mversion.Version,
+			BootId:          []byte{1, 2, 3},
 		}},
 		{NodeId: nodeID, Status: &cpb.NodeStatus{
 			ExternalAddress: "192.0.2.11",
@@ -157,10 +179,12 @@
 				Port: int32(common.CuratorServicePort),
 			},
 			Version: mversion.Version,
+			BootId:  []byte{1, 2, 3},
 		}},
 		{NodeId: nodeID, Status: &cpb.NodeStatus{
 			ExternalAddress: "192.0.2.11",
 			Version:         mversion.Version,
+			BootId:          []byte{1, 2, 3},
 		}},
 	})
 }