m/n/c/curator: add ID field to nodes in etcd
This makes the ID independent of the public key for nodes stored in
etcd. This is needed to eventually allow node key rotation.
We could just extract the ID from the key without adding an ID field.
But the consistency check between key and value has already caught a bug
once, so it seems worth keeping.
Change-Id: I7ba5904d37d54e93ad6dc7d4b6f0cfac19bc730d
Reviewed-on: https://review.monogon.dev/c/monogon/+/3475
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/curator/BUILD.bazel b/metropolis/node/core/curator/BUILD.bazel
index eb6dbc8..1361c95 100644
--- a/metropolis/node/core/curator/BUILD.bazel
+++ b/metropolis/node/core/curator/BUILD.bazel
@@ -43,6 +43,7 @@
"@com_github_google_cel_go//checker/decls:go_default_library",
"@com_github_google_cel_go//common/types:go_default_library",
"@com_zx2c4_golang_wireguard_wgctrl//wgtypes",
+ "@io_etcd_go_etcd_api_v3//mvccpb",
"@io_etcd_go_etcd_client_v3//:client",
"@io_etcd_go_etcd_client_v3//concurrency",
"@org_golang_google_genproto_googleapis_api//expr/v1alpha1",
diff --git a/metropolis/node/core/curator/impl_follower.go b/metropolis/node/core/curator/impl_follower.go
index 2a7f45e..ac68e6d 100644
--- a/metropolis/node/core/curator/impl_follower.go
+++ b/metropolis/node/core/curator/impl_follower.go
@@ -56,7 +56,7 @@
rpc.Trace(ctx).Printf("could not get current leader's data: 0 kvs")
return status.Errorf(codes.Internal, "could not get current leader's data")
}
- node, err := nodeUnmarshal(res.Kvs[0].Value)
+ node, err := nodeUnmarshal(res.Kvs[0])
if err != nil {
rpc.Trace(ctx).Printf("could not unmarshal leader node %s: %v", lock.NodeId, err)
return status.Errorf(codes.Unavailable, "could not unmarshal leader node")
diff --git a/metropolis/node/core/curator/impl_leader_background.go b/metropolis/node/core/curator/impl_leader_background.go
index 1b07607..11c4134 100644
--- a/metropolis/node/core/curator/impl_leader_background.go
+++ b/metropolis/node/core/curator/impl_leader_background.go
@@ -82,7 +82,7 @@
isClusterNode := make(map[string]bool)
var clusterNodes []*Node
for _, kv := range res.Responses[0].GetResponseRange().Kvs {
- node, err := nodeUnmarshal(kv.Value)
+ node, err := nodeUnmarshal(kv)
if err != nil {
return fmt.Errorf("could not unmarshal node %q: %w", kv.Key, err)
}
diff --git a/metropolis/node/core/curator/impl_leader_curator.go b/metropolis/node/core/curator/impl_leader_curator.go
index d668dc4..cf88b37 100644
--- a/metropolis/node/core/curator/impl_leader_curator.go
+++ b/metropolis/node/core/curator/impl_leader_curator.go
@@ -10,6 +10,7 @@
"net"
"time"
+ "go.etcd.io/etcd/api/v3/mvccpb"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/proto"
@@ -174,7 +175,7 @@
id: NodeEtcdPrefix.ExtractID(string(key)),
}
if len(value) > 0 {
- node, err := nodeUnmarshal(value)
+ node, err := nodeUnmarshal(&mvccpb.KeyValue{Key: key, Value: value})
if err != nil {
return nil, err
}
diff --git a/metropolis/node/core/curator/impl_leader_management.go b/metropolis/node/core/curator/impl_leader_management.go
index 494271f..4321fbe 100644
--- a/metropolis/node/core/curator/impl_leader_management.go
+++ b/metropolis/node/core/curator/impl_leader_management.go
@@ -84,7 +84,7 @@
kvs := res.Responses[0].GetResponseRange().Kvs
var nodes []*Node
for _, kv := range kvs {
- node, err := nodeUnmarshal(kv.Value)
+ node, err := nodeUnmarshal(kv)
if err != nil {
rpc.Trace(ctx).Printf("Unmarshalling node %q failed: %v", kv.Value, err)
continue
@@ -205,7 +205,7 @@
// node.
kvs := res.Responses[0].GetResponseRange().Kvs
for _, kv := range kvs {
- node, err := nodeUnmarshal(kv.Value)
+ node, err := nodeUnmarshal(kv)
if err != nil {
rpc.Trace(ctx).Printf("Unmarshalling node %q failed: %v", kv.Value, err)
continue
diff --git a/metropolis/node/core/curator/proto/private/storage.proto b/metropolis/node/core/curator/proto/private/storage.proto
index 681bb0a..ab3e932 100644
--- a/metropolis/node/core/curator/proto/private/storage.proto
+++ b/metropolis/node/core/curator/proto/private/storage.proto
@@ -8,9 +8,13 @@
// Node describes a single node's state in etcd. This is only ever visible to
// the curator, and fully managed by the curator.
//
-// Serialized nodes are stored in /nodes/$id, where $id is the node's ID as
-// calculated from its public key.
+// Serialized nodes are stored in /nodes/$id, where $id is the node's ID.
message Node {
+ // ID of the node. Unique across all nodes. Opaque but human-readable.
+ // The ID is stored redundantly in etcd in both the key and the value,
+ // which allows to detect corruption where a node is stored under the
+ // wrong key.
+ string id = 10;
// The node's public key.
bytes public_key = 1;
// Node's individual cluster part of the data partition encryption key. It
diff --git a/metropolis/node/core/curator/state_node.go b/metropolis/node/core/curator/state_node.go
index e574f9d..52af3df 100644
--- a/metropolis/node/core/curator/state_node.go
+++ b/metropolis/node/core/curator/state_node.go
@@ -24,6 +24,7 @@
"net/netip"
"sort"
+ "go.etcd.io/etcd/api/v3/mvccpb"
clientv3 "go.etcd.io/etcd/client/v3"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
@@ -259,6 +260,7 @@
// etcd.
func (n *Node) proto() *ppb.Node {
msg := &ppb.Node{
+ Id: n.id,
ClusterUnlockKey: n.clusterUnlockKey,
PublicKey: n.pubkey,
JoinKey: n.jkey,
@@ -313,14 +315,26 @@
return msg
}
-func nodeUnmarshal(data []byte) (*Node, error) {
+func nodeUnmarshal(kv *mvccpb.KeyValue) (*Node, error) {
+ id := NodeEtcdPrefix.ExtractID(string(kv.Key))
+ if id == "" {
+ return nil, fmt.Errorf("invalid node key %q", kv.Key)
+ }
var msg ppb.Node
- if err := proto.Unmarshal(data, &msg); err != nil {
- return nil, fmt.Errorf("could not unmarshal proto: %w", err)
+ if err := proto.Unmarshal(kv.Value, &msg); err != nil {
+ return nil, fmt.Errorf("could not unmarshal proto of node %s: %w", id, err)
+ }
+ valueID := msg.Id
+ if valueID == "" {
+ // Backward compatibility
+ valueID = identity.NodeID(msg.PublicKey)
+ }
+ if id != valueID {
+ return nil, fmt.Errorf("node ID mismatch (etcd key: %q, value: %q)", id, valueID)
}
n := &Node{
clusterUnlockKey: msg.ClusterUnlockKey,
- id: identity.NodeID(msg.PublicKey),
+ id: id,
pubkey: msg.PublicKey,
jkey: msg.JoinKey,
state: msg.FsmState,
@@ -423,7 +437,7 @@
if len(kvs) != 1 {
return nil, errNodeNotFound
}
- node, err := nodeUnmarshal(kvs[0].Value)
+ node, err := nodeUnmarshal(kvs[0])
if err != nil {
rpc.Trace(ctx).Printf("could not unmarshal node: %v", err)
return nil, status.Errorf(codes.Unavailable, "could not unmarshal node")