Review comments for TPM attestation
Lots of comments and an updated boot test. Generously increase the timeout to eliminate random CI failures.
Test Plan: Boot test works
Bug: T499
X-Origin-Diff: phab/D319
GitOrigin-RevId: cf17fe7c599f670ff8b6f0ac60486f2a04f13a5a
diff --git a/core/api/api/schema.proto b/core/api/api/schema.proto
index 7b4bdaf..3f1b7e3 100644
--- a/core/api/api/schema.proto
+++ b/core/api/api/schema.proto
@@ -28,214 +28,316 @@
// The ClusterManagement service is used by an authenticated administrative user
// to manage node membership in an existing Smalltown cluster.
service ClusterManagement {
- // Add a node to the cluster, subject to successful remote attestation.
- rpc AddNode(AddNodeRequest) returns (AddNodeResponse) {}
+ // Add a node to the cluster, subject to successful remote attestation.
+ rpc AddNode (AddNodeRequest) returns (AddNodeResponse) {
+ }
- // Remove a node from the cluster.
- rpc RemoveNode(RemoveNodeRequest) returns (RemoveNodeRequest) {}
+ // Remove a node from the cluster.
+ rpc RemoveNode (RemoveNodeRequest) returns (RemoveNodeRequest) {
+ }
- // List all cluster nodes
- rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) {}
+ // List all cluster nodes.
+ rpc ListNodes (ListNodesRequest) returns (ListNodesResponse) {
+ }
- // NewEnrolmentConfig generates a new enrolment config for adding new nodes to
- // the cluster
- rpc NewEnrolmentConfig(NewEnrolmentConfigRequest)
- returns (NewEnrolmentConfigResponse) {}
+ // NewEnrolmentConfig generates a new enrolment config for adding new nodes to
+ // the cluster.
+ rpc NewEnrolmentConfig (NewEnrolmentConfigRequest) returns (NewEnrolmentConfigResponse) {
+ }
- rpc ListEnrolmentConfigs(ListEnrolmentConfigsRequest)
- returns (ListEnrolmentConfigsResponse) {}
+ rpc ListEnrolmentConfigs (ListEnrolmentConfigsRequest) returns (ListEnrolmentConfigsResponse) {
+ }
- rpc RemoveEnrolmentConfig(RemoveEnrolmentConfigRequest)
- returns (RemoveEnrolmentConfigResponse) {}
+ rpc RemoveEnrolmentConfig (RemoveEnrolmentConfigRequest) returns (RemoveEnrolmentConfigResponse) {
+ }
}
-message NewEnrolmentConfigRequest { string name = 1; }
-message NewEnrolmentConfigResponse { EnrolmentConfig enrolment_config = 1; }
+message NewEnrolmentConfigRequest {
+ string name = 1;
+}
+message NewEnrolmentConfigResponse {
+ EnrolmentConfig enrolment_config = 1;
+}
-message ListEnrolmentConfigsRequest {}
+message ListEnrolmentConfigsRequest {
+}
message ListEnrolmentConfigsResponse {
- repeated EnrolmentConfig enrolment_config = 1;
+ repeated EnrolmentConfig enrolment_config = 1;
}
message RemoveEnrolmentConfigRequest {
- // TODO(lorenz): How do we want to remove EnrolmentConfigs?
+ // TODO(lorenz): How do we want to remove EnrolmentConfigs?
}
-message RemoveEnrolmentConfigResponse {}
+message RemoveEnrolmentConfigResponse {
+}
-// NodeService runs on all nodes and allows active masters to perform things
-// like attestation or grab other system state. Callers are authenticated via
+// NodeService runs on all nodes and allows active masters to perform operations
+// like attestation or requesting other system state. Callers are authenticated via
// TLS using the certificate from the EnrolmentConfig. Any client needs to
// authenticate the node it's talking to by getting the public key from the
// consensus service to verify against.
service NodeService {
- rpc JoinCluster(JoinClusterRequest) returns (JoinClusterResponse) {}
+ rpc JoinCluster (JoinClusterRequest) returns (JoinClusterResponse) {
+ }
}
// NodeManagementService runs on all masters, is identified by the
-// NodeManagementService TLS certificate and is where nodes report to when they
-// initially join or are restarted and need to be unlocked again.
+// NodeManagementService TLS certificate. It is used by nodes to
+// initially join the cluster or subsequently request the cluster unlock secret
+// (refer to EnrolmentConfig for the various node provisioning states).
service NodeManagementService {
- // NewTPM2NodeRegister is called by a node as soon as it is properly
- // initialized. Then any number of policies can determine weather and when to
- // add the node to the cluster.
- //
- // The idea behind this is that we just deliver everything we have trust-wise
- // and then it's up to the customer or his policies to either adopt this node
- // or not since TPM trust hierarchies are a mess in general.
- rpc NewTPM2NodeRegister(stream TPM2FlowRequest)
- returns (stream TPM2FlowResponse) {}
+ // NewTPM2NodeRegister is called by a node as soon as it is properly
+ // initialized. Then any number of policies can determine whether and when to
+ // add the node to the cluster.
+ //
+ // The idea behind this is that we just deliver everything we have trust-wise
+ // and then it's up to the customer or his policies to either adopt this node.
+ //
+ // TPM trust hierarchies highly vary between vendors and can require complex
+ // policy decisions. The NMS cannot determine on its own whether a given TPM
+ // meets customer policy, therefore, the decision is configurable via the policy engine.
+ rpc NewTPM2NodeRegister (stream TPM2FlowRequest) returns (stream TPM2FlowResponse) {
+ }
- // Nodes that were rebooted request their global unlock secret from here.
- rpc TPM2Unlock(stream TPM2UnlockFlowRequeset)
- returns (stream TPM2UnlockFlowResponse) {}
+ // Nodes that were rebooted request their global unlock secret subject to attestation.
+ rpc TPM2Unlock (stream TPM2UnlockFlowRequeset) returns (stream TPM2UnlockFlowResponse) {
+ }
}
-/*
-This flow needs to run in a single TLS session, so we force that with
-bidirectional streaming. It works like this New Node NodeManagementService
-TPM2RegisterRequest ------>
- <------ TPM2AttestRequest
-TPM2AttestResponse ------>
-NewNodeInfo ------>
-*/
-
+// TPM2FlowRequest is a bidirectional stream that is used for the attestation state machine.
+//
+// gRPC with TLS guarantees ordered delivery and channel binding, ensuring that other
+// nodes cannot inject messages related to an in-progress attestation flow.
+//
+// TPM2RegisterRequest ------>
+// <------ TPM2AttestRequest
+// TPM2AttestResponse ------>
+// NewNodeInfo ------>
+//
message TPM2FlowRequest {
- oneof Stage {
- TPM2RegisterRequest register = 1;
- TPM2AttestResponse attest_response = 2;
- NewNodeInfo new_node_info = 3;
- }
+ oneof Stage {
+ TPM2RegisterRequest register = 1;
+ TPM2AttestResponse attest_response = 2;
+ NewNodeInfo new_node_info = 3;
+ }
}
message TPM2FlowResponse {
- oneof Stage { TPM2AttestRequest attest_request = 1; }
+ oneof Stage {
+ TPM2AttestRequest attest_request = 1;
+ }
}
-// EnrolmentConfig is attached to an installation payload
+// EnrolmentConfig is attached to an installation payload. It represents a node's only
+// unencrypted state outside of the secure partition.
+//
+// - If no EnrolmentConfig is found, a new node is bootstrapped.
+//
+// - If an EnrolmentConfig is present and contains a secret, it will attempt to register
+// with the master hosts and will subsequently clear its enrolment secret.
+//
+// - If an EnrolmentConfig is found and does not contain a secret, the node is presumed
+// to be registered and will attempt to retrieve the cluster unlock key.
+//
+// The EnrolmentConfig is one of the inputs for the integrity mechanism. This ensures
+// that masters_cert has not been tampered with.
message EnrolmentConfig {
- bytes enrolment_secret = 1;
- bytes masters_cert = 2; // X.509 DER certificate of the NodeManagement service
- repeated bytes master_ips = 3; // IPs where the NodeManagement service runs
- // Filled in by node after it is enrolled
- string node_id = 4;
+ // Present only during initialization.
+ bytes enrolment_secret = 1;
+
+ // X.509 DER certificate of the NodeManagement service.
+ // The certificate will never change during a cluster's lifetime.
+ bytes masters_cert = 2;
+
+ // IPs where the NodeManagement service runs. We hardcode the list of
+ // master hosts to sidestep the bootstrap discovery (external DNS? multicast? ...).
+ // IPs are the lowest common denominator across all deployment targets and have
+ // zero external dependencies, but they can change. We cannot assume the existence
+ // of a properly-managed external discovery service.
+ //
+ // TODO(leo): We will have to figure out how to update these in a running cluster
+ // without breaking attestation.
+ repeated bytes master_ips = 3;
+
+ // Filled in by node after it is enrolled
+ string node_id = 4;
}
+// TPM2RegisterRequest is the first message in the attestation state machine,
+// sent by the node to the master (see flowchart).
message TPM2RegisterRequest {
- bytes ak_public = 9; // AK public portion, TPM2_PUBLIC
- bytes ek_pubkey = 5; // TPM EK public key, PKIX
- bytes ek_cert = 6; // TPM EK certificate, X.509 DER (only if available in TPM)
+ // AK public portion, TPM2_PUBLIC. The AK is a new key we create in the node's TPM
+ // endorsement hierarchy (see core/pkg/tpm). It is derived from the TPM's
+ // unique and permanent endorsment hierarchy primary seed using a specific template.
+ // The master verifies that the AK matches the expected template and is generated
+ // on the same TPM as the given EK (binding EK and AK).
+ bytes ak_public = 9;
+
+ // TPM EK public key, PKIX. Derived from the primary seed like the AK, but using
+ // a standardized template (rather than a custom template like the AK) such that
+ // it can be used for endorsement. The EK cannot sign things, only the AK can.
+ bytes ek_pubkey = 5;
+
+ // TPM EK certificate, X.509 DER (only if available in TPM).
+ //
+ // Some vendors issue an EK certificate for a TPM's EK that proves
+ // that the given TPM has in fact been manufactured by them. Unfortunately, this
+ // certificate is not always embedded in the TPM (see T578).
+ bytes ek_cert = 6;
}
+// The master issues a TPM2AttestRequest after verifying the AK in the TPM2RegisterRequest.
message TPM2AttestRequest {
- bytes ak_challenge = 1;
- bytes ak_challenge_secret = 2;
- bytes quote_nonce = 3;
+ // A challenge encrypted with the node's EK. The node decrypts the challenge, verifies
+ // that it matches the key that is being attested (i.e. the AK, using the EK) and
+ // returns the decrypted challenge.
+ //
+ // What actually happens is a LOT more complex than that, bordering on insanity.
+ // Start reading in credactivation_compat.go.
+ bytes ak_challenge = 1;
+ bytes ak_challenge_secret = 2;
+
+ // Replay protection nonce used in the next step.
+ bytes quote_nonce = 3;
}
+// TPM2AttestResponse is returned the node's attestation response.
message TPM2AttestResponse {
- bytes quote = 1;
- bytes quote_signature = 4;
-
- bytes ak_challenge_solution = 2;
- repeated bytes pcrs = 3; // All 16 SHA256 SRTM PCRs in order
+ // Internal hash of all PCRs and a number of other TPM states.
+ bytes quote = 1;
+ // Signature for quote using the node's AK.
+ bytes quote_signature = 4;
+ // Solution for ak_challenge (nonce) decrypted using the node's AK.
+ bytes ak_challenge_solution = 2;
+ // All 16 SHA256 SRTM PCRs in order. Verified by comparing to the
+ // hash included in the quote (see VerifyAttestPlatform).
+ repeated bytes pcrs = 3;
}
+// NewNodeInfo is submitted by the node along with its TPM2AttestResponse.
+// The data returned is then persisted by the master after verifying the
+// attestation response. The info is channel bound to the successful attestation.
message NewNodeInfo {
- EnrolmentConfig enrolment_config = 1;
+ EnrolmentConfig enrolment_config = 1;
- bytes ip = 11; // IP of the node
+ bytes ip = 11; // IP of the node
- bytes id_cert = 4; // ID certificate, X.509 DER
+ bytes id_cert = 4; // ID certificate, X.509 DER
- // Part of the encryption key for cluster unlock (32 byte), to be XOR'ed with
- // the node-local part on the TPM
- bytes global_unlock_key = 7;
+ // Part of the encryption key for cluster unlock (32 byte), to be XOR'ed with
+ // the node-local part on the TPM. Each nodes has its individual cluster unlock key.
+ bytes global_unlock_key = 7;
}
-message TPM2UnlockInit { bytes nonce = 1; }
+message TPM2UnlockInit {
+ bytes nonce = 1;
+}
message TPM2UnlockRequest {
- string node_id = 4;
- repeated bytes pcrs = 1;
- bytes quote = 2;
- bytes quote_signature = 3;
+ string node_id = 4;
+ repeated bytes pcrs = 1;
+ bytes quote = 2;
+ bytes quote_signature = 3;
}
-message TPM2UnlockResponse { bytes global_unlock_key = 1; }
+message TPM2UnlockResponse {
+ bytes global_unlock_key = 1;
+}
message TPM2UnlockFlowRequeset {
- oneof Stage { TPM2UnlockRequest unlock_request = 1; }
+ oneof Stage {
+ TPM2UnlockRequest unlock_request = 1;
+ }
}
message TPM2UnlockFlowResponse {
- oneof Stage {
- TPM2UnlockInit unlock_init = 1;
- TPM2UnlockResponse unlock_response = 2;
- }
+ oneof Stage {
+ TPM2UnlockInit unlock_init = 1;
+ TPM2UnlockResponse unlock_response = 2;
+ }
}
// ConsensusCertificates is a node's individual etcd certificates.
// When provisioning a new node, the existing node sends the new node
// its certificates after authenticating it.
message ConsensusCertificates {
- bytes ca = 1;
- bytes crl = 2;
- bytes cert = 3;
- bytes key = 4;
+ bytes ca = 1;
+ bytes crl = 2;
+ bytes cert = 3;
+ bytes key = 4;
}
message JoinClusterRequest {
- // Cluster bootstrap URI for etcd. The caller will set this to the
- // list of existing nodes in the cluster. This value is only used during
- // bootstrap.
- string initialCluster = 2;
- // New node's etcd client certificates
- ConsensusCertificates certs = 3;
+ // Cluster bootstrap URI for etcd. The caller will set this to the
+ // list of existing nodes in the cluster. This value is only used during bootstrap.
+ string initialCluster = 2;
+ // New node's etcd client certificates.
+ ConsensusCertificates certs = 3;
}
-message JoinClusterResponse {}
+message JoinClusterResponse {
+}
message AddNodeRequest {
- string node_id = 1;
- // TODO: Add things like role
+ string node_id = 1;
+ // TODO: Add things like role
}
-message AddNodeResponse {}
+message AddNodeResponse {
+}
-message RemoveNodeRequest {}
+message RemoveNodeRequest {
+}
-message RemoveNodeResponse {}
+message RemoveNodeResponse {
+}
-message ListNodesRequest {}
+message ListNodesRequest {
+}
-message ListNodesResponse { repeated Node nodes = 1; }
+message ListNodesResponse {
+ repeated Node nodes = 1;
+}
message NodeTPM2 {
- bytes ak_pub = 1; // TPM2T_PUBLIC
- bytes ek_pubkey = 2; // PKIX DER
- bytes ek_cert = 3; // X.509 DER
+ bytes ak_pub = 1; // TPM2T_PUBLIC
+ bytes ek_pubkey = 2; // PKIX DER
+ bytes ek_cert = 3; // X.509 DER
}
-// Node describes a single node's state
+// Node describes a single node's state in etcd
message Node {
- bytes id_cert = 5;
- bytes global_unlock_key = 7;
- bytes address = 3;
- enum State {
- UNININITALIZED = 0; // WARNING: In this state the node has not been adopted and thus cannot be fully trusted
- MASTER = 1; // A full master node with Consensus, NMS & Kubernetes control plane
- WORKER = 2; // A worker node with just Kubelet and supporting services
- }
- State state = 9;
+ // Individual node service certificate.
+ bytes id_cert = 1;
+ // Node's individual cluster part of the disk encryption key.
+ bytes global_unlock_key = 2;
+ // Node address. This is currently an IPv4 address because that is what the
+ // network service uses, but would also support IPv6. See EnrolmentConfig for a
+ // discussion on why we use IP addresses rather than some kind of resolver.
+ bytes address = 3;
- oneof integrity { NodeTPM2 tpm2 = 6; }
+ enum State {
+ // WARNING: In this state the node has not been adopted and thus cannot be fully trusted
+ UNININITALIZED = 0;
+ // A full master node with Consensus, NMS & Kubernetes control plane
+ MASTER = 1;
+ // A worker node with just Kubelet and supporting services
+ WORKER = 2;
+ }
+ State state = 9;
- // etcd State (might later be moved to a separate type)
+ // Which integrity mechanism is used to verify this node's state.
+ oneof integrity {
+ NodeTPM2 tpm2 = 6;
+ }
- // etcd member ID
- uint64 id = 1;
- // etcd member name
- string name = 2;
- // Whether the etcd member is synced with the cluster.
- bool synced = 4;
-}
\ No newline at end of file
+ // etcd State (might later be moved to a separate type)
+ // We will separate etcd state from nodes and remove this.
+
+ // etcd member ID
+ uint64 id = 20;
+ // etcd member name
+ string name = 21;
+ // Whether the etcd member is synced with the cluster.
+ bool synced = 22;
+}
diff --git a/core/internal/api/nodemanagement.go b/core/internal/api/nodemanagement.go
index 2becd00..f193d5c 100644
--- a/core/internal/api/nodemanagement.go
+++ b/core/internal/api/nodemanagement.go
@@ -109,7 +109,7 @@
func (s *Server) TPM2Unlock(unlockServer api.NodeManagementService_TPM2UnlockServer) error {
nonce := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
- return status.Error(codes.Unavailable, "failed to get randonmess")
+ return status.Error(codes.Unavailable, "failed to get randomness")
}
if err := unlockServer.Send(&api.TPM2UnlockFlowResponse{
Stage: &api.TPM2UnlockFlowResponse_UnlockInit{
@@ -191,7 +191,7 @@
challengeNonce := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, challengeNonce); err != nil {
- return status.Error(codes.Unavailable, "failed to get randonmess")
+ return status.Error(codes.Unavailable, "failed to get randomness")
}
challenge, challengeBlob, err := tpm.MakeAKChallenge(registerReq.EkPubkey, registerReq.AkPublic, challengeNonce)
if err != nil {
@@ -199,7 +199,7 @@
}
nonce := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
- return status.Error(codes.Unavailable, "failed to get randonmess")
+ return status.Error(codes.Unavailable, "failed to get randomness")
}
if err := registerServer.Send(&api.TPM2FlowResponse{Stage: &api.TPM2FlowResponse_AttestRequest{AttestRequest: &api.TPM2AttestRequest{
AkChallenge: challenge,
diff --git a/core/internal/integrity/common.go b/core/internal/integrity/common.go
index 6850a12..52196ce 100644
--- a/core/internal/integrity/common.go
+++ b/core/internal/integrity/common.go
@@ -33,6 +33,7 @@
)
// Agent specifices the interface which every integrity agent needs to fulfill
+// TODO: This interface is not yet used, we call the TPM2 agent directly.
type Agent interface {
// Initialize needs to be called once and initializes the systems required to maintain integrity
// on the given platform.
@@ -42,8 +43,8 @@
// Initialize returns the cryptographic identity that it's bound to.
Initialize(newNode api.NewNodeInfo, enrolment api.EnrolmentConfig) (string, error)
- // Unlock performs all required actions to assure the integrity of the platform and retrieves
- // the unlock key in a secure manner
+ // Unlock performs all required actions to assure the integrity of the platform and securely retrieves
+ // the unlock key.
Unlock(enrolment api.EnrolmentConfig) ([]byte, error)
}
diff --git a/core/pkg/tpm/credactivation_compat.go b/core/pkg/tpm/credactivation_compat.go
index 0a848d2..039f8d5 100644
--- a/core/pkg/tpm/credactivation_compat.go
+++ b/core/pkg/tpm/credactivation_compat.go
@@ -18,9 +18,11 @@
// This file is adapted from github.com/google/go-tpm/tpm2/credactivation which outputs broken
// challenges for unknown reasons. They use u16 length-delimited outputs for the challenge blobs
-// which is incorrect.
+// which is incorrect. Rather than rewriting the routine, we only applied minimal fixes to it
+// and skip the ECC part of the issue (because we would rather trust the proprietary RSA implementation).
+//
// TODO(lorenz): I'll eventually deal with this upstream, but for now just fix it here (it's not that)
-// much code after all.
+// much code after all (https://github.com/google/go-tpm/issues/121)
import (
"crypto/aes"
diff --git a/core/pkg/tpm/tpm.go b/core/pkg/tpm/tpm.go
index bb92289..d659d3a 100644
--- a/core/pkg/tpm/tpm.go
+++ b/core/pkg/tpm/tpm.go
@@ -265,7 +265,7 @@
func loadAK() error {
var err error
- // Rationale: The AK is a EK-equivalent key and used only for attestation. Using a non-primary
+ // Rationale: The AK is an EK-equivalent key and used only for attestation. Using a non-primary
// key here would require us to store the wrapped version somewhere, which is inconvenient.
// This being a primary key in the Endorsement hierarchy means that it can always be recreated
// and can never be "destroyed". Under our security model this is of no concern since we identify
diff --git a/core/scripts/test_boot.sh b/core/scripts/test_boot.sh
index 03cf8c4..e927c94 100755
--- a/core/scripts/test_boot.sh
+++ b/core/scripts/test_boot.sh
@@ -9,7 +9,7 @@
# (see https://github.com/bazelbuild/bazel/blob/master/tools/bash/runfiles/runfiles.bash)
set kubectl_path "external/kubernetes/cmd/kubectl/linux_amd64_pure_stripped/kubectl"
-set timeout 60
+set timeout 120
proc print_stderr {msg} {
send_error "\[TEST\] $msg\n"
@@ -38,7 +38,7 @@
spawn $kubectl_path cluster-info dump -s https://localhost:6443 --username none --password none --insecure-skip-tls-verify=true
expect "User \"system:anonymous\" cannot list resource \"nodes\" in API group \"\" at the cluster scope" {} default {
- print_stderr "Failed while waiting for encrypted storage\n"
+ print_stderr "Failed while waiting for kubectl test\n"
exit 1
}