Review comments for TPM attestation

Lots of comments and an updated boot test. Generously increase the timeout to eliminate random CI failures.

Test Plan: Boot test works

Bug: T499

X-Origin-Diff: phab/D319
GitOrigin-RevId: cf17fe7c599f670ff8b6f0ac60486f2a04f13a5a
diff --git a/core/api/api/schema.proto b/core/api/api/schema.proto
index 7b4bdaf..3f1b7e3 100644
--- a/core/api/api/schema.proto
+++ b/core/api/api/schema.proto
@@ -28,214 +28,316 @@
 // The ClusterManagement service is used by an authenticated administrative user
 // to manage node membership in an existing Smalltown cluster.
 service ClusterManagement {
-  // Add a node to the cluster, subject to successful remote attestation.
-  rpc AddNode(AddNodeRequest) returns (AddNodeResponse) {}
+    // Add a node to the cluster, subject to successful remote attestation.
+    rpc AddNode (AddNodeRequest) returns (AddNodeResponse) {
+    }
 
-  // Remove a node from the cluster.
-  rpc RemoveNode(RemoveNodeRequest) returns (RemoveNodeRequest) {}
+    // Remove a node from the cluster.
+    rpc RemoveNode (RemoveNodeRequest) returns (RemoveNodeRequest) {
+    }
 
-  // List all cluster nodes
-  rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) {}
+    // List all cluster nodes.
+    rpc ListNodes (ListNodesRequest) returns (ListNodesResponse) {
+    }
 
-  // NewEnrolmentConfig generates a new enrolment config for adding new nodes to
-  // the cluster
-  rpc NewEnrolmentConfig(NewEnrolmentConfigRequest)
-      returns (NewEnrolmentConfigResponse) {}
+    // NewEnrolmentConfig generates a new enrolment config for adding new nodes to
+    // the cluster.
+    rpc NewEnrolmentConfig (NewEnrolmentConfigRequest) returns (NewEnrolmentConfigResponse) {
+    }
 
-  rpc ListEnrolmentConfigs(ListEnrolmentConfigsRequest)
-      returns (ListEnrolmentConfigsResponse) {}
+    rpc ListEnrolmentConfigs (ListEnrolmentConfigsRequest) returns (ListEnrolmentConfigsResponse) {
+    }
 
-  rpc RemoveEnrolmentConfig(RemoveEnrolmentConfigRequest)
-      returns (RemoveEnrolmentConfigResponse) {}
+    rpc RemoveEnrolmentConfig (RemoveEnrolmentConfigRequest) returns (RemoveEnrolmentConfigResponse) {
+    }
 }
 
-message NewEnrolmentConfigRequest { string name = 1; }
-message NewEnrolmentConfigResponse { EnrolmentConfig enrolment_config = 1; }
+message NewEnrolmentConfigRequest {
+    string name = 1;
+}
+message NewEnrolmentConfigResponse {
+    EnrolmentConfig enrolment_config = 1;
+}
 
-message ListEnrolmentConfigsRequest {}
+message ListEnrolmentConfigsRequest {
+}
 message ListEnrolmentConfigsResponse {
-  repeated EnrolmentConfig enrolment_config = 1;
+    repeated EnrolmentConfig enrolment_config = 1;
 }
 
 message RemoveEnrolmentConfigRequest {
-  // TODO(lorenz): How do we want to remove EnrolmentConfigs?
+    // TODO(lorenz): How do we want to remove EnrolmentConfigs?
 }
-message RemoveEnrolmentConfigResponse {}
+message RemoveEnrolmentConfigResponse {
+}
 
-// NodeService runs on all nodes and allows active masters to perform things
-// like attestation or grab other system state. Callers are authenticated via
+// NodeService runs on all nodes and allows active masters to perform operations
+// like attestation or requesting other system state. Callers are authenticated via
 // TLS using the certificate from the EnrolmentConfig. Any client needs to
 // authenticate the node it's talking to by getting the public key from the
 // consensus service to verify against.
 service NodeService {
-  rpc JoinCluster(JoinClusterRequest) returns (JoinClusterResponse) {}
+    rpc JoinCluster (JoinClusterRequest) returns (JoinClusterResponse) {
+    }
 }
 
 // NodeManagementService runs on all masters, is identified by the
-// NodeManagementService TLS certificate and is where nodes report to when they
-// initially join or are restarted and need to be unlocked again.
+// NodeManagementService TLS certificate. It is used by nodes to
+// initially join the cluster or subsequently request the cluster unlock secret
+// (refer to EnrolmentConfig for the various node provisioning states).
 service NodeManagementService {
-  // NewTPM2NodeRegister is called by a node as soon as it is properly
-  // initialized. Then any number of policies can determine weather and when to
-  // add the node to the cluster.
-  //
-  // The idea behind this is that we just deliver everything we have trust-wise
-  // and then it's up to the customer or his policies to either adopt this node
-  // or not since TPM trust hierarchies are a mess in general.
-  rpc NewTPM2NodeRegister(stream TPM2FlowRequest)
-      returns (stream TPM2FlowResponse) {}
+    // NewTPM2NodeRegister is called by a node as soon as it is properly
+    // initialized. Then any number of policies can determine whether and when to
+    // add the node to the cluster.
+    //
+    // The idea behind this is that we just deliver everything we have trust-wise
+    // and then it's up to the customer or his policies to either adopt this node.
+    //
+    // TPM trust hierarchies highly vary between vendors and can require complex
+    // policy decisions. The NMS cannot determine on its own whether a given TPM
+    // meets customer policy, therefore, the decision is configurable via the policy engine.
+    rpc NewTPM2NodeRegister (stream TPM2FlowRequest) returns (stream TPM2FlowResponse) {
+    }
 
-  // Nodes that were rebooted request their global unlock secret from here.
-  rpc TPM2Unlock(stream TPM2UnlockFlowRequeset)
-      returns (stream TPM2UnlockFlowResponse) {}
+    // Nodes that were rebooted request their global unlock secret subject to attestation.
+    rpc TPM2Unlock (stream TPM2UnlockFlowRequeset) returns (stream TPM2UnlockFlowResponse) {
+    }
 }
 
-/*
-This flow needs to run in a single TLS session, so we force that with
-bidirectional streaming. It works like this New Node NodeManagementService
-TPM2RegisterRequest ------>
-                   <------  TPM2AttestRequest
-TPM2AttestResponse  ------>
-NewNodeInfo         ------>
-*/
-
+// TPM2FlowRequest is a bidirectional stream that is used for the attestation state machine.
+//
+// gRPC with TLS guarantees ordered delivery and channel binding, ensuring that other
+// nodes cannot inject messages related to an in-progress attestation flow.
+//
+//   TPM2RegisterRequest ------>
+//                       <------  TPM2AttestRequest
+//   TPM2AttestResponse  ------>
+//   NewNodeInfo         ------>
+//
 message TPM2FlowRequest {
-  oneof Stage {
-    TPM2RegisterRequest register = 1;
-    TPM2AttestResponse attest_response = 2;
-    NewNodeInfo new_node_info = 3;
-  }
+    oneof Stage {
+        TPM2RegisterRequest register = 1;
+        TPM2AttestResponse attest_response = 2;
+        NewNodeInfo new_node_info = 3;
+    }
 }
 
 message TPM2FlowResponse {
-  oneof Stage { TPM2AttestRequest attest_request = 1; }
+    oneof Stage {
+        TPM2AttestRequest attest_request = 1;
+    }
 }
 
-// EnrolmentConfig is attached to an installation payload
+// EnrolmentConfig is attached to an installation payload. It represents a node's only
+// unencrypted state outside of the secure partition.
+//
+//  - If no EnrolmentConfig is found, a new node is bootstrapped.
+//
+//  - If an EnrolmentConfig is present and contains a secret, it will attempt to register
+//    with the master hosts and will subsequently clear its enrolment secret.
+//
+//  - If an EnrolmentConfig is found and does not contain a secret, the node is presumed
+//    to be registered and will attempt to retrieve the cluster unlock key.
+//
+// The EnrolmentConfig is one of the inputs for the integrity mechanism. This ensures
+// that masters_cert has not been tampered with.
 message EnrolmentConfig {
-  bytes enrolment_secret = 1;
-  bytes masters_cert = 2; // X.509 DER certificate of the NodeManagement service
-  repeated bytes master_ips = 3; // IPs where the NodeManagement service runs
-  // Filled in by node after it is enrolled
-  string node_id = 4;
+    // Present only during initialization.
+    bytes enrolment_secret = 1;
+
+    // X.509 DER certificate of the NodeManagement service.
+    // The certificate will never change during a cluster's lifetime.
+    bytes masters_cert = 2;
+
+    // IPs where the NodeManagement service runs. We hardcode the list of
+    // master hosts to sidestep the bootstrap discovery (external DNS? multicast? ...).
+    // IPs are the lowest common denominator across all deployment targets and have
+    // zero external dependencies, but they can change. We cannot assume the existence
+    // of a properly-managed external discovery service.
+    //
+    // TODO(leo): We will have to figure out how to update these in a running cluster
+    // without breaking attestation.
+    repeated bytes master_ips = 3;
+
+    // Filled in by node after it is enrolled
+    string node_id = 4;
 }
 
+// TPM2RegisterRequest is the first message in the attestation state machine,
+// sent by the node to the master (see flowchart).
 message TPM2RegisterRequest {
-  bytes ak_public = 9; // AK public portion, TPM2_PUBLIC
-  bytes ek_pubkey = 5; // TPM EK public key, PKIX
-  bytes ek_cert = 6; // TPM EK certificate, X.509 DER (only if available in TPM)
+    // AK public portion, TPM2_PUBLIC. The AK is a new key we create in the node's TPM
+    // endorsement hierarchy (see core/pkg/tpm). It is derived from the TPM's
+    // unique and permanent endorsment hierarchy primary seed using a specific template.
+    // The master verifies that the AK matches the expected template and is generated
+    // on the same TPM as the given EK (binding EK and AK).
+    bytes ak_public = 9;
+
+    // TPM EK public key, PKIX. Derived from the primary seed like the AK, but using
+    // a standardized template (rather than a custom template like the AK) such that
+    // it can be used for endorsement. The EK cannot sign things, only the AK can.
+    bytes ek_pubkey = 5;
+
+    // TPM EK certificate, X.509 DER (only if available in TPM).
+    //
+    // Some vendors issue an EK certificate for a TPM's EK that proves
+    // that the given TPM has in fact been manufactured by them. Unfortunately, this
+    // certificate is not always embedded in the TPM (see T578).
+    bytes ek_cert = 6;
 }
 
+// The master issues a TPM2AttestRequest after verifying the AK in the TPM2RegisterRequest.
 message TPM2AttestRequest {
-  bytes ak_challenge = 1;
-  bytes ak_challenge_secret = 2;
-  bytes quote_nonce = 3;
+    // A challenge encrypted with the node's EK. The node decrypts the challenge, verifies
+    // that it matches the key that is being attested (i.e. the AK, using the EK) and
+    // returns the decrypted challenge.
+    //
+    // What actually happens is a LOT more complex than that, bordering on insanity.
+    // Start reading in credactivation_compat.go.
+    bytes ak_challenge = 1;
+    bytes ak_challenge_secret = 2;
+
+    // Replay protection nonce used in the next step.
+    bytes quote_nonce = 3;
 }
 
+// TPM2AttestResponse is returned the node's attestation response.
 message TPM2AttestResponse {
-  bytes quote = 1;
-  bytes quote_signature = 4;
-
-  bytes ak_challenge_solution = 2;
-  repeated bytes pcrs = 3; // All 16 SHA256 SRTM PCRs in order
+    // Internal hash of all PCRs and a number of other TPM states.
+    bytes quote = 1;
+    // Signature for quote using the node's AK.
+    bytes quote_signature = 4;
+    // Solution for ak_challenge (nonce) decrypted using the node's AK.
+    bytes ak_challenge_solution = 2;
+    // All 16 SHA256 SRTM PCRs in order. Verified by comparing to the
+    // hash included in the quote (see VerifyAttestPlatform).
+    repeated bytes pcrs = 3;
 }
 
+// NewNodeInfo is submitted by the node along with its TPM2AttestResponse.
+// The data returned is then persisted by the master after verifying the
+// attestation response. The info is channel bound to the successful attestation.
 message NewNodeInfo {
-  EnrolmentConfig enrolment_config = 1;
+    EnrolmentConfig enrolment_config = 1;
 
-  bytes ip = 11; // IP of the node
+    bytes ip = 11; // IP of the node
 
-  bytes id_cert = 4; // ID certificate, X.509 DER
+    bytes id_cert = 4; // ID certificate, X.509 DER
 
-  // Part of the encryption key for cluster unlock (32 byte), to be XOR'ed with
-  // the node-local part on the TPM
-  bytes global_unlock_key = 7;
+    // Part of the encryption key for cluster unlock (32 byte), to be XOR'ed with
+    // the node-local part on the TPM. Each nodes has its individual cluster unlock key.
+    bytes global_unlock_key = 7;
 }
 
-message TPM2UnlockInit { bytes nonce = 1; }
+message TPM2UnlockInit {
+    bytes nonce = 1;
+}
 
 message TPM2UnlockRequest {
-  string node_id = 4;
-  repeated bytes pcrs = 1;
-  bytes quote = 2;
-  bytes quote_signature = 3;
+    string node_id = 4;
+    repeated bytes pcrs = 1;
+    bytes quote = 2;
+    bytes quote_signature = 3;
 }
 
-message TPM2UnlockResponse { bytes global_unlock_key = 1; }
+message TPM2UnlockResponse {
+    bytes global_unlock_key = 1;
+}
 
 message TPM2UnlockFlowRequeset {
-  oneof Stage { TPM2UnlockRequest unlock_request = 1; }
+    oneof Stage {
+        TPM2UnlockRequest unlock_request = 1;
+    }
 }
 
 message TPM2UnlockFlowResponse {
-  oneof Stage {
-    TPM2UnlockInit unlock_init = 1;
-    TPM2UnlockResponse unlock_response = 2;
-  }
+    oneof Stage {
+        TPM2UnlockInit unlock_init = 1;
+        TPM2UnlockResponse unlock_response = 2;
+    }
 }
 
 // ConsensusCertificates is a node's individual etcd certificates.
 // When provisioning a new node, the existing node sends the new node
 // its certificates after authenticating it.
 message ConsensusCertificates {
-  bytes ca = 1;
-  bytes crl = 2;
-  bytes cert = 3;
-  bytes key = 4;
+    bytes ca = 1;
+    bytes crl = 2;
+    bytes cert = 3;
+    bytes key = 4;
 }
 
 message JoinClusterRequest {
-  // Cluster bootstrap URI for etcd. The caller will set this to the
-  // list of existing nodes in the cluster. This value is only used during
-  // bootstrap.
-  string initialCluster = 2;
-  // New node's etcd client certificates
-  ConsensusCertificates certs = 3;
+    // Cluster bootstrap URI for etcd. The caller will set this to the
+    // list of existing nodes in the cluster. This value is only used during bootstrap.
+    string initialCluster = 2;
+    // New node's etcd client certificates.
+    ConsensusCertificates certs = 3;
 }
 
-message JoinClusterResponse {}
+message JoinClusterResponse {
+}
 
 message AddNodeRequest {
-  string node_id = 1;
-  // TODO: Add things like role
+    string node_id = 1;
+    // TODO: Add things like role
 }
 
-message AddNodeResponse {}
+message AddNodeResponse {
+}
 
-message RemoveNodeRequest {}
+message RemoveNodeRequest {
+}
 
-message RemoveNodeResponse {}
+message RemoveNodeResponse {
+}
 
-message ListNodesRequest {}
+message ListNodesRequest {
+}
 
-message ListNodesResponse { repeated Node nodes = 1; }
+message ListNodesResponse {
+    repeated Node nodes = 1;
+}
 
 message NodeTPM2 {
-  bytes ak_pub = 1;    // TPM2T_PUBLIC
-  bytes ek_pubkey = 2; // PKIX DER
-  bytes ek_cert = 3;   // X.509 DER
+    bytes ak_pub = 1; // TPM2T_PUBLIC
+    bytes ek_pubkey = 2; // PKIX DER
+    bytes ek_cert = 3; // X.509 DER
 }
 
-// Node describes a single node's state
+// Node describes a single node's state in etcd
 message Node {
-  bytes id_cert = 5;
-  bytes global_unlock_key = 7;
-  bytes address = 3;
-  enum State {
-    UNININITALIZED = 0; // WARNING: In this state the node has not been adopted and thus cannot be fully trusted
-    MASTER = 1; // A full master node with Consensus, NMS & Kubernetes control plane
-    WORKER = 2; // A worker node with just Kubelet and supporting services
-  }
-  State state = 9;
+    // Individual node service certificate.
+    bytes id_cert = 1;
+    // Node's individual cluster part of the disk encryption key.
+    bytes global_unlock_key = 2;
+    // Node address. This is currently an IPv4 address because that is what the
+    // network service uses, but would also support IPv6. See EnrolmentConfig for a
+    // discussion on why we use IP addresses rather than some kind of resolver.
+    bytes address = 3;
 
-  oneof integrity { NodeTPM2 tpm2 = 6; }
+    enum State {
+      // WARNING: In this state the node has not been adopted and thus cannot be fully trusted
+      UNININITALIZED = 0;
+      // A full master node with Consensus, NMS & Kubernetes control plane
+      MASTER = 1;
+      // A worker node with just Kubelet and supporting services
+      WORKER = 2;
+    }
+    State state = 9;
 
-  // etcd State (might later be moved to a separate type)
+    // Which integrity mechanism is used to verify this node's state.
+    oneof integrity {
+        NodeTPM2 tpm2 = 6;
+    }
 
-  // etcd member ID
-  uint64 id = 1;
-  // etcd member name
-  string name = 2;
-  // Whether the etcd member is synced with the cluster.
-  bool synced = 4;
-}
\ No newline at end of file
+    // etcd State (might later be moved to a separate type)
+    // We will separate etcd state from nodes and remove this.
+
+    // etcd member ID
+    uint64 id = 20;
+    // etcd member name
+    string name = 21;
+    // Whether the etcd member is synced with the cluster.
+    bool synced = 22;
+}
diff --git a/core/internal/api/nodemanagement.go b/core/internal/api/nodemanagement.go
index 2becd00..f193d5c 100644
--- a/core/internal/api/nodemanagement.go
+++ b/core/internal/api/nodemanagement.go
@@ -109,7 +109,7 @@
 func (s *Server) TPM2Unlock(unlockServer api.NodeManagementService_TPM2UnlockServer) error {
 	nonce := make([]byte, 32)
 	if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
-		return status.Error(codes.Unavailable, "failed to get randonmess")
+		return status.Error(codes.Unavailable, "failed to get randomness")
 	}
 	if err := unlockServer.Send(&api.TPM2UnlockFlowResponse{
 		Stage: &api.TPM2UnlockFlowResponse_UnlockInit{
@@ -191,7 +191,7 @@
 
 	challengeNonce := make([]byte, 32)
 	if _, err := io.ReadFull(rand.Reader, challengeNonce); err != nil {
-		return status.Error(codes.Unavailable, "failed to get randonmess")
+		return status.Error(codes.Unavailable, "failed to get randomness")
 	}
 	challenge, challengeBlob, err := tpm.MakeAKChallenge(registerReq.EkPubkey, registerReq.AkPublic, challengeNonce)
 	if err != nil {
@@ -199,7 +199,7 @@
 	}
 	nonce := make([]byte, 32)
 	if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
-		return status.Error(codes.Unavailable, "failed to get randonmess")
+		return status.Error(codes.Unavailable, "failed to get randomness")
 	}
 	if err := registerServer.Send(&api.TPM2FlowResponse{Stage: &api.TPM2FlowResponse_AttestRequest{AttestRequest: &api.TPM2AttestRequest{
 		AkChallenge:       challenge,
diff --git a/core/internal/integrity/common.go b/core/internal/integrity/common.go
index 6850a12..52196ce 100644
--- a/core/internal/integrity/common.go
+++ b/core/internal/integrity/common.go
@@ -33,6 +33,7 @@
 )
 
 // Agent specifices the interface which every integrity agent needs to fulfill
+// TODO: This interface is not yet used, we call the TPM2 agent directly.
 type Agent interface {
 	// Initialize needs to be called once and initializes the systems required to maintain integrity
 	// on the given platform.
@@ -42,8 +43,8 @@
 	// Initialize returns the cryptographic identity that it's bound to.
 	Initialize(newNode api.NewNodeInfo, enrolment api.EnrolmentConfig) (string, error)
 
-	// Unlock performs all required actions to assure the integrity of the platform and retrieves
-	// the unlock key in a secure manner
+	// Unlock performs all required actions to assure the integrity of the platform and securely retrieves
+	// the unlock key.
 	Unlock(enrolment api.EnrolmentConfig) ([]byte, error)
 }
 
diff --git a/core/pkg/tpm/credactivation_compat.go b/core/pkg/tpm/credactivation_compat.go
index 0a848d2..039f8d5 100644
--- a/core/pkg/tpm/credactivation_compat.go
+++ b/core/pkg/tpm/credactivation_compat.go
@@ -18,9 +18,11 @@
 
 // This file is adapted from github.com/google/go-tpm/tpm2/credactivation which outputs broken
 // challenges for unknown reasons. They use u16 length-delimited outputs for the challenge blobs
-// which is incorrect.
+// which is incorrect. Rather than rewriting the routine, we only applied minimal fixes to it
+// and skip the ECC part of the issue (because we would rather trust the proprietary RSA implementation).
+//
 // TODO(lorenz): I'll eventually deal with this upstream, but for now just fix it here (it's not that)
-// much code after all.
+// much code after all (https://github.com/google/go-tpm/issues/121)
 
 import (
 	"crypto/aes"
diff --git a/core/pkg/tpm/tpm.go b/core/pkg/tpm/tpm.go
index bb92289..d659d3a 100644
--- a/core/pkg/tpm/tpm.go
+++ b/core/pkg/tpm/tpm.go
@@ -265,7 +265,7 @@
 
 func loadAK() error {
 	var err error
-	// Rationale: The AK is a EK-equivalent key and used only for attestation. Using a non-primary
+	// Rationale: The AK is an EK-equivalent key and used only for attestation. Using a non-primary
 	// key here would require us to store the wrapped version somewhere, which is inconvenient.
 	// This being a primary key in the Endorsement hierarchy means that it can always be recreated
 	// and can never be "destroyed". Under our security model this is of no concern since we identify
diff --git a/core/scripts/test_boot.sh b/core/scripts/test_boot.sh
index 03cf8c4..e927c94 100755
--- a/core/scripts/test_boot.sh
+++ b/core/scripts/test_boot.sh
@@ -9,7 +9,7 @@
 # (see https://github.com/bazelbuild/bazel/blob/master/tools/bash/runfiles/runfiles.bash)
 set kubectl_path "external/kubernetes/cmd/kubectl/linux_amd64_pure_stripped/kubectl"
 
-set timeout 60
+set timeout 120
 
 proc print_stderr {msg} {
   send_error "\[TEST\] $msg\n"
@@ -38,7 +38,7 @@
 spawn $kubectl_path cluster-info dump -s https://localhost:6443 --username none --password none --insecure-skip-tls-verify=true
 
 expect "User \"system:anonymous\" cannot list resource \"nodes\" in API group \"\" at the cluster scope" {} default {
-  print_stderr "Failed while waiting for encrypted storage\n"
+  print_stderr "Failed while waiting for kubectl test\n"
   exit 1
 }