c/bmaas/bmdb: implement OS installation flow

This adds two new tags: OSInstallationRequest and
OSInstallationResponse. It also implements interacting with these tags
from the agent side.

This doesn't yet implement any admin/user-facing API to actually request
OS installation, for now we just exercise this in tests.

Change-Id: I2e31a8369a3a8670bb92bcacfb8231a0d5e1b9fd
Reviewed-on: https://review.monogon.dev/c/monogon/+/1011
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/bmaas/server/BUILD.bazel b/cloud/bmaas/server/BUILD.bazel
index ee491d1..2c96a03 100644
--- a/cloud/bmaas/server/BUILD.bazel
+++ b/cloud/bmaas/server/BUILD.bazel
@@ -40,5 +40,6 @@
         "//metropolis/node/core/rpc",
         "@com_github_google_uuid//:uuid",
         "@org_golang_google_grpc//:go_default_library",
+        "@org_golang_google_protobuf//proto",
     ],
 )
diff --git a/cloud/bmaas/server/agent_callback_service.go b/cloud/bmaas/server/agent_callback_service.go
index f058213..b6e0e71 100644
--- a/cloud/bmaas/server/agent_callback_service.go
+++ b/cloud/bmaas/server/agent_callback_service.go
@@ -3,6 +3,7 @@
 import (
 	"context"
 	"crypto/ed25519"
+	"encoding/hex"
 	"errors"
 	"fmt"
 	"time"
@@ -57,6 +58,7 @@
 			return fmt.Errorf("AuthenticateAgentConnection: %w", err)
 		}
 		if len(agents) < 1 {
+			klog.Errorf("No agent for %s/%s", machineId.String(), hex.EncodeToString(pk))
 			return errAgentUnauthenticated
 		}
 		return nil
@@ -76,7 +78,7 @@
 	if req.HardwareReport != nil {
 		hwraw, err = proto.Marshal(req.HardwareReport)
 		if err != nil {
-			return nil, status.Errorf(codes.InvalidArgument, "could not serialize harcware report: %v", err)
+			return nil, status.Errorf(codes.InvalidArgument, "could not serialize hardware report: %v", err)
 		}
 	}
 
@@ -92,6 +94,13 @@
 				return fmt.Errorf("hardware report upsert: %w", err)
 			}
 		}
+		// Upsert os installation report if submitted.
+		if req.InstallationReport != nil {
+			err = q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
+				MachineID:  machineId,
+				Generation: req.InstallationReport.Generation,
+			})
+		}
 		return q.MachineSetAgentHeartbeat(ctx, model.MachineSetAgentHeartbeatParams{
 			MachineID:        machineId,
 			AgentHeartbeatAt: time.Now(),
@@ -101,6 +110,35 @@
 		klog.Errorf("Could not submit heartbeat: %v", err)
 		return nil, status.Error(codes.Unavailable, "could not submit heartbeat")
 	}
+	klog.Infof("Heartbeat from %s/%s", machineId.String(), hex.EncodeToString(pk))
 
-	return &apb.AgentHeartbeatResponse{}, nil
+	// Get installation request for machine if present.
+	var installRequest *apb.OSInstallationRequest
+	err = session.Transact(ctx, func(q *model.Queries) error {
+		reqs, err := q.GetExactMachineForOSInstallation(ctx, model.GetExactMachineForOSInstallationParams{
+			MachineID: machineId,
+			Limit:     1,
+		})
+		if err != nil {
+			return fmt.Errorf("GetExactMachineForOSInstallation: %w", err)
+		}
+		if len(reqs) > 0 {
+			raw := reqs[0].OsInstallationRequestRaw
+			var preq apb.OSInstallationRequest
+			if err := proto.Unmarshal(raw, &preq); err != nil {
+				return fmt.Errorf("could not decode stored OS installation request: %w", err)
+			}
+			installRequest = &preq
+		}
+		return nil
+	})
+	if err != nil {
+		// Do not fail entire request. Instead, just log an error.
+		// TODO(q3k): alert on this
+		klog.Errorf("Failure during OS installation request retrieval: %v", err)
+	}
+
+	return &apb.AgentHeartbeatResponse{
+		InstallationRequest: installRequest,
+	}, nil
 }
diff --git a/cloud/bmaas/server/agent_callback_service_test.go b/cloud/bmaas/server/agent_callback_service_test.go
index bc3201a..320bb68 100644
--- a/cloud/bmaas/server/agent_callback_service_test.go
+++ b/cloud/bmaas/server/agent_callback_service_test.go
@@ -9,6 +9,7 @@
 
 	"github.com/google/uuid"
 	"google.golang.org/grpc"
+	"google.golang.org/protobuf/proto"
 
 	"source.monogon.dev/cloud/bmaas/bmdb"
 	"source.monogon.dev/cloud/bmaas/bmdb/model"
@@ -114,3 +115,138 @@
 	// TODO(q3k): test hardware report being attached once we have some debug API
 	// for tags.
 }
+
+// TestOSInstallationFlow exercises the agent's OS installation request/report
+// functionality.
+func TestOSInstallationFlow(t *testing.T) {
+	s := dut()
+	ctx, ctxC := context.WithCancel(context.Background())
+	defer ctxC()
+	s.Start(ctx)
+
+	pub, priv, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		t.Fatalf("could not generate keypair: %v", err)
+	}
+
+	sess, err := s.bmdb.StartSession(ctx)
+	if err != nil {
+		t.Fatalf("could not start session")
+	}
+
+	heartbeat := func(mid uuid.UUID, report *apb.OSInstallationReport) (*apb.AgentHeartbeatResponse, error) {
+		creds, err := rpc.NewEphemeralCredentials(priv, nil)
+		if err != nil {
+			t.Fatalf("could not generate ephemeral credentials: %v", err)
+		}
+		conn, err := grpc.Dial(s.ListenPublic, grpc.WithTransportCredentials(creds))
+		if err != nil {
+			t.Fatalf("Dial failed: %v", err)
+		}
+		defer conn.Close()
+
+		stub := apb.NewAgentCallbackClient(conn)
+		return stub.Heartbeat(ctx, &apb.AgentHeartbeatRequest{
+			MachineId:          mid.String(),
+			HardwareReport:     &apb.AgentHardwareReport{},
+			InstallationReport: report,
+		})
+	}
+
+	// Create machine with no OS installation request.
+	var machine model.Machine
+	err = sess.Transact(ctx, func(q *model.Queries) error {
+		machine, err = q.NewMachine(ctx)
+		if err != nil {
+			return err
+		}
+		err = q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
+			MachineID:  machine.MachineID,
+			Provider:   model.ProviderEquinix,
+			ProviderID: "123",
+		})
+		if err != nil {
+			return err
+		}
+		return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
+			MachineID:      machine.MachineID,
+			AgentStartedAt: time.Now(),
+			AgentPublicKey: pub,
+		})
+	})
+	if err != nil {
+		t.Fatalf("could not create machine: %v", err)
+	}
+
+	// Expect successful heartbeat, but no OS installation request.
+	hbr, err := heartbeat(machine.MachineID, nil)
+	if err != nil {
+		t.Fatalf("heartbeat: %v", err)
+	}
+	if hbr.InstallationRequest != nil {
+		t.Fatalf("expected no installation request")
+	}
+
+	// Now add an OS installation request tag, and expect it to be returned.
+	err = sess.Transact(ctx, func(q *model.Queries) error {
+		req := apb.OSInstallationRequest{
+			Generation: 123,
+		}
+		raw, _ := proto.Marshal(&req)
+		return q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
+			MachineID:                machine.MachineID,
+			Generation:               req.Generation,
+			OsInstallationRequestRaw: raw,
+		})
+	})
+	if err != nil {
+		t.Fatalf("could not add os installation request to machine: %v", err)
+	}
+
+	// Heartbeat a few times just to make sure every response is as expected.
+	for i := 0; i < 3; i++ {
+		hbr, err = heartbeat(machine.MachineID, nil)
+		if err != nil {
+			t.Fatalf("heartbeat: %v", err)
+		}
+		if hbr.InstallationRequest == nil || hbr.InstallationRequest.Generation != 123 {
+			t.Fatalf("expected installation request for generation 123, got %+v", hbr.InstallationRequest)
+		}
+	}
+
+	// Submit a report, expect no more request.
+	hbr, err = heartbeat(machine.MachineID, &apb.OSInstallationReport{Generation: 123})
+	if err != nil {
+		t.Fatalf("heartbeat: %v", err)
+	}
+	if hbr.InstallationRequest != nil {
+		t.Fatalf("expected no installation request")
+	}
+
+	// Submit a newer request, expect it to be returned.
+	err = sess.Transact(ctx, func(q *model.Queries) error {
+		req := apb.OSInstallationRequest{
+			Generation: 234,
+		}
+		raw, _ := proto.Marshal(&req)
+		return q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
+			MachineID:                machine.MachineID,
+			Generation:               req.Generation,
+			OsInstallationRequestRaw: raw,
+		})
+	})
+	if err != nil {
+		t.Fatalf("could not update installation request: %v", err)
+	}
+
+	// Heartbeat a few times just to make sure every response is as expected.
+	for i := 0; i < 3; i++ {
+		hbr, err = heartbeat(machine.MachineID, nil)
+		if err != nil {
+			t.Fatalf("heartbeat: %v", err)
+		}
+		if hbr.InstallationRequest == nil || hbr.InstallationRequest.Generation != 234 {
+			t.Fatalf("expected installation request for generation 234, got %+v", hbr.InstallationRequest)
+		}
+	}
+}
diff --git a/cloud/bmaas/server/api/agent.proto b/cloud/bmaas/server/api/agent.proto
index c08c767..0ed29c3 100644
--- a/cloud/bmaas/server/api/agent.proto
+++ b/cloud/bmaas/server/api/agent.proto
@@ -21,6 +21,14 @@
   // TODO(lorenz): implement
 }
 
+// OSInstallationReport is submitted from the agent to the BMDB server after
+// successful OS installation.
+message OSInstallationReport {
+  // generation must be set to the same value as 'generation' in the
+  // OSInstallation request which triggered the OS installation
+  int64 generation = 1;
+}
+
 message AgentHeartbeatRequest {
   // MachineID that this agent represents. Technically not necessary since
   // keypairs between agents should be unique, but this provides an extra layer
@@ -29,8 +37,26 @@
   // Optional hardware report to be upserted for this machine. An agent should
   // submit one at least once after it's started, as early as it can.
   AgentHardwareReport hardware_report = 2;
+  // Optional installation report sent to be upserted to this machine. An agent
+  // should submit one after it successfully installed an operating system for
+  // a given OSInstallationRequest.
+  OSInstallationReport installation_report = 3;
+}
+
+// OSInstallationRequest is provided to the agent by the BMDB server, from
+// a responding BMDB tag, when an OS installation request is pending.
+message OSInstallationRequest {
+  // generation is the 'version' of the OS installation request, and will always
+  // be incremented within the BMDB when a new OS installation request is
+  // submitted. The agent must pipe this through to the OSInstallationReport to
+  // let the rest of the system know which OS installation request it actually
+  // fulfilled.
+  int64 generation = 1;
+  // TODO(lorenz): implement
 }
 
 message AgentHeartbeatResponse {
-  // Agent actions (like install, reboot, etc) go here.
+  // If set, the control plane is requesting the installation of an operating
+  // system.
+  OSInstallationRequest installation_request = 1;
 }
\ No newline at end of file