cloud/bmaas: do not start/recover agent on installed machines

This prevents us from perpetually restarting machines (or otherwise
attempting to get the agent running) on machines that have a 'fulfilled'
installation request, ie. machines with a machine generation in their
installation request/report tags.

The recovery flow will still kick in when a new installation request
comes in after a machine has been sucesffuly installed, and that will
still cause a reboot loop (as even rebooting the machine does not allow
us to start an agent anymore if it has been installed). But we'll fix
that later.

Change-Id: I49bb81be5d35ef600e18021bdf98bb207bb8d5a7
Reviewed-on: https://review.monogon.dev/c/monogon/+/1672
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/bmaas/bmdb/queries_test.go b/cloud/bmaas/bmdb/queries_test.go
index 8490930..facc54f 100644
--- a/cloud/bmaas/bmdb/queries_test.go
+++ b/cloud/bmaas/bmdb/queries_test.go
@@ -6,6 +6,8 @@
 	"testing"
 	"time"
 
+	"github.com/google/uuid"
+
 	"source.monogon.dev/cloud/bmaas/bmdb/model"
 )
 
@@ -37,13 +39,14 @@
 
 	// It should be, by default, not a candidate for agent start as it's not yet
 	// provided by any provider.
-	expectNoCandidates := func() {
+	expectCandidates := func(want int) {
+		t.Helper()
 		if err := session.Transact(ctx, func(q *model.Queries) error {
 			candidates, err := q.GetMachinesForAgentStart(ctx, 1)
 			if err != nil {
 				t.Fatalf("Could not retrieve machines for agent start: %v", err)
 			}
-			if want, got := 0, len(candidates); want != got {
+			if got := len(candidates); want != got {
 				t.Fatalf("Wanted %d machines for agent start, got %+v", want, candidates)
 			}
 			return nil
@@ -51,7 +54,6 @@
 			t.Fatal(err)
 		}
 	}
-	expectNoCandidates()
 
 	// Provide machine, and check it is now a candidate.
 	if err := session.Transact(ctx, func(q *model.Queries) error {
@@ -63,21 +65,7 @@
 	}); err != nil {
 		t.Fatalf("could not add provided tag to machine: %v", err)
 	}
-	if err := session.Transact(ctx, func(q *model.Queries) error {
-		candidates, err := q.GetMachinesForAgentStart(ctx, 1)
-		if err != nil {
-			t.Fatalf("Could not retrieve machines for agent start: %v", err)
-		}
-		if want, got := 1, len(candidates); want != got {
-			t.Fatalf("Wanted %d machines for agent start, got %+v", want, candidates)
-		}
-		if want, got := machine.MachineID, candidates[0].MachineID; want != got {
-			t.Fatalf("Wanted %s for agent start, got %s", want, got)
-		}
-		return nil
-	}); err != nil {
-		t.Fatal(err)
-	}
+	expectCandidates(1)
 
 	// Add a start tag. Machine shouldn't be a candidate anymore.
 	if err := session.Transact(ctx, func(q *model.Queries) error {
@@ -89,7 +77,63 @@
 	}); err != nil {
 		t.Fatalf("could not add provided tag to machine: %v", err)
 	}
-	expectNoCandidates()
+	expectCandidates(0)
+
+	// Add a new machine which has an unfulfilled installation request. It should be
+	// a candidate.
+	if err = session.Transact(ctx, func(q *model.Queries) error {
+		machine, err = q.NewMachine(ctx)
+		if err != nil {
+			return err
+		}
+		if err := q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
+			MachineID:  machine.MachineID,
+			Provider:   model.ProviderEquinix,
+			ProviderID: "234",
+		}); err != nil {
+			return err
+		}
+		if err := q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
+			MachineID:  machine.MachineID,
+			Generation: 10,
+		}); err != nil {
+			return err
+		}
+		return nil
+	}); err != nil {
+		t.Fatalf("could not add new machine with installation request: %v", err)
+	}
+	expectCandidates(1)
+
+	// Fulfill installation request on machine with an older generation. it should
+	// remain a candidate.
+	if err = session.Transact(ctx, func(q *model.Queries) error {
+		if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
+			MachineID:  machine.MachineID,
+			Generation: 9,
+		}); err != nil {
+			return err
+		}
+		return nil
+	}); err != nil {
+		t.Fatalf("could not fulfill installation request with older generation: %v", err)
+	}
+	expectCandidates(1)
+
+	// Fulfill installation request with correct generation. The machine should not
+	// be a candidate anymore.
+	if err = session.Transact(ctx, func(q *model.Queries) error {
+		if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
+			MachineID:  machine.MachineID,
+			Generation: 10,
+		}); err != nil {
+			return err
+		}
+		return nil
+	}); err != nil {
+		t.Fatalf("could not fulfill installation request with current generation: %v", err)
+	}
+	expectCandidates(0)
 }
 
 // TestAgentRecovery exercises GetMachinesForAgentRecovery though a few
@@ -117,24 +161,40 @@
 		started time.Time
 		// heartbeat will add a AgentHeartbeat tag for a given time, if set.
 		heartbeat time.Time
+		// requestGeneration will populate a OSInstallationRequest if not zero.
+		requestGeneration int64
+		// requestGeneration will populate a OSInstallationResponse if not zero.
+		reportGeneration int64
 	}{
 		// No start, no heartbeat -> no recovery expected.
-		{false, time.Time{}, time.Time{}},
+		{false, time.Time{}, time.Time{}, 0, 0},
 		// Started recently, no heartbeat -> no recovery expected.
-		{false, time.Now(), time.Time{}},
+		{false, time.Now(), time.Time{}, 0, 0},
 		// Started a while ago, heartbeat active -> no recovery expected.
-		{false, time.Now().Add(-40 * time.Minute), time.Now()},
+		{false, time.Now().Add(-40 * time.Minute), time.Now(), 0, 0},
 
 		// Started a while ago, no heartbeat -> recovery expected.
-		{true, time.Now().Add(-40 * time.Minute), time.Time{}},
+		{true, time.Now().Add(-40 * time.Minute), time.Time{}, 0, 0},
 		// Started a while ago, no recent heartbeat -> recovery expected.
-		{true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute)},
+		{true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 0, 0},
+
+		// Installation request without report -> recovery expected.
+		{true, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 0},
+		{true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 0},
+		// Installation request mismatching report -> recovery expected.
+		{true, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 9},
+		{true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 9},
+		// Installation request matching report -> no recovery expected.
+		{false, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 10},
+		{false, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 10},
 	} {
+		var machineID uuid.UUID
 		if err := session.Transact(ctx, func(q *model.Queries) error {
 			machine, err := q.NewMachine(ctx)
 			if err != nil {
 				return fmt.Errorf("NewMachine: %w", err)
 			}
+			machineID = machine.MachineID
 			if err := q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
 				MachineID:  machine.MachineID,
 				Provider:   model.ProviderEquinix,
@@ -159,26 +219,49 @@
 					return fmt.Errorf("MachineSetAgentHeartbeat: %w", err)
 				}
 			}
+			if scenario.requestGeneration != 0 {
+				if err := q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
+					MachineID:  machine.MachineID,
+					Generation: scenario.requestGeneration,
+				}); err != nil {
+					return fmt.Errorf("MachineSetOSInstallationRequest: %w", err)
+				}
+			}
+			if scenario.reportGeneration != 0 {
+				if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
+					MachineID:  machine.MachineID,
+					Generation: scenario.reportGeneration,
+				}); err != nil {
+					return fmt.Errorf("MachineSetOSInstallationReport: %w", err)
+				}
+			}
 			return nil
 		}); err != nil {
 			t.Errorf("%d: setup failed: %v", i, err)
 			continue
 		}
 
+		found := false
 		if err := session.Transact(ctx, func(q *model.Queries) error {
-			candidates, err := q.GetMachineForAgentRecovery(ctx, 1)
+			candidates, err := q.GetMachineForAgentRecovery(ctx, 100)
 			if err != nil {
 				return fmt.Errorf("GetMachinesForAgentRecovery: %w", err)
 			}
-			if scenario.wantRun && len(candidates) == 0 {
-				return fmt.Errorf("machine unscheduled for recovery")
-			}
-			if !scenario.wantRun && len(candidates) != 0 {
-				return fmt.Errorf("machine scheduled for recovery")
+			for _, c := range candidates {
+				if c.MachineID == machineID {
+					found = true
+					break
+				}
 			}
 			return nil
 		}); err != nil {
-			t.Errorf("%d: test failed: %v", i, err)
+			t.Errorf("%d: failed to retrieve candidates: %v", i, err)
+		}
+		if scenario.wantRun && !found {
+			t.Errorf("%d: expected recovery but not scheduled", i)
+		}
+		if !scenario.wantRun && found {
+			t.Errorf("%d: expected no recovery but is scheduled", i)
 		}
 	}
 }