cloud/bmaas: do not start/recover agent on installed machines
This prevents us from perpetually restarting machines (or otherwise
attempting to get the agent running) on machines that have a 'fulfilled'
installation request, ie. machines with a machine generation in their
installation request/report tags.
The recovery flow will still kick in when a new installation request
comes in after a machine has been sucesffuly installed, and that will
still cause a reboot loop (as even rebooting the machine does not allow
us to start an agent anymore if it has been installed). But we'll fix
that later.
Change-Id: I49bb81be5d35ef600e18021bdf98bb207bb8d5a7
Reviewed-on: https://review.monogon.dev/c/monogon/+/1672
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/bmaas/bmdb/model/queries_workflows.sql b/cloud/bmaas/bmdb/model/queries_workflows.sql
index 07ebf77..62e4a40 100644
--- a/cloud/bmaas/bmdb/model/queries_workflows.sql
+++ b/cloud/bmaas/bmdb/model/queries_workflows.sql
@@ -15,10 +15,15 @@
LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAgentStart'
LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
WHERE
machine_agent_started.machine_id IS NULL
- -- TODO(q3k): exclude machines which are not expected to run the agent (eg.
- -- are already exposed to a user).
+ -- Do not start on machines that have a fulfilled OS installation request.
+ AND (
+ machine_os_installation_request.machine_id IS NULL
+ OR machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation
+ )
AND work.machine_id IS NULL
AND work_backoff.machine_id IS NULL
LIMIT $1;
@@ -35,11 +40,16 @@
INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdRecovery'
-LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+INNER JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
+LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
WHERE
- -- Only act on machines where the agent is expected to have been started.
- machine_agent_started.machine_id IS NOT NULL
+ -- Do not recover machines that have a fulfilled OS installation request.
+ (
+ machine_os_installation_request.machine_id IS NULL
+ OR machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation
+ )
AND (
-- No heartbeat 30 minutes after starting the agent.
(
diff --git a/cloud/bmaas/bmdb/queries_test.go b/cloud/bmaas/bmdb/queries_test.go
index 8490930..facc54f 100644
--- a/cloud/bmaas/bmdb/queries_test.go
+++ b/cloud/bmaas/bmdb/queries_test.go
@@ -6,6 +6,8 @@
"testing"
"time"
+ "github.com/google/uuid"
+
"source.monogon.dev/cloud/bmaas/bmdb/model"
)
@@ -37,13 +39,14 @@
// It should be, by default, not a candidate for agent start as it's not yet
// provided by any provider.
- expectNoCandidates := func() {
+ expectCandidates := func(want int) {
+ t.Helper()
if err := session.Transact(ctx, func(q *model.Queries) error {
candidates, err := q.GetMachinesForAgentStart(ctx, 1)
if err != nil {
t.Fatalf("Could not retrieve machines for agent start: %v", err)
}
- if want, got := 0, len(candidates); want != got {
+ if got := len(candidates); want != got {
t.Fatalf("Wanted %d machines for agent start, got %+v", want, candidates)
}
return nil
@@ -51,7 +54,6 @@
t.Fatal(err)
}
}
- expectNoCandidates()
// Provide machine, and check it is now a candidate.
if err := session.Transact(ctx, func(q *model.Queries) error {
@@ -63,21 +65,7 @@
}); err != nil {
t.Fatalf("could not add provided tag to machine: %v", err)
}
- if err := session.Transact(ctx, func(q *model.Queries) error {
- candidates, err := q.GetMachinesForAgentStart(ctx, 1)
- if err != nil {
- t.Fatalf("Could not retrieve machines for agent start: %v", err)
- }
- if want, got := 1, len(candidates); want != got {
- t.Fatalf("Wanted %d machines for agent start, got %+v", want, candidates)
- }
- if want, got := machine.MachineID, candidates[0].MachineID; want != got {
- t.Fatalf("Wanted %s for agent start, got %s", want, got)
- }
- return nil
- }); err != nil {
- t.Fatal(err)
- }
+ expectCandidates(1)
// Add a start tag. Machine shouldn't be a candidate anymore.
if err := session.Transact(ctx, func(q *model.Queries) error {
@@ -89,7 +77,63 @@
}); err != nil {
t.Fatalf("could not add provided tag to machine: %v", err)
}
- expectNoCandidates()
+ expectCandidates(0)
+
+ // Add a new machine which has an unfulfilled installation request. It should be
+ // a candidate.
+ if err = session.Transact(ctx, func(q *model.Queries) error {
+ machine, err = q.NewMachine(ctx)
+ if err != nil {
+ return err
+ }
+ if err := q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
+ MachineID: machine.MachineID,
+ Provider: model.ProviderEquinix,
+ ProviderID: "234",
+ }); err != nil {
+ return err
+ }
+ if err := q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
+ MachineID: machine.MachineID,
+ Generation: 10,
+ }); err != nil {
+ return err
+ }
+ return nil
+ }); err != nil {
+ t.Fatalf("could not add new machine with installation request: %v", err)
+ }
+ expectCandidates(1)
+
+ // Fulfill installation request on machine with an older generation. it should
+ // remain a candidate.
+ if err = session.Transact(ctx, func(q *model.Queries) error {
+ if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
+ MachineID: machine.MachineID,
+ Generation: 9,
+ }); err != nil {
+ return err
+ }
+ return nil
+ }); err != nil {
+ t.Fatalf("could not fulfill installation request with older generation: %v", err)
+ }
+ expectCandidates(1)
+
+ // Fulfill installation request with correct generation. The machine should not
+ // be a candidate anymore.
+ if err = session.Transact(ctx, func(q *model.Queries) error {
+ if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
+ MachineID: machine.MachineID,
+ Generation: 10,
+ }); err != nil {
+ return err
+ }
+ return nil
+ }); err != nil {
+ t.Fatalf("could not fulfill installation request with current generation: %v", err)
+ }
+ expectCandidates(0)
}
// TestAgentRecovery exercises GetMachinesForAgentRecovery though a few
@@ -117,24 +161,40 @@
started time.Time
// heartbeat will add a AgentHeartbeat tag for a given time, if set.
heartbeat time.Time
+ // requestGeneration will populate a OSInstallationRequest if not zero.
+ requestGeneration int64
+ // requestGeneration will populate a OSInstallationResponse if not zero.
+ reportGeneration int64
}{
// No start, no heartbeat -> no recovery expected.
- {false, time.Time{}, time.Time{}},
+ {false, time.Time{}, time.Time{}, 0, 0},
// Started recently, no heartbeat -> no recovery expected.
- {false, time.Now(), time.Time{}},
+ {false, time.Now(), time.Time{}, 0, 0},
// Started a while ago, heartbeat active -> no recovery expected.
- {false, time.Now().Add(-40 * time.Minute), time.Now()},
+ {false, time.Now().Add(-40 * time.Minute), time.Now(), 0, 0},
// Started a while ago, no heartbeat -> recovery expected.
- {true, time.Now().Add(-40 * time.Minute), time.Time{}},
+ {true, time.Now().Add(-40 * time.Minute), time.Time{}, 0, 0},
// Started a while ago, no recent heartbeat -> recovery expected.
- {true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute)},
+ {true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 0, 0},
+
+ // Installation request without report -> recovery expected.
+ {true, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 0},
+ {true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 0},
+ // Installation request mismatching report -> recovery expected.
+ {true, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 9},
+ {true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 9},
+ // Installation request matching report -> no recovery expected.
+ {false, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 10},
+ {false, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 10},
} {
+ var machineID uuid.UUID
if err := session.Transact(ctx, func(q *model.Queries) error {
machine, err := q.NewMachine(ctx)
if err != nil {
return fmt.Errorf("NewMachine: %w", err)
}
+ machineID = machine.MachineID
if err := q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
MachineID: machine.MachineID,
Provider: model.ProviderEquinix,
@@ -159,26 +219,49 @@
return fmt.Errorf("MachineSetAgentHeartbeat: %w", err)
}
}
+ if scenario.requestGeneration != 0 {
+ if err := q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
+ MachineID: machine.MachineID,
+ Generation: scenario.requestGeneration,
+ }); err != nil {
+ return fmt.Errorf("MachineSetOSInstallationRequest: %w", err)
+ }
+ }
+ if scenario.reportGeneration != 0 {
+ if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
+ MachineID: machine.MachineID,
+ Generation: scenario.reportGeneration,
+ }); err != nil {
+ return fmt.Errorf("MachineSetOSInstallationReport: %w", err)
+ }
+ }
return nil
}); err != nil {
t.Errorf("%d: setup failed: %v", i, err)
continue
}
+ found := false
if err := session.Transact(ctx, func(q *model.Queries) error {
- candidates, err := q.GetMachineForAgentRecovery(ctx, 1)
+ candidates, err := q.GetMachineForAgentRecovery(ctx, 100)
if err != nil {
return fmt.Errorf("GetMachinesForAgentRecovery: %w", err)
}
- if scenario.wantRun && len(candidates) == 0 {
- return fmt.Errorf("machine unscheduled for recovery")
- }
- if !scenario.wantRun && len(candidates) != 0 {
- return fmt.Errorf("machine scheduled for recovery")
+ for _, c := range candidates {
+ if c.MachineID == machineID {
+ found = true
+ break
+ }
}
return nil
}); err != nil {
- t.Errorf("%d: test failed: %v", i, err)
+ t.Errorf("%d: failed to retrieve candidates: %v", i, err)
+ }
+ if scenario.wantRun && !found {
+ t.Errorf("%d: expected recovery but not scheduled", i)
+ }
+ if !scenario.wantRun && found {
+ t.Errorf("%d: expected no recovery but is scheduled", i)
}
}
}