cloud/bmaas: do not start/recover agent on installed machines
This prevents us from perpetually restarting machines (or otherwise
attempting to get the agent running) on machines that have a 'fulfilled'
installation request, ie. machines with a machine generation in their
installation request/report tags.
The recovery flow will still kick in when a new installation request
comes in after a machine has been sucesffuly installed, and that will
still cause a reboot loop (as even rebooting the machine does not allow
us to start an agent anymore if it has been installed). But we'll fix
that later.
Change-Id: I49bb81be5d35ef600e18021bdf98bb207bb8d5a7
Reviewed-on: https://review.monogon.dev/c/monogon/+/1672
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/bmaas/bmdb/model/queries_workflows.sql b/cloud/bmaas/bmdb/model/queries_workflows.sql
index 07ebf77..62e4a40 100644
--- a/cloud/bmaas/bmdb/model/queries_workflows.sql
+++ b/cloud/bmaas/bmdb/model/queries_workflows.sql
@@ -15,10 +15,15 @@
LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAgentStart'
LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
WHERE
machine_agent_started.machine_id IS NULL
- -- TODO(q3k): exclude machines which are not expected to run the agent (eg.
- -- are already exposed to a user).
+ -- Do not start on machines that have a fulfilled OS installation request.
+ AND (
+ machine_os_installation_request.machine_id IS NULL
+ OR machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation
+ )
AND work.machine_id IS NULL
AND work_backoff.machine_id IS NULL
LIMIT $1;
@@ -35,11 +40,16 @@
INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdRecovery'
-LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+INNER JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
+LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
WHERE
- -- Only act on machines where the agent is expected to have been started.
- machine_agent_started.machine_id IS NOT NULL
+ -- Do not recover machines that have a fulfilled OS installation request.
+ (
+ machine_os_installation_request.machine_id IS NULL
+ OR machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation
+ )
AND (
-- No heartbeat 30 minutes after starting the agent.
(