cloud/bmaas/bmdb/scruffy: initialize, implement BMDB metrics
This creates a new BMaaS component, Scruffy the Janitor.
Scruffy will run a bunch of housekeeping jobs that aren't tied to a
particular provider or even region. Currently Scruffy just collects BMDB
metrics by periodically polling the BMDB SQL database.
Change-Id: Icafa714811757eaaf31fed43184ded8512bde067
Reviewed-on: https://review.monogon.dev/c/monogon/+/1819
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/cloud/bmaas/bmdb/model/queries_stats.sql b/cloud/bmaas/bmdb/model/queries_stats.sql
new file mode 100644
index 0000000..d868a6c
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/queries_stats.sql
@@ -0,0 +1,110 @@
+-- name: CountActiveWork :many
+-- Return number of active work, grouped by process.
+SELECT COUNT(*), work.process
+FROM work
+GROUP BY (work.process);
+
+-- name: CountActiveBackoffs :many
+-- Return number of active backoffs, grouped by process.
+SELECT COUNT(*), work_backoff.process
+FROM work_backoff
+GROUP BY (work_backoff.process);
+
+-- name: CountMachines :one
+SELECT COUNT(*)
+FROM machines;
+
+-- name: CountMachinesProvided :one
+SELECT COUNT(*)
+FROM machine_provided;
+
+-- name: CountMachinesAgentHeartbeating :one
+SELECT COUNT(*)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ INNER JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
+WHERE
+ now() < machine_agent_heartbeat.agent_heartbeat_at + interval '10 minute';
+
+-- name: CountMachinesInstallationPending :one
+SELECT COUNT(*)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ INNER JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+ LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+ machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation;
+
+-- name: CountMachinesInstallationComplete :one
+SELECT COUNT(*)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ INNER JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+ INNER JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+ machine_os_installation_request.generation IS NOT DISTINCT FROM machine_os_installation_report.generation;
+
+-- name: CountMachinesForAgentStart :one
+-- Return number of machines eligible for agent start.
+-- ONCHANGE(queries_workflows.sql): constraints must be kept in sync with GetMachinesForAgentStart.
+SELECT COUNT(machine_provided)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
+ LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAgentStart'
+ LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+ LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+ LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+ work.machine_id IS NULL
+ AND work_backoff.machine_id IS NULL
+ AND machine_agent_started.machine_id IS NULL
+ -- If there is a pending installation request, it must not have been fulfilled already.
+ AND (
+ machine_os_installation_request.machine_id IS NULL
+ OR machine_os_installation_report.generation IS DISTINCT FROM machine_os_installation_request.generation
+ );
+
+-- name: CountMachinesForAgentRecovery :one
+-- Return number of machines eligible for agent recovery.
+-- ONCHANGE(queries_workflows.sql): constraints must be kept in sync with GetMachineForAgentRecovery.
+SELECT COUNT(machine_provided)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
+ LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdRecovery'
+ INNER JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+ LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
+ LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+ LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+ work.machine_id IS NULL
+ AND work_backoff.machine_id IS NULL
+ -- Only act on machines where the agent is expected to have been started:
+ -- 1. If there is a pending installation request, it must not have been fulfilled already.
+ AND (
+ machine_os_installation_request.machine_id IS NULL
+ OR machine_os_installation_report.generation IS DISTINCT FROM machine_os_installation_request.generation
+ )
+ -- 2. The agent must have never heartbeat or must have stopped heartbeating.
+ AND (
+ -- No heartbeat 30 minutes after starting the agent.
+ ( machine_agent_heartbeat.machine_id IS NULL
+ AND now() > (machine_agent_started.agent_started_at + interval '30 minutes')
+ )
+ -- Heartbeats ceased for 10 minutes.
+ OR ( machine_agent_heartbeat.machine_id IS NOT NULL
+ AND now() > (machine_agent_heartbeat.agent_heartbeat_at + interval '10 minutes')
+ )
+ );
+
+-- name: ListMachineHardware :many
+SELECT
+ machine_provided.*,
+ machine_hardware_report.*
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ INNER JOIN machine_hardware_report ON machines.machine_id = machine_hardware_report.machine_id
+WHERE machines.machine_id > $1
+ORDER BY machines.machine_id ASC
+LIMIT $2;