cloud/bmaas/bmdb/scruffy: initialize, implement BMDB metrics

This creates a new BMaaS component, Scruffy the Janitor.

Scruffy will run a bunch of housekeeping jobs that aren't tied to a
particular provider or even region. Currently Scruffy just collects BMDB
metrics by periodically polling the BMDB SQL database.

Change-Id: Icafa714811757eaaf31fed43184ded8512bde067
Reviewed-on: https://review.monogon.dev/c/monogon/+/1819
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/cloud/bmaas/bmdb/model/BUILD.bazel b/cloud/bmaas/bmdb/model/BUILD.bazel
index aea0d7b..bb00a08 100644
--- a/cloud/bmaas/bmdb/model/BUILD.bazel
+++ b/cloud/bmaas/bmdb/model/BUILD.bazel
@@ -8,6 +8,7 @@
     migrations = glob(["migrations/*sql"]),
     queries = [
         "queries_base.sql",
+        "queries_stats.sql",
         "queries_tags.sql",
         "queries_workflows.sql",
     ],
@@ -15,7 +16,10 @@
 
 go_library(
     name = "model",
-    srcs = ["migrations.go"],
+    srcs = [
+        "interfaces.go",
+        "migrations.go",
+    ],
     embed = [
         ":sqlc_model",  # keep
     ],
diff --git a/cloud/bmaas/bmdb/model/interfaces.go b/cloud/bmaas/bmdb/model/interfaces.go
new file mode 100644
index 0000000..9c8f601
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/interfaces.go
@@ -0,0 +1,73 @@
+package model
+
+import "context"
+
+// MetricValue is a prometheus-style labeled numerical metric value. In other
+// words, it's a number accompanied by string key/value pairs.
+type MetricValue struct {
+	Count  int64
+	Labels map[string]string
+}
+
+// WrapSimpleMetric turns a SQL model function which returns a single number into
+// a function which returns one-length MetricValue list with no labels.
+func WrapSimpleMetric(fn func(*Queries, context.Context) (int64, error)) func(*Queries, context.Context) ([]MetricValue, error) {
+	return func(q *Queries, ctx context.Context) ([]MetricValue, error) {
+		v, err := fn(q, ctx)
+		if err != nil {
+			return nil, err
+		}
+		return []MetricValue{
+			{
+				Count:  v,
+				Labels: nil,
+			},
+		}, nil
+	}
+}
+
+// A SQLMetricRow is a row that is the result of some kind of SQL 'metric query'.
+// For each such query we define in our *.sql files, a corresponding
+// implementation exists here.
+type SQLMetricRow interface {
+	Value() MetricValue
+}
+
+// Value implements SQLMetricRow for a row of the result of the
+// CountActiveBackoffs SQL metric query.
+func (c CountActiveBackoffsRow) Value() MetricValue {
+	return MetricValue{
+		Count: c.Count,
+		Labels: map[string]string{
+			"process": string(c.Process),
+		},
+	}
+}
+
+// Value implements SQLMetricRow for a row of the result of the
+// CountActiveWork SQL metric query.
+func (c CountActiveWorkRow) Value() MetricValue {
+	return MetricValue{
+		Count: c.Count,
+		Labels: map[string]string{
+			"process": string(c.Process),
+		},
+	}
+}
+
+// WrapLabeledMetric turns a SQL model function which returns a list of rows
+// implementing SQLMetricRow into a function which returns a list of MetricValues
+// with labels corresponding to the data returned in the rows.
+func WrapLabeledMetric[M SQLMetricRow](fn func(*Queries, context.Context) ([]M, error)) func(*Queries, context.Context) ([]MetricValue, error) {
+	return func(q *Queries, ctx context.Context) ([]MetricValue, error) {
+		v, err := fn(q, ctx)
+		if err != nil {
+			return nil, err
+		}
+		res := make([]MetricValue, len(v))
+		for i, vv := range v {
+			res[i] = vv.Value()
+		}
+		return res, nil
+	}
+}
diff --git a/cloud/bmaas/bmdb/model/queries_stats.sql b/cloud/bmaas/bmdb/model/queries_stats.sql
new file mode 100644
index 0000000..d868a6c
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/queries_stats.sql
@@ -0,0 +1,110 @@
+-- name: CountActiveWork :many
+-- Return number of active work, grouped by process.
+SELECT COUNT(*), work.process
+FROM work
+GROUP BY (work.process);
+
+-- name: CountActiveBackoffs :many
+-- Return number of active backoffs, grouped by process.
+SELECT COUNT(*), work_backoff.process
+FROM work_backoff
+GROUP BY (work_backoff.process);
+
+-- name: CountMachines :one
+SELECT COUNT(*)
+FROM machines;
+
+-- name: CountMachinesProvided :one
+SELECT COUNT(*)
+FROM machine_provided;
+
+-- name: CountMachinesAgentHeartbeating :one
+SELECT COUNT(*)
+FROM machines
+    INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+    INNER JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
+WHERE
+    now() < machine_agent_heartbeat.agent_heartbeat_at + interval '10 minute';
+
+-- name: CountMachinesInstallationPending :one
+SELECT COUNT(*)
+FROM machines
+    INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+    INNER JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+    LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+        machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation;
+
+-- name: CountMachinesInstallationComplete :one
+SELECT COUNT(*)
+FROM machines
+         INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+         INNER JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+         INNER JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+    machine_os_installation_request.generation IS NOT DISTINCT FROM machine_os_installation_report.generation;
+
+-- name: CountMachinesForAgentStart :one
+-- Return number of machines eligible for agent start.
+-- ONCHANGE(queries_workflows.sql): constraints must be kept in sync with GetMachinesForAgentStart.
+SELECT COUNT(machine_provided)
+FROM machines
+         INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+         LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
+         LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAgentStart'
+         LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+         LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+         LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+    work.machine_id IS NULL
+    AND work_backoff.machine_id IS NULL
+    AND machine_agent_started.machine_id IS NULL
+    -- If there is a pending installation request, it must not have been fulfilled already.
+    AND (
+        machine_os_installation_request.machine_id IS NULL
+        OR machine_os_installation_report.generation IS DISTINCT FROM machine_os_installation_request.generation
+    );
+
+-- name: CountMachinesForAgentRecovery :one
+-- Return number of machines eligible for agent recovery.
+-- ONCHANGE(queries_workflows.sql): constraints must be kept in sync with GetMachineForAgentRecovery.
+SELECT COUNT(machine_provided)
+FROM machines
+         INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+         LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
+         LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdRecovery'
+         INNER JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+         LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
+         LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+         LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+    work.machine_id IS NULL
+    AND work_backoff.machine_id IS NULL
+    -- Only act on machines where the agent is expected to have been started:
+    -- 1. If there is a pending installation request, it must not have been fulfilled already.
+    AND (
+        machine_os_installation_request.machine_id IS NULL
+        OR machine_os_installation_report.generation IS DISTINCT FROM machine_os_installation_request.generation
+    )
+    -- 2. The agent must have never heartbeat or must have stopped heartbeating.
+    AND (
+        -- No heartbeat 30 minutes after starting the agent.
+        ( machine_agent_heartbeat.machine_id IS NULL
+          AND now() > (machine_agent_started.agent_started_at + interval '30 minutes')
+        )
+        -- Heartbeats ceased for 10 minutes.
+        OR ( machine_agent_heartbeat.machine_id IS NOT NULL
+          AND now() > (machine_agent_heartbeat.agent_heartbeat_at + interval '10 minutes')
+        )
+    );
+
+-- name: ListMachineHardware :many
+SELECT
+    machine_provided.*,
+    machine_hardware_report.*
+FROM machines
+         INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+         INNER JOIN machine_hardware_report ON machines.machine_id = machine_hardware_report.machine_id
+WHERE machines.machine_id > $1
+ORDER BY machines.machine_id ASC
+LIMIT $2;
diff --git a/cloud/bmaas/bmdb/model/queries_workflows.sql b/cloud/bmaas/bmdb/model/queries_workflows.sql
index 62e4a40..854c15d 100644
--- a/cloud/bmaas/bmdb/model/queries_workflows.sql
+++ b/cloud/bmaas/bmdb/model/queries_workflows.sql
@@ -8,6 +8,7 @@
 -- name: GetMachinesForAgentStart :many
 -- Get machines that need agent started for the first time. Machine can be
 -- assumed to be 'new', with no previous attempts or failures.
+-- ONCHANGE(queries_stats.sql): constraints must be kept in sync with StatsMachinesForAgentStart.
 SELECT
     machine_provided.*
 FROM machines
@@ -34,6 +35,7 @@
 -- some point, or the machine got rebooted or somehow else lost the agent. Assume
 -- some work needs to be performed on the shepherd side to diagnose and recover
 -- whatever state the machine truly is in.
+-- ONCHANGE(queries_stats.sql): constraints must be kept in sync with StatsMachinesForAgentRecovery.
 SELECT
     machine_provided.*
 FROM machines