cloud/bmaas/bmdb/scruffy: initialize, implement BMDB metrics
This creates a new BMaaS component, Scruffy the Janitor.
Scruffy will run a bunch of housekeeping jobs that aren't tied to a
particular provider or even region. Currently Scruffy just collects BMDB
metrics by periodically polling the BMDB SQL database.
Change-Id: Icafa714811757eaaf31fed43184ded8512bde067
Reviewed-on: https://review.monogon.dev/c/monogon/+/1819
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/cloud/bmaas/bmdb/model/BUILD.bazel b/cloud/bmaas/bmdb/model/BUILD.bazel
index aea0d7b..bb00a08 100644
--- a/cloud/bmaas/bmdb/model/BUILD.bazel
+++ b/cloud/bmaas/bmdb/model/BUILD.bazel
@@ -8,6 +8,7 @@
migrations = glob(["migrations/*sql"]),
queries = [
"queries_base.sql",
+ "queries_stats.sql",
"queries_tags.sql",
"queries_workflows.sql",
],
@@ -15,7 +16,10 @@
go_library(
name = "model",
- srcs = ["migrations.go"],
+ srcs = [
+ "interfaces.go",
+ "migrations.go",
+ ],
embed = [
":sqlc_model", # keep
],
diff --git a/cloud/bmaas/bmdb/model/interfaces.go b/cloud/bmaas/bmdb/model/interfaces.go
new file mode 100644
index 0000000..9c8f601
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/interfaces.go
@@ -0,0 +1,73 @@
+package model
+
+import "context"
+
+// MetricValue is a prometheus-style labeled numerical metric value. In other
+// words, it's a number accompanied by string key/value pairs.
+type MetricValue struct {
+ Count int64
+ Labels map[string]string
+}
+
+// WrapSimpleMetric turns a SQL model function which returns a single number into
+// a function which returns one-length MetricValue list with no labels.
+func WrapSimpleMetric(fn func(*Queries, context.Context) (int64, error)) func(*Queries, context.Context) ([]MetricValue, error) {
+ return func(q *Queries, ctx context.Context) ([]MetricValue, error) {
+ v, err := fn(q, ctx)
+ if err != nil {
+ return nil, err
+ }
+ return []MetricValue{
+ {
+ Count: v,
+ Labels: nil,
+ },
+ }, nil
+ }
+}
+
+// A SQLMetricRow is a row that is the result of some kind of SQL 'metric query'.
+// For each such query we define in our *.sql files, a corresponding
+// implementation exists here.
+type SQLMetricRow interface {
+ Value() MetricValue
+}
+
+// Value implements SQLMetricRow for a row of the result of the
+// CountActiveBackoffs SQL metric query.
+func (c CountActiveBackoffsRow) Value() MetricValue {
+ return MetricValue{
+ Count: c.Count,
+ Labels: map[string]string{
+ "process": string(c.Process),
+ },
+ }
+}
+
+// Value implements SQLMetricRow for a row of the result of the
+// CountActiveWork SQL metric query.
+func (c CountActiveWorkRow) Value() MetricValue {
+ return MetricValue{
+ Count: c.Count,
+ Labels: map[string]string{
+ "process": string(c.Process),
+ },
+ }
+}
+
+// WrapLabeledMetric turns a SQL model function which returns a list of rows
+// implementing SQLMetricRow into a function which returns a list of MetricValues
+// with labels corresponding to the data returned in the rows.
+func WrapLabeledMetric[M SQLMetricRow](fn func(*Queries, context.Context) ([]M, error)) func(*Queries, context.Context) ([]MetricValue, error) {
+ return func(q *Queries, ctx context.Context) ([]MetricValue, error) {
+ v, err := fn(q, ctx)
+ if err != nil {
+ return nil, err
+ }
+ res := make([]MetricValue, len(v))
+ for i, vv := range v {
+ res[i] = vv.Value()
+ }
+ return res, nil
+ }
+}
diff --git a/cloud/bmaas/bmdb/model/queries_stats.sql b/cloud/bmaas/bmdb/model/queries_stats.sql
new file mode 100644
index 0000000..d868a6c
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/queries_stats.sql
@@ -0,0 +1,110 @@
+-- name: CountActiveWork :many
+-- Return number of active work, grouped by process.
+SELECT COUNT(*), work.process
+FROM work
+GROUP BY (work.process);
+
+-- name: CountActiveBackoffs :many
+-- Return number of active backoffs, grouped by process.
+SELECT COUNT(*), work_backoff.process
+FROM work_backoff
+GROUP BY (work_backoff.process);
+
+-- name: CountMachines :one
+SELECT COUNT(*)
+FROM machines;
+
+-- name: CountMachinesProvided :one
+SELECT COUNT(*)
+FROM machine_provided;
+
+-- name: CountMachinesAgentHeartbeating :one
+SELECT COUNT(*)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ INNER JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
+WHERE
+ now() < machine_agent_heartbeat.agent_heartbeat_at + interval '10 minute';
+
+-- name: CountMachinesInstallationPending :one
+SELECT COUNT(*)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ INNER JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+ LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+ machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation;
+
+-- name: CountMachinesInstallationComplete :one
+SELECT COUNT(*)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ INNER JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+ INNER JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+ machine_os_installation_request.generation IS NOT DISTINCT FROM machine_os_installation_report.generation;
+
+-- name: CountMachinesForAgentStart :one
+-- Return number of machines eligible for agent start.
+-- ONCHANGE(queries_workflows.sql): constraints must be kept in sync with GetMachinesForAgentStart.
+SELECT COUNT(machine_provided)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
+ LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAgentStart'
+ LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+ LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+ LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+ work.machine_id IS NULL
+ AND work_backoff.machine_id IS NULL
+ AND machine_agent_started.machine_id IS NULL
+ -- If there is a pending installation request, it must not have been fulfilled already.
+ AND (
+ machine_os_installation_request.machine_id IS NULL
+ OR machine_os_installation_report.generation IS DISTINCT FROM machine_os_installation_request.generation
+ );
+
+-- name: CountMachinesForAgentRecovery :one
+-- Return number of machines eligible for agent recovery.
+-- ONCHANGE(queries_workflows.sql): constraints must be kept in sync with GetMachineForAgentRecovery.
+SELECT COUNT(machine_provided)
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
+ LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdRecovery'
+ INNER JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
+ LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
+ LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
+ LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
+WHERE
+ work.machine_id IS NULL
+ AND work_backoff.machine_id IS NULL
+ -- Only act on machines where the agent is expected to have been started:
+ -- 1. If there is a pending installation request, it must not have been fulfilled already.
+ AND (
+ machine_os_installation_request.machine_id IS NULL
+ OR machine_os_installation_report.generation IS DISTINCT FROM machine_os_installation_request.generation
+ )
+ -- 2. The agent must have never heartbeat or must have stopped heartbeating.
+ AND (
+ -- No heartbeat 30 minutes after starting the agent.
+ ( machine_agent_heartbeat.machine_id IS NULL
+ AND now() > (machine_agent_started.agent_started_at + interval '30 minutes')
+ )
+ -- Heartbeats ceased for 10 minutes.
+ OR ( machine_agent_heartbeat.machine_id IS NOT NULL
+ AND now() > (machine_agent_heartbeat.agent_heartbeat_at + interval '10 minutes')
+ )
+ );
+
+-- name: ListMachineHardware :many
+SELECT
+ machine_provided.*,
+ machine_hardware_report.*
+FROM machines
+ INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
+ INNER JOIN machine_hardware_report ON machines.machine_id = machine_hardware_report.machine_id
+WHERE machines.machine_id > $1
+ORDER BY machines.machine_id ASC
+LIMIT $2;
diff --git a/cloud/bmaas/bmdb/model/queries_workflows.sql b/cloud/bmaas/bmdb/model/queries_workflows.sql
index 62e4a40..854c15d 100644
--- a/cloud/bmaas/bmdb/model/queries_workflows.sql
+++ b/cloud/bmaas/bmdb/model/queries_workflows.sql
@@ -8,6 +8,7 @@
-- name: GetMachinesForAgentStart :many
-- Get machines that need agent started for the first time. Machine can be
-- assumed to be 'new', with no previous attempts or failures.
+-- ONCHANGE(queries_stats.sql): constraints must be kept in sync with StatsMachinesForAgentStart.
SELECT
machine_provided.*
FROM machines
@@ -34,6 +35,7 @@
-- some point, or the machine got rebooted or somehow else lost the agent. Assume
-- some work needs to be performed on the shepherd side to diagnose and recover
-- whatever state the machine truly is in.
+-- ONCHANGE(queries_stats.sql): constraints must be kept in sync with StatsMachinesForAgentRecovery.
SELECT
machine_provided.*
FROM machines