c/bmaas/bmdb: implement backoff and history
This augments the existing Work mechanism with a Fail outcome/method
which allows insertion of a machine & process backoff until a deadline
expires.
We also add a history/audit table which contains information about the
work history of a machine - when some work started, finished, failed or
got cancelled.
Change-Id: If890a412977c1d3c7ff3baa69987fb74932818a0
Reviewed-on: https://review.monogon.dev/c/monogon/+/1086
Tested-by: Jenkins CI
Reviewed-by: Leopold Schabel <leo@monogon.tech>
diff --git a/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.down.sql b/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.down.sql
new file mode 100644
index 0000000..2aa3ced
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.down.sql
@@ -0,0 +1,3 @@
+DROP TABLE work_backoff;
+DROP TABLE work_history;
+DROP TYPE work_history_event;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.up.sql b/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.up.sql
new file mode 100644
index 0000000..24f90e5
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.up.sql
@@ -0,0 +1,53 @@
+CREATE TYPE work_history_event AS ENUM (
+ 'Started',
+ 'Finished',
+ 'Failed',
+ 'Canceled'
+);
+
+-- Audit trail of work history for a given machine.
+CREATE TABLE work_history(
+ -- The machine subject to this audit entry. As we want to allow keeping
+ -- information about deleted machines, this is not a foreign key.
+ machine_id UUID NOT NULL,
+
+ -- TODO(q3k): session history?
+
+ -- Process acting on this machine which caused an audit entry to be created.
+ process process NOT NULL,
+ -- Process lifecycle event (started, finished, etc) that caused this audit
+ -- entry to be created.
+ event work_history_event NOT NULL,
+ -- Time at which this entry was created.
+ timestamp TIMESTAMPTZ NOT NULL,
+
+ -- Failure cause, only set when event == Failed.
+ failed_cause STRING
+);
+
+CREATE INDEX ON work_history (machine_id);
+
+-- Backoff entries are created by failed work items, and effectively act as
+-- a Lockout-tagout entry for a given machine and a given process.
+--
+-- Currently, there is no way to fully backoff an entire machine, just
+-- individual processes from a given machine.
+--
+-- Backoff entries are only valid as long as 'until' is before now(), after that
+-- they are ignored by workflow queries. Insertion queries act as upserts,
+-- and thus the backoff entries do not need to be garbage collected, as they do
+-- not grow unbounded (maximumum one entry per process/machine).
+CREATE TABLE work_backoff(
+ -- The machine affected by this backoff.
+ machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE CASCADE,
+ -- The process that this machine should not be subjected to.
+ process process NOT NULL,
+ -- Until when the backoff is enforced.
+ until TIMESTAMPTZ NOT NULL,
+
+ -- Error reported by process/work when this backoff was inserted.
+ -- Human-readable.
+ cause STRING NOT NULL,
+
+ UNIQUE(machine_id, process)
+);
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/queries_base.sql b/cloud/bmaas/bmdb/model/queries_base.sql
index 60d1737..8257fb8 100644
--- a/cloud/bmaas/bmdb/model/queries_base.sql
+++ b/cloud/bmaas/bmdb/model/queries_base.sql
@@ -42,3 +42,29 @@
AND session_id = $2
AND process = $3;
+
+-- name: WorkHistoryInsert :exec
+-- Insert an entry into the work_history audit table.
+INSERT INTO work_history (
+ machine_id, process, event, timestamp, failed_cause
+) VALUES (
+ $1, $2, $3, now(), $4
+);
+
+-- name: WorkBackoffInsert :exec
+-- Upsert a backoff for a given machine/process.
+INSERT INTO work_backoff (
+ machine_id, process, cause, until
+) VALUES (
+ $1, $2, $3, now() + (sqlc.arg(seconds)::int * interval '1 second')
+) ON CONFLICT (machine_id, process) DO UPDATE SET
+ cause = $3,
+ until = now() + (sqlc.arg(seconds)::int * interval '1 second')
+;
+
+-- name: ListHistoryOf :many
+-- Retrieve full audit history of a machine.
+SELECT *
+FROM work_history
+WHERE machine_id = $1
+ORDER BY timestamp ASC;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/queries_workflows.sql b/cloud/bmaas/bmdb/model/queries_workflows.sql
index d2e9966..b7b60fe 100644
--- a/cloud/bmaas/bmdb/model/queries_workflows.sql
+++ b/cloud/bmaas/bmdb/model/queries_workflows.sql
@@ -6,12 +6,14 @@
FROM machines
INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process = 'ShepherdInstall'
+LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdInstall'
LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
WHERE
machine_agent_started.machine_id IS NULL
-- TODO(q3k): exclude machines which are not expected to run the agent (eg.
-- are already exposed to a user).
AND work.machine_id IS NULL
+ AND work_backoff.machine_id IS NULL
LIMIT $1;
-- name: GetMachineForAgentRecovery :many
@@ -25,6 +27,7 @@
FROM machines
INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process = 'ShepherdInstall'
+LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdInstall'
LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
WHERE
@@ -43,6 +46,7 @@
)
)
AND work.machine_id IS NULL
+ AND work_backoff.machine_id IS NULL
LIMIT $1;
-- name: AuthenticateAgentConnection :many
@@ -75,5 +79,4 @@
machine_os_installation_report.machine_id IS NULL
OR machine_os_installation_report.generation != machine_os_installation_request.generation
)
-LIMIT $2;
-
+LIMIT $2;
\ No newline at end of file