cloud/bmaas: split ShepherdAccess into Shepherd{AgentStart,Recovery}
This effectively undoes our previous attempted consolidation of all
Shepherd accesses under one tag. We now use two separate tags for the
two main Shepherd work processes, and mutually exclude them in SQL.
We do this so that we can see more clearly in work history (and in
general when processing machines) what the Shepherd is actually trying
to do to a machine.
The downside of this implementation is that we now extend the BMDB/ETP
model to being able to mutually exclude different processes. This is
easy enough to express in SQL, but might make future generic modelling
more difficult.
An alternative would be to add an extra field to work/work history that
acts as an informative field for operators to know the details of a work
item. We might still want to do that in the future. However, that field
being freeform, we could not really rely on it for machine parsing.
Change-Id: I9578ac000f6112514fe587e9fddf7e85671c6437
Reviewed-on: https://review.monogon.dev/c/monogon/+/1584
Reviewed-by: Leopold Schabel <leo@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.down.sql b/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.down.sql
new file mode 100644
index 0000000..e01997f
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.down.sql
@@ -0,0 +1,2 @@
+-- Not removing added enum values, as the 'up' migration has 'if not exists',
+-- and there is no harm in keeping unused enum values around.
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.up.sql b/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.up.sql
new file mode 100644
index 0000000..6e7c630
--- /dev/null
+++ b/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.up.sql
@@ -0,0 +1,6 @@
+-- Add two more process kinds, ShepherdAgentStart and ShepherdRecovery, for agent
+-- start and recovery by the shepherd respectively. These deprecate the previous
+-- ShepherdAccess process. The two processes mutually exclude each other.
+
+ALTER TYPE process ADD VALUE IF NOT EXISTS 'ShepherdAgentStart';
+ALTER TYPE process ADD VALUE IF NOT EXISTS 'ShepherdRecovery';
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/queries_workflows.sql b/cloud/bmaas/bmdb/model/queries_workflows.sql
index 8f22f41..07ebf77 100644
--- a/cloud/bmaas/bmdb/model/queries_workflows.sql
+++ b/cloud/bmaas/bmdb/model/queries_workflows.sql
@@ -12,8 +12,8 @@
machine_provided.*
FROM machines
INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
-LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process = 'ShepherdAccess'
-LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAccess'
+LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
+LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAgentStart'
LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
WHERE
machine_agent_started.machine_id IS NULL
@@ -33,8 +33,8 @@
machine_provided.*
FROM machines
INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
-LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process = 'ShepherdAccess'
-LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAccess'
+LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
+LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdRecovery'
LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
WHERE
diff --git a/cloud/bmaas/bmdb/sessions_test.go b/cloud/bmaas/bmdb/sessions_test.go
index e697150..b8625a9 100644
--- a/cloud/bmaas/bmdb/sessions_test.go
+++ b/cloud/bmaas/bmdb/sessions_test.go
@@ -201,7 +201,7 @@
if time.Now().After(deadline) {
t.Fatalf("Deadline expired")
}
- work, err := session.Work(ctx, model.ProcessShepherdAccess, func(q *model.Queries) ([]uuid.UUID, error) {
+ work, err := session.Work(ctx, model.ProcessShepherdAgentStart, func(q *model.Queries) ([]uuid.UUID, error) {
machines, err := q.GetMachinesForAgentStart(ctx, 1)
if err != nil {
return nil, err
@@ -274,7 +274,7 @@
var err error
backoffs, err = q.WorkBackoffOf(ctx, model.WorkBackoffOfParams{
MachineID: machine.MachineID,
- Process: model.ProcessShepherdAccess,
+ Process: model.ProcessShepherdAgentStart,
})
return err
})
@@ -350,7 +350,7 @@
doneC := make(chan struct{})
errC := make(chan error)
go func() {
- work, err := session.Work(ctx, model.ProcessShepherdAccess, func(q *model.Queries) ([]uuid.UUID, error) {
+ work, err := session.Work(ctx, model.ProcessShepherdAgentStart, func(q *model.Queries) ([]uuid.UUID, error) {
machines, err := q.GetMachinesForAgentStart(ctx, 1)
if err != nil {
return nil, err
@@ -384,8 +384,11 @@
errC <- err
}()
<-startedC
+
// Work on the machine has started. Attempting to get more machines now should
// return no machines.
+
+ // Mutual exclusion with AgentStart:
err = session.Transact(ctx, func(q *model.Queries) error {
machines, err := q.GetMachinesForAgentStart(ctx, 1)
if err != nil {
@@ -399,6 +402,22 @@
if err != nil {
t.Fatalf("Failed to retrieve machines for start in parallel: %v", err)
}
+
+ // Mutual exclusion with Recovery:
+ err = session.Transact(ctx, func(q *model.Queries) error {
+ machines, err := q.GetMachineForAgentRecovery(ctx, 1)
+ if err != nil {
+ return err
+ }
+ if len(machines) > 0 {
+ t.Errorf("Expected no machines ready for agent recovery.")
+ }
+ return nil
+ })
+ if err != nil {
+ t.Fatalf("Failed to retrieve machines for recovery in parallel: %v", err)
+ }
+
// Finish working on machine.
close(doneC)
err = <-errC
@@ -446,7 +465,7 @@
if want, got := machine, el.MachineID; want.String() != got.String() {
t.Errorf("Wanted %d history event machine ID to be %s, got %s", i, want, got)
}
- if want, got := model.ProcessShepherdAccess, el.Process; want != got {
+ if want, got := model.ProcessShepherdAgentStart, el.Process; want != got {
t.Errorf("Wanted %d history event process to be %s, got %s", i, want, got)
}
}
@@ -498,7 +517,7 @@
})
workOnce := func(ctx context.Context, workerID int, session *Session) error {
- work, err := session.Work(ctx, model.ProcessShepherdAccess, func(q *model.Queries) ([]uuid.UUID, error) {
+ work, err := session.Work(ctx, model.ProcessShepherdAgentStart, func(q *model.Queries) ([]uuid.UUID, error) {
machines, err := q.GetMachinesForAgentStart(ctx, 1)
if err != nil {
return nil, err