cloud: rename ShepherdInstall to ShepherdAccess and clean up other 'installation' references

ShepherdInstall should've been called ShepherdAgentStart from the
beginning, but got named incorrectly because of a momentary lapse of
reason.

Instead of doing a rename, we widen that Process to anything that
involved the Shepherds directly accessing the underlying provider
machines, be it starting the agent, trying to fix the machine, or
rebooting a machine. This wide meaning makes sure no two shepherds work
on the same machine ever, regardless of the actual workflow each one is
performing.

Change-Id: Ic247919d1bcf1c5ec9fcf7125f17b90413068ed5
Reviewed-on: https://review.monogon.dev/c/monogon/+/1138
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.up.sql b/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.up.sql
index e1afca2..1d2144c 100644
--- a/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.up.sql
+++ b/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.up.sql
@@ -53,5 +53,6 @@
     CONSTRAINT "primary" PRIMARY KEY(machine_id)
 );
 
--- Used by the Shepherd when performing direct actions against a machine.
-ALTER TYPE process ADD VALUE IF NOT EXISTS 'ShepherdInstall';
+-- Used by the Shepherd when performing mutations against the underlying machine
+-- (eg. SSH access, restarts, ...).
+ALTER TYPE process ADD VALUE IF NOT EXISTS 'ShepherdAccess';
diff --git a/cloud/bmaas/bmdb/model/queries_workflows.sql b/cloud/bmaas/bmdb/model/queries_workflows.sql
index b0132a3..8f22f41 100644
--- a/cloud/bmaas/bmdb/model/queries_workflows.sql
+++ b/cloud/bmaas/bmdb/model/queries_workflows.sql
@@ -6,14 +6,14 @@
 WHERE machine_provided.provider = $1;
 
 -- name: GetMachinesForAgentStart :many
--- Get machines that need agent installed for the first time. Machine can be
+-- Get machines that need agent started for the first time. Machine can be
 -- assumed to be 'new', with no previous attempts or failures.
 SELECT
     machine_provided.*
 FROM machines
 INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
-LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process = 'ShepherdInstall'
-LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdInstall'
+LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process = 'ShepherdAccess'
+LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAccess'
 LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
 WHERE
   machine_agent_started.machine_id IS NULL
@@ -24,17 +24,17 @@
 LIMIT $1;
 
 -- name: GetMachineForAgentRecovery :many
--- Get machines that need agent installed after something went wrong. Either
+-- Get machines that need agent restarted after something went wrong. Either
 -- the agent started but never responded, or the agent stopped responding at
--- some point, or the machine is being reinstalled after failure. Assume some
--- work needs to be performed on the shepherd side to diagnose and recover
+-- some point, or the machine got rebooted or somehow else lost the agent. Assume
+-- some work needs to be performed on the shepherd side to diagnose and recover
 -- whatever state the machine truly is in.
 SELECT
     machine_provided.*
 FROM machines
 INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
-LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process = 'ShepherdInstall'
-LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdInstall'
+LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process = 'ShepherdAccess'
+LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAccess'
 LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
 LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
 WHERE
diff --git a/cloud/bmaas/bmdb/sessions_test.go b/cloud/bmaas/bmdb/sessions_test.go
index 097c79e..bd29d5d 100644
--- a/cloud/bmaas/bmdb/sessions_test.go
+++ b/cloud/bmaas/bmdb/sessions_test.go
@@ -193,7 +193,7 @@
 	}
 
 	// Work on machine, but fail it with a backoff.
-	work, err := session.Work(ctx, model.ProcessShepherdInstall, func(q *model.Queries) ([]uuid.UUID, error) {
+	work, err := session.Work(ctx, model.ProcessShepherdAccess, func(q *model.Queries) ([]uuid.UUID, error) {
 		machines, err := q.GetMachinesForAgentStart(ctx, 1)
 		if err != nil {
 			return nil, err
@@ -218,12 +218,12 @@
 			return err
 		}
 		if len(machines) > 0 {
-			t.Errorf("Expected no machines ready for installation.")
+			t.Errorf("Expected no machines ready for agent start.")
 		}
 		return nil
 	})
 	if err != nil {
-		t.Errorf("Failed to retrieve machines for installation: %v", err)
+		t.Errorf("Failed to retrieve machines for agent start: %v", err)
 	}
 
 	// Instead of waiting for the backoff to expire, set it again, but this time
@@ -231,7 +231,7 @@
 	err = session.Transact(ctx, func(q *model.Queries) error {
 		return q.WorkBackoffInsert(ctx, model.WorkBackoffInsertParams{
 			MachineID: machine.MachineID,
-			Process:   model.ProcessShepherdInstall,
+			Process:   model.ProcessShepherdAccess,
 			Seconds:   0,
 		})
 	})
@@ -249,18 +249,17 @@
 			return err
 		}
 		if len(machines) != 1 {
-			t.Errorf("Expected exactly one machine ready for installation.")
+			t.Errorf("Expected exactly one machine ready for agent start.")
 		}
 		return nil
 	})
 	if err != nil {
-		t.Errorf("Failed to retrieve machines for installation: %v", err)
+		t.Errorf("Failed to retrieve machines for agent start: %v", err)
 	}
 }
 
-// TestInstallationWorkflow exercises the agent installation workflow within the
-// BMDB.
-func TestInstallationWorkflow(t *testing.T) {
+// TestAgentStartWorkflow exercises the agent start workflow within the BMDB.
+func TestAgentStartWorkflow(t *testing.T) {
 	b := dut()
 	conn, err := b.Open(true)
 	if err != nil {
@@ -297,7 +296,7 @@
 	doneC := make(chan struct{})
 	errC := make(chan error)
 	go func() {
-		work, err := session.Work(ctx, model.ProcessShepherdInstall, func(q *model.Queries) ([]uuid.UUID, error) {
+		work, err := session.Work(ctx, model.ProcessShepherdAccess, func(q *model.Queries) ([]uuid.UUID, error) {
 			machines, err := q.GetMachinesForAgentStart(ctx, 1)
 			if err != nil {
 				return nil, err
@@ -339,12 +338,12 @@
 			return err
 		}
 		if len(machines) > 0 {
-			t.Errorf("Expected no machines ready for installation.")
+			t.Errorf("Expected no machines ready for agent start.")
 		}
 		return nil
 	})
 	if err != nil {
-		t.Fatalf("Failed to retrieve machines for installation in parallel: %v", err)
+		t.Fatalf("Failed to retrieve machines for start in parallel: %v", err)
 	}
 	// Finish working on machine.
 	close(doneC)
@@ -352,19 +351,20 @@
 	if err != nil {
 		t.Fatalf("Failed to finish work on machine: %v", err)
 	}
-	// That machine is now installed, so we still expect no work to have to be done.
+	// That machine has its agent started, so we still expect no work to have to be
+	// done.
 	err = session.Transact(ctx, func(q *model.Queries) error {
 		machines, err := q.GetMachinesForAgentStart(ctx, 1)
 		if err != nil {
 			return err
 		}
 		if len(machines) > 0 {
-			t.Errorf("Expected still no machines ready for installation.")
+			t.Errorf("Expected still no machines ready for agent start.")
 		}
 		return nil
 	})
 	if err != nil {
-		t.Fatalf("Failed to retrieve machines for installation after work finished: %v", err)
+		t.Fatalf("Failed to retrieve machines for agent start after work finished: %v", err)
 	}
 
 	// Check history has been recorded.
@@ -392,15 +392,15 @@
 		if want, got := machine, el.MachineID; want.String() != got.String() {
 			t.Errorf("Wanted %d history event machine ID to be %s, got %s", i, want, got)
 		}
-		if want, got := model.ProcessShepherdInstall, el.Process; want != got {
+		if want, got := model.ProcessShepherdAccess, el.Process; want != got {
 			t.Errorf("Wanted %d history event process to be %s, got %s", i, want, got)
 		}
 	}
 }
 
-// TestInstallationWorkflowParallel starts work on three machines by six workers
+// TestAgentStartWorkflowParallel starts work on three machines by six workers
 // and makes sure that there are no scheduling conflicts between them.
-func TestInstallationWorkflowParallel(t *testing.T) {
+func TestAgentStartWorkflowParallel(t *testing.T) {
 	b := dut()
 	conn, err := b.Open(true)
 	if err != nil {
@@ -444,7 +444,7 @@
 	})
 
 	workOnce := func(ctx context.Context, workerID int, session *Session) error {
-		work, err := session.Work(ctx, model.ProcessShepherdInstall, func(q *model.Queries) ([]uuid.UUID, error) {
+		work, err := session.Work(ctx, model.ProcessShepherdAccess, func(q *model.Queries) ([]uuid.UUID, error) {
 			machines, err := q.GetMachinesForAgentStart(ctx, 1)
 			if err != nil {
 				return nil, err
diff --git a/cloud/shepherd/equinix/manager/initializer.go b/cloud/shepherd/equinix/manager/initializer.go
index fd2e00b..5b1ea49 100644
--- a/cloud/shepherd/equinix/manager/initializer.go
+++ b/cloud/shepherd/equinix/manager/initializer.go
@@ -216,7 +216,7 @@
 	}
 	defer t.work.Cancel(ctx)
 
-	klog.Infof("Machine %q needs installation, fetching corresponding packngo device %q...", t.id, t.pid)
+	klog.Infof("Machine %q needs agent start, fetching corresponding packngo device %q...", t.id, t.pid)
 	dev, err := c.cl.GetDevice(ctx, c.sharedConfig.ProjectId, t.pid.String())
 	if err != nil {
 		klog.Errorf("failed to fetch device %q: %v", t.pid, err)
@@ -314,7 +314,7 @@
 	klog.Infof("Starting agent on device (ID: %s, PID %s)", t.id, t.pid)
 	apk, err := ir.startAgent(ctx, sgn, *t.dev)
 	if err != nil {
-		return fmt.Errorf("while installing the agent: %w", err)
+		return fmt.Errorf("while starting the agent: %w", err)
 	}
 
 	// Agent startup succeeded. Set the appropriate BMDB tag, and release the
@@ -341,7 +341,7 @@
 	ir.config.DBQueryLimiter.Wait(ctx)
 
 	var machine *model.MachineProvided
-	work, err := sess.Work(ctx, model.ProcessShepherdInstall, func(q *model.Queries) ([]uuid.UUID, error) {
+	work, err := sess.Work(ctx, model.ProcessShepherdAccess, func(q *model.Queries) ([]uuid.UUID, error) {
 		machines, err := q.GetMachinesForAgentStart(ctx, 1)
 		if err != nil {
 			return nil, err