cloud/bmaas: split ShepherdAccess into Shepherd{AgentStart,Recovery} This effectively undoes our previous attempted consolidation of all Shepherd accesses under one tag. We now use two separate tags for the two main Shepherd work processes, and mutually exclude them in SQL. We do this so that we can see more clearly in work history (and in general when processing machines) what the Shepherd is actually trying to do to a machine. The downside of this implementation is that we now extend the BMDB/ETP model to being able to mutually exclude different processes. This is easy enough to express in SQL, but might make future generic modelling more difficult. An alternative would be to add an extra field to work/work history that acts as an informative field for operators to know the details of a work item. We might still want to do that in the future. However, that field being freeform, we could not really rely on it for machine parsing. Change-Id: I9578ac000f6112514fe587e9fddf7e85671c6437 Reviewed-on: https://review.monogon.dev/c/monogon/+/1584 Reviewed-by: Leopold Schabel <leo@monogon.tech> Tested-by: Jenkins CI

commit: 00cf57dd411af5f5c9aa21ced2a2d6c67729297c [log] [tgz]
author: Serge Bazanski <serge@monogon.tech> Thu Apr 20 11:19:00 2023 +0200
committer: Serge Bazanski <serge@monogon.tech> Tue Apr 25 11:56:50 2023 +0000
tree: bdd35f818c7aca6d2f0bac93501d7a56fcfc2911
parent: afd3cf8bb5c99e9a8db23a5f435f7c1e31c96f91 [diff]
diff --git a/cloud/shepherd/equinix/manager/control_loop.go b/cloud/shepherd/equinix/manager/control_loop.go
index aa4138e..4896c15 100644
--- a/cloud/shepherd/equinix/manager/control_loop.go
+++ b/cloud/shepherd/equinix/manager/control_loop.go

@@ -26,12 +26,17 @@
 	// work is a machine lock facilitated by BMDB that prevents machines from
 	// being processed by multiple workers at the same time.
 	work *bmdb.Work
+	// backoff is configured from processInfo.defaultBackoff but can be overridden by
+	// processMachine to set a different backoff policy for specific failure modes.
+	backoff bmdb.Backoff
 }
 
 // controlLoop is implemented by any component which should act as a BMDB-based
 // control loop. Implementing these methods allows the given component to be
 // started using RunControlLoop.
 type controlLoop interface {
+	getProcessInfo() processInfo
+
 	// getMachines must return the list of machines ready to be processed by the
 	// control loop for a given control loop implementation.
 	getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error)
@@ -45,6 +50,11 @@
 	getControlLoopConfig() *ControlLoopConfig
 }
 
+type processInfo struct {
+	process        model.Process
+	defaultBackoff bmdb.Backoff
+}
+
 // ControlLoopConfig should be embedded the every component which acts as a
 // control loop. RegisterFlags should be called by the component whenever it is
 // registering its own flags. Check should be called whenever the component is
@@ -135,10 +145,12 @@
 // run the control loops(s) (depending on opts.Parallelism) blocking the current
 // goroutine until the given context expires and all provisioners quit.
 func (r *controlLoopRunner) run(ctx context.Context, conn *bmdb.Connection) error {
+	pinfo := r.loop.getProcessInfo()
+
 	eg := errgroup.Group{}
 	for j := 0; j < r.config.Parallelism; j += 1 {
 		eg.Go(func() error {
-			return r.runOne(ctx, conn)
+			return r.runOne(ctx, conn, &pinfo)
 		})
 	}
 	return eg.Wait()
@@ -146,7 +158,7 @@
 
 // run the control loop blocking the current goroutine until the given context
 // expires.
-func (r *controlLoopRunner) runOne(ctx context.Context, conn *bmdb.Connection) error {
+func (r *controlLoopRunner) runOne(ctx context.Context, conn *bmdb.Connection, pinfo *processInfo) error {
 	var err error
 
 	// Maintain a BMDB session as long as possible.
@@ -159,7 +171,7 @@
 			}
 		}
 		// Inside that session, run the main logic.
-		err := r.runInSession(ctx, sess)
+		err := r.runInSession(ctx, sess, pinfo)
 
 		switch {
 		case err == nil:
@@ -180,8 +192,8 @@
 // runInSession executes one iteration of the control loop within a BMDB session.
 // This control loop attempts to start or re-start the agent on any machines that
 // need this per the BMDB.
-func (r *controlLoopRunner) runInSession(ctx context.Context, sess *bmdb.Session) error {
-	t, err := r.source(ctx, sess)
+func (r *controlLoopRunner) runInSession(ctx context.Context, sess *bmdb.Session, pinfo *processInfo) error {
+	t, err := r.source(ctx, sess, pinfo)
 	if err != nil {
 		return fmt.Errorf("could not source machine: %w", err)
 	}
@@ -192,12 +204,7 @@
 
 	if err := r.loop.processMachine(ctx, t); err != nil {
 		klog.Errorf("Failed to process machine %s: %v", t.machine.MachineID, err)
-		backoff := bmdb.Backoff{
-			Initial:  time.Minute,
-			Maximum:  2 * time.Hour,
-			Exponent: 1.1,
-		}
-		err = t.work.Fail(ctx, &backoff, fmt.Sprintf("failed to process: %v", err))
+		err = t.work.Fail(ctx, &t.backoff, fmt.Sprintf("failed to process: %v", err))
 		return err
 	}
 	return nil
@@ -207,11 +214,11 @@
 // control loop, locked by a work item. If both task and error are nil, then
 // there are no machines needed to be initialized. The returned work item in task
 // _must_ be canceled or finished by the caller.
-func (r *controlLoopRunner) source(ctx context.Context, sess *bmdb.Session) (*task, error) {
+func (r *controlLoopRunner) source(ctx context.Context, sess *bmdb.Session, pinfo *processInfo) (*task, error) {
 	r.config.DBQueryLimiter.Wait(ctx)
 
 	var machine *model.MachineProvided
-	work, err := sess.Work(ctx, model.ProcessShepherdAccess, func(q *model.Queries) ([]uuid.UUID, error) {
+	work, err := sess.Work(ctx, pinfo.process, func(q *model.Queries) ([]uuid.UUID, error) {
 		machines, err := r.loop.getMachines(ctx, q, 1)
 		if err != nil {
 			return nil, err
@@ -234,5 +241,6 @@
 	return &task{
 		machine: machine,
 		work:    work,
+		backoff: pinfo.defaultBackoff,
 	}, nil
 }

diff --git a/cloud/shepherd/equinix/manager/initializer.go b/cloud/shepherd/equinix/manager/initializer.go
index 95f9d39..c90f8d8 100644
--- a/cloud/shepherd/equinix/manager/initializer.go
+++ b/cloud/shepherd/equinix/manager/initializer.go

@@ -20,6 +20,7 @@
 	"k8s.io/klog/v2"
 
 	apb "source.monogon.dev/cloud/agent/api"
+	"source.monogon.dev/cloud/bmaas/bmdb"
 	"source.monogon.dev/cloud/bmaas/bmdb/model"
 	ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo"
 )
@@ -147,6 +148,17 @@
 	}, nil
 }
 
+func (c *Initializer) getProcessInfo() processInfo {
+	return processInfo{
+		process: model.ProcessShepherdAgentStart,
+		defaultBackoff: bmdb.Backoff{
+			Initial:  5 * time.Minute,
+			Maximum:  4 * time.Hour,
+			Exponent: 1.2,
+		},
+	}
+}
+
 func (c *Initializer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
 	return q.GetMachinesForAgentStart(ctx, limit)
 }

diff --git a/cloud/shepherd/equinix/manager/recoverer.go b/cloud/shepherd/equinix/manager/recoverer.go
index 4ec73af..85ec440 100644
--- a/cloud/shepherd/equinix/manager/recoverer.go
+++ b/cloud/shepherd/equinix/manager/recoverer.go

@@ -8,6 +8,7 @@
 
 	"k8s.io/klog/v2"
 
+	"source.monogon.dev/cloud/bmaas/bmdb"
 	"source.monogon.dev/cloud/bmaas/bmdb/model"
 	ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo"
 )
@@ -40,6 +41,17 @@
 	}, nil
 }
 
+func (r *Recoverer) getProcessInfo() processInfo {
+	return processInfo{
+		process: model.ProcessShepherdRecovery,
+		defaultBackoff: bmdb.Backoff{
+			Initial:  1 * time.Minute,
+			Maximum:  1 * time.Hour,
+			Exponent: 1.2,
+		},
+	}
+}
+
 func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
 	return q.GetMachineForAgentRecovery(ctx, limit)
 }
commit	00cf57dd411af5f5c9aa21ced2a2d6c67729297c	[log] [tgz]
author	Serge Bazanski <serge@monogon.tech>	Thu Apr 20 11:19:00 2023 +0200
committer	Serge Bazanski <serge@monogon.tech>	Tue Apr 25 11:56:50 2023 +0000
tree	bdd35f818c7aca6d2f0bac93501d7a56fcfc2911
parent	afd3cf8bb5c99e9a8db23a5f435f7c1e31c96f91 [diff]