c/bmaas/bmdb: implement backoff and history

This augments the existing Work mechanism with a Fail outcome/method
which allows insertion of a machine & process backoff until a deadline
expires.

We also add a history/audit table which contains information about the
work history of a machine - when some work started, finished, failed or
got cancelled.

Change-Id: If890a412977c1d3c7ff3baa69987fb74932818a0
Reviewed-on: https://review.monogon.dev/c/monogon/+/1086
Tested-by: Jenkins CI
Reviewed-by: Leopold Schabel <leo@monogon.tech>
diff --git a/cloud/bmaas/bmdb/sessions.go b/cloud/bmaas/bmdb/sessions.go
index ccb1510..4336121 100644
--- a/cloud/bmaas/bmdb/sessions.go
+++ b/cloud/bmaas/bmdb/sessions.go
@@ -289,6 +289,14 @@
 			}
 			return fmt.Errorf("could not start work on %q: %w", mids[0], err)
 		}
+		err = q.WorkHistoryInsert(ctx, model.WorkHistoryInsertParams{
+			MachineID: mids[0],
+			Event:     model.WorkHistoryEventStarted,
+			Process:   process,
+		})
+		if err != nil {
+			return fmt.Errorf("could not insert history event: %w", err)
+		}
 		return nil
 	})
 	if err != nil {
@@ -329,11 +337,19 @@
 	// will be invalidated soon and so will the work being performed on this
 	// machine.
 	err := w.s.Transact(ctx, func(q *model.Queries) error {
-		return q.FinishWork(ctx, model.FinishWorkParams{
+		err := q.FinishWork(ctx, model.FinishWorkParams{
 			MachineID: w.Machine,
 			SessionID: w.s.UUID,
 			Process:   w.process,
 		})
+		if err != nil {
+			return err
+		}
+		return q.WorkHistoryInsert(ctx, model.WorkHistoryInsertParams{
+			MachineID: w.Machine,
+			Process:   w.process,
+			Event:     model.WorkHistoryEventCanceled,
+		})
 	})
 	if err != nil {
 		klog.Errorf("Failed to cancel work %q on %q (sess %q): %v", w.process, w.Machine, w.s.UUID, err)
@@ -362,6 +378,61 @@
 		if err != nil {
 			return err
 		}
+		err = q.WorkHistoryInsert(ctx, model.WorkHistoryInsertParams{
+			MachineID: w.Machine,
+			Process:   w.process,
+			Event:     model.WorkHistoryEventFinished,
+		})
+		if err != nil {
+			return err
+		}
 		return fn(q)
 	})
 }
+
+// Fail work and introduce backoff for a given duration (if given backoff is
+// non-nil). As long as that backoff is active, no further work for this
+// machine/process will be started. The given cause is an operator-readable
+// string that will be persisted alongside the backoff and the work history/audit
+// table.
+func (w *Work) Fail(ctx context.Context, backoff *time.Duration, cause string) error {
+	if w.done {
+		return fmt.Errorf("already finished")
+	}
+	w.done = true
+
+	return w.s.Transact(ctx, func(q *model.Queries) error {
+		err := q.FinishWork(ctx, model.FinishWorkParams{
+			MachineID: w.Machine,
+			SessionID: w.s.UUID,
+			Process:   w.process,
+		})
+		if err != nil {
+			return err
+		}
+		err = q.WorkHistoryInsert(ctx, model.WorkHistoryInsertParams{
+			MachineID: w.Machine,
+			Process:   w.process,
+			Event:     model.WorkHistoryEventFailed,
+			FailedCause: sql.NullString{
+				String: cause,
+				Valid:  true,
+			},
+		})
+		if err != nil {
+			return err
+		}
+		if backoff != nil && backoff.Seconds() >= 1.0 {
+			seconds := int64(backoff.Seconds())
+			klog.Infof("Adding backoff for %q on machine %q (%d seconds)", w.process, w.Machine, seconds)
+			return q.WorkBackoffInsert(ctx, model.WorkBackoffInsertParams{
+				MachineID: w.Machine,
+				Process:   w.process,
+				Seconds:   seconds,
+				Cause:     cause,
+			})
+		} else {
+			return nil
+		}
+	})
+}