cloud/bmaas: implement exponential backoffs This lets work fail with a proper exponential backoff. This is important not just to not hammer external systems, but also to not end up with exteremely long and verbose historical logs for repeatedly failing processes. To implement these, we have to slightly alter our model: instead of always persisting backoffs for a given machine, we only persist them as long as the last work for a given process has failed, deleting pertinent backoffs (if any) on success. Then, the existence of a backoff item is used to calculate the value of the next backoff. The change also introduces and explicit period, in seconds, to the backoff item. It is currently implemented as a nullable field, but a future migration/update might make them non-nullable (and delete any straggling backoffs that still don't have the period set). Change-Id: I958fcd957dae1156349224f07fb8d4836955d375 Reviewed-on: https://review.monogon.dev/c/monogon/+/1565 Tested-by: Jenkins CI Reviewed-by: Lorenz Brun <lorenz@monogon.tech>

commit: 20312b40ed4644c64581b4cc8d93a0fc0035fc71 [log] [tgz]
author: Serge Bazanski <serge@monogon.tech> Wed Apr 19 13:49:47 2023 +0200
committer: Serge Bazanski <serge@monogon.tech> Wed Apr 19 15:42:44 2023 +0000
tree: 5b812982e0b5bfd7ef0c10659393d37ad0062d62
parent: ff619354a126df5acffd78ee2f072a17d074e7ab [diff] [blame]
diff --git a/cloud/shepherd/equinix/manager/initializer.go b/cloud/shepherd/equinix/manager/initializer.go
index 4193f4c..bd1dbba 100644
--- a/cloud/shepherd/equinix/manager/initializer.go
+++ b/cloud/shepherd/equinix/manager/initializer.go

@@ -154,10 +154,7 @@
 func (c *Initializer) processMachine(ctx context.Context, t *task) error {
 	dev, err := c.cl.GetDevice(ctx, c.sharedConfig.ProjectId, t.machine.ProviderID)
 	if err != nil {
-		klog.Errorf("failed to fetch device %q: %v", t.machine.ProviderID, err)
-		d := 30 * time.Second
-		err = t.work.Fail(ctx, &d, "failed to fetch device from equinix")
-		return err
+		return fmt.Errorf("while fetching device %q: %v", t.machine.ProviderID, err)
 	}
 
 	// Start the agent.
commit	20312b40ed4644c64581b4cc8d93a0fc0035fc71	[log] [tgz]
author	Serge Bazanski <serge@monogon.tech>	Wed Apr 19 13:49:47 2023 +0200
committer	Serge Bazanski <serge@monogon.tech>	Wed Apr 19 15:42:44 2023 +0000
tree	5b812982e0b5bfd7ef0c10659393d37ad0062d62
parent	ff619354a126df5acffd78ee2f072a17d074e7ab [diff] [blame]