cloud/shepherd/equinix: implement recoverer This implements basic recovery functionality for 'stuck' agents. The shepherd will notice machines with a agent that either never sent a heartbeat, or stopped sending heartbeats, and will remove their agent started tags and reboot the machine. Then, the main agent start logic should kick in again. More complex recovery flows can be implemented later, this will do for now. Change-Id: I2c1b0d0465e4e302cdecce950a041581c2dc8548 Reviewed-on: https://review.monogon.dev/c/monogon/+/1560 Tested-by: Jenkins CI Reviewed-by: Tim Windelschmidt <tim@monogon.tech>

commit: ae00468363b0006ecf1ae90ed3833bbe54820df5 [log] [tgz]
author: Serge Bazanski <serge@monogon.tech> Tue Apr 18 13:28:48 2023 +0200
committer: Serge Bazanski <serge@monogon.tech> Wed Apr 19 13:55:01 2023 +0000
tree: 3dff4cdf264bed17e66f7aed2c8085b67738104d
parent: 86a714d6e81bb524dc59fda7baa63b45e7180489 [diff] [blame]
diff --git a/cloud/shepherd/equinix/manager/fakequinix_test.go b/cloud/shepherd/equinix/manager/fakequinix_test.go
index 3970373..d9e5683 100644
--- a/cloud/shepherd/equinix/manager/fakequinix_test.go
+++ b/cloud/shepherd/equinix/manager/fakequinix_test.go

@@ -19,6 +19,7 @@
 	devices      map[string]*packngo.Device
 	reservations map[string]*packngo.HardwareReservation
 	sshKeys      map[string]*packngo.SSHKey
+	reboots      map[string]int
 }
 
 // newFakequinix makes a fakequinix with a given fake project ID and number of
@@ -29,6 +30,7 @@
 		devices:      make(map[string]*packngo.Device),
 		reservations: make(map[string]*packngo.HardwareReservation),
 		sshKeys:      make(map[string]*packngo.SSHKey),
+		reboots:      make(map[string]int),
 	}
 
 	for i := 0; i < numReservations; i++ {
@@ -168,5 +170,14 @@
 	return key, nil
 }
 
+func (f *fakequinix) RebootDevice(_ context.Context, did string) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	f.reboots[did]++
+
+	return nil
+}
+
 func (f *fakequinix) Close() {
 }
commit	ae00468363b0006ecf1ae90ed3833bbe54820df5	[log] [tgz]
author	Serge Bazanski <serge@monogon.tech>	Tue Apr 18 13:28:48 2023 +0200
committer	Serge Bazanski <serge@monogon.tech>	Wed Apr 19 13:55:01 2023 +0000
tree	3dff4cdf264bed17e66f7aed2c8085b67738104d
parent	86a714d6e81bb524dc59fda7baa63b45e7180489 [diff] [blame]